遗传分解hgvs，没有p. 采用c.

hgvs 采用标准转录本
污染添加17和160图
2024-02-26 10:07:00 +08:00 · 2024-02-26 10:06:23 +08:00 · 2024-02-26 10:05:47 +08:00
4 changed files with 73 additions and 16 deletions
--- a/codes/filter_snpindel.pl
+++ b/codes/filter_snpindel.pl
@ -189,18 +189,22 @@ while (<IN>) {
            push @reason, 'not_need_spl_inron';
        }
        my @hgvs = split(/,/, $line[9]);
-        my $hgvs = $hgvs[0];
+        # my $hgvs = $hgvs[0];
        my $transcript_gene;
        $transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
        my $hgvs;
        if (grep {/$transcript_gene/} @hgvs) {
            $hgvs = (grep {/$transcript_gene/} @hgvs)[0];
-        }
+            $line[9] = $hgvs;
        $line[9] = $hgvs;
-        $hgvs =~ /:(NM_\d+):exon\d+:(c\.\S+):p\.(\S+)$/;
+            $hgvs =~ /:(NM_\d+):exon\d+:(c\.\S+):p\.(\S+)$/;
-        $protein = $3;
+            $protein = $3;
-        if ($protein =~ /\d+X$|\d+\*$/ or $line[8] eq 'stopgain' or $line[8] eq 'frameshift deletion' or $line[8] eq 'frameshift insertion') {
+            if ($protein =~ /\d+X$|\d+\*$/ or $line[8] eq 'stopgain' or $line[8] eq 'frameshift deletion' or $line[8] eq 'frameshift insertion') {
-            $protein = 'Truncating Mutations';
+                $protein = 'Truncating Mutations';
            }
        }
        else {
            push @reason, 'not_correct_hgvs';
        }
    }
--- a/codes/hereditary.py
+++ b/codes/hereditary.py
@ -6,6 +6,32 @@ import re
 import pandas as pd
 def split_hgvs(hgvs):
    hgvs_split = hgvs.split(':')
    if len(hgvs_split) == 4:
        gene, position, transcript_version, coordinate_type = hgvs_split
        # pattern = r'c\.\d+([\+\-])[12]\D+>\D+'
        # match = re.search(pattern, coordinate_type)
        # # if match:
        # #     transcript_version =
        # #     if match.group(1) == '-':X
        variant_version = None
    elif len(hgvs_split) == 5:
        gene, position, transcript_version, coordinate_type, variant_version = hgvs_split
    else:
        raise ValueError(f'Invalid HGVS format{hgvs}')
    return {
        'gene': gene,
        'transcript': position,
        'exon': transcript_version,
        'nacid': coordinate_type,
        'aacid': variant_version
    }
 class HereditaryRun:
    def __init__(self, database, project, output_dir, name, file):
@ -30,21 +56,19 @@ class HereditaryRun:
        result_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation'])
        for _, rows in data.iterrows():
-            matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
+            # matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
-            row_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
+            matches = split_hgvs(rows['AAChange_refGene'])
-            gene, mutation = '', ''
+            gene = matches['gene']
-            if matches:
+            aacid = matches['aacid'] if matches['aacid'] else matches['nacid']
-                gene = matches.group(1)
+            row_df = pd.DataFrame(
-                mutation = matches.group(2)
+                columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
            else:
                raise UserWarning('HGVS 解析错误！')
            selected_rows = expanded_database[expanded_database['Gene'].str.split(';').apply(lambda x: gene in x)]
            row_df['Syndrome_Cn'] = selected_rows['Syndrome_Cn']
            row_df['inheritance'] = selected_rows['inheritance']
            row_df['Gene'] = gene
-            row_df['mutation'] = mutation
+            row_df['mutation'] = aacid
            row_df['genotype'] = '纯合' if rows['Freq'] > 0.9 else '杂合'
            row_df['ClinicalSign'] = str(rows['ClinicalSign'])
--- a/codes/pollution.py
+++ b/codes/pollution.py
@ -46,9 +46,13 @@ def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
    bed_regions = load_bed_regions(bed_file)
    vcf = pysam.VariantFile(vcf_file)
    res_pos = list()
    count_normal = 0
    count_exception = 0
    vcf_out = open(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w')
    for line in str(vcf.header).strip().split('\n'):
        vcf_out.write(line + '\n')
    for record in vcf:
        contig = record.contig
@ -79,7 +83,9 @@ def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
                        freq=freq,
                        res=res
                    ))
                    vcf_out.write(str(record) + '\n')
                    break
    count_all = count_exception + count_normal
    if count_all == 0:
        z_score = 0
--- a/wdl/pollution.wdl
+++ b/wdl/pollution.wdl
@ -61,6 +61,19 @@ task run_generate_png {
    }
 }
 task run_single_generate_png {
    String name
    String probe
    String cnvkit_tumor_vcf
    String cnv_cnr
    String cnv_cns
    String output_dir
    command {
        cnvkit.py scatter ${cnv_cnr} -s ${cnv_cns} -v ${cnvkit_tumor_vcf} -o ${output_dir}/pollution/${name}_pollution_cnvkit_tumor.png
    }
 }
 workflow call_pollution {
    Boolean run=true
@ -114,6 +127,16 @@ workflow call_pollution {
                    probe=probe,
                    vcf=initial_vcf
            }
            call run_single_generate_png {
                input:
                    name=tumor,
                    probe=probe,
                    cnvkit_tumor_vcf=run_pollution_paired.cnvkit_tumor_vcf,
                    cnv_cnr=cnv_cnr,
                    cnv_cns=cnv_cns,
                    output_dir=output_dir
            }
        }
    }
Author	SHA1	Message	Date
chaopower	fb36b97329	遗传分解hgvs，没有p. 采用c.	2024-02-26 10:07:00 +08:00
chaopower	778d55ed5b	hgvs 采用标准转录本	2024-02-26 10:06:23 +08:00
chaopower	157fa94ef8	污染添加17和160图	2024-02-26 10:05:47 +08:00