4 changed files with 16 additions and 73 deletions
--- a/codes/filter_snpindel.pl
+++ b/codes/filter_snpindel.pl
@ -189,12 +189,12 @@ while (<IN>) {
            push @reason, 'not_need_spl_inron';
        }
        my @hgvs = split(/,/, $line[9]);
-        # my $hgvs = $hgvs[0];
+        my $hgvs = $hgvs[0];
        my $transcript_gene;
        $transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
        my $hgvs;
        if (grep {/$transcript_gene/} @hgvs) {
            $hgvs = (grep {/$transcript_gene/} @hgvs)[0];
        }
        $line[9] = $hgvs;
        $hgvs =~ /:(NM_\d+):exon\d+:(c\.\S+):p\.(\S+)$/;
@ -202,10 +202,6 @@ while (<IN>) {
        if ($protein =~ /\d+X$|\d+\*$/ or $line[8] eq 'stopgain' or $line[8] eq 'frameshift deletion' or $line[8] eq 'frameshift insertion') {
            $protein = 'Truncating Mutations';
        }
        }
        else {
            push @reason, 'not_correct_hgvs';
        }
    }
--- a/codes/hereditary.py
+++ b/codes/hereditary.py
@ -6,32 +6,6 @@ import re
 import pandas as pd
 def split_hgvs(hgvs):
    hgvs_split = hgvs.split(':')
    if len(hgvs_split) == 4:
        gene, position, transcript_version, coordinate_type = hgvs_split
        # pattern = r'c\.\d+([\+\-])[12]\D+>\D+'
        # match = re.search(pattern, coordinate_type)
        # # if match:
        # #     transcript_version =
        # #     if match.group(1) == '-':X
        variant_version = None
    elif len(hgvs_split) == 5:
        gene, position, transcript_version, coordinate_type, variant_version = hgvs_split
    else:
        raise ValueError(f'Invalid HGVS format{hgvs}')
    return {
        'gene': gene,
        'transcript': position,
        'exon': transcript_version,
        'nacid': coordinate_type,
        'aacid': variant_version
    }
 class HereditaryRun:
    def __init__(self, database, project, output_dir, name, file):
@ -56,19 +30,21 @@ class HereditaryRun:
        result_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation'])
        for _, rows in data.iterrows():
-            # matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
+            matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
-            matches = split_hgvs(rows['AAChange_refGene'])
+            row_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
-            gene = matches['gene']
+            gene, mutation = '', ''
-            aacid = matches['aacid'] if matches['aacid'] else matches['nacid']
+            if matches:
-            row_df = pd.DataFrame(
+                gene = matches.group(1)
-                columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
+                mutation = matches.group(2)
            else:
                raise UserWarning('HGVS 解析错误！')
            selected_rows = expanded_database[expanded_database['Gene'].str.split(';').apply(lambda x: gene in x)]
            row_df['Syndrome_Cn'] = selected_rows['Syndrome_Cn']
            row_df['inheritance'] = selected_rows['inheritance']
            row_df['Gene'] = gene
-            row_df['mutation'] = aacid
+            row_df['mutation'] = mutation
            row_df['genotype'] = '纯合' if rows['Freq'] > 0.9 else '杂合'
            row_df['ClinicalSign'] = str(rows['ClinicalSign'])
--- a/codes/pollution.py
+++ b/codes/pollution.py
@ -46,13 +46,9 @@ def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
    bed_regions = load_bed_regions(bed_file)
    vcf = pysam.VariantFile(vcf_file)
    res_pos = list()
    count_normal = 0
    count_exception = 0
    vcf_out = open(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w')
    for line in str(vcf.header).strip().split('\n'):
        vcf_out.write(line + '\n')
    for record in vcf:
        contig = record.contig
@ -83,9 +79,7 @@ def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
                        freq=freq,
                        res=res
                    ))
                    vcf_out.write(str(record) + '\n')
                    break
    count_all = count_exception + count_normal
    if count_all == 0:
        z_score = 0
--- a/wdl/pollution.wdl
+++ b/wdl/pollution.wdl
@ -61,19 +61,6 @@ task run_generate_png {
    }
 }
 task run_single_generate_png {
    String name
    String probe
    String cnvkit_tumor_vcf
    String cnv_cnr
    String cnv_cns
    String output_dir
    command {
        cnvkit.py scatter ${cnv_cnr} -s ${cnv_cns} -v ${cnvkit_tumor_vcf} -o ${output_dir}/pollution/${name}_pollution_cnvkit_tumor.png
    }
 }
 workflow call_pollution {
    Boolean run=true
@ -127,16 +114,6 @@ workflow call_pollution {
                    probe=probe,
                    vcf=initial_vcf
            }
            call run_single_generate_png {
                input:
                    name=tumor,
                    probe=probe,
                    cnvkit_tumor_vcf=run_pollution_paired.cnvkit_tumor_vcf,
                    cnv_cnr=cnv_cnr,
                    cnv_cns=cnv_cns,
                    output_dir=output_dir
            }
        }
    }