4 changed files with 16 additions and 73 deletions
--- a/codes/filter_snpindel.pl
+++ b/codes/filter_snpindel.pl
@ -189,12 +189,12 @@ while (<IN>) {
            push @reason, 'not_need_spl_inron';
        }
        my @hgvs = split(/,/, $line[9]);
-        # my $hgvs = $hgvs[0];
+        my $hgvs = $hgvs[0];
        my $transcript_gene;
        $transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
-        my $hgvs;
        if (grep {/$transcript_gene/} @hgvs) {
            $hgvs = (grep {/$transcript_gene/} @hgvs)[0];
+        }
        $line[9] = $hgvs;

        $hgvs =~ /:(NM_\d+):exon\d+:(c\.\S+):p\.(\S+)$/;
@ -202,10 +202,6 @@ while (<IN>) {
        if ($protein =~ /\d+X$|\d+\*$/ or $line[8] eq 'stopgain' or $line[8] eq 'frameshift deletion' or $line[8] eq 'frameshift insertion') {
            $protein = 'Truncating Mutations';
        }
-        }
-        else {
-            push @reason, 'not_correct_hgvs';
-        }

    }

--- a/codes/hereditary.py
+++ b/codes/hereditary.py
@ -6,32 +6,6 @@ import re
 import pandas as pd


-def split_hgvs(hgvs):
-    hgvs_split = hgvs.split(':')
-    if len(hgvs_split) == 4:
-        gene, position, transcript_version, coordinate_type = hgvs_split
-
-        # pattern = r'c\.\d+([\+\-])[12]\D+>\D+'
-        # match = re.search(pattern, coordinate_type)
-        # # if match:
-        # #     transcript_version =
-        # #     if match.group(1) == '-':X
-
-        variant_version = None
-    elif len(hgvs_split) == 5:
-        gene, position, transcript_version, coordinate_type, variant_version = hgvs_split
-    else:
-        raise ValueError(f'Invalid HGVS format{hgvs}')
-
-    return {
-        'gene': gene,
-        'transcript': position,
-        'exon': transcript_version,
-        'nacid': coordinate_type,
-        'aacid': variant_version
-    }
-
-
 class HereditaryRun:

    def __init__(self, database, project, output_dir, name, file):
@ -56,19 +30,21 @@ class HereditaryRun:

        result_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation'])
        for _, rows in data.iterrows():
-            # matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
-            matches = split_hgvs(rows['AAChange_refGene'])
-            gene = matches['gene']
-            aacid = matches['aacid'] if matches['aacid'] else matches['nacid']
-            row_df = pd.DataFrame(
-                columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
+            matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
+            row_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
+            gene, mutation = '', ''
+            if matches:
+                gene = matches.group(1)
+                mutation = matches.group(2)
+            else:
+                raise UserWarning('HGVS 解析错误！')

            selected_rows = expanded_database[expanded_database['Gene'].str.split(';').apply(lambda x: gene in x)]

            row_df['Syndrome_Cn'] = selected_rows['Syndrome_Cn']
            row_df['inheritance'] = selected_rows['inheritance']
            row_df['Gene'] = gene
-            row_df['mutation'] = aacid
+            row_df['mutation'] = mutation
            row_df['genotype'] = '纯合' if rows['Freq'] > 0.9 else '杂合'
            row_df['ClinicalSign'] = str(rows['ClinicalSign'])

--- a/codes/pollution.py
+++ b/codes/pollution.py
@ -46,13 +46,9 @@ def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
    bed_regions = load_bed_regions(bed_file)

    vcf = pysam.VariantFile(vcf_file)
-
    res_pos = list()
    count_normal = 0
    count_exception = 0
-    vcf_out = open(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w')
-    for line in str(vcf.header).strip().split('\n'):
-        vcf_out.write(line + '\n')

    for record in vcf:
        contig = record.contig
@ -83,9 +79,7 @@ def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
                        freq=freq,
                        res=res
                    ))
-                    vcf_out.write(str(record) + '\n')
                    break
-
    count_all = count_exception + count_normal
    if count_all == 0:
        z_score = 0
--- a/wdl/pollution.wdl
+++ b/wdl/pollution.wdl
@ -61,19 +61,6 @@ task run_generate_png {
    }
 }

-task run_single_generate_png {
-    String name
-    String probe
-    String cnvkit_tumor_vcf
-    String cnv_cnr
-    String cnv_cns
-    String output_dir
-
-    command {
-        cnvkit.py scatter ${cnv_cnr} -s ${cnv_cns} -v ${cnvkit_tumor_vcf} -o ${output_dir}/pollution/${name}_pollution_cnvkit_tumor.png
-    }
-}
-
 workflow call_pollution {

    Boolean run=true
@ -127,16 +114,6 @@ workflow call_pollution {
                    probe=probe,
                    vcf=initial_vcf
            }
-            call run_single_generate_png {
-                input:
-                    name=tumor,
-                    probe=probe,
-                    cnvkit_tumor_vcf=run_pollution_paired.cnvkit_tumor_vcf,
-                    cnv_cnr=cnv_cnr,
-                    cnv_cns=cnv_cns,
-                    output_dir=output_dir
-
-            }
        }
    }