遗传分解hgvs，没有p. 采用c.

hgvs 采用标准转录本
污染添加17和160图
2024-02-26 10:07:00 +08:00 · 2024-02-26 10:06:23 +08:00 · 2024-02-26 10:05:47 +08:00
4 changed files with 73 additions and 16 deletions
--- a/codes/filter_snpindel.pl
+++ b/codes/filter_snpindel.pl
@ -189,12 +189,12 @@ while (<IN>) {
            push @reason, 'not_need_spl_inron';
        }
        my @hgvs = split(/,/, $line[9]);
-        my $hgvs = $hgvs[0];
+        # my $hgvs = $hgvs[0];
        my $transcript_gene;
        $transcript_gene = $transcript{$gene} if (exists $transcript{$gene});
+        my $hgvs;
        if (grep {/$transcript_gene/} @hgvs) {
            $hgvs = (grep {/$transcript_gene/} @hgvs)[0];
-        }
            $line[9] = $hgvs;

            $hgvs =~ /:(NM_\d+):exon\d+:(c\.\S+):p\.(\S+)$/;
@ -202,6 +202,10 @@ while (<IN>) {
            if ($protein =~ /\d+X$|\d+\*$/ or $line[8] eq 'stopgain' or $line[8] eq 'frameshift deletion' or $line[8] eq 'frameshift insertion') {
                $protein = 'Truncating Mutations';
            }
+        }
+        else {
+            push @reason, 'not_correct_hgvs';
+        }

    }

--- a/codes/hereditary.py
+++ b/codes/hereditary.py
@ -6,6 +6,32 @@ import re
 import pandas as pd


+def split_hgvs(hgvs):
+    hgvs_split = hgvs.split(':')
+    if len(hgvs_split) == 4:
+        gene, position, transcript_version, coordinate_type = hgvs_split
+
+        # pattern = r'c\.\d+([\+\-])[12]\D+>\D+'
+        # match = re.search(pattern, coordinate_type)
+        # # if match:
+        # #     transcript_version =
+        # #     if match.group(1) == '-':X
+
+        variant_version = None
+    elif len(hgvs_split) == 5:
+        gene, position, transcript_version, coordinate_type, variant_version = hgvs_split
+    else:
+        raise ValueError(f'Invalid HGVS format{hgvs}')
+
+    return {
+        'gene': gene,
+        'transcript': position,
+        'exon': transcript_version,
+        'nacid': coordinate_type,
+        'aacid': variant_version
+    }
+
+
 class HereditaryRun:

    def __init__(self, database, project, output_dir, name, file):
@ -30,21 +56,19 @@ class HereditaryRun:

        result_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation'])
        for _, rows in data.iterrows():
-            matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
-            row_df = pd.DataFrame(columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])
-            gene, mutation = '', ''
-            if matches:
-                gene = matches.group(1)
-                mutation = matches.group(2)
-            else:
-                raise UserWarning('HGVS 解析错误！')
+            # matches = re.match(r"([A-Za-z0-9]+):.*:(p\..*)", rows['AAChange_refGene'])
+            matches = split_hgvs(rows['AAChange_refGene'])
+            gene = matches['gene']
+            aacid = matches['aacid'] if matches['aacid'] else matches['nacid']
+            row_df = pd.DataFrame(
+                columns=['Gene', 'Syndrome_Cn', 'inheritance', 'genotype', 'mutation', 'ClinicalSign'])

            selected_rows = expanded_database[expanded_database['Gene'].str.split(';').apply(lambda x: gene in x)]

            row_df['Syndrome_Cn'] = selected_rows['Syndrome_Cn']
            row_df['inheritance'] = selected_rows['inheritance']
            row_df['Gene'] = gene
-            row_df['mutation'] = mutation
+            row_df['mutation'] = aacid
            row_df['genotype'] = '纯合' if rows['Freq'] > 0.9 else '杂合'
            row_df['ClinicalSign'] = str(rows['ClinicalSign'])

--- a/codes/pollution.py
+++ b/codes/pollution.py
@ -46,9 +46,13 @@ def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
    bed_regions = load_bed_regions(bed_file)

    vcf = pysam.VariantFile(vcf_file)
+
    res_pos = list()
    count_normal = 0
    count_exception = 0
+    vcf_out = open(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w')
+    for line in str(vcf.header).strip().split('\n'):
+        vcf_out.write(line + '\n')

    for record in vcf:
        contig = record.contig
@ -79,7 +83,9 @@ def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
                        freq=freq,
                        res=res
                    ))
+                    vcf_out.write(str(record) + '\n')
                    break
+
    count_all = count_exception + count_normal
    if count_all == 0:
        z_score = 0
--- a/wdl/pollution.wdl
+++ b/wdl/pollution.wdl
@ -61,6 +61,19 @@ task run_generate_png {
    }
 }

+task run_single_generate_png {
+    String name
+    String probe
+    String cnvkit_tumor_vcf
+    String cnv_cnr
+    String cnv_cns
+    String output_dir
+
+    command {
+        cnvkit.py scatter ${cnv_cnr} -s ${cnv_cns} -v ${cnvkit_tumor_vcf} -o ${output_dir}/pollution/${name}_pollution_cnvkit_tumor.png
+    }
+}
+
 workflow call_pollution {

    Boolean run=true
@ -114,6 +127,16 @@ workflow call_pollution {
                    probe=probe,
                    vcf=initial_vcf
            }
+            call run_single_generate_png {
+                input:
+                    name=tumor,
+                    probe=probe,
+                    cnvkit_tumor_vcf=run_pollution_paired.cnvkit_tumor_vcf,
+                    cnv_cnr=cnv_cnr,
+                    cnv_cns=cnv_cns,
+                    output_dir=output_dir
+
+            }
        }
    }
Author	SHA1	Message	Date
chaopower	fb36b97329	遗传分解hgvs，没有p. 采用c.	2024-02-26 10:07:00 +08:00
chaopower	778d55ed5b	hgvs 采用标准转录本	2024-02-26 10:06:23 +08:00
chaopower	157fa94ef8	污染添加17和160图	2024-02-26 10:05:47 +08:00