流程更新

2023-12-25 14:06:30 +08:00 · 2023-12-25 14:06:30 +08:00 · bd66bc6612
parent c9f525b2bf
commit bd66bc6612
24 changed files with 254 additions and 162 deletions
--- a/codes/chemo.py
+++ b/codes/chemo.py
@ -1,4 +1,5 @@
 #! /usr/bin/env python3
+
 import argparse
 import logging
 import os
--- a/codes/drug_dedup.py
+++ b/codes/drug_dedup.py
@ -1,114 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: UTF-8 -*-
-
-import os
-import re
-import sys
-
-import pandas as pd
-
-if len(sys.argv) != 3:
-    print(" ".join(['usage:python3', sys.argv[0], 'output_dir', 'name']))
-    sys.exit()
-
-output_dir = sys.argv[1]
-name = sys.argv[2]
-snv_file = os.path.join(output_dir, 'mutation', f'{name}.somatic.hg19_multianno.filter.sum.pos.txt')
-snv_file_new = os.path.join(output_dir, 'mutation', f'{name}.somatic.hg19_multianno.filter.sum.pos.dedup.txt')
-
-fusion_file = os.path.join(output_dir, 'fusion', f'{name}.fusion.hg19_multianno.filter.fusion.pos.txt')
-fusion_file_new = os.path.join(output_dir, 'fusion', f'{name}.fusion.hg19_multianno.filter.fusion.pos.dedup.txt')
-
-cnv_file = os.path.join(output_dir, 'cnv', f'{name}.rmdup.cns.filter.pos.txt')
-cnv_file_new = os.path.join(output_dir, 'cnvkit', f'{name}.rmdup.cns.filter.pos.dedup.txt')
-
-# gm_snv_file = os.path.join(output_dir, '/mutation/', name, '.snvindel.Germline.pos.txt')
-# gm_snv_file_new = os.path.join(output_dir, '/mutation/', name, '.snvindel.Germline.pos.dedup.txt')
-
-open(snv_file_new, "w")
-open(fusion_file_new, "w")
-open(cnv_file_new, "w")
-# open(gm_snv_file_new, "w")
-##Evidence_Source_C及标签排序
-df_mapping_1 = pd.DataFrame({
-    'Evidence_Source_C': ['FDA', 'NMPA', 'NCCN', '临床III期', '临床II期', '临床I期', '临床试验', '回顾性研究', '个案', '临床前研究'],
-})
-sort_mapping_1 = df_mapping_1.reset_index().set_index('Evidence_Source_C')
-df_mapping_2 = pd.DataFrame({'标签': ['适应症', '非适应症', '.']})
-sort_mapping_2 = df_mapping_2.reset_index().set_index('标签')
-
-##snvindel处理
-snv_size = os.path.getsize(snv_file)
-if snv_size > 0:
-    data = pd.read_table(snv_file, sep="\t")
-    data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
-    data['level2'] = data['标签'].map(sort_mapping_2['index'])
-    data.sort_values(by=['AAChange.refGene', 'level2', 'level1'], ascending=True, inplace=True)
-    data.drop(['level1', 'level2'], axis=1, inplace=True)
-    info = {}
-    for index, row in data.iterrows():
-        if re.search(r'敏感', row['Response_Type_C']):
-            if row['标签'] == '适应症':
-                info[row['AAChange.refGene'] + row['Drug']] = '1'
-            else:
-                if (row['AAChange.refGene'] + row['Drug']) in info.keys():
-                    data.drop([index], inplace=True)
-    data.insert(0, '可信', 1)
-    data.to_csv(snv_file_new, index=False, sep='\t')
-
-# ##germline snv/indel处理
-# gm_snv_size = os.path.getsize(gm_snv_file)
-# if gm_snv_size > 0:
-#     data = pd.read_table(gm_snv_file, sep="\t")
-#     data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
-#     data['level2'] = data['标签'].map(sort_mapping_2['index'])
-#     data.sort_values(by=['AAChange.refGene', 'level2', 'level1'], ascending=True, inplace=True)
-#     data.drop(['level1', 'level2'], axis=1, inplace=True)
-#     info = {}
-#     for index, row in data.iterrows():
-#         if re.search(r'敏感', row['Response_Type_C']):
-#             if row['标签'] == '适应症':
-#                 info[row['AAChange.refGene'] + row['Drug']] = '1'
-#             else:
-#                 if (row['AAChange.refGene'] + row['Drug']) in info.keys():
-#                     data.drop([index], inplace=True)
-#     data.insert(0, '可信', 1)
-#     data.to_csv(gm_snv_file_new, index=False, sep='\t')
-
-##fusion处理
-fusion_size = os.path.getsize(fusion_file)
-if fusion_size > 0:
-    data = pd.read_table(fusion_file, sep="\t")
-    data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
-    data['level2'] = data['标签'].map(sort_mapping_2['index'])
-    data.sort_values(by=['FUSION', 'level2', 'level1'], ascending=True, inplace=True)
-    data.drop(['level1', 'level2'], axis=1, inplace=True)
-    info = {}
-    for index, row in data.iterrows():
-        if re.search(r'敏感', row['Response_Type_C']):
-            if row['标签'] == '适应症':
-                info[row['FUSION'] + row['Drug']] = '1'
-            else:
-                if (row['FUSION'] + row['Drug']) in info.keys():
-                    data.drop([index], inplace=True)
-    data.insert(0, '可信', 1)
-    data.to_csv(fusion_file_new, index=False, sep='\t')
-
-##cnv处理
-cnv_size = os.path.getsize(cnv_file)
-if cnv_size > 0:
-    data = pd.read_table(cnv_file, sep="\t")
-    data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
-    data['level2'] = data['标签'].map(sort_mapping_2['index'])
-    data.sort_values(by=['Gene_Symbol', 'level2', 'level1'], ascending=True, inplace=True)
-    data.drop(['level1', 'level2'], axis=1, inplace=True)
-    info = {}
-    for index, row in data.iterrows():
-        if re.search(r'敏感', row['Response_Type_C']):
-            if row['标签'] == '适应症':
-                info[row['Gene_Symbol'] + row['Drug']] = '1'
-            else:
-                if (row['Gene_Symbol'] + row['Drug']) in info.keys():
-                    data.drop([index], inplace=True)
-    data.insert(0, '可信', 1)
-    data.to_csv(cnv_file_new, index=False, sep='\t')
--- a/codes/filter_longindel.pl
+++ b/codes/filter_longindel.pl
@ -30,14 +30,15 @@ while (<IN>) {
    }
    my @line = split(/\t/);
    $line[7] =~ /Gene.refGene=(.*?);/;
-    if ((grep {$1 =~ /$_/} @longindels) && ($_ =~ /SVTYPE=DEL/ || $_ =~ /SVTYPE=DUP/ || $_ =~ /SVTYPE=INS/)) {
-        if ($1 eq "BCL2L11") {
+    my $gene = $1;
+    if ((grep {$gene =~ /$_/} @longindels) && ($_ =~ /SVTYPE=DEL/ || $_ =~ /SVTYPE=DUP/ || $_ =~ /SVTYPE=INS/)) {
+        if ($gene eq "BCL2L11") {
            if ($line[1] == '111883194') {
-                print LONGINDEL join("\n", @pos) . "\n";
+                print LONGINDEL $_;
            }
        }
        else {
-            print LONGINDEL join("\n", @pos) . "\n";
+            print LONGINDEL $_;
        }
    }
 }
--- a/codes/filter_snpindel.pl
+++ b/codes/filter_snpindel.pl
@ -1,4 +1,5 @@
 #!/usr/bin/env perl
+
 use strict;
 #use warnings;
 use List::Util qw(sum);
@ -131,12 +132,12 @@ while (<IN>) {
                $line[9] = join(":", ($gene, $hgvs));
            }
            elsif ($spl =~ /\d+\-[1|2]\D+/) {
-                my $intron = $exon-1;
+                my $intron = $exon - 1;
                $hgvs =~ s/exon(\d+)/intron$intron;exon$exon/;
                $line[9] = join(":", ($gene, $hgvs));
            }
            elsif ($gene eq "MET") {
-                $line[9] = join(":", ($gene, "exon14", "c.xxx"));
+                $line[9] = join(":", ($gene, "NM_000245", "exon14", "c.xxx"));
                $line[8] = 'skipping'
            }
            else {
--- a/codes/netchop.pl
+++ b/codes/netchop.pl
@ -0,0 +1,78 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use List::Util qw(min max);
+#max_length:最大的epitope长度
+die "usage:perl $0 outputDir tumor_prefix" if @ARGV != 2;
+
+my ($outputDir, $tumor_prefix) = @ARGV;
+
+open IN, "$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.fasta";
+my %fa;
+while (<IN>) {
+    if (/^>MT/) {
+        open OUT, ">$outputDir/neoantigen/MHC_Class_I/tmp.fa";
+        print OUT;
+        $_ =~ /MT\.(\d+)\./;
+        my $id = $1;
+        my $seq = <IN>;
+        print OUT $seq;
+        chomp $seq;
+        $fa{$id} = $seq;
+        system "predict.py -m netchop -n $outputDir/neoantigen/MHC_Class_I/tmp.fa >>$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.cleavage.txt";
+        close OUT;
+    }
+}
+unlink "$outputDir/neoantigen/MHC_Class_I/tmp.fa";
+
+my %score;
+open IN, "$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.cleavage.txt";
+while (<IN>) {
+    next unless /^\d+/;
+    chomp;
+    my @line = split;
+    $line[3] =~ /MT\.(\d+)\./;
+    $score{$1}{$line[0]} = $line[2];
+}
+open IN, "$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.tsv";
+open OUT, ">$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.txt";
+open OUT2, ">$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.filter.txt";
+#my %neopt;
+my $head = <IN>;
+chomp $head;
+print OUT "$head\tcleavage_score\n";
+while (<IN>) {
+    chomp;
+    my @line = split(/\t/);
+    $line[44] =~ /^(\d+)\./;
+    my $id = $1;
+    my $pep = $line[18];
+    if (exists $fa{$id}) {
+        my $index = index($fa{$id}, $pep) + length($pep);
+        my $cleavage_score = $score{$id}{$index};
+        print OUT "$_\t$cleavage_score\n";
+        if ($line[21] <= 5000 and ($line[23] eq "NA" or $line[23] >= 1)) {
+            print OUT2 "$_\t$cleavage_score\n";
+        }
+    }
+    else {
+        print OUT "$_\tNA\n";
+    }
+}
+system "sort -k 22 -n $outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.filter.txt >$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.filter.sort.txt";
+unlink "$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.filter.txt";
+open SORT, "$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.filter.sort.txt";
+open OUT3, ">$outputDir/neoantigen/MHC_Class_I/neoantigen.txt";
+print OUT3 "序号\tHLA分型\t基因\t多肽\t亲和力\t剪切效率\n";
+my %pep;
+my $bool = 0;
+while (<SORT>) {
+    chomp;
+    my @line = split(/\t/);
+    if (not exists $pep{$line[18]}) {
+        $pep{$line[18]}++;
+        $bool += 1;
+        print OUT3 "$bool\t$line[14]\t$line[11]\t$line[18]\t$line[21]\t$line[53]\n";
+    }
+}
--- a/codes/pollution.py
+++ b/codes/pollution.py
@ -110,7 +110,7 @@ def process_judge_vcf(input_vcf, output_vcf):
            if not line.startswith("#"):
                fields = line.strip().split('\t')
                info = fields[9].split(":")
-                percentage = float(info[4])
+                percentage = float(info[6])

                if 0.1 <= percentage <= 0.9:
                    b = 0.5
@ -135,6 +135,9 @@ def merge_and_sort_files(matched_file, unmatched_file, output_file):
        return output_file

    # 如果 unmatched_file 不为空，继续合并和排序操作
+    if os.stat(matched_file).st_size == 0:
+        matched_df = pd.DataFrame()
+    else:
        matched_df = pd.read_csv(matched_file, sep='\t', header=None)
    unmatched_df = pd.read_csv(unmatched_file, sep='\t', header=None)

@ -195,7 +198,7 @@ def select_cnvkit_vcf(vcf, bed, output_file):
                          line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
                              line.split()[3]) < 2 and len(line.split()[4]) < 2]
        for line in filtered_lines:
-            p_value_str = line.split()[9].split(":")[4]
+            p_value_str = line.split()[9].split(":")[6]
            p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
            if 0.1 <= p_value <= 0.9:
                result_data.append(line)
@ -218,7 +221,7 @@ def paired_monitoring(name, somatic_vcf, germline_vcf, ref_bed, cnvkit_ref_bed,
    # 处理胚系，根据bed筛选
    select_position_output_file3 = os.path.join(output_dir, f'{name}_germline_matched.vcf')
    select_position_output_file4 = os.path.join(output_dir, f'{name}_germline_unmatched.vcf')
-    Germline_matched_file, Germline_unmatched_file = select_position(germline_vcf, ref_bed,
+    germline_matched_file, germline_unmatched_file = select_position(germline_vcf, ref_bed,
                                                                     select_position_output_file3,
                                                                     select_position_output_file4)
    # 处理体系，数值转换
@ -226,14 +229,14 @@ def paired_monitoring(name, somatic_vcf, germline_vcf, ref_bed, cnvkit_ref_bed,
    somatic_matched_add_judge_file = process_judge_vcf(somatic_matched_file, process_judge_vcf_file1)
    # 处理胚系，数值转换
    process_judge_vcf_file2 = os.path.join(output_dir, f'{name}_germline_matched_add_judge.vcf')
-    germline_matched_add_judge_file = process_judge_vcf(Germline_matched_file, process_judge_vcf_file2)
+    germline_matched_add_judge_file = process_judge_vcf(germline_matched_file, process_judge_vcf_file2)
    # 合并体系，将匹配到的和未匹配到bed的的合并
    merge_and_sort_files_file1 = os.path.join(output_dir, f'{name}_somatic_merged.vcf')
    somatic_merged_file = merge_and_sort_files(somatic_matched_add_judge_file, somatic_unmatched_file,
                                               merge_and_sort_files_file1)
    # 合并胚系，将匹配到的和未匹配到bed的的合并
    merge_and_sort_files_file2 = os.path.join(output_dir, f'{name}_germline__merged.vcf')
-    Germline_merged_file = merge_and_sort_files(germline_matched_add_judge_file, Germline_unmatched_file,
+    Germline_merged_file = merge_and_sort_files(germline_matched_add_judge_file, germline_unmatched_file,
                                                merge_and_sort_files_file2)
    # 合并胚系，体系，将体系，胚系两个合并文件再合并
    result_pro_file = os.path.join(output_dir, f'{name}_result_pro.txt')
--- a/codes/postprocess.py
+++ b/codes/postprocess.py
@ -75,7 +75,7 @@ class PostProcess:
    def txt_2_excel(path):
        try:
            df = pd.read_csv(path, sep='\t')
-        except pd.errors.EmptyDataError:
+        except (pd.errors.EmptyDataError, FileNotFoundError):
            return []
        return df.to_dict('records')

@ -179,6 +179,8 @@ class PostProcess:
        filter_neg = os.path.join(self.path, 'mutation',
                                  f'{self.sample_name}.snp_indel.somatic.hg19_multianno.filter.sum.neg.txt')

+        tmb_file = os.path.join(self.path, 'tmb', f'{self.sample_name}.tmb.txt')
+
        filter_sum_pos_res = list()
        # 从pos_files中获取药物信息
        pos_check = check_file_exist_and_empty(filter_pos)
@ -219,7 +221,7 @@ class PostProcess:
            neg['AMP_mut_level'] = 'IIII'
            neg_dict = neg.set_index(['Chr', 'Start', 'End'])['AMP_mut_level'].to_dict()

-        filter_sum_res = list()
+        filter_sum_df = pd.DataFrame()
        filter_sum_check = check_file_exist_and_empty(filter_sum)
        if not filter_sum_check:
            filter_sum_df = pd.read_csv(filter_sum, sep='\t')
@ -228,6 +230,31 @@ class PostProcess:
            level_dict.update(vus_dict)
            level_dict.update(neg_dict)
            filter_sum_df['AMP_mut_level'] = filter_sum_df.set_index(['Chr', 'Start', 'End']).index.map(level_dict)
+
+        cols = list(filter_sum_df.columns)
+
+        tmb_file_check = check_file_exist_and_empty(tmb_file)
+        if not tmb_file_check:
+            tmb_df = pd.read_csv(tmb_file, sep='\t')
+            key_cols = ['Chr', 'Start', 'End']
+            filter_sum_df = filter_sum_df.set_index(key_cols)
+            tmb_df = tmb_df.set_index(key_cols)
+
+            # 在filter_sum_df中的process列中追加字符串";tmb"，对应tmb_df中的行 并且 是非 1，2类突变
+            filter_sum_df['process'] = filter_sum_df.index.map(
+                lambda x: filter_sum_df.at[x, 'process'] + ';tmb' if x in tmb_df.index and filter_sum_df.at[
+                    x, 'AMP_mut_level'] not in ['I', 'II'] else filter_sum_df.at[x, 'process'])
+
+            # 找到tmb_df中不在filter_sum_df中的行，并将这些新的行添加到filter_sum_df中
+            new_rows = tmb_df[~tmb_df.index.isin(filter_sum_df.index)]
+            filter_sum_df = pd.concat([filter_sum_df, new_rows])
+
+            # 重置索引
+            filter_sum_df = filter_sum_df.reset_index()
+
+            # 按之前列排
+            filter_sum_df = filter_sum_df[cols]
+
        filter_sum_df = filter_sum_df.fillna('.')
        filter_sum_res = filter_sum_df.to_dict('records')

@ -429,11 +456,62 @@ class PostProcess:
            print(file_check)

    def longindel(self):
-        longindel_files = glob.glob(
-            os.path.join(self.path, 'fusion', '*.longindel.pos.txt'))
-        if longindel_files:
-            # return self.txt_2_excel(longindel_files[0])
-            self.sheet['longindel'] = self.txt_2_excel(longindel_files[0])
+
+        filter_sum_pos = os.path.join(self.path, 'fusion',
+                                      f'{self.sample_name}.longindel.hg19_multianno.filter.pos.txt')
+        filter_sum_pos_check = check_file_exist_and_empty(filter_sum_pos)
+
+        filter_sum_pos_res = list()
+        pos_dict = dict()
+
+        if not filter_sum_pos_check:
+            filter_sum_pos_df = pd.read_csv(filter_sum_pos, sep='\t')
+            # 添加常规列
+            filter_sum_pos_df = self._add_columns(filter_sum_pos_df)
+            # 添加基因功能
+            filter_sum_pos_df = self._add_gene_function(filter_sum_pos_df, colname='ref_gene')
+            # 药物处理
+            self.drug_parse(filter_sum_pos_df['DrugCn'].to_list())
+            filter_sum_pos_df['Validated'] = 1
+            filter_sum_pos_df = filter_sum_pos_df.fillna('.')
+
+            grouped_df = filter_sum_pos_df.groupby(['#CHROM', 'POS', 'REF', 'ALT'])
+            # 对每个分组进行操作
+            for group_name, group_data in grouped_df:
+                chr, pos, ref, alt = group_name
+                if any(group_data['AMP_mut_level'] == 'I'):
+                    filter_condition = (filter_sum_pos_df['#CHROM'] == chr) & \
+                                       (filter_sum_pos_df['POS'] == pos) & \
+                                       (filter_sum_pos_df['REF'] == ref) & \
+                                       (filter_sum_pos_df['ALT'] == alt)
+                    filter_sum_pos_df.loc[filter_condition, 'AMP_mut_level'] = 'I'
+
+            pos_dict = filter_sum_pos_df.set_index(['#CHROM', 'POS', 'REF', 'ALT'])['AMP_mut_level'].to_dict()
+            filter_sum_pos_res = filter_sum_pos_df.to_dict('records')
+
+        filter_sum = os.path.join(self.path, 'fusion', f'{self.sample_name}.longindel.hg19_multianno.filter.txt')
+        filter_sum_check = check_file_exist_and_empty(filter_sum)
+        filter_sum_res = list()
+        if not filter_sum_check:
+            filter_sum_df = pd.read_csv(filter_sum, sep='\t')
+
+            filter_sum_df['Validated'] = 1
+            level_dict = dict()
+            level_dict.update(pos_dict)
+            filter_sum_df['AMP_mut_level'] = filter_sum_df.set_index(['#CHROM', 'POS', 'REF', 'ALT']).index.map(
+                level_dict)
+            filter_sum_df = filter_sum_df.fillna('.')
+            filter_sum_res = filter_sum_df.to_dict('records')
+
+        self.sheet['longindel_mut'] = filter_sum_res
+        self.sheet['longindel_drug'] = filter_sum_pos_res
+
+    def neoantigen(self):
+        neoantigen = os.path.join(self.path, 'neoantigen', f'MHC_Class_I', 'neoantigen.txt')
+        hla = os.path.join(self.path, 'neoantigen', f'hla', f'{self.normal_name}_result.tsv')
+
+        self.sheet['neoantigen'] = self.txt_2_excel(neoantigen)
+        self.sheet['hla'] = self.txt_2_excel(hla)

    def qc(self):
        qc_files = glob.glob(os.path.join(self.path, 'qc', '*_qc.txt'))
@ -452,17 +530,18 @@ class PostProcess:
    def collect(self):
        writer = pd.ExcelWriter(self.outpath)
        self.cms()
+        self.qc()
        self.snv()
        self.fusion()
+        self.longindel()
        self.cnv()
        self.msi()
        self.germline()
        self.heredity()
        self.heredity_res()
-        self.longindel()
        self.chemo()
        self.indication()
-        self.qc()
+        self.neoantigen()
        self.drugs()

        # 遍历CSV文件列表
--- a/codes/run_pipeline.py
+++ b/codes/run_pipeline.py
@ -2,6 +2,7 @@

 import argparse
 import os
+from datetime import datetime

 run_wdl_path = os.path.join(os.path.dirname(__file__), 'run_wdl.py')

@ -42,13 +43,15 @@ if __name__ == '__main__':
    if not os.path.exists(res_path):
        os.makedirs(res_path)

+    logname = datetime.now().strftime("%m%d%H%M")
+
    cmd = f'nohup python ' \
          f'{run_wdl_path} -n {args.barcode} -s {args.normal} ' \
          f'{"-u " if args.umi else ""} -i {args.input_dir} ' \
          f'-node {args.start_node} ' \
          f'-o {res_path} -b {args.probe} -p {args.project} -c {args.cancer} -w {args.wdl} ' \
-          f'> {res_path}/{args.barcode}_run.log ' \
-          f'2>> {res_path}/{args.barcode}_run.log &'
+          f'> {res_path}/{args.barcode}_{logname}_run.log ' \
+          f'2>> {res_path}/{args.barcode}_{logname}_run.log &'
    # with open(os.path.join(res_path, 'exec'), 'w') as execfile:
    #     execfile.write(cmd + '\n')
    os.system(cmd)
--- a/codes/run_wdl.py
+++ b/codes/run_wdl.py
@ -3,6 +3,7 @@ import json
 import os
 import subprocess
 import time
+from datetime import datetime

 import pandas as pd

@ -80,7 +81,8 @@ def run(barcode, normal, umi, input_dir, output_dir, project, cancer, probe, wdl
    arg = {key: value for key, value in arg.items() if value not in (None, '', False)}

    # generate json
-    jsfile_path = os.path.join(output_dir, f'{barcode}.json')
+    logname = datetime.now().strftime("%m%d%H%M")
+    jsfile_path = os.path.join(output_dir, f'{barcode}_{logname}.json')
    with open(jsfile_path, 'w') as jsfile:
        jsfile.write(json.dumps(arg, indent=4, ensure_ascii=False))

@ -127,7 +129,7 @@ if __name__ == '__main__':
    parser.add_argument('-p', '--project', help="project", required=True)
    parser.add_argument('-c', '--cancer', help="cancer", required=True)
    parser.add_argument('-b', '--probe', help="probe, 682, 624, 160, 17 for now ", required=True)
-    parser.add_argument('-w', '--wdl', help="wdl", default='/home/zhangchao/project/pipeline/workflow/pipeline.wdl')
+    parser.add_argument('-w', '--wdl', help="wdl", default='$WORKFLOW/pipeline.wdl')
    parser.add_argument('-node', '--start_node',
                        help="node begain to run; 'addQc', 'addAlignment', "
                             "'addTarget', 'addFusion', 'addCnv', 'addMsi', 'addChemo',"
--- a/codes/target_therapy_longindel.pl
+++ b/codes/target_therapy_longindel.pl
@ -6,10 +6,10 @@ die "useage:perl $0 input output cancer_type" unless @ARGV == 3;
 my ($input, $output, $cancer_type) = @ARGV;

 my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public";
-print "Fusion药物注释使用public路径：$public_path\n";
+print "Longindel药物注释使用public路径：$public_path\n";

 my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase";
-print "Fusion药物注释使用路径：$database_path\n";
+print "Longindel药物注释使用路径：$database_path\n";

 open MUT, "$database_path/fusion.csv";
 <MUT>;
@ -26,7 +26,8 @@ my %therapy;
 while (<THERAPY>) {
    chomp;
    my @line = split("\t");
-    push @{$therapy{$line[0]}{$line[1]}}, $_ if ($line[1] =~ /fusion/i and $line[9] ne 'D' and $line[2] !~ /Leukemia|Lymphoma|Myeloid/i);
+    # push @{$therapy{$line[0]}{$line[1]}}, $_ if ($line[1] =~ /fusion/i and $line[9] ne 'D' and $line[2] !~ /Leukemia|Lymphoma|Myeloid/i);
+    push @{$therapy{$line[0]}{$line[1]}}, $_ if ($line[9] ne 'D' and $line[2] !~ /Leukemia|Lymphoma|Myeloid/i);
 }

 ##药物翻译信息
@ -106,23 +107,38 @@ while (<IN>) {
    my @splitline = split(/\t/);

    my $freq = (split(/:/, $splitline[9]))[9] / (split(/:/, $splitline[9]))[7];
-    my $gene;
-    if ($_ =~ /Gene\.refGene=([^;]+)/) {
-        $gene = $1;
-    }

    if (exists $therapy{'BCL2L11'}{'DELETION POLYMORPHISM'}) {
+        print "$freq\n";
        foreach my $entry (@{$therapy{'BCL2L11'}{'DELETION POLYMORPHISM'}}) {
+
            my @line = split("\t", $entry);
-            if (($line[5] eq "FDA" or $line[5] eq "NCCN" or $line[5] eq "NMPA") and $line[2] =~ /$cancer_type|solid tumor/i) {
+            if (($line[14] eq 'A') and (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}})) {
+                # push @pos, "$_\t.\t" . join("\t", @line[0 .. 9, 14]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
            }
-            elsif (($line[5] eq "FDA" or $line[5] eq "NCCN" or $line[5] eq "NMPA") and $line[2] !~ /$cancer_type|solid tumor/i) {
+            elsif (($line[14] eq 'A') and (grep {lc $line[2] ne lc $_} @{$dis2{$cancer_type}})) {
+                # push @pos, "$_\t.\t" . join("\t", @line[0 .. 9, 14]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
            }
-            elsif ($line[2] =~ /$cancer_type|solid tumor/i) {
+            elsif (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}}) {
+                # push @pos, "$_\t.\t" . join("\t", @line[0 .. 9, 14]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
                push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
            }
+
+            # my @line = split("\t", $entry);
+            # if (($line[5] eq "FDA" or $line[5] eq "NCCN" or $line[5] eq "NMPA") and $line[2] =~ /$cancer_type|solid tumor/i) {
+            #     push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
+            # }
+            # elsif (($line[5] eq "FDA" or $line[5] eq "NCCN" or $line[5] eq "NMPA") and $line[2] !~ /$cancer_type|solid tumor/i) {
+            #     push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
+            # }
+            # elsif ($line[2] =~ /$cancer_type|solid tumor/i) {
+            #     push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
+            # }
+            # else {
+            #     print "未匹配到"
+            # }
        }
    }

--- a/database/chemo_database.xlsx
+++ b/database/chemo_database.xlsx
--- a/database/info.csv
+++ b/database/info.csv
--- a/pipeline.wdl
+++ b/pipeline.wdl
@ -1,3 +1,4 @@
+
 import "./wdl/qc.wdl"
 import "./wdl/alignment.wdl"
 import "./wdl/call_mutation.wdl"
@ -199,6 +200,7 @@ workflow pipeline {
            msi=call_msi.msi_txt,
            hereditary=call_hereditary.hereditary_txt,
            chemo=call_chemo.chemo_res,
+            neoantigen=call_neoantigen.neoantigen_txt,
            pollution=call_pollution.pollution_res,
            name=tumor,
            normal=normal,
--- a/wdl/call_mutation.wdl
+++ b/wdl/call_mutation.wdl
@ -1,3 +1,4 @@
+
 task mutation_calling_umi {
    String name
    String output_dir
--- a/wdl/chemo.wdl
+++ b/wdl/chemo.wdl
@ -1,3 +1,4 @@
+
 task run_chemo {
    String name
    String output_dir
@ -10,11 +11,12 @@ task run_chemo {
        if [ ! -d ${output_dir}/chemo ];then
        mkdir ${output_dir}/chemo
        fi
+
        chemo.py -d $DATABASE/chemo_database.xlsx -probe ${probe} -n ${name} -v ${vcf} -o ${output_dir}/chemo -c ${cancer} -p ${project}
    >>>

    output {
-        String chemo_res = "${output_dir}/chemo/${name}.drug.res.txt"
+        String run_chemo_res = "${output_dir}/chemo/${name}.drug.res.txt"
    }
 }

--- a/wdl/cnv.wdl
+++ b/wdl/cnv.wdl
@ -1,3 +1,4 @@
+
 task cnv_single {
    String name
    String output_dir
--- a/wdl/hereditary.wdl
+++ b/wdl/hereditary.wdl
@ -1,3 +1,4 @@
+
 task run_hereditary {
    String name
    String output_dir
@ -8,6 +9,7 @@ task run_hereditary {
        if [ ! -d ${output_dir}/hereditary ];then
        mkdir ${output_dir}/hereditary
        fi
+
        hereditary.py -d $DATABASE/hereditary_database.xlsx -p ${project} -n ${name} -f ${filter_txt} -o ${output_dir}/hereditary
    >>>

--- a/wdl/msi.wdl
+++ b/wdl/msi.wdl
@ -1,3 +1,4 @@
+
 task msi_single {
    String name
    String bed
@ -18,7 +19,7 @@ task msi_single {
    >>>

    output {
-        String msi_txt = "${output_dir}/msi/${name}.msi.txt"
+        String run_msi_txt = "${output_dir}/msi/${name}.msi.txt"
    }
 }

@ -44,7 +45,7 @@ task msi_paired {
    >>>

    output {
-        String msi_txt = "${output_dir}/msi/${name}.msi.txt"
+        String run_msi_txt = "${output_dir}/msi/${name}.msi.txt"
    }

 }
--- a/wdl/neoantigen.wdl
+++ b/wdl/neoantigen.wdl
@ -1,3 +1,4 @@
+
 task run_neoantigen {
    String tumor
    String? normal
@ -12,7 +13,7 @@ task run_neoantigen {
    command <<<

        if [ ! -d ${output_dir}/neoantigen/hla ];then
-        mkdir ${output_dir}/neoantigen/hla
+        mkdir -p ${output_dir}/neoantigen/hla
        fi

        razers3 -tc 10 -i 95 -m 1 -dr 0 -o ${output_dir}/neoantigen/hla/fished_1.bam \
@ -145,7 +146,7 @@ task run_neoantigen {
        /data

        dos2unix ${output_dir}/neoantigen/MHC_Class_I/*.all_epitopes.tsv
-        #        perl ${output_dir}/netchop.pl $outputDir $name $tumor $max_peptide_length
+        netchop.pl ${output_dir} ${tumor}
    >>>

 }
@ -178,4 +179,8 @@ workflow call_neoantigen {
                sample_type=if umi then 'c' else 't'
        }
    }
+
+    output {
+        String neoantigen_txt = "${output_dir}neoantigen/MHC_Class_I/neoantigen.txt"
+    }
 }
--- a/wdl/pollution.wdl
+++ b/wdl/pollution.wdl
@ -1,3 +1,4 @@
+
 task run_pollution {
    String name
    String output_dir
@ -20,7 +21,7 @@ task run_pollution {
    >>>

    output {
-        String pollution_res = "${output_dir}/pollution/${name}_pollution.csv"
+        String run_pollution_res = "${output_dir}/pollution/${name}_pollution.csv"
    }
 }

--- a/wdl/postprocess.wdl
+++ b/wdl/postprocess.wdl
@ -1,3 +1,4 @@
+
 task run_post {
    String?  mutation
    String? fusion
@ -5,6 +6,7 @@ task run_post {
    String?  msi
    String? hereditary
    String? chemo
+    String? neoantigen
    String? pollution
    String name
    String? normal
@ -22,7 +24,7 @@ task run_post {
    >>>

    output {
-        String merged = "${output_dir}/report/${name}.merged_file.xlsx"
+        String run_merged = "${output_dir}/report/${name}.merged_file.xlsx"
    }
 }

@ -37,6 +39,7 @@ workflow call_postprocess {
    String? hereditary
    String? pollution
    String? chemo
+    String? neoantigen
    String name
    String? normal
    String output_dir
@ -52,6 +55,7 @@ workflow call_postprocess {
                msi=msi,
                hereditary=hereditary,
                chemo=chemo,
+                neoantigen=neoantigen,
                pollution=pollution,
                name=name,
                normal=normal,
--- a/wdl/qc.wdl
+++ b/wdl/qc.wdl
@ -1,3 +1,4 @@
+
 task runqc {
    String name
    String input_dir
--- a/wdl/statistics.wdl
+++ b/wdl/statistics.wdl
@ -1,3 +1,4 @@
+
 task run_statistics {
    String name
    String output_dir
--- a/wdl/tmb.wdl
+++ b/wdl/tmb.wdl
@ -1,3 +1,4 @@
+
 task run_tmb {
    String name
    String file
@ -19,7 +20,7 @@ task run_tmb {
    >>>

    output {
-        String tmb_txt = "${output_dir}/tmb/${name}.tmb.txt"
+        String run_tmb_txt = "${output_dir}/tmb/${name}.tmb.txt"
    }
 }

@ -50,7 +51,7 @@ workflow call_tmb {
                    name=name,
                    file=file,
                    project=project,
-                    sample_type='c',
+                    sample_type='t',
                    output_dir=output_dir
            }