流程更新
parent
c9f525b2bf
commit
bd66bc6612
|
|
@ -1,4 +1,5 @@
|
|||
#! /usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
|
|
|
|||
|
|
@ -1,114 +0,0 @@
|
|||
#!/usr/bin/python3
|
||||
# -*- coding: UTF-8 -*-
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
import pandas as pd
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print(" ".join(['usage:python3', sys.argv[0], 'output_dir', 'name']))
|
||||
sys.exit()
|
||||
|
||||
output_dir = sys.argv[1]
|
||||
name = sys.argv[2]
|
||||
snv_file = os.path.join(output_dir, 'mutation', f'{name}.somatic.hg19_multianno.filter.sum.pos.txt')
|
||||
snv_file_new = os.path.join(output_dir, 'mutation', f'{name}.somatic.hg19_multianno.filter.sum.pos.dedup.txt')
|
||||
|
||||
fusion_file = os.path.join(output_dir, 'fusion', f'{name}.fusion.hg19_multianno.filter.fusion.pos.txt')
|
||||
fusion_file_new = os.path.join(output_dir, 'fusion', f'{name}.fusion.hg19_multianno.filter.fusion.pos.dedup.txt')
|
||||
|
||||
cnv_file = os.path.join(output_dir, 'cnv', f'{name}.rmdup.cns.filter.pos.txt')
|
||||
cnv_file_new = os.path.join(output_dir, 'cnvkit', f'{name}.rmdup.cns.filter.pos.dedup.txt')
|
||||
|
||||
# gm_snv_file = os.path.join(output_dir, '/mutation/', name, '.snvindel.Germline.pos.txt')
|
||||
# gm_snv_file_new = os.path.join(output_dir, '/mutation/', name, '.snvindel.Germline.pos.dedup.txt')
|
||||
|
||||
open(snv_file_new, "w")
|
||||
open(fusion_file_new, "w")
|
||||
open(cnv_file_new, "w")
|
||||
# open(gm_snv_file_new, "w")
|
||||
##Evidence_Source_C及标签排序
|
||||
df_mapping_1 = pd.DataFrame({
|
||||
'Evidence_Source_C': ['FDA', 'NMPA', 'NCCN', '临床III期', '临床II期', '临床I期', '临床试验', '回顾性研究', '个案', '临床前研究'],
|
||||
})
|
||||
sort_mapping_1 = df_mapping_1.reset_index().set_index('Evidence_Source_C')
|
||||
df_mapping_2 = pd.DataFrame({'标签': ['适应症', '非适应症', '.']})
|
||||
sort_mapping_2 = df_mapping_2.reset_index().set_index('标签')
|
||||
|
||||
##snvindel处理
|
||||
snv_size = os.path.getsize(snv_file)
|
||||
if snv_size > 0:
|
||||
data = pd.read_table(snv_file, sep="\t")
|
||||
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
|
||||
data['level2'] = data['标签'].map(sort_mapping_2['index'])
|
||||
data.sort_values(by=['AAChange.refGene', 'level2', 'level1'], ascending=True, inplace=True)
|
||||
data.drop(['level1', 'level2'], axis=1, inplace=True)
|
||||
info = {}
|
||||
for index, row in data.iterrows():
|
||||
if re.search(r'敏感', row['Response_Type_C']):
|
||||
if row['标签'] == '适应症':
|
||||
info[row['AAChange.refGene'] + row['Drug']] = '1'
|
||||
else:
|
||||
if (row['AAChange.refGene'] + row['Drug']) in info.keys():
|
||||
data.drop([index], inplace=True)
|
||||
data.insert(0, '可信', 1)
|
||||
data.to_csv(snv_file_new, index=False, sep='\t')
|
||||
|
||||
# ##germline snv/indel处理
|
||||
# gm_snv_size = os.path.getsize(gm_snv_file)
|
||||
# if gm_snv_size > 0:
|
||||
# data = pd.read_table(gm_snv_file, sep="\t")
|
||||
# data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
|
||||
# data['level2'] = data['标签'].map(sort_mapping_2['index'])
|
||||
# data.sort_values(by=['AAChange.refGene', 'level2', 'level1'], ascending=True, inplace=True)
|
||||
# data.drop(['level1', 'level2'], axis=1, inplace=True)
|
||||
# info = {}
|
||||
# for index, row in data.iterrows():
|
||||
# if re.search(r'敏感', row['Response_Type_C']):
|
||||
# if row['标签'] == '适应症':
|
||||
# info[row['AAChange.refGene'] + row['Drug']] = '1'
|
||||
# else:
|
||||
# if (row['AAChange.refGene'] + row['Drug']) in info.keys():
|
||||
# data.drop([index], inplace=True)
|
||||
# data.insert(0, '可信', 1)
|
||||
# data.to_csv(gm_snv_file_new, index=False, sep='\t')
|
||||
|
||||
##fusion处理
|
||||
fusion_size = os.path.getsize(fusion_file)
|
||||
if fusion_size > 0:
|
||||
data = pd.read_table(fusion_file, sep="\t")
|
||||
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
|
||||
data['level2'] = data['标签'].map(sort_mapping_2['index'])
|
||||
data.sort_values(by=['FUSION', 'level2', 'level1'], ascending=True, inplace=True)
|
||||
data.drop(['level1', 'level2'], axis=1, inplace=True)
|
||||
info = {}
|
||||
for index, row in data.iterrows():
|
||||
if re.search(r'敏感', row['Response_Type_C']):
|
||||
if row['标签'] == '适应症':
|
||||
info[row['FUSION'] + row['Drug']] = '1'
|
||||
else:
|
||||
if (row['FUSION'] + row['Drug']) in info.keys():
|
||||
data.drop([index], inplace=True)
|
||||
data.insert(0, '可信', 1)
|
||||
data.to_csv(fusion_file_new, index=False, sep='\t')
|
||||
|
||||
##cnv处理
|
||||
cnv_size = os.path.getsize(cnv_file)
|
||||
if cnv_size > 0:
|
||||
data = pd.read_table(cnv_file, sep="\t")
|
||||
data['level1'] = data['Evidence_Source_C'].map(sort_mapping_1['index'])
|
||||
data['level2'] = data['标签'].map(sort_mapping_2['index'])
|
||||
data.sort_values(by=['Gene_Symbol', 'level2', 'level1'], ascending=True, inplace=True)
|
||||
data.drop(['level1', 'level2'], axis=1, inplace=True)
|
||||
info = {}
|
||||
for index, row in data.iterrows():
|
||||
if re.search(r'敏感', row['Response_Type_C']):
|
||||
if row['标签'] == '适应症':
|
||||
info[row['Gene_Symbol'] + row['Drug']] = '1'
|
||||
else:
|
||||
if (row['Gene_Symbol'] + row['Drug']) in info.keys():
|
||||
data.drop([index], inplace=True)
|
||||
data.insert(0, '可信', 1)
|
||||
data.to_csv(cnv_file_new, index=False, sep='\t')
|
||||
|
|
@ -30,14 +30,15 @@ while (<IN>) {
|
|||
}
|
||||
my @line = split(/\t/);
|
||||
$line[7] =~ /Gene.refGene=(.*?);/;
|
||||
if ((grep {$1 =~ /$_/} @longindels) && ($_ =~ /SVTYPE=DEL/ || $_ =~ /SVTYPE=DUP/ || $_ =~ /SVTYPE=INS/)) {
|
||||
if ($1 eq "BCL2L11") {
|
||||
my $gene = $1;
|
||||
if ((grep {$gene =~ /$_/} @longindels) && ($_ =~ /SVTYPE=DEL/ || $_ =~ /SVTYPE=DUP/ || $_ =~ /SVTYPE=INS/)) {
|
||||
if ($gene eq "BCL2L11") {
|
||||
if ($line[1] == '111883194') {
|
||||
print LONGINDEL join("\n", @pos) . "\n";
|
||||
print LONGINDEL $_;
|
||||
}
|
||||
}
|
||||
else {
|
||||
print LONGINDEL join("\n", @pos) . "\n";
|
||||
print LONGINDEL $_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
#use warnings;
|
||||
use List::Util qw(sum);
|
||||
|
|
@ -131,12 +132,12 @@ while (<IN>) {
|
|||
$line[9] = join(":", ($gene, $hgvs));
|
||||
}
|
||||
elsif ($spl =~ /\d+\-[1|2]\D+/) {
|
||||
my $intron = $exon-1;
|
||||
my $intron = $exon - 1;
|
||||
$hgvs =~ s/exon(\d+)/intron$intron;exon$exon/;
|
||||
$line[9] = join(":", ($gene, $hgvs));
|
||||
}
|
||||
elsif ($gene eq "MET") {
|
||||
$line[9] = join(":", ($gene, "exon14", "c.xxx"));
|
||||
$line[9] = join(":", ($gene, "NM_000245", "exon14", "c.xxx"));
|
||||
$line[8] = 'skipping'
|
||||
}
|
||||
else {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,78 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use List::Util qw(min max);
|
||||
#max_length:最大的epitope长度
|
||||
die "usage:perl $0 outputDir tumor_prefix" if @ARGV != 2;
|
||||
|
||||
my ($outputDir, $tumor_prefix) = @ARGV;
|
||||
|
||||
open IN, "$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.fasta";
|
||||
my %fa;
|
||||
while (<IN>) {
|
||||
if (/^>MT/) {
|
||||
open OUT, ">$outputDir/neoantigen/MHC_Class_I/tmp.fa";
|
||||
print OUT;
|
||||
$_ =~ /MT\.(\d+)\./;
|
||||
my $id = $1;
|
||||
my $seq = <IN>;
|
||||
print OUT $seq;
|
||||
chomp $seq;
|
||||
$fa{$id} = $seq;
|
||||
system "predict.py -m netchop -n $outputDir/neoantigen/MHC_Class_I/tmp.fa >>$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.cleavage.txt";
|
||||
close OUT;
|
||||
}
|
||||
}
|
||||
unlink "$outputDir/neoantigen/MHC_Class_I/tmp.fa";
|
||||
|
||||
my %score;
|
||||
open IN, "$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.cleavage.txt";
|
||||
while (<IN>) {
|
||||
next unless /^\d+/;
|
||||
chomp;
|
||||
my @line = split;
|
||||
$line[3] =~ /MT\.(\d+)\./;
|
||||
$score{$1}{$line[0]} = $line[2];
|
||||
}
|
||||
open IN, "$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.tsv";
|
||||
open OUT, ">$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.txt";
|
||||
open OUT2, ">$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.filter.txt";
|
||||
#my %neopt;
|
||||
my $head = <IN>;
|
||||
chomp $head;
|
||||
print OUT "$head\tcleavage_score\n";
|
||||
while (<IN>) {
|
||||
chomp;
|
||||
my @line = split(/\t/);
|
||||
$line[44] =~ /^(\d+)\./;
|
||||
my $id = $1;
|
||||
my $pep = $line[18];
|
||||
if (exists $fa{$id}) {
|
||||
my $index = index($fa{$id}, $pep) + length($pep);
|
||||
my $cleavage_score = $score{$id}{$index};
|
||||
print OUT "$_\t$cleavage_score\n";
|
||||
if ($line[21] <= 5000 and ($line[23] eq "NA" or $line[23] >= 1)) {
|
||||
print OUT2 "$_\t$cleavage_score\n";
|
||||
}
|
||||
}
|
||||
else {
|
||||
print OUT "$_\tNA\n";
|
||||
}
|
||||
}
|
||||
system "sort -k 22 -n $outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.filter.txt >$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.filter.sort.txt";
|
||||
unlink "$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.filter.txt";
|
||||
open SORT, "$outputDir/neoantigen/MHC_Class_I/${tumor_prefix}.all_epitopes.netchop.filter.sort.txt";
|
||||
open OUT3, ">$outputDir/neoantigen/MHC_Class_I/neoantigen.txt";
|
||||
print OUT3 "序号\tHLA分型\t基因\t多肽\t亲和力\t剪切效率\n";
|
||||
my %pep;
|
||||
my $bool = 0;
|
||||
while (<SORT>) {
|
||||
chomp;
|
||||
my @line = split(/\t/);
|
||||
if (not exists $pep{$line[18]}) {
|
||||
$pep{$line[18]}++;
|
||||
$bool += 1;
|
||||
print OUT3 "$bool\t$line[14]\t$line[11]\t$line[18]\t$line[21]\t$line[53]\n";
|
||||
}
|
||||
}
|
||||
|
|
@ -110,7 +110,7 @@ def process_judge_vcf(input_vcf, output_vcf):
|
|||
if not line.startswith("#"):
|
||||
fields = line.strip().split('\t')
|
||||
info = fields[9].split(":")
|
||||
percentage = float(info[4])
|
||||
percentage = float(info[6])
|
||||
|
||||
if 0.1 <= percentage <= 0.9:
|
||||
b = 0.5
|
||||
|
|
@ -135,6 +135,9 @@ def merge_and_sort_files(matched_file, unmatched_file, output_file):
|
|||
return output_file
|
||||
|
||||
# 如果 unmatched_file 不为空,继续合并和排序操作
|
||||
if os.stat(matched_file).st_size == 0:
|
||||
matched_df = pd.DataFrame()
|
||||
else:
|
||||
matched_df = pd.read_csv(matched_file, sep='\t', header=None)
|
||||
unmatched_df = pd.read_csv(unmatched_file, sep='\t', header=None)
|
||||
|
||||
|
|
@ -195,7 +198,7 @@ def select_cnvkit_vcf(vcf, bed, output_file):
|
|||
line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
|
||||
line.split()[3]) < 2 and len(line.split()[4]) < 2]
|
||||
for line in filtered_lines:
|
||||
p_value_str = line.split()[9].split(":")[4]
|
||||
p_value_str = line.split()[9].split(":")[6]
|
||||
p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
|
||||
if 0.1 <= p_value <= 0.9:
|
||||
result_data.append(line)
|
||||
|
|
@ -218,7 +221,7 @@ def paired_monitoring(name, somatic_vcf, germline_vcf, ref_bed, cnvkit_ref_bed,
|
|||
# 处理胚系,根据bed筛选
|
||||
select_position_output_file3 = os.path.join(output_dir, f'{name}_germline_matched.vcf')
|
||||
select_position_output_file4 = os.path.join(output_dir, f'{name}_germline_unmatched.vcf')
|
||||
Germline_matched_file, Germline_unmatched_file = select_position(germline_vcf, ref_bed,
|
||||
germline_matched_file, germline_unmatched_file = select_position(germline_vcf, ref_bed,
|
||||
select_position_output_file3,
|
||||
select_position_output_file4)
|
||||
# 处理体系,数值转换
|
||||
|
|
@ -226,14 +229,14 @@ def paired_monitoring(name, somatic_vcf, germline_vcf, ref_bed, cnvkit_ref_bed,
|
|||
somatic_matched_add_judge_file = process_judge_vcf(somatic_matched_file, process_judge_vcf_file1)
|
||||
# 处理胚系,数值转换
|
||||
process_judge_vcf_file2 = os.path.join(output_dir, f'{name}_germline_matched_add_judge.vcf')
|
||||
germline_matched_add_judge_file = process_judge_vcf(Germline_matched_file, process_judge_vcf_file2)
|
||||
germline_matched_add_judge_file = process_judge_vcf(germline_matched_file, process_judge_vcf_file2)
|
||||
# 合并体系,将匹配到的和未匹配到bed的的合并
|
||||
merge_and_sort_files_file1 = os.path.join(output_dir, f'{name}_somatic_merged.vcf')
|
||||
somatic_merged_file = merge_and_sort_files(somatic_matched_add_judge_file, somatic_unmatched_file,
|
||||
merge_and_sort_files_file1)
|
||||
# 合并胚系,将匹配到的和未匹配到bed的的合并
|
||||
merge_and_sort_files_file2 = os.path.join(output_dir, f'{name}_germline__merged.vcf')
|
||||
Germline_merged_file = merge_and_sort_files(germline_matched_add_judge_file, Germline_unmatched_file,
|
||||
Germline_merged_file = merge_and_sort_files(germline_matched_add_judge_file, germline_unmatched_file,
|
||||
merge_and_sort_files_file2)
|
||||
# 合并胚系,体系,将体系,胚系两个合并文件再合并
|
||||
result_pro_file = os.path.join(output_dir, f'{name}_result_pro.txt')
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ class PostProcess:
|
|||
def txt_2_excel(path):
|
||||
try:
|
||||
df = pd.read_csv(path, sep='\t')
|
||||
except pd.errors.EmptyDataError:
|
||||
except (pd.errors.EmptyDataError, FileNotFoundError):
|
||||
return []
|
||||
return df.to_dict('records')
|
||||
|
||||
|
|
@ -179,6 +179,8 @@ class PostProcess:
|
|||
filter_neg = os.path.join(self.path, 'mutation',
|
||||
f'{self.sample_name}.snp_indel.somatic.hg19_multianno.filter.sum.neg.txt')
|
||||
|
||||
tmb_file = os.path.join(self.path, 'tmb', f'{self.sample_name}.tmb.txt')
|
||||
|
||||
filter_sum_pos_res = list()
|
||||
# 从pos_files中获取药物信息
|
||||
pos_check = check_file_exist_and_empty(filter_pos)
|
||||
|
|
@ -219,7 +221,7 @@ class PostProcess:
|
|||
neg['AMP_mut_level'] = 'IIII'
|
||||
neg_dict = neg.set_index(['Chr', 'Start', 'End'])['AMP_mut_level'].to_dict()
|
||||
|
||||
filter_sum_res = list()
|
||||
filter_sum_df = pd.DataFrame()
|
||||
filter_sum_check = check_file_exist_and_empty(filter_sum)
|
||||
if not filter_sum_check:
|
||||
filter_sum_df = pd.read_csv(filter_sum, sep='\t')
|
||||
|
|
@ -228,6 +230,31 @@ class PostProcess:
|
|||
level_dict.update(vus_dict)
|
||||
level_dict.update(neg_dict)
|
||||
filter_sum_df['AMP_mut_level'] = filter_sum_df.set_index(['Chr', 'Start', 'End']).index.map(level_dict)
|
||||
|
||||
cols = list(filter_sum_df.columns)
|
||||
|
||||
tmb_file_check = check_file_exist_and_empty(tmb_file)
|
||||
if not tmb_file_check:
|
||||
tmb_df = pd.read_csv(tmb_file, sep='\t')
|
||||
key_cols = ['Chr', 'Start', 'End']
|
||||
filter_sum_df = filter_sum_df.set_index(key_cols)
|
||||
tmb_df = tmb_df.set_index(key_cols)
|
||||
|
||||
# 在filter_sum_df中的process列中追加字符串";tmb",对应tmb_df中的行 并且 是非 1,2类突变
|
||||
filter_sum_df['process'] = filter_sum_df.index.map(
|
||||
lambda x: filter_sum_df.at[x, 'process'] + ';tmb' if x in tmb_df.index and filter_sum_df.at[
|
||||
x, 'AMP_mut_level'] not in ['I', 'II'] else filter_sum_df.at[x, 'process'])
|
||||
|
||||
# 找到tmb_df中不在filter_sum_df中的行,并将这些新的行添加到filter_sum_df中
|
||||
new_rows = tmb_df[~tmb_df.index.isin(filter_sum_df.index)]
|
||||
filter_sum_df = pd.concat([filter_sum_df, new_rows])
|
||||
|
||||
# 重置索引
|
||||
filter_sum_df = filter_sum_df.reset_index()
|
||||
|
||||
# 按之前列排
|
||||
filter_sum_df = filter_sum_df[cols]
|
||||
|
||||
filter_sum_df = filter_sum_df.fillna('.')
|
||||
filter_sum_res = filter_sum_df.to_dict('records')
|
||||
|
||||
|
|
@ -429,11 +456,62 @@ class PostProcess:
|
|||
print(file_check)
|
||||
|
||||
def longindel(self):
|
||||
longindel_files = glob.glob(
|
||||
os.path.join(self.path, 'fusion', '*.longindel.pos.txt'))
|
||||
if longindel_files:
|
||||
# return self.txt_2_excel(longindel_files[0])
|
||||
self.sheet['longindel'] = self.txt_2_excel(longindel_files[0])
|
||||
|
||||
filter_sum_pos = os.path.join(self.path, 'fusion',
|
||||
f'{self.sample_name}.longindel.hg19_multianno.filter.pos.txt')
|
||||
filter_sum_pos_check = check_file_exist_and_empty(filter_sum_pos)
|
||||
|
||||
filter_sum_pos_res = list()
|
||||
pos_dict = dict()
|
||||
|
||||
if not filter_sum_pos_check:
|
||||
filter_sum_pos_df = pd.read_csv(filter_sum_pos, sep='\t')
|
||||
# 添加常规列
|
||||
filter_sum_pos_df = self._add_columns(filter_sum_pos_df)
|
||||
# 添加基因功能
|
||||
filter_sum_pos_df = self._add_gene_function(filter_sum_pos_df, colname='ref_gene')
|
||||
# 药物处理
|
||||
self.drug_parse(filter_sum_pos_df['DrugCn'].to_list())
|
||||
filter_sum_pos_df['Validated'] = 1
|
||||
filter_sum_pos_df = filter_sum_pos_df.fillna('.')
|
||||
|
||||
grouped_df = filter_sum_pos_df.groupby(['#CHROM', 'POS', 'REF', 'ALT'])
|
||||
# 对每个分组进行操作
|
||||
for group_name, group_data in grouped_df:
|
||||
chr, pos, ref, alt = group_name
|
||||
if any(group_data['AMP_mut_level'] == 'I'):
|
||||
filter_condition = (filter_sum_pos_df['#CHROM'] == chr) & \
|
||||
(filter_sum_pos_df['POS'] == pos) & \
|
||||
(filter_sum_pos_df['REF'] == ref) & \
|
||||
(filter_sum_pos_df['ALT'] == alt)
|
||||
filter_sum_pos_df.loc[filter_condition, 'AMP_mut_level'] = 'I'
|
||||
|
||||
pos_dict = filter_sum_pos_df.set_index(['#CHROM', 'POS', 'REF', 'ALT'])['AMP_mut_level'].to_dict()
|
||||
filter_sum_pos_res = filter_sum_pos_df.to_dict('records')
|
||||
|
||||
filter_sum = os.path.join(self.path, 'fusion', f'{self.sample_name}.longindel.hg19_multianno.filter.txt')
|
||||
filter_sum_check = check_file_exist_and_empty(filter_sum)
|
||||
filter_sum_res = list()
|
||||
if not filter_sum_check:
|
||||
filter_sum_df = pd.read_csv(filter_sum, sep='\t')
|
||||
|
||||
filter_sum_df['Validated'] = 1
|
||||
level_dict = dict()
|
||||
level_dict.update(pos_dict)
|
||||
filter_sum_df['AMP_mut_level'] = filter_sum_df.set_index(['#CHROM', 'POS', 'REF', 'ALT']).index.map(
|
||||
level_dict)
|
||||
filter_sum_df = filter_sum_df.fillna('.')
|
||||
filter_sum_res = filter_sum_df.to_dict('records')
|
||||
|
||||
self.sheet['longindel_mut'] = filter_sum_res
|
||||
self.sheet['longindel_drug'] = filter_sum_pos_res
|
||||
|
||||
def neoantigen(self):
|
||||
neoantigen = os.path.join(self.path, 'neoantigen', f'MHC_Class_I', 'neoantigen.txt')
|
||||
hla = os.path.join(self.path, 'neoantigen', f'hla', f'{self.normal_name}_result.tsv')
|
||||
|
||||
self.sheet['neoantigen'] = self.txt_2_excel(neoantigen)
|
||||
self.sheet['hla'] = self.txt_2_excel(hla)
|
||||
|
||||
def qc(self):
|
||||
qc_files = glob.glob(os.path.join(self.path, 'qc', '*_qc.txt'))
|
||||
|
|
@ -452,17 +530,18 @@ class PostProcess:
|
|||
def collect(self):
|
||||
writer = pd.ExcelWriter(self.outpath)
|
||||
self.cms()
|
||||
self.qc()
|
||||
self.snv()
|
||||
self.fusion()
|
||||
self.longindel()
|
||||
self.cnv()
|
||||
self.msi()
|
||||
self.germline()
|
||||
self.heredity()
|
||||
self.heredity_res()
|
||||
self.longindel()
|
||||
self.chemo()
|
||||
self.indication()
|
||||
self.qc()
|
||||
self.neoantigen()
|
||||
self.drugs()
|
||||
|
||||
# 遍历CSV文件列表
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
import argparse
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
run_wdl_path = os.path.join(os.path.dirname(__file__), 'run_wdl.py')
|
||||
|
||||
|
|
@ -42,13 +43,15 @@ if __name__ == '__main__':
|
|||
if not os.path.exists(res_path):
|
||||
os.makedirs(res_path)
|
||||
|
||||
logname = datetime.now().strftime("%m%d%H%M")
|
||||
|
||||
cmd = f'nohup python ' \
|
||||
f'{run_wdl_path} -n {args.barcode} -s {args.normal} ' \
|
||||
f'{"-u " if args.umi else ""} -i {args.input_dir} ' \
|
||||
f'-node {args.start_node} ' \
|
||||
f'-o {res_path} -b {args.probe} -p {args.project} -c {args.cancer} -w {args.wdl} ' \
|
||||
f'> {res_path}/{args.barcode}_run.log ' \
|
||||
f'2>> {res_path}/{args.barcode}_run.log &'
|
||||
f'> {res_path}/{args.barcode}_{logname}_run.log ' \
|
||||
f'2>> {res_path}/{args.barcode}_{logname}_run.log &'
|
||||
# with open(os.path.join(res_path, 'exec'), 'w') as execfile:
|
||||
# execfile.write(cmd + '\n')
|
||||
os.system(cmd)
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ import json
|
|||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
|
@ -80,7 +81,8 @@ def run(barcode, normal, umi, input_dir, output_dir, project, cancer, probe, wdl
|
|||
arg = {key: value for key, value in arg.items() if value not in (None, '', False)}
|
||||
|
||||
# generate json
|
||||
jsfile_path = os.path.join(output_dir, f'{barcode}.json')
|
||||
logname = datetime.now().strftime("%m%d%H%M")
|
||||
jsfile_path = os.path.join(output_dir, f'{barcode}_{logname}.json')
|
||||
with open(jsfile_path, 'w') as jsfile:
|
||||
jsfile.write(json.dumps(arg, indent=4, ensure_ascii=False))
|
||||
|
||||
|
|
@ -127,7 +129,7 @@ if __name__ == '__main__':
|
|||
parser.add_argument('-p', '--project', help="project", required=True)
|
||||
parser.add_argument('-c', '--cancer', help="cancer", required=True)
|
||||
parser.add_argument('-b', '--probe', help="probe, 682, 624, 160, 17 for now ", required=True)
|
||||
parser.add_argument('-w', '--wdl', help="wdl", default='/home/zhangchao/project/pipeline/workflow/pipeline.wdl')
|
||||
parser.add_argument('-w', '--wdl', help="wdl", default='$WORKFLOW/pipeline.wdl')
|
||||
parser.add_argument('-node', '--start_node',
|
||||
help="node begain to run; 'addQc', 'addAlignment', "
|
||||
"'addTarget', 'addFusion', 'addCnv', 'addMsi', 'addChemo',"
|
||||
|
|
|
|||
|
|
@ -6,10 +6,10 @@ die "useage:perl $0 input output cancer_type" unless @ARGV == 3;
|
|||
my ($input, $output, $cancer_type) = @ARGV;
|
||||
|
||||
my $public_path = defined $ENV{'PUBLIC'} ? $ENV{'PUBLIC'} : "/dataseq/jmdna/codes/public";
|
||||
print "Fusion药物注释使用public路径:$public_path\n";
|
||||
print "Longindel药物注释使用public路径:$public_path\n";
|
||||
|
||||
my $database_path = defined $ENV{'DATABASE'} ? $ENV{'DATABASE'} : "/dataseq/jmdna/codes/reportbase";
|
||||
print "Fusion药物注释使用路径:$database_path\n";
|
||||
print "Longindel药物注释使用路径:$database_path\n";
|
||||
|
||||
open MUT, "$database_path/fusion.csv";
|
||||
<MUT>;
|
||||
|
|
@ -26,7 +26,8 @@ my %therapy;
|
|||
while (<THERAPY>) {
|
||||
chomp;
|
||||
my @line = split("\t");
|
||||
push @{$therapy{$line[0]}{$line[1]}}, $_ if ($line[1] =~ /fusion/i and $line[9] ne 'D' and $line[2] !~ /Leukemia|Lymphoma|Myeloid/i);
|
||||
# push @{$therapy{$line[0]}{$line[1]}}, $_ if ($line[1] =~ /fusion/i and $line[9] ne 'D' and $line[2] !~ /Leukemia|Lymphoma|Myeloid/i);
|
||||
push @{$therapy{$line[0]}{$line[1]}}, $_ if ($line[9] ne 'D' and $line[2] !~ /Leukemia|Lymphoma|Myeloid/i);
|
||||
}
|
||||
|
||||
##药物翻译信息
|
||||
|
|
@ -106,23 +107,38 @@ while (<IN>) {
|
|||
my @splitline = split(/\t/);
|
||||
|
||||
my $freq = (split(/:/, $splitline[9]))[9] / (split(/:/, $splitline[9]))[7];
|
||||
my $gene;
|
||||
if ($_ =~ /Gene\.refGene=([^;]+)/) {
|
||||
$gene = $1;
|
||||
}
|
||||
|
||||
if (exists $therapy{'BCL2L11'}{'DELETION POLYMORPHISM'}) {
|
||||
print "$freq\n";
|
||||
foreach my $entry (@{$therapy{'BCL2L11'}{'DELETION POLYMORPHISM'}}) {
|
||||
|
||||
my @line = split("\t", $entry);
|
||||
if (($line[5] eq "FDA" or $line[5] eq "NCCN" or $line[5] eq "NMPA") and $line[2] =~ /$cancer_type|solid tumor/i) {
|
||||
if (($line[14] eq 'A') and (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}})) {
|
||||
# push @pos, "$_\t.\t" . join("\t", @line[0 .. 9, 14]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
|
||||
push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
|
||||
}
|
||||
elsif (($line[5] eq "FDA" or $line[5] eq "NCCN" or $line[5] eq "NMPA") and $line[2] !~ /$cancer_type|solid tumor/i) {
|
||||
elsif (($line[14] eq 'A') and (grep {lc $line[2] ne lc $_} @{$dis2{$cancer_type}})) {
|
||||
# push @pos, "$_\t.\t" . join("\t", @line[0 .. 9, 14]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
|
||||
push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
|
||||
}
|
||||
elsif ($line[2] =~ /$cancer_type|solid tumor/i) {
|
||||
elsif (grep {lc $line[2] eq lc $_} @{$dis2{$cancer_type}}) {
|
||||
# push @pos, "$_\t.\t" . join("\t", @line[0 .. 9, 14]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
|
||||
push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
|
||||
}
|
||||
|
||||
# my @line = split("\t", $entry);
|
||||
# if (($line[5] eq "FDA" or $line[5] eq "NCCN" or $line[5] eq "NMPA") and $line[2] =~ /$cancer_type|solid tumor/i) {
|
||||
# push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
|
||||
# }
|
||||
# elsif (($line[5] eq "FDA" or $line[5] eq "NCCN" or $line[5] eq "NMPA") and $line[2] !~ /$cancer_type|solid tumor/i) {
|
||||
# push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t非适应症" . "\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
|
||||
# }
|
||||
# elsif ($line[2] =~ /$cancer_type|solid tumor/i) {
|
||||
# push @pos, "$_\tc\.394+1479_394+4381del\tBCL2L11:NM_001204106:intron2:c\.394+1479_394+4381del\t" . $freq . "\t" . join("\t", @line[0 .. 9]) . "\t\.\t" . &drug($line[3]) . "\t" . $dis{lc $line[2]};
|
||||
# }
|
||||
# else {
|
||||
# print "未匹配到"
|
||||
# }
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Binary file not shown.
File diff suppressed because one or more lines are too long
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
import "./wdl/qc.wdl"
|
||||
import "./wdl/alignment.wdl"
|
||||
import "./wdl/call_mutation.wdl"
|
||||
|
|
@ -199,6 +200,7 @@ workflow pipeline {
|
|||
msi=call_msi.msi_txt,
|
||||
hereditary=call_hereditary.hereditary_txt,
|
||||
chemo=call_chemo.chemo_res,
|
||||
neoantigen=call_neoantigen.neoantigen_txt,
|
||||
pollution=call_pollution.pollution_res,
|
||||
name=tumor,
|
||||
normal=normal,
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
task mutation_calling_umi {
|
||||
String name
|
||||
String output_dir
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
task run_chemo {
|
||||
String name
|
||||
String output_dir
|
||||
|
|
@ -10,11 +11,12 @@ task run_chemo {
|
|||
if [ ! -d ${output_dir}/chemo ];then
|
||||
mkdir ${output_dir}/chemo
|
||||
fi
|
||||
|
||||
chemo.py -d $DATABASE/chemo_database.xlsx -probe ${probe} -n ${name} -v ${vcf} -o ${output_dir}/chemo -c ${cancer} -p ${project}
|
||||
>>>
|
||||
|
||||
output {
|
||||
String chemo_res = "${output_dir}/chemo/${name}.drug.res.txt"
|
||||
String run_chemo_res = "${output_dir}/chemo/${name}.drug.res.txt"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
task cnv_single {
|
||||
String name
|
||||
String output_dir
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
task run_hereditary {
|
||||
String name
|
||||
String output_dir
|
||||
|
|
@ -8,6 +9,7 @@ task run_hereditary {
|
|||
if [ ! -d ${output_dir}/hereditary ];then
|
||||
mkdir ${output_dir}/hereditary
|
||||
fi
|
||||
|
||||
hereditary.py -d $DATABASE/hereditary_database.xlsx -p ${project} -n ${name} -f ${filter_txt} -o ${output_dir}/hereditary
|
||||
>>>
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
task msi_single {
|
||||
String name
|
||||
String bed
|
||||
|
|
@ -18,7 +19,7 @@ task msi_single {
|
|||
>>>
|
||||
|
||||
output {
|
||||
String msi_txt = "${output_dir}/msi/${name}.msi.txt"
|
||||
String run_msi_txt = "${output_dir}/msi/${name}.msi.txt"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -44,7 +45,7 @@ task msi_paired {
|
|||
>>>
|
||||
|
||||
output {
|
||||
String msi_txt = "${output_dir}/msi/${name}.msi.txt"
|
||||
String run_msi_txt = "${output_dir}/msi/${name}.msi.txt"
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
task run_neoantigen {
|
||||
String tumor
|
||||
String? normal
|
||||
|
|
@ -12,7 +13,7 @@ task run_neoantigen {
|
|||
command <<<
|
||||
|
||||
if [ ! -d ${output_dir}/neoantigen/hla ];then
|
||||
mkdir ${output_dir}/neoantigen/hla
|
||||
mkdir -p ${output_dir}/neoantigen/hla
|
||||
fi
|
||||
|
||||
razers3 -tc 10 -i 95 -m 1 -dr 0 -o ${output_dir}/neoantigen/hla/fished_1.bam \
|
||||
|
|
@ -145,7 +146,7 @@ task run_neoantigen {
|
|||
/data
|
||||
|
||||
dos2unix ${output_dir}/neoantigen/MHC_Class_I/*.all_epitopes.tsv
|
||||
# perl ${output_dir}/netchop.pl $outputDir $name $tumor $max_peptide_length
|
||||
netchop.pl ${output_dir} ${tumor}
|
||||
>>>
|
||||
|
||||
}
|
||||
|
|
@ -178,4 +179,8 @@ workflow call_neoantigen {
|
|||
sample_type=if umi then 'c' else 't'
|
||||
}
|
||||
}
|
||||
|
||||
output {
|
||||
String neoantigen_txt = "${output_dir}neoantigen/MHC_Class_I/neoantigen.txt"
|
||||
}
|
||||
}
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
task run_pollution {
|
||||
String name
|
||||
String output_dir
|
||||
|
|
@ -20,7 +21,7 @@ task run_pollution {
|
|||
>>>
|
||||
|
||||
output {
|
||||
String pollution_res = "${output_dir}/pollution/${name}_pollution.csv"
|
||||
String run_pollution_res = "${output_dir}/pollution/${name}_pollution.csv"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
task run_post {
|
||||
String? mutation
|
||||
String? fusion
|
||||
|
|
@ -5,6 +6,7 @@ task run_post {
|
|||
String? msi
|
||||
String? hereditary
|
||||
String? chemo
|
||||
String? neoantigen
|
||||
String? pollution
|
||||
String name
|
||||
String? normal
|
||||
|
|
@ -22,7 +24,7 @@ task run_post {
|
|||
>>>
|
||||
|
||||
output {
|
||||
String merged = "${output_dir}/report/${name}.merged_file.xlsx"
|
||||
String run_merged = "${output_dir}/report/${name}.merged_file.xlsx"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -37,6 +39,7 @@ workflow call_postprocess {
|
|||
String? hereditary
|
||||
String? pollution
|
||||
String? chemo
|
||||
String? neoantigen
|
||||
String name
|
||||
String? normal
|
||||
String output_dir
|
||||
|
|
@ -52,6 +55,7 @@ workflow call_postprocess {
|
|||
msi=msi,
|
||||
hereditary=hereditary,
|
||||
chemo=chemo,
|
||||
neoantigen=neoantigen,
|
||||
pollution=pollution,
|
||||
name=name,
|
||||
normal=normal,
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
task runqc {
|
||||
String name
|
||||
String input_dir
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
task run_statistics {
|
||||
String name
|
||||
String output_dir
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
task run_tmb {
|
||||
String name
|
||||
String file
|
||||
|
|
@ -19,7 +20,7 @@ task run_tmb {
|
|||
>>>
|
||||
|
||||
output {
|
||||
String tmb_txt = "${output_dir}/tmb/${name}.tmb.txt"
|
||||
String run_tmb_txt = "${output_dir}/tmb/${name}.tmb.txt"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -50,7 +51,7 @@ workflow call_tmb {
|
|||
name=name,
|
||||
file=file,
|
||||
project=project,
|
||||
sample_type='c',
|
||||
sample_type='t',
|
||||
output_dir=output_dir
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue