diff --git a/codes/chemo.py b/codes/chemo.py index 4720dcf..1cb7fec 100755 --- a/codes/chemo.py +++ b/codes/chemo.py @@ -10,7 +10,7 @@ import pysam class ChemoRun: - def __init__(self, database, probe, cancer, project, output_dir, name, vcf): + def __init__(self, database, probe, cancer, project, output_dir, name, vcf, bam, ref): self.database = database self.probe = probe self.output_dir = output_dir @@ -18,6 +18,8 @@ class ChemoRun: self.vcf = vcf self.cancer = cancer self.project = project + self.bam_file = pysam.AlignmentFile(bam, "rb") + self.ref_file = pysam.FastaFile(ref) @staticmethod def read_info(project): @@ -62,6 +64,22 @@ class ChemoRun: records[(chrom, pos)] = record return records + def get_position_depth(self, chr, pos): + """ + 获取位点测序深度 >=30X + """ + + depths = self.bam_file.count_coverage(chr, start=pos - 1, stop=pos) + depth_sum = sum([sum(arr) for arr in depths]) + return depth_sum + + def get_ref_base(self, chr, pos): + """ + 获取参考基因组 + """ + base = self.ref_file.fetch(chr, pos - 1, pos) + return base + def parsedata(self): """ 处理chemo_drug_rsid_snppos.xlsx, 并生成对应bed文件 @@ -97,55 +115,57 @@ class ChemoRun: for _, row in beddata.iterrows(): chrom = row['chr'] end = row['end'] - fliter = pd.DataFrame() + # bed 文件在 vcf 中 if (chrom, end) in records: record = records[(chrom, end)] ref = record.ref alt = record.alts[0] - # gt = '/'.join(list(map(str, sorted(record.samples.get(record.samples.keys()[0]).get('GT'))))) freq = record.samples.get(record.samples.keys()[-1]).get('AF')[0] depth = record.samples.get(record.samples.keys()[-1]).get('DP') + if freq > 0.9: gt = '1/1' elif 0.9 >= freq > 0.1: gt = '0/1' else: gt = '0/0' - match_drug_rsid_data = drug_rsid_data[ - (drug_rsid_data['chr'] == chrom) & - (drug_rsid_data['end'] == end) & - (drug_rsid_data['ref'] == ref) & - (drug_rsid_data['alt'] == alt) & - (drug_rsid_data['genotype'] == gt) - ] - match_drug_rsid_data = match_drug_rsid_data.reset_index() - match_drug_rsid_data['chr'] = chrom - match_drug_rsid_data['pos'] = end - match_drug_rsid_data['freq'] = freq - match_drug_rsid_data['depth'] = depth - fliter = pd.concat([fliter, match_drug_rsid_data]) + # 不在vcf中 去bam文件中获取该位点的测序深度 去ref 获取碱基 没有的就为NA了 + else: + depth = self.get_position_depth(chrom, end) + ref = self.get_ref_base(chrom, end) + alt = ref + freq = 1 + gt = '0/0' + # 测序深度小于30 排除 + if depth < 30: + alt = 'NA' + freq = 'NA' + gt = 'NA' - if fliter.empty: - match_drug_rsid_data = drug_rsid_data[ - (drug_rsid_data['chr'] == chrom) & - (drug_rsid_data['end'] == end) & - (drug_rsid_data['genotype'] == '0/0') - ] - match_drug_rsid_data = match_drug_rsid_data.reset_index() - match_drug_rsid_data['chr'] = chrom - match_drug_rsid_data['pos'] = end - match_drug_rsid_data['freq'] = '.' - match_drug_rsid_data['depth'] = '.' - fliter = pd.concat([fliter, match_drug_rsid_data]) + match_drug_rsid_data = drug_rsid_data[ + (drug_rsid_data['chr'] == chrom) & + (drug_rsid_data['end'] == end) & + (drug_rsid_data['ref'] == ref) & + (drug_rsid_data['alt'] == alt) & + (drug_rsid_data['genotype'] == gt) + ] + match_drug_rsid_data = match_drug_rsid_data.reset_index() + if match_drug_rsid_data.empty: + match_drug_rsid_data = pd.DataFrame([ + dict( + ref=ref, + alt=alt, + genotype=gt + ) + ]) + match_drug_rsid_data['chr'] = chrom + match_drug_rsid_data['pos'] = end + match_drug_rsid_data['freq'] = freq + match_drug_rsid_data['depth'] = depth + fliterdata = pd.concat([fliterdata, match_drug_rsid_data]) - if fliter.empty: - raise UserWarning( - 'chr: %s , end: %s 数据库未能匹配, 野生型0/0也未能匹配' % (chrom, end)) - fliterdata = pd.concat([fliterdata, fliter]) # 生成过滤之后文件 - respath = os.path.join(self.output_dir, f'{self.name}.chemo.res.csv') - if not fliterdata.empty: - fliterdata['probe'] = self.probe + respath = os.path.join(self.output_dir, f'{self.name}.chemo.res.txt') fliterdata.to_csv(respath, sep='\t', index=False) # 分类汇总 同位点,药物合并 drug.infos.txt @@ -256,6 +276,9 @@ if __name__ == "__main__": parser.add_argument('-c', '--cancer', help="cancer", required=True) parser.add_argument('-p', '--project', help="Project", required=True) parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='') + parser.add_argument('-b', '--bam', help="Bamfile for sample", required=True) + parser.add_argument('-r', '--ref', help="ref fasta", required=True) args = parser.parse_args() - chemorun = ChemoRun(args.database, args.probe, args.cancer, args.project, args.output_dir, args.name, args.vcf) + chemorun = ChemoRun(args.database, args.probe, args.cancer, args.project, args.output_dir, args.name, args.vcf, + args.bam, args.ref) chemorun.run() diff --git a/pipeline.wdl b/pipeline.wdl index a8274f7..2ef32e7 100644 --- a/pipeline.wdl +++ b/pipeline.wdl @@ -166,7 +166,11 @@ workflow pipeline { probe=probe, vcf=call_mutation.germline_vcf, cancer=cancer, - project=project + project=project, + normal=normal, + tumor_rmdup_bam=alignment.tumor_rmdup_bam, + normal_rmdup_bam=alignment.normal_rmdup_bam, + ref=ref } call neoantigen.call_neoantigen as call_neoantigen { diff --git a/wdl/chemo.wdl b/wdl/chemo.wdl index 5b2fe75..c9e1586 100755 --- a/wdl/chemo.wdl +++ b/wdl/chemo.wdl @@ -7,12 +7,14 @@ task run_chemo { String vcf String cancer String project + String bam + String ref command { if [ ! -d ${output_dir}/chemo ];then mkdir ${output_dir}/chemo fi - chemo.py -d $DATABASE/chemo_database.xlsx -probe ${probe} -n ${name} -v ${vcf} -o ${output_dir}/chemo -c ${cancer} -p ${project} + chemo.py -d $DATABASE/chemo_database.xlsx -probe ${probe} -n ${name} -v ${vcf} -o ${output_dir}/chemo -c ${cancer} -p ${project} -b ${bam} -r ${ref} } output { @@ -25,22 +27,44 @@ workflow call_chemo { Boolean run=true String name + String? normal String output_dir String probe String vcf String cancer String project + String tumor_rmdup_bam + String? normal_rmdup_bam + String ref if (run) { - call run_chemo { - input: - name=name, - output_dir=output_dir, - probe=probe, - vcf=vcf, - cancer=cancer, - project=project + if (defined(normal)) { + call run_chemo as run_chemo_normal { + input: + name=name, + output_dir=output_dir, + probe=probe, + vcf=vcf, + cancer=cancer, + project=project, + bam=normal_rmdup_bam, + ref=ref + } } + if (!defined(normal)) { + call run_chemo as run_chemo_tumor { + input: + name=name, + output_dir=output_dir, + probe=probe, + vcf=vcf, + cancer=cancer, + project=project, + bam=tumor_rmdup_bam, + ref=ref + } + } + } output {