化疗修改,不在vcf中出现的从bam文件中获取深度
parent
39f51ccf2e
commit
bf1d9de10e
|
|
@ -10,7 +10,7 @@ import pysam
|
|||
|
||||
class ChemoRun:
|
||||
|
||||
def __init__(self, database, probe, cancer, project, output_dir, name, vcf):
|
||||
def __init__(self, database, probe, cancer, project, output_dir, name, vcf, bam, ref):
|
||||
self.database = database
|
||||
self.probe = probe
|
||||
self.output_dir = output_dir
|
||||
|
|
@ -18,6 +18,8 @@ class ChemoRun:
|
|||
self.vcf = vcf
|
||||
self.cancer = cancer
|
||||
self.project = project
|
||||
self.bam_file = pysam.AlignmentFile(bam, "rb")
|
||||
self.ref_file = pysam.FastaFile(ref)
|
||||
|
||||
@staticmethod
|
||||
def read_info(project):
|
||||
|
|
@ -62,6 +64,22 @@ class ChemoRun:
|
|||
records[(chrom, pos)] = record
|
||||
return records
|
||||
|
||||
def get_position_depth(self, chr, pos):
|
||||
"""
|
||||
获取位点测序深度 >=30X
|
||||
"""
|
||||
|
||||
depths = self.bam_file.count_coverage(chr, start=pos - 1, stop=pos)
|
||||
depth_sum = sum([sum(arr) for arr in depths])
|
||||
return depth_sum
|
||||
|
||||
def get_ref_base(self, chr, pos):
|
||||
"""
|
||||
获取参考基因组
|
||||
"""
|
||||
base = self.ref_file.fetch(chr, pos - 1, pos)
|
||||
return base
|
||||
|
||||
def parsedata(self):
|
||||
"""
|
||||
处理chemo_drug_rsid_snppos.xlsx, 并生成对应bed文件
|
||||
|
|
@ -97,55 +115,57 @@ class ChemoRun:
|
|||
for _, row in beddata.iterrows():
|
||||
chrom = row['chr']
|
||||
end = row['end']
|
||||
fliter = pd.DataFrame()
|
||||
# bed 文件在 vcf 中
|
||||
if (chrom, end) in records:
|
||||
record = records[(chrom, end)]
|
||||
ref = record.ref
|
||||
alt = record.alts[0]
|
||||
# gt = '/'.join(list(map(str, sorted(record.samples.get(record.samples.keys()[0]).get('GT')))))
|
||||
freq = record.samples.get(record.samples.keys()[-1]).get('AF')[0]
|
||||
depth = record.samples.get(record.samples.keys()[-1]).get('DP')
|
||||
|
||||
if freq > 0.9:
|
||||
gt = '1/1'
|
||||
elif 0.9 >= freq > 0.1:
|
||||
gt = '0/1'
|
||||
else:
|
||||
gt = '0/0'
|
||||
match_drug_rsid_data = drug_rsid_data[
|
||||
(drug_rsid_data['chr'] == chrom) &
|
||||
(drug_rsid_data['end'] == end) &
|
||||
(drug_rsid_data['ref'] == ref) &
|
||||
(drug_rsid_data['alt'] == alt) &
|
||||
(drug_rsid_data['genotype'] == gt)
|
||||
]
|
||||
match_drug_rsid_data = match_drug_rsid_data.reset_index()
|
||||
match_drug_rsid_data['chr'] = chrom
|
||||
match_drug_rsid_data['pos'] = end
|
||||
match_drug_rsid_data['freq'] = freq
|
||||
match_drug_rsid_data['depth'] = depth
|
||||
fliter = pd.concat([fliter, match_drug_rsid_data])
|
||||
# 不在vcf中 去bam文件中获取该位点的测序深度 去ref 获取碱基 没有的就为NA了
|
||||
else:
|
||||
depth = self.get_position_depth(chrom, end)
|
||||
ref = self.get_ref_base(chrom, end)
|
||||
alt = ref
|
||||
freq = 1
|
||||
gt = '0/0'
|
||||
# 测序深度小于30 排除
|
||||
if depth < 30:
|
||||
alt = 'NA'
|
||||
freq = 'NA'
|
||||
gt = 'NA'
|
||||
|
||||
if fliter.empty:
|
||||
match_drug_rsid_data = drug_rsid_data[
|
||||
(drug_rsid_data['chr'] == chrom) &
|
||||
(drug_rsid_data['end'] == end) &
|
||||
(drug_rsid_data['genotype'] == '0/0')
|
||||
]
|
||||
match_drug_rsid_data = match_drug_rsid_data.reset_index()
|
||||
match_drug_rsid_data['chr'] = chrom
|
||||
match_drug_rsid_data['pos'] = end
|
||||
match_drug_rsid_data['freq'] = '.'
|
||||
match_drug_rsid_data['depth'] = '.'
|
||||
fliter = pd.concat([fliter, match_drug_rsid_data])
|
||||
match_drug_rsid_data = drug_rsid_data[
|
||||
(drug_rsid_data['chr'] == chrom) &
|
||||
(drug_rsid_data['end'] == end) &
|
||||
(drug_rsid_data['ref'] == ref) &
|
||||
(drug_rsid_data['alt'] == alt) &
|
||||
(drug_rsid_data['genotype'] == gt)
|
||||
]
|
||||
match_drug_rsid_data = match_drug_rsid_data.reset_index()
|
||||
if match_drug_rsid_data.empty:
|
||||
match_drug_rsid_data = pd.DataFrame([
|
||||
dict(
|
||||
ref=ref,
|
||||
alt=alt,
|
||||
genotype=gt
|
||||
)
|
||||
])
|
||||
match_drug_rsid_data['chr'] = chrom
|
||||
match_drug_rsid_data['pos'] = end
|
||||
match_drug_rsid_data['freq'] = freq
|
||||
match_drug_rsid_data['depth'] = depth
|
||||
fliterdata = pd.concat([fliterdata, match_drug_rsid_data])
|
||||
|
||||
if fliter.empty:
|
||||
raise UserWarning(
|
||||
'chr: %s , end: %s 数据库未能匹配, 野生型0/0也未能匹配' % (chrom, end))
|
||||
fliterdata = pd.concat([fliterdata, fliter])
|
||||
# 生成过滤之后文件
|
||||
respath = os.path.join(self.output_dir, f'{self.name}.chemo.res.csv')
|
||||
if not fliterdata.empty:
|
||||
fliterdata['probe'] = self.probe
|
||||
respath = os.path.join(self.output_dir, f'{self.name}.chemo.res.txt')
|
||||
fliterdata.to_csv(respath, sep='\t', index=False)
|
||||
|
||||
# 分类汇总 同位点,药物合并 drug.infos.txt
|
||||
|
|
@ -256,6 +276,9 @@ if __name__ == "__main__":
|
|||
parser.add_argument('-c', '--cancer', help="cancer", required=True)
|
||||
parser.add_argument('-p', '--project', help="Project", required=True)
|
||||
parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='')
|
||||
parser.add_argument('-b', '--bam', help="Bamfile for sample", required=True)
|
||||
parser.add_argument('-r', '--ref', help="ref fasta", required=True)
|
||||
args = parser.parse_args()
|
||||
chemorun = ChemoRun(args.database, args.probe, args.cancer, args.project, args.output_dir, args.name, args.vcf)
|
||||
chemorun = ChemoRun(args.database, args.probe, args.cancer, args.project, args.output_dir, args.name, args.vcf,
|
||||
args.bam, args.ref)
|
||||
chemorun.run()
|
||||
|
|
|
|||
|
|
@ -166,7 +166,11 @@ workflow pipeline {
|
|||
probe=probe,
|
||||
vcf=call_mutation.germline_vcf,
|
||||
cancer=cancer,
|
||||
project=project
|
||||
project=project,
|
||||
normal=normal,
|
||||
tumor_rmdup_bam=alignment.tumor_rmdup_bam,
|
||||
normal_rmdup_bam=alignment.normal_rmdup_bam,
|
||||
ref=ref
|
||||
}
|
||||
|
||||
call neoantigen.call_neoantigen as call_neoantigen {
|
||||
|
|
|
|||
|
|
@ -7,12 +7,14 @@ task run_chemo {
|
|||
String vcf
|
||||
String cancer
|
||||
String project
|
||||
String bam
|
||||
String ref
|
||||
command {
|
||||
if [ ! -d ${output_dir}/chemo ];then
|
||||
mkdir ${output_dir}/chemo
|
||||
fi
|
||||
|
||||
chemo.py -d $DATABASE/chemo_database.xlsx -probe ${probe} -n ${name} -v ${vcf} -o ${output_dir}/chemo -c ${cancer} -p ${project}
|
||||
chemo.py -d $DATABASE/chemo_database.xlsx -probe ${probe} -n ${name} -v ${vcf} -o ${output_dir}/chemo -c ${cancer} -p ${project} -b ${bam} -r ${ref}
|
||||
|
||||
}
|
||||
output {
|
||||
|
|
@ -25,22 +27,44 @@ workflow call_chemo {
|
|||
Boolean run=true
|
||||
|
||||
String name
|
||||
String? normal
|
||||
String output_dir
|
||||
String probe
|
||||
String vcf
|
||||
String cancer
|
||||
String project
|
||||
String tumor_rmdup_bam
|
||||
String? normal_rmdup_bam
|
||||
String ref
|
||||
|
||||
if (run) {
|
||||
call run_chemo {
|
||||
input:
|
||||
name=name,
|
||||
output_dir=output_dir,
|
||||
probe=probe,
|
||||
vcf=vcf,
|
||||
cancer=cancer,
|
||||
project=project
|
||||
if (defined(normal)) {
|
||||
call run_chemo as run_chemo_normal {
|
||||
input:
|
||||
name=name,
|
||||
output_dir=output_dir,
|
||||
probe=probe,
|
||||
vcf=vcf,
|
||||
cancer=cancer,
|
||||
project=project,
|
||||
bam=normal_rmdup_bam,
|
||||
ref=ref
|
||||
}
|
||||
}
|
||||
if (!defined(normal)) {
|
||||
call run_chemo as run_chemo_tumor {
|
||||
input:
|
||||
name=name,
|
||||
output_dir=output_dir,
|
||||
probe=probe,
|
||||
vcf=vcf,
|
||||
cancer=cancer,
|
||||
project=project,
|
||||
bam=tumor_rmdup_bam,
|
||||
ref=ref
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
output {
|
||||
|
|
|
|||
Loading…
Reference in New Issue