2023-11-30 15:31:35 +08:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
2024-01-19 17:56:25 +08:00
|
|
|
|
import pysam
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_bed_regions(bed_file):
|
|
|
|
|
|
bed_regions = set()
|
|
|
|
|
|
if bed_file:
|
|
|
|
|
|
with open(bed_file, 'r') as file:
|
|
|
|
|
|
for line in file:
|
|
|
|
|
|
parts = line.strip().split('\t')
|
|
|
|
|
|
if len(parts) >= 3:
|
|
|
|
|
|
chrom, start, end = parts[0], int(parts[1]), int(parts[2])
|
|
|
|
|
|
bed_regions.add((chrom, start, end))
|
|
|
|
|
|
return bed_regions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_record_dict(vcf_file):
|
|
|
|
|
|
records = {}
|
|
|
|
|
|
for record in vcf_file:
|
|
|
|
|
|
key = (record.contig, record.pos)
|
|
|
|
|
|
records[key] = record
|
|
|
|
|
|
return records
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def simplify_genotype(freq):
|
|
|
|
|
|
if freq < 0.1:
|
|
|
|
|
|
return '0/0'
|
|
|
|
|
|
elif 0.1 <= freq <= 0.9:
|
|
|
|
|
|
return '0/1'
|
|
|
|
|
|
else:
|
|
|
|
|
|
return '1/1'
|
2023-11-30 15:31:35 +08:00
|
|
|
|
|
|
|
|
|
|
|
2024-01-19 17:56:25 +08:00
|
|
|
|
def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
|
|
|
|
|
|
"""
|
|
|
|
|
|
17, 160 单样本处理
|
|
|
|
|
|
rawvcf 对应bed 挑选出 位点,计算位点是否在给定范围内,计算比例得到z_socre
|
|
|
|
|
|
"""
|
2023-11-30 15:31:35 +08:00
|
|
|
|
|
|
|
|
|
|
# bed 处理
|
2024-01-19 17:56:25 +08:00
|
|
|
|
bed_regions = load_bed_regions(bed_file)
|
2023-11-30 15:31:35 +08:00
|
|
|
|
|
2024-01-19 17:56:25 +08:00
|
|
|
|
vcf = pysam.VariantFile(vcf_file)
|
|
|
|
|
|
res_pos = list()
|
2023-11-30 15:31:35 +08:00
|
|
|
|
count_normal = 0
|
|
|
|
|
|
count_exception = 0
|
2024-01-19 17:56:25 +08:00
|
|
|
|
|
|
|
|
|
|
for record in vcf:
|
|
|
|
|
|
contig = record.contig
|
|
|
|
|
|
pos = record.pos
|
|
|
|
|
|
ref = record.ref
|
|
|
|
|
|
alt = record.alts[0]
|
|
|
|
|
|
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
|
|
|
|
|
|
|
|
|
|
|
|
if freq < 0.1 or freq > 0.9:
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
# 检查记录是否是点突变
|
|
|
|
|
|
if len(record.ref) == 1 and len(record.alts[0]) == 1:
|
|
|
|
|
|
# 在bed 区间内
|
|
|
|
|
|
for (bed_chrom, bed_start, bed_end) in bed_regions:
|
|
|
|
|
|
if contig == bed_chrom and bed_start < pos <= bed_end:
|
|
|
|
|
|
if freq_range[0] <= freq <= freq_range[1]:
|
|
|
|
|
|
count_normal += 1
|
|
|
|
|
|
res = 'yes'
|
|
|
|
|
|
else:
|
|
|
|
|
|
count_exception += 1
|
|
|
|
|
|
res = 'no'
|
|
|
|
|
|
res_pos.append(dict(
|
|
|
|
|
|
chr=contig,
|
|
|
|
|
|
pos=pos,
|
|
|
|
|
|
ref=ref,
|
|
|
|
|
|
alt=alt,
|
|
|
|
|
|
freq=freq,
|
|
|
|
|
|
res=res
|
|
|
|
|
|
))
|
|
|
|
|
|
break
|
2023-11-30 15:31:35 +08:00
|
|
|
|
count_all = count_exception + count_normal
|
|
|
|
|
|
if count_all == 0:
|
|
|
|
|
|
z_score = 0
|
|
|
|
|
|
else:
|
|
|
|
|
|
z_score = count_exception / count_all
|
|
|
|
|
|
|
|
|
|
|
|
res = dict(
|
|
|
|
|
|
barcode=name,
|
|
|
|
|
|
count_normal=count_normal,
|
|
|
|
|
|
count_exception=count_exception,
|
|
|
|
|
|
z_score=z_score
|
|
|
|
|
|
)
|
2024-01-19 17:56:25 +08:00
|
|
|
|
pd.DataFrame([res]).to_csv(os.path.join(output_dir, f'{name}_pollution_sum.txt'), sep="\t", index=False)
|
|
|
|
|
|
|
|
|
|
|
|
res_pos_df = pd.DataFrame(res_pos)
|
|
|
|
|
|
if not res_pos_df.empty:
|
|
|
|
|
|
res_pos_df.rename(columns={
|
|
|
|
|
|
'res': f'res{freq_range}',
|
|
|
|
|
|
}, inplace=True)
|
|
|
|
|
|
res_pos_df.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def paired_monitor(name, vcf_file, bed_file, cnvkit_bed, output_dir):
|
|
|
|
|
|
"""
|
|
|
|
|
|
682双样本处理
|
|
|
|
|
|
"""
|
|
|
|
|
|
bed_regions = load_bed_regions(bed_file)
|
|
|
|
|
|
cnvkit_bed_regions = load_bed_regions(cnvkit_bed)
|
|
|
|
|
|
vcf = pysam.VariantFile(vcf_file)
|
|
|
|
|
|
records_dict = build_record_dict(vcf)
|
|
|
|
|
|
res_pos = list()
|
|
|
|
|
|
|
|
|
|
|
|
# 遍历 bed
|
|
|
|
|
|
for (bed_chrom, bed_start, bed_end) in bed_regions:
|
|
|
|
|
|
key = (bed_chrom, bed_end)
|
|
|
|
|
|
ref = '.'
|
|
|
|
|
|
alt = '.'
|
|
|
|
|
|
freq = '.'
|
|
|
|
|
|
genotype = '.'
|
|
|
|
|
|
ref_normal = '.'
|
|
|
|
|
|
alt_normal = '.'
|
|
|
|
|
|
freq_normal = '.'
|
|
|
|
|
|
genotype_normal = '.'
|
|
|
|
|
|
res = 'no'
|
|
|
|
|
|
if key in records_dict:
|
|
|
|
|
|
record = records_dict[key]
|
|
|
|
|
|
ref = record.ref
|
|
|
|
|
|
alt = record.alts[0]
|
|
|
|
|
|
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
|
|
|
|
|
|
freq_normal = record.samples.get(record.samples.keys()[1]).get("AF")[0]
|
|
|
|
|
|
ref_normal = ref
|
|
|
|
|
|
alt_normal = alt
|
|
|
|
|
|
|
|
|
|
|
|
genotype = simplify_genotype(freq)
|
|
|
|
|
|
genotype_normal = simplify_genotype(freq_normal)
|
|
|
|
|
|
|
|
|
|
|
|
if genotype == genotype_normal:
|
|
|
|
|
|
res = 'yes'
|
|
|
|
|
|
res_pos.append(dict(
|
|
|
|
|
|
chr=bed_chrom,
|
|
|
|
|
|
pos=bed_end,
|
|
|
|
|
|
ref=ref,
|
|
|
|
|
|
alt=alt,
|
|
|
|
|
|
freq=freq,
|
|
|
|
|
|
genotype=genotype,
|
|
|
|
|
|
ref_normal=ref_normal,
|
|
|
|
|
|
alt_normal=alt_normal,
|
|
|
|
|
|
freq_normal=freq_normal,
|
|
|
|
|
|
genotype_normal=genotype_normal,
|
|
|
|
|
|
res=res
|
|
|
|
|
|
))
|
|
|
|
|
|
res_pos_df = pd.DataFrame(res_pos)
|
|
|
|
|
|
res_pos_df_sort = res_pos_df.sort_values(by='chr')
|
|
|
|
|
|
res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)
|
|
|
|
|
|
|
|
|
|
|
|
output_tumor_vcf = open(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w')
|
|
|
|
|
|
output_normal_vcf = open(os.path.join(output_dir, f'{name}_cnvkit_normal.vcf'), 'w')
|
|
|
|
|
|
for line in str(vcf.header).strip().split('\n'):
|
|
|
|
|
|
output_tumor_vcf.write(line + '\n') if line.startswith('##') else output_tumor_vcf.write(
|
|
|
|
|
|
'\t'.join(line.split('\t')[:-1]) + '\n')
|
|
|
|
|
|
output_normal_vcf.write(line + '\n') if line.startswith('##') else output_normal_vcf.write(
|
|
|
|
|
|
'\t'.join(line.split('\t')[:-2] + line.split('\t')[-1:]) + '\n')
|
|
|
|
|
|
|
|
|
|
|
|
for (bed_chrom, bed_start, bed_end) in cnvkit_bed_regions:
|
|
|
|
|
|
key = (bed_chrom, bed_end)
|
|
|
|
|
|
if key in records_dict:
|
|
|
|
|
|
record = records_dict[key]
|
|
|
|
|
|
output_tumor_vcf.write('\t'.join(str(record).split('\t')[:-1]) + '\n')
|
|
|
|
|
|
output_normal_vcf.write('\t'.join(str(record).split('\t')[:-2] + str(record).split('\t')[-1:]) + '\n')
|
|
|
|
|
|
|
|
|
|
|
|
output_tumor_vcf.close()
|
|
|
|
|
|
output_normal_vcf.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def paired_monitor_vcf(name, tumor_vcf_file, normal_vcf_file, bed_file, cnvkit_bed, output_dir):
|
|
|
|
|
|
"""
|
|
|
|
|
|
624双样本处理
|
|
|
|
|
|
"""
|
|
|
|
|
|
bed_regions = load_bed_regions(bed_file)
|
|
|
|
|
|
tumor_vcf = pysam.VariantFile(tumor_vcf_file)
|
|
|
|
|
|
normal_vcf = pysam.VariantFile(normal_vcf_file)
|
|
|
|
|
|
|
|
|
|
|
|
tumor_records_dict = build_record_dict(tumor_vcf)
|
|
|
|
|
|
normal_records_dict = build_record_dict(normal_vcf)
|
|
|
|
|
|
|
|
|
|
|
|
res_pos = list()
|
|
|
|
|
|
|
|
|
|
|
|
# 遍历 bed
|
|
|
|
|
|
for (bed_chrom, bed_start, bed_end) in bed_regions:
|
|
|
|
|
|
key = (bed_chrom, bed_end)
|
|
|
|
|
|
ref = '.'
|
|
|
|
|
|
alt = '.'
|
|
|
|
|
|
freq = '.'
|
|
|
|
|
|
genotype = '.'
|
|
|
|
|
|
ref_normal = '.'
|
|
|
|
|
|
alt_normal = '.'
|
|
|
|
|
|
freq_normal = '.'
|
|
|
|
|
|
genotype_normal = '.'
|
|
|
|
|
|
res = 'no'
|
|
|
|
|
|
|
|
|
|
|
|
if key in tumor_records_dict:
|
|
|
|
|
|
record = tumor_records_dict[key]
|
|
|
|
|
|
ref = record.ref
|
|
|
|
|
|
alt = record.alts[0]
|
|
|
|
|
|
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
|
|
|
|
|
|
genotype = simplify_genotype(freq)
|
|
|
|
|
|
if key in normal_records_dict:
|
|
|
|
|
|
record = normal_records_dict[key]
|
|
|
|
|
|
ref_normal = record.ref
|
|
|
|
|
|
alt_normal = record.alts[0]
|
|
|
|
|
|
freq_normal = record.samples.get(record.samples.keys()[0]).get("AF")[0]
|
|
|
|
|
|
genotype_normal = simplify_genotype(freq_normal)
|
|
|
|
|
|
if genotype == genotype_normal:
|
|
|
|
|
|
res = 'yes'
|
|
|
|
|
|
res_pos.append(dict(
|
|
|
|
|
|
chr=bed_chrom,
|
|
|
|
|
|
pos=bed_end,
|
|
|
|
|
|
ref=ref,
|
|
|
|
|
|
alt=alt,
|
|
|
|
|
|
freq=freq,
|
|
|
|
|
|
genotype=genotype,
|
|
|
|
|
|
ref_normal=ref_normal,
|
|
|
|
|
|
alt_normal=alt_normal,
|
|
|
|
|
|
freq_normal=freq_normal,
|
|
|
|
|
|
genotype_normal=genotype_normal,
|
|
|
|
|
|
res=res
|
|
|
|
|
|
))
|
|
|
|
|
|
|
|
|
|
|
|
res_pos_df = pd.DataFrame(res_pos)
|
|
|
|
|
|
res_pos_df_sort = res_pos_df.sort_values(by='chr')
|
|
|
|
|
|
res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)
|
|
|
|
|
|
|
|
|
|
|
|
output_tumor_vcf = pysam.VariantFile(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w',
|
|
|
|
|
|
header=tumor_vcf.header)
|
|
|
|
|
|
output_normal_vcf = pysam.VariantFile(os.path.join(output_dir, f'{name}_cnvkit_normal.vcf'), 'w',
|
|
|
|
|
|
header=normal_vcf.header)
|
|
|
|
|
|
|
|
|
|
|
|
cnvkit_bed_regions = load_bed_regions(cnvkit_bed)
|
|
|
|
|
|
for (bed_chrom, bed_start, bed_end) in cnvkit_bed_regions:
|
|
|
|
|
|
key = (bed_chrom, bed_end)
|
|
|
|
|
|
if key in normal_records_dict:
|
|
|
|
|
|
output_normal_vcf.write(normal_records_dict[key])
|
|
|
|
|
|
if key in tumor_records_dict:
|
|
|
|
|
|
output_tumor_vcf.write(tumor_records_dict[key])
|
|
|
|
|
|
output_tumor_vcf.close()
|
|
|
|
|
|
output_normal_vcf.close()
|
2023-11-30 15:31:35 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Pollution Process Script")
|
|
|
|
|
|
|
|
|
|
|
|
parser.add_argument('-n', '--name', help="Name for sample", required=True)
|
|
|
|
|
|
parser.add_argument('-b', '--ref_bed', help="ref_bed", required=True)
|
|
|
|
|
|
parser.add_argument('-v', '--vcf', help="raw vcf for prbe 160 or 17 ; somatic vcf for prbe 682 or 624",
|
|
|
|
|
|
required=True)
|
2024-01-19 17:56:25 +08:00
|
|
|
|
parser.add_argument('-v2', '--vcf2', help="germline vcf; required when prbe 682 or 624", default='', required=False,
|
|
|
|
|
|
nargs='?')
|
2023-11-30 15:31:35 +08:00
|
|
|
|
parser.add_argument('-c', '--cnvkit_bed', help="cnvkit_bed; required when prbe 682 or 624")
|
|
|
|
|
|
parser.add_argument('-p', '--probe', help="probe, 682, 624, 160, 17 for now ", required=True)
|
|
|
|
|
|
parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='')
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
bed_path = os.path.realpath(args.ref_bed)
|
|
|
|
|
|
print(f'污染检测使用ref_bed: {bed_path}')
|
|
|
|
|
|
probe = args.probe
|
|
|
|
|
|
|
|
|
|
|
|
if probe == '160' or probe == '17':
|
|
|
|
|
|
freq_range = {"17": [0.3452, 0.6512], "160": [0.2930, 0.6753]}.get(probe)
|
2024-01-19 17:56:25 +08:00
|
|
|
|
single_monitor(args.name, args.vcf, bed_path, freq_range, args.output_dir)
|
|
|
|
|
|
elif probe == '682':
|
2023-11-30 15:31:35 +08:00
|
|
|
|
if not args.cnvkit_bed:
|
2024-01-19 17:56:25 +08:00
|
|
|
|
parser.error('--cnvkit_bed is required in probe 624')
|
2023-11-30 15:31:35 +08:00
|
|
|
|
cnvkit_bed_path = os.path.realpath(args.cnvkit_bed)
|
|
|
|
|
|
print(f'污染检测使用cnvkit_bed: {cnvkit_bed_path}')
|
2024-01-19 17:56:25 +08:00
|
|
|
|
paired_monitor(args.name, args.vcf, bed_path, args.cnvkit_bed, args.output_dir)
|
|
|
|
|
|
elif probe == '624':
|
|
|
|
|
|
if not args.vcf2:
|
|
|
|
|
|
parser.error('--vcf2 is required in probe 624')
|
|
|
|
|
|
if not args.cnvkit_bed:
|
|
|
|
|
|
parser.error('--cnvkit_bed is required in probe 624')
|
|
|
|
|
|
paired_monitor_vcf(args.name, args.vcf, args.vcf2, bed_path, args.cnvkit_bed, args.output_dir)
|
2023-11-30 15:31:35 +08:00
|
|
|
|
else:
|
|
|
|
|
|
parser.error('probe error. 682, 624, 160, 17 for now')
|