pipeline/codes/pollution.py

292 lines
10 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
import argparse
import os
import pandas as pd
import pysam
def load_bed_regions(bed_file):
bed_regions = set()
if bed_file:
with open(bed_file, 'r') as file:
for line in file:
parts = line.strip().split('\t')
if len(parts) >= 3:
chrom, start, end = parts[0], int(parts[1]), int(parts[2])
bed_regions.add((chrom, start, end))
return bed_regions
def build_record_dict(vcf_file):
records = {}
for record in vcf_file:
key = (record.contig, record.pos)
records[key] = record
return records
def simplify_genotype(freq):
if freq < 0.1:
return '0/0'
elif 0.1 <= freq <= 0.9:
return '0/1'
else:
return '1/1'
def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
"""
17 160 单样本处理
rawvcf 对应bed 挑选出 位点计算位点是否在给定范围内计算比例得到z_socre
"""
# bed 处理
bed_regions = load_bed_regions(bed_file)
vcf = pysam.VariantFile(vcf_file)
res_pos = list()
count_normal = 0
count_exception = 0
vcf_out = open(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w')
for line in str(vcf.header).strip().split('\n'):
vcf_out.write(line + '\n')
for record in vcf:
contig = record.contig
pos = record.pos
ref = record.ref
alt = record.alts[0]
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
if freq < 0.1 or freq > 0.9:
continue
# 检查记录是否是点突变
if len(record.ref) == 1 and len(record.alts[0]) == 1:
# 在bed 区间内
for (bed_chrom, bed_start, bed_end) in bed_regions:
if contig == bed_chrom and bed_start < pos <= bed_end:
if freq_range[0] <= freq <= freq_range[1]:
count_normal += 1
res = 'yes'
else:
count_exception += 1
res = 'no'
res_pos.append(dict(
chr=contig,
pos=pos,
ref=ref,
alt=alt,
freq=freq,
res=res
))
vcf_out.write(str(record) + '\n')
break
count_all = count_exception + count_normal
if count_all == 0:
z_score = 0
else:
z_score = count_exception / count_all
res = dict(
barcode=name,
count_normal=count_normal,
count_exception=count_exception,
z_score=z_score
)
pd.DataFrame([res]).to_csv(os.path.join(output_dir, f'{name}_pollution_sum.txt'), sep="\t", index=False)
res_pos_df = pd.DataFrame(res_pos)
if not res_pos_df.empty:
res_pos_df.rename(columns={
'res': f'res{freq_range}',
}, inplace=True)
res_pos_df.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)
def paired_monitor(name, vcf_file, bed_file, cnvkit_bed, output_dir):
"""
682双样本处理
"""
bed_regions = load_bed_regions(bed_file)
cnvkit_bed_regions = load_bed_regions(cnvkit_bed)
vcf = pysam.VariantFile(vcf_file)
records_dict = build_record_dict(vcf)
res_pos = list()
# 遍历 bed
for (bed_chrom, bed_start, bed_end) in bed_regions:
key = (bed_chrom, bed_end)
ref = '.'
alt = '.'
freq = '.'
genotype = '.'
ref_normal = '.'
alt_normal = '.'
freq_normal = '.'
genotype_normal = '.'
res = 'no'
if key in records_dict:
record = records_dict[key]
ref = record.ref
alt = record.alts[0]
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
freq_normal = record.samples.get(record.samples.keys()[1]).get("AF")[0]
ref_normal = ref
alt_normal = alt
genotype = simplify_genotype(freq)
genotype_normal = simplify_genotype(freq_normal)
if genotype == genotype_normal:
res = 'yes'
res_pos.append(dict(
chr=bed_chrom,
pos=bed_end,
ref=ref,
alt=alt,
freq=freq,
genotype=genotype,
ref_normal=ref_normal,
alt_normal=alt_normal,
freq_normal=freq_normal,
genotype_normal=genotype_normal,
res=res
))
res_pos_df = pd.DataFrame(res_pos)
res_pos_df_sort = res_pos_df.sort_values(by='chr')
res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)
output_tumor_vcf = open(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w')
output_normal_vcf = open(os.path.join(output_dir, f'{name}_cnvkit_normal.vcf'), 'w')
for line in str(vcf.header).strip().split('\n'):
output_tumor_vcf.write(line + '\n') if line.startswith('##') else output_tumor_vcf.write(
'\t'.join(line.split('\t')[:-1]) + '\n')
output_normal_vcf.write(line + '\n') if line.startswith('##') else output_normal_vcf.write(
'\t'.join(line.split('\t')[:-2] + line.split('\t')[-1:]) + '\n')
for (bed_chrom, bed_start, bed_end) in cnvkit_bed_regions:
key = (bed_chrom, bed_end)
if key in records_dict:
record = records_dict[key]
output_tumor_vcf.write('\t'.join(str(record).split('\t')[:-1]) + '\n')
output_normal_vcf.write('\t'.join(str(record).split('\t')[:-2] + str(record).split('\t')[-1:]) + '\n')
output_tumor_vcf.close()
output_normal_vcf.close()
def paired_monitor_vcf(name, tumor_vcf_file, normal_vcf_file, bed_file, cnvkit_bed, output_dir):
"""
624双样本处理
"""
bed_regions = load_bed_regions(bed_file)
tumor_vcf = pysam.VariantFile(tumor_vcf_file)
normal_vcf = pysam.VariantFile(normal_vcf_file)
tumor_records_dict = build_record_dict(tumor_vcf)
normal_records_dict = build_record_dict(normal_vcf)
res_pos = list()
# 遍历 bed
for (bed_chrom, bed_start, bed_end) in bed_regions:
key = (bed_chrom, bed_end)
ref = '.'
alt = '.'
freq = '.'
genotype = '.'
ref_normal = '.'
alt_normal = '.'
freq_normal = '.'
genotype_normal = '.'
res = 'no'
if key in tumor_records_dict:
record = tumor_records_dict[key]
ref = record.ref
alt = record.alts[0]
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
genotype = simplify_genotype(freq)
if key in normal_records_dict:
record = normal_records_dict[key]
ref_normal = record.ref
alt_normal = record.alts[0]
freq_normal = record.samples.get(record.samples.keys()[0]).get("AF")[0]
genotype_normal = simplify_genotype(freq_normal)
if genotype == genotype_normal:
res = 'yes'
res_pos.append(dict(
chr=bed_chrom,
pos=bed_end,
ref=ref,
alt=alt,
freq=freq,
genotype=genotype,
ref_normal=ref_normal,
alt_normal=alt_normal,
freq_normal=freq_normal,
genotype_normal=genotype_normal,
res=res
))
res_pos_df = pd.DataFrame(res_pos)
res_pos_df_sort = res_pos_df.sort_values(by='chr')
res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)
output_tumor_vcf = pysam.VariantFile(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w',
header=tumor_vcf.header)
output_normal_vcf = pysam.VariantFile(os.path.join(output_dir, f'{name}_cnvkit_normal.vcf'), 'w',
header=normal_vcf.header)
cnvkit_bed_regions = load_bed_regions(cnvkit_bed)
for (bed_chrom, bed_start, bed_end) in cnvkit_bed_regions:
key = (bed_chrom, bed_end)
if key in normal_records_dict:
output_normal_vcf.write(normal_records_dict[key])
if key in tumor_records_dict:
output_tumor_vcf.write(tumor_records_dict[key])
output_tumor_vcf.close()
output_normal_vcf.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Pollution Process Script")
parser.add_argument('-n', '--name', help="Name for sample", required=True)
parser.add_argument('-b', '--ref_bed', help="ref_bed", required=True)
parser.add_argument('-v', '--vcf', help="raw vcf for prbe 160 or 17 ; somatic vcf for prbe 682 or 624",
required=True)
parser.add_argument('-v2', '--vcf2', help="germline vcf; required when prbe 682 or 624", default='', required=False,
nargs='?')
parser.add_argument('-c', '--cnvkit_bed', help="cnvkit_bed; required when prbe 682 or 624")
parser.add_argument('-p', '--probe', help="probe, 682, 624, 160, 17 for now ", required=True)
parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='')
args = parser.parse_args()
bed_path = os.path.realpath(args.ref_bed)
print(f'污染检测使用ref_bed: {bed_path}')
probe = args.probe
if probe == '160' or probe == '17':
freq_range = {"17": [0.3452, 0.6512], "160": [0.2930, 0.6753]}.get(probe)
single_monitor(args.name, args.vcf, bed_path, freq_range, args.output_dir)
elif probe == '682':
if not args.cnvkit_bed:
parser.error('--cnvkit_bed is required in probe 624')
cnvkit_bed_path = os.path.realpath(args.cnvkit_bed)
print(f'污染检测使用cnvkit_bed: {cnvkit_bed_path}')
paired_monitor(args.name, args.vcf, bed_path, args.cnvkit_bed, args.output_dir)
elif probe == '624':
if not args.vcf2:
parser.error('--vcf2 is required in probe 624')
if not args.cnvkit_bed:
parser.error('--cnvkit_bed is required in probe 624')
paired_monitor_vcf(args.name, args.vcf, args.vcf2, bed_path, args.cnvkit_bed, args.output_dir)
else:
parser.error('probe error. 682, 624, 160, 17 for now')