#!/usr/bin/env python3 import argparse import os import pandas as pd import pysam def load_bed_regions(bed_file): bed_regions = set() if bed_file: with open(bed_file, 'r') as file: for line in file: parts = line.strip().split('\t') if len(parts) >= 3: chrom, start, end = parts[0], int(parts[1]), int(parts[2]) bed_regions.add((chrom, start, end)) return bed_regions def build_record_dict(vcf_file): records = {} for record in vcf_file: key = (record.contig, record.pos) records[key] = record return records def simplify_genotype(freq): if freq < 0.1: return '0/0' elif 0.1 <= freq <= 0.9: return '0/1' else: return '1/1' def single_monitor(name, vcf_file, bed_file, freq_range, output_dir): """ 17, 160 单样本处理 rawvcf 对应bed 挑选出 位点,计算位点是否在给定范围内,计算比例得到z_socre """ # bed 处理 bed_regions = load_bed_regions(bed_file) vcf = pysam.VariantFile(vcf_file) res_pos = list() count_normal = 0 count_exception = 0 for record in vcf: contig = record.contig pos = record.pos ref = record.ref alt = record.alts[0] freq = record.samples.get(record.samples.keys()[0]).get("AF")[0] if freq < 0.1 or freq > 0.9: continue # 检查记录是否是点突变 if len(record.ref) == 1 and len(record.alts[0]) == 1: # 在bed 区间内 for (bed_chrom, bed_start, bed_end) in bed_regions: if contig == bed_chrom and bed_start < pos <= bed_end: if freq_range[0] <= freq <= freq_range[1]: count_normal += 1 res = 'yes' else: count_exception += 1 res = 'no' res_pos.append(dict( chr=contig, pos=pos, ref=ref, alt=alt, freq=freq, res=res )) break count_all = count_exception + count_normal if count_all == 0: z_score = 0 else: z_score = count_exception / count_all res = dict( barcode=name, count_normal=count_normal, count_exception=count_exception, z_score=z_score ) pd.DataFrame([res]).to_csv(os.path.join(output_dir, f'{name}_pollution_sum.txt'), sep="\t", index=False) res_pos_df = pd.DataFrame(res_pos) if not res_pos_df.empty: res_pos_df.rename(columns={ 'res': f'res{freq_range}', }, inplace=True) res_pos_df.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False) def paired_monitor(name, vcf_file, bed_file, cnvkit_bed, output_dir): """ 682双样本处理 """ bed_regions = load_bed_regions(bed_file) cnvkit_bed_regions = load_bed_regions(cnvkit_bed) vcf = pysam.VariantFile(vcf_file) records_dict = build_record_dict(vcf) res_pos = list() # 遍历 bed for (bed_chrom, bed_start, bed_end) in bed_regions: key = (bed_chrom, bed_end) ref = '.' alt = '.' freq = '.' genotype = '.' ref_normal = '.' alt_normal = '.' freq_normal = '.' genotype_normal = '.' res = 'no' if key in records_dict: record = records_dict[key] ref = record.ref alt = record.alts[0] freq = record.samples.get(record.samples.keys()[0]).get("AF")[0] freq_normal = record.samples.get(record.samples.keys()[1]).get("AF")[0] ref_normal = ref alt_normal = alt genotype = simplify_genotype(freq) genotype_normal = simplify_genotype(freq_normal) if genotype == genotype_normal: res = 'yes' res_pos.append(dict( chr=bed_chrom, pos=bed_end, ref=ref, alt=alt, freq=freq, genotype=genotype, ref_normal=ref_normal, alt_normal=alt_normal, freq_normal=freq_normal, genotype_normal=genotype_normal, res=res )) res_pos_df = pd.DataFrame(res_pos) res_pos_df_sort = res_pos_df.sort_values(by='chr') res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False) output_tumor_vcf = open(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w') output_normal_vcf = open(os.path.join(output_dir, f'{name}_cnvkit_normal.vcf'), 'w') for line in str(vcf.header).strip().split('\n'): output_tumor_vcf.write(line + '\n') if line.startswith('##') else output_tumor_vcf.write( '\t'.join(line.split('\t')[:-1]) + '\n') output_normal_vcf.write(line + '\n') if line.startswith('##') else output_normal_vcf.write( '\t'.join(line.split('\t')[:-2] + line.split('\t')[-1:]) + '\n') for (bed_chrom, bed_start, bed_end) in cnvkit_bed_regions: key = (bed_chrom, bed_end) if key in records_dict: record = records_dict[key] output_tumor_vcf.write('\t'.join(str(record).split('\t')[:-1]) + '\n') output_normal_vcf.write('\t'.join(str(record).split('\t')[:-2] + str(record).split('\t')[-1:]) + '\n') output_tumor_vcf.close() output_normal_vcf.close() def paired_monitor_vcf(name, tumor_vcf_file, normal_vcf_file, bed_file, cnvkit_bed, output_dir): """ 624双样本处理 """ bed_regions = load_bed_regions(bed_file) tumor_vcf = pysam.VariantFile(tumor_vcf_file) normal_vcf = pysam.VariantFile(normal_vcf_file) tumor_records_dict = build_record_dict(tumor_vcf) normal_records_dict = build_record_dict(normal_vcf) res_pos = list() # 遍历 bed for (bed_chrom, bed_start, bed_end) in bed_regions: key = (bed_chrom, bed_end) ref = '.' alt = '.' freq = '.' genotype = '.' ref_normal = '.' alt_normal = '.' freq_normal = '.' genotype_normal = '.' res = 'no' if key in tumor_records_dict: record = tumor_records_dict[key] ref = record.ref alt = record.alts[0] freq = record.samples.get(record.samples.keys()[0]).get("AF")[0] genotype = simplify_genotype(freq) if key in normal_records_dict: record = normal_records_dict[key] ref_normal = record.ref alt_normal = record.alts[0] freq_normal = record.samples.get(record.samples.keys()[0]).get("AF")[0] genotype_normal = simplify_genotype(freq_normal) if genotype == genotype_normal: res = 'yes' res_pos.append(dict( chr=bed_chrom, pos=bed_end, ref=ref, alt=alt, freq=freq, genotype=genotype, ref_normal=ref_normal, alt_normal=alt_normal, freq_normal=freq_normal, genotype_normal=genotype_normal, res=res )) res_pos_df = pd.DataFrame(res_pos) res_pos_df_sort = res_pos_df.sort_values(by='chr') res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False) output_tumor_vcf = pysam.VariantFile(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w', header=tumor_vcf.header) output_normal_vcf = pysam.VariantFile(os.path.join(output_dir, f'{name}_cnvkit_normal.vcf'), 'w', header=normal_vcf.header) cnvkit_bed_regions = load_bed_regions(cnvkit_bed) for (bed_chrom, bed_start, bed_end) in cnvkit_bed_regions: key = (bed_chrom, bed_end) if key in normal_records_dict: output_normal_vcf.write(normal_records_dict[key]) if key in tumor_records_dict: output_tumor_vcf.write(tumor_records_dict[key]) output_tumor_vcf.close() output_normal_vcf.close() if __name__ == '__main__': parser = argparse.ArgumentParser(description="Pollution Process Script") parser.add_argument('-n', '--name', help="Name for sample", required=True) parser.add_argument('-b', '--ref_bed', help="ref_bed", required=True) parser.add_argument('-v', '--vcf', help="raw vcf for prbe 160 or 17 ; somatic vcf for prbe 682 or 624", required=True) parser.add_argument('-v2', '--vcf2', help="germline vcf; required when prbe 682 or 624", default='', required=False, nargs='?') parser.add_argument('-c', '--cnvkit_bed', help="cnvkit_bed; required when prbe 682 or 624") parser.add_argument('-p', '--probe', help="probe, 682, 624, 160, 17 for now ", required=True) parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='') args = parser.parse_args() bed_path = os.path.realpath(args.ref_bed) print(f'污染检测使用ref_bed: {bed_path}') probe = args.probe if probe == '160' or probe == '17': freq_range = {"17": [0.3452, 0.6512], "160": [0.2930, 0.6753]}.get(probe) single_monitor(args.name, args.vcf, bed_path, freq_range, args.output_dir) elif probe == '682': if not args.cnvkit_bed: parser.error('--cnvkit_bed is required in probe 624') cnvkit_bed_path = os.path.realpath(args.cnvkit_bed) print(f'污染检测使用cnvkit_bed: {cnvkit_bed_path}') paired_monitor(args.name, args.vcf, bed_path, args.cnvkit_bed, args.output_dir) elif probe == '624': if not args.vcf2: parser.error('--vcf2 is required in probe 624') if not args.cnvkit_bed: parser.error('--cnvkit_bed is required in probe 624') paired_monitor_vcf(args.name, args.vcf, args.vcf2, bed_path, args.cnvkit_bed, args.output_dir) else: parser.error('probe error. 682, 624, 160, 17 for now')