#!/usr/bin/env python3 import argparse import os import pandas as pd import pysam def load_bed_regions(bed_file): bed_regions = set() if bed_file: with open(bed_file, 'r') as file: for line in file: if line.startswith('#'): continue parts = line.strip().split('\t') if len(parts) >= 3: chrom, start, end = parts[0], int(parts[1]), int(parts[2]) bed_regions.add((chrom, start, end)) return bed_regions def build_record_dict(vcf_file): records = {} for record in vcf_file: key = (record.contig, record.pos) records[key] = record return records def simplify_genotype(freq): if freq < 0.1: return '0/0' elif 0.1 <= freq <= 0.9: return '0/1' else: return '1/1' def paired_monitor(name, vcf_file, gender_bed, output_dir): """ 682双样本处理 """ gender_bed_regions = load_bed_regions(gender_bed) vcf = pysam.VariantFile(vcf_file) records_dict = build_record_dict(vcf) res_pos = list() # 遍历 bed for (bed_chrom, bed_start, bed_end) in gender_bed_regions: key = (bed_chrom, bed_end) ref = '.' alt = '.' freq = '.' genotype = '.' ref_normal = '.' alt_normal = '.' freq_normal = '.' genotype_normal = '.' if key in records_dict: record = records_dict[key] ref = record.ref alt = record.alts[0] freq = record.samples.get(record.samples.keys()[0]).get("AF")[0] freq_normal = record.samples.get(record.samples.keys()[1]).get("AF")[0] ref_normal = ref alt_normal = alt genotype = simplify_genotype(freq) genotype_normal = simplify_genotype(freq_normal) res_pos.append(dict( chr=bed_chrom, pos=bed_end, ref=ref, alt=alt, freq=freq, genotype=genotype, ref_normal=ref_normal, alt_normal=alt_normal, freq_normal=freq_normal, genotype_normal=genotype_normal, )) res_pos_df = pd.DataFrame(res_pos) res_pos_df_sort = res_pos_df.sort_values(by='chr') res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_gender.txt'), sep="\t", index=False) percentage = (res_pos_df_sort['genotype'].str.count('0/1') > 0).mean() * 100 percentage_nor = (res_pos_df_sort['genotype_normal'].str.count('0/1') > 0).mean() * 100 gender_tumor = '男' if percentage == 0 else '女' gender_normal = '男' if percentage_nor == 0 else '女' res = dict( gender_tumor=gender_tumor, gender_normal=gender_normal ) gender_res_df = pd.DataFrame([res]) gender_res_df.to_csv(os.path.join(output_dir, f'{name}_gender_res.txt'), sep="\t", index=False) vcf.close() def paired_monitor_vcf(name, tumor_vcf_file, normal_vcf_file, gender_bed, output_dir): """ 624双样本处理 """ bed_regions = load_bed_regions(gender_bed) tumor_vcf = pysam.VariantFile(tumor_vcf_file) normal_vcf = pysam.VariantFile(normal_vcf_file) tumor_records_dict = build_record_dict(tumor_vcf) normal_records_dict = build_record_dict(normal_vcf) res_pos = list() # 遍历 bed for (bed_chrom, bed_start, bed_end) in bed_regions: key = (bed_chrom, bed_end) ref = '.' alt = '.' freq = '.' genotype = '.' ref_normal = '.' alt_normal = '.' freq_normal = '.' genotype_normal = '.' if key in tumor_records_dict: record = tumor_records_dict[key] ref = record.ref alt = record.alts[0] freq = record.samples.get(record.samples.keys()[0]).get("AF")[0] genotype = simplify_genotype(freq) if key in normal_records_dict: record = normal_records_dict[key] ref_normal = record.ref alt_normal = record.alts[0] freq_normal = record.samples.get(record.samples.keys()[0]).get("AF")[0] genotype_normal = simplify_genotype(freq_normal) res_pos.append(dict( chr=bed_chrom, pos=bed_end, ref=ref, alt=alt, freq=freq, genotype=genotype, ref_normal=ref_normal, alt_normal=alt_normal, freq_normal=freq_normal, genotype_normal=genotype_normal, )) res_pos_df = pd.DataFrame(res_pos) res_pos_df_sort = res_pos_df.sort_values(by='chr') res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_gender.txt'), sep="\t", index=False) tumor_vcf.close() normal_vcf.close() if __name__ == '__main__': parser = argparse.ArgumentParser(description="Pollution Process Script") parser.add_argument('-n', '--name', help="Name for sample", required=True) parser.add_argument('-v', '--vcf', help="raw vcf for prbe 682", required=True) parser.add_argument('-v2', '--vcf2', help="germline vcf; 624", default='', required=False, nargs='?') parser.add_argument('-g', '--gender_bed', help="gender_bed; required when prbe 682 or 624") parser.add_argument('-p', '--probe', help="probe, 682, 624 for now ", required=True) parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='') args = parser.parse_args() probe = args.probe if not args.gender_bed: parser.error('--gender_bed is required in probe 624') gender_bed_path = os.path.realpath(args.gender_bed) print(f'性别鉴定使用gender_bed: {gender_bed_path}') if probe == '682': paired_monitor(args.name, args.vcf, args.gender_bed, args.output_dir) elif probe == '624': if not args.vcf2: parser.error('--vcf2 is required in probe 624') paired_monitor_vcf(args.name, args.vcf, args.vcf2, args.gender_bed, args.output_dir) else: parser.error('probe error. 682, 624 for now')