diff --git a/codes/gender_dectec.py b/codes/gender_dectec.py new file mode 100755 index 0000000..dbfaf3a --- /dev/null +++ b/codes/gender_dectec.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 + + +import argparse +import os + +import pandas as pd +import pysam + + +def load_bed_regions(bed_file): + bed_regions = set() + if bed_file: + with open(bed_file, 'r') as file: + for line in file: + if line.startswith('#'): + continue + parts = line.strip().split('\t') + if len(parts) >= 3: + chrom, start, end = parts[0], int(parts[1]), int(parts[2]) + bed_regions.add((chrom, start, end)) + return bed_regions + + +def build_record_dict(vcf_file): + records = {} + for record in vcf_file: + key = (record.contig, record.pos) + records[key] = record + return records + + +def simplify_genotype(freq): + if freq < 0.1: + return '0/0' + elif 0.1 <= freq <= 0.9: + return '0/1' + else: + return '1/1' + + +def paired_monitor(name, vcf_file, gender_bed, output_dir): + """ + 682双样本处理 + """ + gender_bed_regions = load_bed_regions(gender_bed) + vcf = pysam.VariantFile(vcf_file) + records_dict = build_record_dict(vcf) + res_pos = list() + + # 遍历 bed + for (bed_chrom, bed_start, bed_end) in gender_bed_regions: + key = (bed_chrom, bed_end) + ref = '.' + alt = '.' + freq = '.' + genotype = '.' + ref_normal = '.' + alt_normal = '.' + freq_normal = '.' + genotype_normal = '.' + if key in records_dict: + record = records_dict[key] + ref = record.ref + alt = record.alts[0] + freq = record.samples.get(record.samples.keys()[0]).get("AF")[0] + freq_normal = record.samples.get(record.samples.keys()[1]).get("AF")[0] + ref_normal = ref + alt_normal = alt + + genotype = simplify_genotype(freq) + genotype_normal = simplify_genotype(freq_normal) + + res_pos.append(dict( + chr=bed_chrom, + pos=bed_end, + ref=ref, + alt=alt, + freq=freq, + genotype=genotype, + ref_normal=ref_normal, + alt_normal=alt_normal, + freq_normal=freq_normal, + genotype_normal=genotype_normal, + )) + res_pos_df = pd.DataFrame(res_pos) + res_pos_df_sort = res_pos_df.sort_values(by='chr') + res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_gender.txt'), sep="\t", index=False) + + percentage = (res_pos_df_sort['genotype'].str.count('0/1') > 0).mean() * 100 + percentage_nor = (res_pos_df_sort['genotype_normal'].str.count('0/1') > 0).mean() * 100 + gender_tumor = '男' if percentage == 0 else '女' + gender_normal = '男' if percentage_nor == 0 else '女' + res = dict( + gender_tumor=gender_tumor, + gender_normal=gender_normal + ) + gender_res_df = pd.DataFrame([res]) + gender_res_df.to_csv(os.path.join(output_dir, f'{name}_gender_res.txt'), sep="\t", index=False) + vcf.close() + + +def paired_monitor_vcf(name, tumor_vcf_file, normal_vcf_file, gender_bed, output_dir): + """ + 624双样本处理 + """ + bed_regions = load_bed_regions(gender_bed) + tumor_vcf = pysam.VariantFile(tumor_vcf_file) + normal_vcf = pysam.VariantFile(normal_vcf_file) + + tumor_records_dict = build_record_dict(tumor_vcf) + normal_records_dict = build_record_dict(normal_vcf) + + res_pos = list() + + # 遍历 bed + for (bed_chrom, bed_start, bed_end) in bed_regions: + key = (bed_chrom, bed_end) + ref = '.' + alt = '.' + freq = '.' + genotype = '.' + ref_normal = '.' + alt_normal = '.' + freq_normal = '.' + genotype_normal = '.' + + if key in tumor_records_dict: + record = tumor_records_dict[key] + ref = record.ref + alt = record.alts[0] + freq = record.samples.get(record.samples.keys()[0]).get("AF")[0] + genotype = simplify_genotype(freq) + if key in normal_records_dict: + record = normal_records_dict[key] + ref_normal = record.ref + alt_normal = record.alts[0] + freq_normal = record.samples.get(record.samples.keys()[0]).get("AF")[0] + genotype_normal = simplify_genotype(freq_normal) + res_pos.append(dict( + chr=bed_chrom, + pos=bed_end, + ref=ref, + alt=alt, + freq=freq, + genotype=genotype, + ref_normal=ref_normal, + alt_normal=alt_normal, + freq_normal=freq_normal, + genotype_normal=genotype_normal, + )) + + res_pos_df = pd.DataFrame(res_pos) + res_pos_df_sort = res_pos_df.sort_values(by='chr') + res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_gender.txt'), sep="\t", index=False) + tumor_vcf.close() + normal_vcf.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description="Pollution Process Script") + + parser.add_argument('-n', '--name', help="Name for sample", required=True) + parser.add_argument('-v', '--vcf', help="raw vcf for prbe 682", + required=True) + parser.add_argument('-v2', '--vcf2', help="germline vcf; 624", default='', required=False, nargs='?') + parser.add_argument('-g', '--gender_bed', help="gender_bed; required when prbe 682 or 624") + parser.add_argument('-p', '--probe', help="probe, 682, 624 for now ", required=True) + parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='') + args = parser.parse_args() + + probe = args.probe + if not args.gender_bed: + parser.error('--gender_bed is required in probe 624') + gender_bed_path = os.path.realpath(args.gender_bed) + print(f'性别鉴定使用gender_bed: {gender_bed_path}') + if probe == '682': + + paired_monitor(args.name, args.vcf, args.gender_bed, args.output_dir) + elif probe == '624': + if not args.vcf2: + parser.error('--vcf2 is required in probe 624') + paired_monitor_vcf(args.name, args.vcf, args.vcf2, args.gender_bed, args.output_dir) + else: + parser.error('probe error. 682, 624 for now') diff --git a/codes/public/pollution/gender.bed b/codes/public/pollution/gender.bed new file mode 100644 index 0000000..430c671 --- /dev/null +++ b/codes/public/pollution/gender.bed @@ -0,0 +1,11 @@ +#chr start end rs +chrX 39922358 39922359 rs3810694 +chrX 39932906 39932907 rs6520618 +chrX 44833840 44833841 rs6611055 +chrX 76937962 76937963 rs3088074 +chrX 76940533 76940534 rs35268552 +chrX 100608190 100608191 rs1135363 +chrX 100611284 100611285 rs3747288 +chrX 53228147 53228148 rs1977364 +chrX 44929076 44929077 rs2230018 +chrX 47424614 47424615 rs2071776