pipeline/codes/gender_dectec.py

186 lines
6.0 KiB
Python
Raw Normal View History

2024-01-29 13:02:36 +08:00
#!/usr/bin/env python3
import argparse
import os
import pandas as pd
import pysam
def load_bed_regions(bed_file):
bed_regions = set()
if bed_file:
with open(bed_file, 'r') as file:
for line in file:
if line.startswith('#'):
continue
parts = line.strip().split('\t')
if len(parts) >= 3:
chrom, start, end = parts[0], int(parts[1]), int(parts[2])
bed_regions.add((chrom, start, end))
return bed_regions
def build_record_dict(vcf_file):
records = {}
for record in vcf_file:
key = (record.contig, record.pos)
records[key] = record
return records
def simplify_genotype(freq):
if freq < 0.1:
return '0/0'
elif 0.1 <= freq <= 0.9:
return '0/1'
else:
return '1/1'
def paired_monitor(name, vcf_file, gender_bed, output_dir):
"""
682双样本处理
"""
gender_bed_regions = load_bed_regions(gender_bed)
vcf = pysam.VariantFile(vcf_file)
records_dict = build_record_dict(vcf)
res_pos = list()
# 遍历 bed
for (bed_chrom, bed_start, bed_end) in gender_bed_regions:
key = (bed_chrom, bed_end)
ref = '.'
alt = '.'
freq = '.'
genotype = '.'
ref_normal = '.'
alt_normal = '.'
freq_normal = '.'
genotype_normal = '.'
if key in records_dict:
record = records_dict[key]
ref = record.ref
alt = record.alts[0]
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
freq_normal = record.samples.get(record.samples.keys()[1]).get("AF")[0]
ref_normal = ref
alt_normal = alt
genotype = simplify_genotype(freq)
genotype_normal = simplify_genotype(freq_normal)
res_pos.append(dict(
chr=bed_chrom,
pos=bed_end,
ref=ref,
alt=alt,
freq=freq,
genotype=genotype,
ref_normal=ref_normal,
alt_normal=alt_normal,
freq_normal=freq_normal,
genotype_normal=genotype_normal,
))
res_pos_df = pd.DataFrame(res_pos)
res_pos_df_sort = res_pos_df.sort_values(by='chr')
res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_gender.txt'), sep="\t", index=False)
percentage = (res_pos_df_sort['genotype'].str.count('0/1') > 0).mean() * 100
percentage_nor = (res_pos_df_sort['genotype_normal'].str.count('0/1') > 0).mean() * 100
gender_tumor = '' if percentage == 0 else ''
gender_normal = '' if percentage_nor == 0 else ''
res = dict(
gender_tumor=gender_tumor,
gender_normal=gender_normal
)
gender_res_df = pd.DataFrame([res])
gender_res_df.to_csv(os.path.join(output_dir, f'{name}_gender_res.txt'), sep="\t", index=False)
vcf.close()
def paired_monitor_vcf(name, tumor_vcf_file, normal_vcf_file, gender_bed, output_dir):
"""
624双样本处理
"""
bed_regions = load_bed_regions(gender_bed)
tumor_vcf = pysam.VariantFile(tumor_vcf_file)
normal_vcf = pysam.VariantFile(normal_vcf_file)
tumor_records_dict = build_record_dict(tumor_vcf)
normal_records_dict = build_record_dict(normal_vcf)
res_pos = list()
# 遍历 bed
for (bed_chrom, bed_start, bed_end) in bed_regions:
key = (bed_chrom, bed_end)
ref = '.'
alt = '.'
freq = '.'
genotype = '.'
ref_normal = '.'
alt_normal = '.'
freq_normal = '.'
genotype_normal = '.'
if key in tumor_records_dict:
record = tumor_records_dict[key]
ref = record.ref
alt = record.alts[0]
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
genotype = simplify_genotype(freq)
if key in normal_records_dict:
record = normal_records_dict[key]
ref_normal = record.ref
alt_normal = record.alts[0]
freq_normal = record.samples.get(record.samples.keys()[0]).get("AF")[0]
genotype_normal = simplify_genotype(freq_normal)
res_pos.append(dict(
chr=bed_chrom,
pos=bed_end,
ref=ref,
alt=alt,
freq=freq,
genotype=genotype,
ref_normal=ref_normal,
alt_normal=alt_normal,
freq_normal=freq_normal,
genotype_normal=genotype_normal,
))
res_pos_df = pd.DataFrame(res_pos)
res_pos_df_sort = res_pos_df.sort_values(by='chr')
res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_gender.txt'), sep="\t", index=False)
tumor_vcf.close()
normal_vcf.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Pollution Process Script")
parser.add_argument('-n', '--name', help="Name for sample", required=True)
parser.add_argument('-v', '--vcf', help="raw vcf for prbe 682",
required=True)
parser.add_argument('-v2', '--vcf2', help="germline vcf; 624", default='', required=False, nargs='?')
parser.add_argument('-g', '--gender_bed', help="gender_bed; required when prbe 682 or 624")
parser.add_argument('-p', '--probe', help="probe, 682, 624 for now ", required=True)
parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='')
args = parser.parse_args()
probe = args.probe
if not args.gender_bed:
parser.error('--gender_bed is required in probe 624')
gender_bed_path = os.path.realpath(args.gender_bed)
print(f'性别鉴定使用gender_bed: {gender_bed_path}')
if probe == '682':
paired_monitor(args.name, args.vcf, args.gender_bed, args.output_dir)
elif probe == '624':
if not args.vcf2:
parser.error('--vcf2 is required in probe 624')
paired_monitor_vcf(args.name, args.vcf, args.vcf2, args.gender_bed, args.output_dir)
else:
parser.error('probe error. 682, 624 for now')