性别鉴定

master
chaopower 2024-01-29 13:02:36 +08:00
parent 94d5d859e9
commit 0a60e18e93
2 changed files with 196 additions and 0 deletions

View File

@ -0,0 +1,185 @@
#!/usr/bin/env python3
import argparse
import os
import pandas as pd
import pysam
def load_bed_regions(bed_file):
bed_regions = set()
if bed_file:
with open(bed_file, 'r') as file:
for line in file:
if line.startswith('#'):
continue
parts = line.strip().split('\t')
if len(parts) >= 3:
chrom, start, end = parts[0], int(parts[1]), int(parts[2])
bed_regions.add((chrom, start, end))
return bed_regions
def build_record_dict(vcf_file):
records = {}
for record in vcf_file:
key = (record.contig, record.pos)
records[key] = record
return records
def simplify_genotype(freq):
if freq < 0.1:
return '0/0'
elif 0.1 <= freq <= 0.9:
return '0/1'
else:
return '1/1'
def paired_monitor(name, vcf_file, gender_bed, output_dir):
"""
682双样本处理
"""
gender_bed_regions = load_bed_regions(gender_bed)
vcf = pysam.VariantFile(vcf_file)
records_dict = build_record_dict(vcf)
res_pos = list()
# 遍历 bed
for (bed_chrom, bed_start, bed_end) in gender_bed_regions:
key = (bed_chrom, bed_end)
ref = '.'
alt = '.'
freq = '.'
genotype = '.'
ref_normal = '.'
alt_normal = '.'
freq_normal = '.'
genotype_normal = '.'
if key in records_dict:
record = records_dict[key]
ref = record.ref
alt = record.alts[0]
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
freq_normal = record.samples.get(record.samples.keys()[1]).get("AF")[0]
ref_normal = ref
alt_normal = alt
genotype = simplify_genotype(freq)
genotype_normal = simplify_genotype(freq_normal)
res_pos.append(dict(
chr=bed_chrom,
pos=bed_end,
ref=ref,
alt=alt,
freq=freq,
genotype=genotype,
ref_normal=ref_normal,
alt_normal=alt_normal,
freq_normal=freq_normal,
genotype_normal=genotype_normal,
))
res_pos_df = pd.DataFrame(res_pos)
res_pos_df_sort = res_pos_df.sort_values(by='chr')
res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_gender.txt'), sep="\t", index=False)
percentage = (res_pos_df_sort['genotype'].str.count('0/1') > 0).mean() * 100
percentage_nor = (res_pos_df_sort['genotype_normal'].str.count('0/1') > 0).mean() * 100
gender_tumor = '' if percentage == 0 else ''
gender_normal = '' if percentage_nor == 0 else ''
res = dict(
gender_tumor=gender_tumor,
gender_normal=gender_normal
)
gender_res_df = pd.DataFrame([res])
gender_res_df.to_csv(os.path.join(output_dir, f'{name}_gender_res.txt'), sep="\t", index=False)
vcf.close()
def paired_monitor_vcf(name, tumor_vcf_file, normal_vcf_file, gender_bed, output_dir):
"""
624双样本处理
"""
bed_regions = load_bed_regions(gender_bed)
tumor_vcf = pysam.VariantFile(tumor_vcf_file)
normal_vcf = pysam.VariantFile(normal_vcf_file)
tumor_records_dict = build_record_dict(tumor_vcf)
normal_records_dict = build_record_dict(normal_vcf)
res_pos = list()
# 遍历 bed
for (bed_chrom, bed_start, bed_end) in bed_regions:
key = (bed_chrom, bed_end)
ref = '.'
alt = '.'
freq = '.'
genotype = '.'
ref_normal = '.'
alt_normal = '.'
freq_normal = '.'
genotype_normal = '.'
if key in tumor_records_dict:
record = tumor_records_dict[key]
ref = record.ref
alt = record.alts[0]
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
genotype = simplify_genotype(freq)
if key in normal_records_dict:
record = normal_records_dict[key]
ref_normal = record.ref
alt_normal = record.alts[0]
freq_normal = record.samples.get(record.samples.keys()[0]).get("AF")[0]
genotype_normal = simplify_genotype(freq_normal)
res_pos.append(dict(
chr=bed_chrom,
pos=bed_end,
ref=ref,
alt=alt,
freq=freq,
genotype=genotype,
ref_normal=ref_normal,
alt_normal=alt_normal,
freq_normal=freq_normal,
genotype_normal=genotype_normal,
))
res_pos_df = pd.DataFrame(res_pos)
res_pos_df_sort = res_pos_df.sort_values(by='chr')
res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_gender.txt'), sep="\t", index=False)
tumor_vcf.close()
normal_vcf.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Pollution Process Script")
parser.add_argument('-n', '--name', help="Name for sample", required=True)
parser.add_argument('-v', '--vcf', help="raw vcf for prbe 682",
required=True)
parser.add_argument('-v2', '--vcf2', help="germline vcf; 624", default='', required=False, nargs='?')
parser.add_argument('-g', '--gender_bed', help="gender_bed; required when prbe 682 or 624")
parser.add_argument('-p', '--probe', help="probe, 682, 624 for now ", required=True)
parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='')
args = parser.parse_args()
probe = args.probe
if not args.gender_bed:
parser.error('--gender_bed is required in probe 624')
gender_bed_path = os.path.realpath(args.gender_bed)
print(f'性别鉴定使用gender_bed: {gender_bed_path}')
if probe == '682':
paired_monitor(args.name, args.vcf, args.gender_bed, args.output_dir)
elif probe == '624':
if not args.vcf2:
parser.error('--vcf2 is required in probe 624')
paired_monitor_vcf(args.name, args.vcf, args.vcf2, args.gender_bed, args.output_dir)
else:
parser.error('probe error. 682, 624 for now')

View File

@ -0,0 +1,11 @@
#chr start end rs
chrX 39922358 39922359 rs3810694
chrX 39932906 39932907 rs6520618
chrX 44833840 44833841 rs6611055
chrX 76937962 76937963 rs3088074
chrX 76940533 76940534 rs35268552
chrX 100608190 100608191 rs1135363
chrX 100611284 100611285 rs3747288
chrX 53228147 53228148 rs1977364
chrX 44929076 44929077 rs2230018
chrX 47424614 47424615 rs2071776