pipeline/script/gender_detect.py

97 lines
3.3 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/usr/bin/env python3
#-*-coding:utf-8-*-
import sys
import json
import pandas as pd
def read_position_file(positions_filename):
# 读取 csv 文件中的位点信息
positions = []
df = pd.read_csv(positions_filename, sep='\t')
for p in df['end']:
positions.append(int(p))
return positions
def process_positions(sample_vcf, positions):
# 统计样本杂合率
result_dict = {}
# 读取VCF文件为DataFrame,624会出现nan值
df = pd.read_csv(sample_vcf, sep='\t', comment='#', header=None, names=range(11))
for j in positions:
result_dict[j] = {'count': 0, 'sum': 0}
# 筛选出符合条件的行
filtered_df = df[(df[0] == "chrX") & (df[1] == j)]
# 处理每行数据处理624的nan
for index, row in filtered_df.iterrows():
if pd.isna(row[10]):
genotype_str = row[9].split(":")[4]
else:
genotype_str = row[9].split(":")[5]
if genotype_str.endswith('%'):
genotype = float(genotype_str[:-1]) / 100 # 将百分数转换为小数
else:
genotype = float(genotype_str)
result_dict[j]['sum'] += genotype
if genotype <= 0.9 and genotype >= 0.1:
result_dict[j]['count'] += 1
return result_dict
def write_output(result_dict):
# 计算平均杂合率
sum_i = 0
count_i = 0
average_i = 0
for key, value in result_dict.items():
sum_i += value['count']
count_i += 1
average_i = sum_i / count_i if count_i > 0 else 0
return average_i
def sex_detect_probe(average_i):
# 判断性别
if average_i == 0.0:
sample_sex_probe = ""
else:
sample_sex_probe = ""
return sample_sex_probe
def sex_detect_lims(filename):
# 从 JSON 文件中读取 LIMS 中的性别信息
with open(filename, 'r', encoding= "utf-8") as json_file:
config_data = json.load(json_file)
sample_sex_lims = config_data["data"][0]["gender"]
return sample_sex_lims
def sex_detect(sample_sex_probe, sample_sex_lims):
# 比较样本推断的性别和 LIMS 中的性别
if sample_sex_probe == sample_sex_lims:
return "性别一致"
else:
v1 = f"性别不一致(计算结果:{sample_sex_probe}"
v2 = f"lims:{sample_sex_lims})"
return v1+","+v2
def main(out_dir, sample_name):
sample_vcf = out_dir + "/mutation/" + sample_name + ".snp.indel.Germline.anno.hg19_multianno.vcf"
position_file = "/dataseq/jmdna/codes/public/gender.bed"
positions = read_position_file(position_file)
result_dict = process_positions(sample_vcf, positions)
average_i = write_output(result_dict)
sample_sex_probe = sex_detect_probe(average_i)
post_json_file = out_dir + "/qc/" + sample_name + "_post.json"
sample_sex_lims = sex_detect_lims(post_json_file)
result_data = sex_detect(sample_sex_probe, sample_sex_lims)
return result_data
if __name__ == "__main__":
if len(sys.argv) < 2:
print("usage: python3 sex_detect.py out_dir sample_name")
sys.exit()
out_dir = sys.argv[1]
sample_name = sys.argv[2]
main(out_dir,sample_name)