pipeline/script/gender_detect.py

97 lines
3.3 KiB
Python
Raw Normal View History

2023-08-25 10:06:31 +08:00
#!/usr/bin/env python3
#-*-coding:utf-8-*-
import sys
import json
import pandas as pd
def read_position_file(positions_filename):
# 读取 csv 文件中的位点信息
positions = []
df = pd.read_csv(positions_filename, sep='\t')
for p in df['end']:
positions.append(int(p))
return positions
def process_positions(sample_vcf, positions):
# 统计样本杂合率
result_dict = {}
# 读取VCF文件为DataFrame,624会出现nan值
df = pd.read_csv(sample_vcf, sep='\t', comment='#', header=None, names=range(11))
for j in positions:
result_dict[j] = {'count': 0, 'sum': 0}
# 筛选出符合条件的行
filtered_df = df[(df[0] == "chrX") & (df[1] == j)]
# 处理每行数据处理624的nan
for index, row in filtered_df.iterrows():
if pd.isna(row[10]):
genotype_str = row[9].split(":")[4]
else:
genotype_str = row[9].split(":")[5]
if genotype_str.endswith('%'):
genotype = float(genotype_str[:-1]) / 100 # 将百分数转换为小数
else:
genotype = float(genotype_str)
result_dict[j]['sum'] += genotype
if genotype <= 0.9 and genotype >= 0.1:
result_dict[j]['count'] += 1
return result_dict
def write_output(result_dict):
# 计算平均杂合率
sum_i = 0
count_i = 0
average_i = 0
for key, value in result_dict.items():
sum_i += value['count']
count_i += 1
average_i = sum_i / count_i if count_i > 0 else 0
return average_i
def sex_detect_probe(average_i):
# 判断性别
if average_i == 0.0:
sample_sex_probe = ""
else:
sample_sex_probe = ""
return sample_sex_probe
def sex_detect_lims(filename):
# 从 JSON 文件中读取 LIMS 中的性别信息
with open(filename, 'r', encoding= "utf-8") as json_file:
config_data = json.load(json_file)
sample_sex_lims = config_data["data"][0]["gender"]
return sample_sex_lims
def sex_detect(sample_sex_probe, sample_sex_lims):
# 比较样本推断的性别和 LIMS 中的性别
if sample_sex_probe == sample_sex_lims:
return "性别一致"
else:
v1 = f"性别不一致(计算结果:{sample_sex_probe}"
v2 = f"lims:{sample_sex_lims})"
return v1+","+v2
def main(out_dir, sample_name):
sample_vcf = out_dir + "/mutation/" + sample_name + ".snp.indel.Germline.anno.hg19_multianno.vcf"
position_file = "/dataseq/jmdna/codes/public/gender.bed"
positions = read_position_file(position_file)
result_dict = process_positions(sample_vcf, positions)
average_i = write_output(result_dict)
sample_sex_probe = sex_detect_probe(average_i)
post_json_file = out_dir + "/qc/" + sample_name + "_post.json"
sample_sex_lims = sex_detect_lims(post_json_file)
result_data = sex_detect(sample_sex_probe, sample_sex_lims)
return result_data
if __name__ == "__main__":
if len(sys.argv) < 2:
print("usage: python3 sex_detect.py out_dir sample_name")
sys.exit()
out_dir = sys.argv[1]
sample_name = sys.argv[2]
main(out_dir,sample_name)