97 lines
3.3 KiB
Python
Executable File
97 lines
3.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
#-*-coding:utf-8-*-
|
||
import sys
|
||
import json
|
||
import pandas as pd
|
||
|
||
def read_position_file(positions_filename):
|
||
# 读取 csv 文件中的位点信息
|
||
positions = []
|
||
df = pd.read_csv(positions_filename, sep='\t')
|
||
for p in df['end']:
|
||
positions.append(int(p))
|
||
return positions
|
||
|
||
def process_positions(sample_vcf, positions):
|
||
# 统计样本杂合率
|
||
result_dict = {}
|
||
# 读取VCF文件为DataFrame,624会出现nan值
|
||
df = pd.read_csv(sample_vcf, sep='\t', comment='#', header=None, names=range(11))
|
||
for j in positions:
|
||
result_dict[j] = {'count': 0, 'sum': 0}
|
||
# 筛选出符合条件的行
|
||
filtered_df = df[(df[0] == "chrX") & (df[1] == j)]
|
||
# 处理每行数据,处理624的nan
|
||
for index, row in filtered_df.iterrows():
|
||
if pd.isna(row[10]):
|
||
genotype_str = row[9].split(":")[4]
|
||
else:
|
||
genotype_str = row[9].split(":")[5]
|
||
if genotype_str.endswith('%'):
|
||
genotype = float(genotype_str[:-1]) / 100 # 将百分数转换为小数
|
||
else:
|
||
genotype = float(genotype_str)
|
||
|
||
result_dict[j]['sum'] += genotype
|
||
if genotype <= 0.9 and genotype >= 0.1:
|
||
result_dict[j]['count'] += 1
|
||
|
||
return result_dict
|
||
|
||
def write_output(result_dict):
|
||
# 计算平均杂合率
|
||
sum_i = 0
|
||
count_i = 0
|
||
average_i = 0
|
||
for key, value in result_dict.items():
|
||
sum_i += value['count']
|
||
count_i += 1
|
||
average_i = sum_i / count_i if count_i > 0 else 0
|
||
return average_i
|
||
|
||
def sex_detect_probe(average_i):
|
||
# 判断性别
|
||
if average_i == 0.0:
|
||
sample_sex_probe = "男"
|
||
else:
|
||
sample_sex_probe = "女"
|
||
return sample_sex_probe
|
||
|
||
def sex_detect_lims(filename):
|
||
# 从 JSON 文件中读取 LIMS 中的性别信息
|
||
with open(filename, 'r', encoding= "utf-8") as json_file:
|
||
config_data = json.load(json_file)
|
||
sample_sex_lims = config_data["data"][0]["gender"]
|
||
return sample_sex_lims
|
||
|
||
def sex_detect(sample_sex_probe, sample_sex_lims):
|
||
# 比较样本推断的性别和 LIMS 中的性别
|
||
if sample_sex_probe == sample_sex_lims:
|
||
return "性别一致"
|
||
else:
|
||
v1 = f"性别不一致(计算结果:{sample_sex_probe}"
|
||
v2 = f"lims:{sample_sex_lims})"
|
||
return v1+","+v2
|
||
|
||
def main(out_dir, sample_name):
|
||
sample_vcf = out_dir + "/mutation/" + sample_name + ".snp.indel.Germline.anno.hg19_multianno.vcf"
|
||
position_file = "/dataseq/jmdna/codes/public/gender.bed"
|
||
positions = read_position_file(position_file)
|
||
result_dict = process_positions(sample_vcf, positions)
|
||
average_i = write_output(result_dict)
|
||
sample_sex_probe = sex_detect_probe(average_i)
|
||
post_json_file = out_dir + "/qc/" + sample_name + "_post.json"
|
||
sample_sex_lims = sex_detect_lims(post_json_file)
|
||
result_data = sex_detect(sample_sex_probe, sample_sex_lims)
|
||
return result_data
|
||
|
||
if __name__ == "__main__":
|
||
if len(sys.argv) < 2:
|
||
print("usage: python3 sex_detect.py out_dir sample_name")
|
||
sys.exit()
|
||
|
||
out_dir = sys.argv[1]
|
||
sample_name = sys.argv[2]
|
||
|
||
main(out_dir,sample_name)
|