97 lines
3.3 KiB
Python
97 lines
3.3 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
#-*-coding:utf-8-*-
|
|||
|
|
import sys
|
|||
|
|
import json
|
|||
|
|
import pandas as pd
|
|||
|
|
|
|||
|
|
def read_position_file(positions_filename):
|
|||
|
|
# 读取 csv 文件中的位点信息
|
|||
|
|
positions = []
|
|||
|
|
df = pd.read_csv(positions_filename, sep='\t')
|
|||
|
|
for p in df['end']:
|
|||
|
|
positions.append(int(p))
|
|||
|
|
return positions
|
|||
|
|
|
|||
|
|
def process_positions(sample_vcf, positions):
|
|||
|
|
# 统计样本杂合率
|
|||
|
|
result_dict = {}
|
|||
|
|
# 读取VCF文件为DataFrame,624会出现nan值
|
|||
|
|
df = pd.read_csv(sample_vcf, sep='\t', comment='#', header=None, names=range(11))
|
|||
|
|
for j in positions:
|
|||
|
|
result_dict[j] = {'count': 0, 'sum': 0}
|
|||
|
|
# 筛选出符合条件的行
|
|||
|
|
filtered_df = df[(df[0] == "chrX") & (df[1] == j)]
|
|||
|
|
# 处理每行数据,处理624的nan
|
|||
|
|
for index, row in filtered_df.iterrows():
|
|||
|
|
if pd.isna(row[10]):
|
|||
|
|
genotype_str = row[9].split(":")[4]
|
|||
|
|
else:
|
|||
|
|
genotype_str = row[9].split(":")[5]
|
|||
|
|
if genotype_str.endswith('%'):
|
|||
|
|
genotype = float(genotype_str[:-1]) / 100 # 将百分数转换为小数
|
|||
|
|
else:
|
|||
|
|
genotype = float(genotype_str)
|
|||
|
|
|
|||
|
|
result_dict[j]['sum'] += genotype
|
|||
|
|
if genotype <= 0.9 and genotype >= 0.1:
|
|||
|
|
result_dict[j]['count'] += 1
|
|||
|
|
|
|||
|
|
return result_dict
|
|||
|
|
|
|||
|
|
def write_output(result_dict):
|
|||
|
|
# 计算平均杂合率
|
|||
|
|
sum_i = 0
|
|||
|
|
count_i = 0
|
|||
|
|
average_i = 0
|
|||
|
|
for key, value in result_dict.items():
|
|||
|
|
sum_i += value['count']
|
|||
|
|
count_i += 1
|
|||
|
|
average_i = sum_i / count_i if count_i > 0 else 0
|
|||
|
|
return average_i
|
|||
|
|
|
|||
|
|
def sex_detect_probe(average_i):
|
|||
|
|
# 判断性别
|
|||
|
|
if average_i == 0.0:
|
|||
|
|
sample_sex_probe = "男"
|
|||
|
|
else:
|
|||
|
|
sample_sex_probe = "女"
|
|||
|
|
return sample_sex_probe
|
|||
|
|
|
|||
|
|
def sex_detect_lims(filename):
|
|||
|
|
# 从 JSON 文件中读取 LIMS 中的性别信息
|
|||
|
|
with open(filename, 'r', encoding= "utf-8") as json_file:
|
|||
|
|
config_data = json.load(json_file)
|
|||
|
|
sample_sex_lims = config_data["data"][0]["gender"]
|
|||
|
|
return sample_sex_lims
|
|||
|
|
|
|||
|
|
def sex_detect(sample_sex_probe, sample_sex_lims):
|
|||
|
|
# 比较样本推断的性别和 LIMS 中的性别
|
|||
|
|
if sample_sex_probe == sample_sex_lims:
|
|||
|
|
return "性别一致"
|
|||
|
|
else:
|
|||
|
|
v1 = f"性别不一致(计算结果:{sample_sex_probe}"
|
|||
|
|
v2 = f"lims:{sample_sex_lims})"
|
|||
|
|
return v1+","+v2
|
|||
|
|
|
|||
|
|
def main(out_dir, sample_name):
|
|||
|
|
sample_vcf = out_dir + "/mutation/" + sample_name + ".snp.indel.Germline.anno.hg19_multianno.vcf"
|
|||
|
|
position_file = "/dataseq/jmdna/codes/public/gender.bed"
|
|||
|
|
positions = read_position_file(position_file)
|
|||
|
|
result_dict = process_positions(sample_vcf, positions)
|
|||
|
|
average_i = write_output(result_dict)
|
|||
|
|
sample_sex_probe = sex_detect_probe(average_i)
|
|||
|
|
post_json_file = out_dir + "/qc/" + sample_name + "_post.json"
|
|||
|
|
sample_sex_lims = sex_detect_lims(post_json_file)
|
|||
|
|
result_data = sex_detect(sample_sex_probe, sample_sex_lims)
|
|||
|
|
return result_data
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
if len(sys.argv) < 2:
|
|||
|
|
print("usage: python3 sex_detect.py out_dir sample_name")
|
|||
|
|
sys.exit()
|
|||
|
|
|
|||
|
|
out_dir = sys.argv[1]
|
|||
|
|
sample_name = sys.argv[2]
|
|||
|
|
|
|||
|
|
main(out_dir,sample_name)
|