#!/usr/bin/env python3 #-*-coding:utf-8-*- import sys import json import pandas as pd def read_position_file(positions_filename): # 读取 csv 文件中的位点信息 positions = [] df = pd.read_csv(positions_filename, sep='\t') for p in df['end']: positions.append(int(p)) return positions def process_positions(sample_vcf, positions): # 统计样本杂合率 result_dict = {} # 读取VCF文件为DataFrame,624会出现nan值 df = pd.read_csv(sample_vcf, sep='\t', comment='#', header=None, names=range(11)) for j in positions: result_dict[j] = {'count': 0, 'sum': 0} # 筛选出符合条件的行 filtered_df = df[(df[0] == "chrX") & (df[1] == j)] # 处理每行数据,处理624的nan for index, row in filtered_df.iterrows(): if pd.isna(row[10]): genotype_str = row[9].split(":")[4] else: genotype_str = row[9].split(":")[5] if genotype_str.endswith('%'): genotype = float(genotype_str[:-1]) / 100 # 将百分数转换为小数 else: genotype = float(genotype_str) result_dict[j]['sum'] += genotype if genotype <= 0.9 and genotype >= 0.1: result_dict[j]['count'] += 1 return result_dict def write_output(result_dict): # 计算平均杂合率 sum_i = 0 count_i = 0 average_i = 0 for key, value in result_dict.items(): sum_i += value['count'] count_i += 1 average_i = sum_i / count_i if count_i > 0 else 0 return average_i def sex_detect_probe(average_i): # 判断性别 if average_i == 0.0: sample_sex_probe = "男" else: sample_sex_probe = "女" return sample_sex_probe def sex_detect_lims(filename): # 从 JSON 文件中读取 LIMS 中的性别信息 with open(filename, 'r', encoding= "utf-8") as json_file: config_data = json.load(json_file) sample_sex_lims = config_data["data"][0]["gender"] return sample_sex_lims def sex_detect(sample_sex_probe, sample_sex_lims): # 比较样本推断的性别和 LIMS 中的性别 if sample_sex_probe == sample_sex_lims: return "性别一致" else: v1 = f"性别不一致(计算结果:{sample_sex_probe}" v2 = f"lims:{sample_sex_lims})" return v1+","+v2 def main(out_dir, sample_name): sample_vcf = out_dir + "/mutation/" + sample_name + ".snp.indel.Germline.anno.hg19_multianno.vcf" position_file = "/dataseq/jmdna/codes/public/gender.bed" positions = read_position_file(position_file) result_dict = process_positions(sample_vcf, positions) average_i = write_output(result_dict) sample_sex_probe = sex_detect_probe(average_i) post_json_file = out_dir + "/qc/" + sample_name + "_post.json" sample_sex_lims = sex_detect_lims(post_json_file) result_data = sex_detect(sample_sex_probe, sample_sex_lims) return result_data if __name__ == "__main__": if len(sys.argv) < 2: print("usage: python3 sex_detect.py out_dir sample_name") sys.exit() out_dir = sys.argv[1] sample_name = sys.argv[2] main(out_dir,sample_name)