pipeline/script/gender_detect.py

#!/usr/bin/env python3
#-*-coding:utf-8-*-
import sys
import json
import pandas as pd

def read_position_file(positions_filename):
    # 读取 csv 文件中的位点信息
    positions = []
    df = pd.read_csv(positions_filename, sep='\t')
    for p in df['end']:
        positions.append(int(p))
    return positions

def process_positions(sample_vcf, positions):
    # 统计样本杂合率
    result_dict = {}
    # 读取VCF文件为DataFrame,624会出现nan值
    df = pd.read_csv(sample_vcf, sep='\t', comment='#', header=None, names=range(11))
    for j in positions:
        result_dict[j] = {'count': 0, 'sum': 0}
        # 筛选出符合条件的行
        filtered_df = df[(df[0] == "chrX") & (df[1] == j)]
        # 处理每行数据，处理624的nan
        for index, row in filtered_df.iterrows():
            if pd.isna(row[10]):
                genotype_str = row[9].split(":")[4]
            else:
                genotype_str = row[9].split(":")[5]
            if genotype_str.endswith('%'):
                genotype = float(genotype_str[:-1]) / 100  # 将百分数转换为小数
            else:
                genotype = float(genotype_str)

            result_dict[j]['sum'] += genotype
            if genotype <= 0.9 and genotype >= 0.1:
                result_dict[j]['count'] += 1

    return result_dict

def write_output(result_dict):
    # 计算平均杂合率
    sum_i = 0
    count_i = 0
    average_i = 0
    for key, value in result_dict.items():
        sum_i += value['count']
        count_i += 1
    average_i = sum_i / count_i if count_i > 0 else 0
    return average_i

def sex_detect_probe(average_i):
    # 判断性别
    if average_i == 0.0:
        sample_sex_probe = "男"
    else:
        sample_sex_probe = "女"
    return sample_sex_probe

def sex_detect_lims(filename):
    # 从 JSON 文件中读取 LIMS 中的性别信息
    with open(filename, 'r', encoding= "utf-8") as json_file:
        config_data = json.load(json_file)
        sample_sex_lims = config_data["data"][0]["gender"]
    return sample_sex_lims

def sex_detect(sample_sex_probe, sample_sex_lims):
    # 比较样本推断的性别和 LIMS 中的性别
    if sample_sex_probe == sample_sex_lims:
        return "性别一致"
    else:
        v1 = f"性别不一致(计算结果:{sample_sex_probe}"
        v2 = f"lims:{sample_sex_lims})"
        return v1+","+v2

def main(out_dir, sample_name):
    sample_vcf = out_dir + "/mutation/" + sample_name + ".snp.indel.Germline.anno.hg19_multianno.vcf"
    position_file = "/dataseq/jmdna/codes/public/gender.bed"
    positions = read_position_file(position_file)
    result_dict = process_positions(sample_vcf, positions)
    average_i = write_output(result_dict)
    sample_sex_probe = sex_detect_probe(average_i)
    post_json_file = out_dir + "/qc/" + sample_name + "_post.json"
    sample_sex_lims = sex_detect_lims(post_json_file)
    result_data = sex_detect(sample_sex_probe, sample_sex_lims)
    return result_data

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("usage: python3 sex_detect.py out_dir sample_name")
        sys.exit()

    out_dir = sys.argv[1]
    sample_name = sys.argv[2]

    main(out_dir,sample_name)