pipeline/script/hereditary/hereditary.py

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

"""
Created on: 2021-10-09
@author: cjs
# 用途：处理遗传性并发症信息
# 版本：0.0.3
# 0.0.2 2021-11-09 输出文件1，如果没有阳性结果，输出文件大小为0kb，表头也不写
# 0.0.3 2022-01-28 160和650基因信息用不同的文件
# 0.0.3 2022-01-28 日志中记录参数列表以空格合并
# 0.0.4 2022-12-06 增加pro.txt，检测到的基因原始信息，临床意义包含1、2、3
# 0.0.4 2022-12-06 1.txt修改文件名为:hereditary.txt,临床意义包含1、2、3;2.txt修改为:hereditary.risk.txt
# 0.0.4 2022-12-06 风险等级"高风险"修改为"偏高"
# 0.0.5 2023-03-06 增加624项目
# 最后编辑日期: 2022-01-28
"""

from glob import glob
from collections import defaultdict
import datetime
import traceback
import os
import sys

# 全局参数
Output_dir = r''
P_Name = r''
Project = r''
Panle_Txt = 'r'

# 脚本运行的参数
Exe_Path = r''
GLog = None
StartTime = r''

# 配置文件
Gene_Panle = {'160gene': '160_panel.hereditary.txt',
              '624gene': '624_panel.hereditary.txt',
              '650gene': '650_panel.hereditary.txt'}
Gene_TD = {'160gene': 'gene_dd.160.txt',
           '624gene': 'gene_dd.624.txt',
           '650gene': 'gene_dd.650.txt'}
Min_Rate = 0.10  # 胚系突变默认最低突变频率

# 基因常见转录本信息
Ref_Txt = 'oncokbgene.txt'
Gene_NM = {}


def usage(df_exe):
    """使用方法."""
    print("Usage:")
    print("%s -o Output_dir -p Project [--n P_Name ]" % df_exe)
    sys.exit(0)


def Get_Argvs():
    """获取环境变量."""
    global Output_dir
    global P_Name
    global Project
    global Panle_Txt

    global Exe_Path
    global StartTime

    argvs = sys.argv
    file_real = os.path.realpath(argvs[0])
    Exe_Path = os.path.dirname(file_real)
    bin_name = os.path.basename(file_real)

    StartTime = datetime.datetime.now()
    ymd = StartTime.__format__('%Y%m%d_%H%M%S')

    # print(argvs)

    if len(argvs) < 5:
        print('参数列表数量不对, %s' % argvs)
        usage(bin_name)

    else:
        # 参数解析
        opt_normal = ''

        argv_index = 1
        for argv in argvs[1:]:
            argv_index += 1
            argv = argv.upper()

            # 必选参数获取
            if argv == '-O':
                Output_dir = os.path.realpath(argvs[argv_index])
                # 新建日志
                if not os.path.exists(Output_dir):
                    print('项目路径不存在: %s' % Output_dir)
                    usage(bin_name)

            elif argv == '-P':
                Project = argvs[argv_index]
                Panle_Txt = Gene_Panle.get(Project, '')

            # 可选参数获得
            elif argv == '--N':
                opt_normal = argvs[argv_index]

    # 核对参数解析结果
    if Output_dir == '':
        print('未能解析出项目路径')
        usage(bin_name)
    elif Panle_Txt == '':
        print('-p:%s, 指定的项目类型不正确' % Project)
        usage(bin_name)
    # 解析noraml
    mut_Path = os.path.join(Output_dir, 'mutation')
    ff_txt = ''
    txt_base = ''
    ft_txts = []
    if opt_normal == '':
        ft_txts = glob('%s/*Germline*filtered.txt' % (mut_Path))
    else:
        ft_txts = glob('%s/*%s*Germline*filtered.txt' % (
            mut_Path, opt_normal))
    if len(ft_txts) == 0:
        if opt_normal == '':
            print('%s,项目路径未发现filtered.txt文件' % Output_dir)
        else:
            print('项目路径未发现名称为:%s的相关filtered.txt文件' % opt_normal)
        usage(bin_name)
    else:
        # 理论只能找到1个相关文件，只取列表中的第一个文件
        ff_txt = ft_txts[0]
        txt_base = os.path.basename(ff_txt)
        print('使用的txt文件:\n%s' % txt_base)
        txt_name = txt_base.split('.')[0]
        if opt_normal == '':
            P_Name = txt_name
        else:
            P_Name = opt_normal

    # 日志
    hd_path = os.path.join(Output_dir, 'hereditary')
    if not os.path.exists(hd_path):
        os.makedirs(hd_path)

    return ff_txt


def Preocess_NM():
    """读取基因的常见转录本信息."""
    global Gene_NM
    ref_txt = os.path.join(Exe_Path, 'ref', Ref_Txt)
    # ref_txt = '/home/codes/reportbase/oncokbgene.txt'
    if os.path.isfile(ref_txt):
        with open(ref_txt, 'r', encoding='utf8') as ff:
            for line in ff:
                line = line.strip()
                lns = line.split('\t')
                if len(lns) > 2 and not lns[0].startswith('#'):
                    gene = lns[0]
                    nm = lns[2].split('.')[0]  # 去除版本号
                    Gene_NM[gene] = nm
    else:
        eline = '不存在ref文件:%s' % ref_txt
        print(eline)


# 处理txt文件
def Process_Txt(df_txt):
    """读取项目的txt文件."""
    Preocess_NM()

    # 处理当前项目的基因列表
    pro_genes = []
    p_txt = os.path.join(Exe_Path, 'config', Panle_Txt)
    with open(p_txt, 'r', encoding='utf-8') as ff:
        for line in ff:
            gene = line.strip()
            if not line.startswith('#') and len(gene) > 0:
                pro_genes.append(gene)

    # 处理gene与综合征、遗传疾病的关系
    gene_dd = {}
    dd_base = Gene_TD.get(Project)
    dd_full = os.path.join(Exe_Path, 'config', dd_base)
    with open(dd_full, 'r', encoding='utf-8') as ff:
        for line in ff:
            line = line.strip()
            if not line.startswith('#') and len(line) > 0:
                lns = line.split('\t')
                gene = lns[0]
                gene_ls = lns[1].split('|')
                gene_dd[gene] = gene_ls

    # 筛选条件1：临床意义为1致病的2疑似致病的
    # 筛选条件2：突变频率大于10%
    muts_dd = defaultdict(list)
    inlines = []
    with open(df_txt, 'r', encoding='utf-8') as ff:
        inlines = ff.readlines()
    head = inlines[0]
    mut_index = -1
    head_infos = head.strip().split('\t')
    try:
        mut_index = head_infos.index("Otherinfo13")
    except Exception:
        print('未发现Otherinfo13列信息')
    res3 = [head]

    if mut_index > -1:
        for line in inlines[1:]:
            lns = line.split('\t')
            if len(lns) >= 100:
                normal_info = lns[mut_index]
                mut_rate = 0
                gene = lns[7]
                gene_type = ''
                gene_p = ''
                try:
                    mut_infos = normal_info.split(':')
                    mut_type = mut_infos[0]
                    if mut_type == '0/1':
                        gene_type = '杂合'
                    elif mut_type == '1/1':
                        gene_type = '纯合'
                    # 区别对待组织版(频率带%)和血液版(频率不带%)
                    if normal_info.find(r'%') > -1:
                        mut_chrs = mut_infos[5]
                        mut_rate = float(mut_chrs.strip('%')) / 100
                    else:
                        # # 血液版区别对待1/1
                        mut_chrs = mut_infos[4]
                        # if normal_info.startswith('1/1'):
                        #     mut_chrs = mut_infos[4]
                        mut_rate = float(mut_chrs.strip())
                except Exception:
                    print(normal_info)
                    print(traceback.format_exc())

                if gene in pro_genes:
                    if mut_rate >= Min_Rate:
                        # 只要在项目内的基因都收集到pre.txt
                        res3.append(line)
                        nm_pos = 0
                        gene_nms = lns[10].split(':')
                        nm_index = 0
                        gene_nm = Gene_NM.get(gene, '')
                        if gene_nm != '':
                            for nm in gene_nms:
                                if nm.startswith('NM_'):
                                    if nm == gene_nm:
                                        gene_p = gene_nms[nm_index + 4]
                                        # 防止格式不一致，多一个基因
                                        gene_p = gene_p.split(',')[0]
                                        nm_pos = 1
                                        break
                        else:
                            # 没有发现常见转录本
                            gene_p = gene_nms[-1]
                        if nm_pos == 0:
                            print('%s, Gene未发现常见转录本' % gene)

                        temp_ls = [gene_type, gene_p]
                        if lns[0] != '3':
                            muts_dd[gene].append(temp_ls)

    # 写最终结果文件
    # 1
    hd_path = os.path.join(Output_dir, 'hereditary')
    txt1_base = '%s.hereditary.txt' % P_Name
    txt3_base = '%s.hereditary.pre.txt' % P_Name
    txt1_full = os.path.join(hd_path, txt1_base)
    txt3_full = os.path.join(hd_path, txt3_base)
    with open(txt1_full, 'w', encoding='utf-8') as ff:
        ff.write('基因\t遗传性肿瘤综合征\t遗传方式\t杂合/纯合\t检测结果\n')
        if len(muts_dd) > 0:
            for gene in muts_dd:
                gene_values = muts_dd[gene]
                if gene in pro_genes:
                    gene_info = gene_dd.get(gene, ['', ''])[0]
                    gene_types = []
                    gene_ps = []
                    for gene_value_ls in gene_values:
                        gene_types.append(gene_value_ls[0])
                        gene_ps.append(gene_value_ls[1])

                    # 多个综合征分隔符
                    split_pos = ';'
                    gene_type = split_pos.join(gene_types)
                    gene_p = split_pos.join(gene_ps)
                    line = '%s\t%s\t%s' % (gene, gene_info, 'AD')
                    line += '\t%s\t%s\n' % (gene_type, gene_p)
                    ff.write(line)
    # 3
    with open(txt3_full, 'w', encoding='utf-8') as ff:
        if len(res3) > 1:
            ff.writelines(res3)

    # 2
    txt2_base = '%s.risk.txt' % P_Name
    txt2_full = os.path.join(hd_path, txt2_base)
    cancers_high = []  # 记录已经统计过的高风险肿瘤类型
    cancers_normal = []  # 记录已经统计过的正常风险肿瘤类型

    for gene in pro_genes:
        gene_info = gene_dd.get(gene, ['', ''])[1]
        for cancer in gene_info.split(';'):
            # 突变有高风险
            if gene in muts_dd:
                if cancer not in cancers_high:
                    cancers_high.append(cancer)
            # 正常分享
            else:
                if cancer not in cancers_normal:
                    cancers_normal.append(cancer)

    cancers = []  # 记录所有的肿瘤类型
    with open(txt2_full, 'w', encoding='utf-8') as ff:
        ff.write('肿瘤类型\t风险值\n')
        for gene in pro_genes:
            gene_info = gene_dd.get(gene, ['', ''])[1]
            for cancer in gene_info.split(';'):

                if cancer not in cancers:
                    cancers.append(cancer)
                    cancer_risk = ''
                    if cancer in cancers_high:
                        cancer_risk = '偏高'
                    else:
                        cancer_risk = '同一般人群'
                    line = '%s\t%s\n' % (cancer, cancer_risk)
                    ff.write(line)


if __name__ == '__main__':
    ff_txt = Get_Argvs()

    try:
        Process_Txt(ff_txt)
    except BaseException:
        print(traceback.format_exc())
    endtime = datetime.datetime.now()