pipeline/script/hereditary/Check_Gene.py

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

"""
Created on: 2021-09-30
@author: cjs
# 用途：处理遗传性并发症信息,生成hereditary.py脚本所需的基因信息文件
# panel.hereditary.txt为panel中有综合征信息的基因
# gene_dd.txt为由综合征解释的基因。有的基因不在panel中，只是在综合征的解释中
# 版本：0.0.2
# 2022-02-10 0.0.2 项目模板文件取消对别称的输入，所以别称的检查也取消
# 最后编辑日期: 2021-09-30
"""

from cjs_test.cjs_logger import Logger
from openpyxl import load_workbook
from collections import defaultdict
import pandas as pd
import datetime
import traceback
import os
import sys

# 全局参数
GLog = None
Exe_Path = ''


# xlsx文件
XLSX_FILE = '160panel遗传性肿瘤综合征.庞杰.2022.02.10修.xlsx'
# XLSX_FILE = '650panel遗传性肿瘤综合征.庞杰.2022.02.10修.xlsx'
TXT_IN = '160_panel_genes.txt'
# TXT_IN = '650_panel_genes.txt'

DD_OUT = 'gene_dd.160.txt'
TXT_OUT = '160_panel.hereditary.txt'


# 脚本变量
Gene_Dd = defaultdict(list)  # 记录基因与综合征，肿瘤的对应关系
# HGNC基因列表，在列表中的直接当做基因，不在列表中的当做其它基因的别称
Genes_Ls = []
# 基因别称的字典
Gas_Dd = {}


# 处理基因别名
def Creat_Gas():
    global Gas_Dd
    global Genes_Ls
    hgnc_base = 'gene_with_protein_product2.txt'
    hgnc_full = os.path.join(Exe_Path, 'ref', '001', 'HGNC', hgnc_base)

    pd_txt = pd.read_table(hgnc_full)  # 这个是带有标题的文件
    gene_names = list(pd_txt['symbol'])  # 根据标题来取值
    head_list = list(pd_txt.iloc[0].keys())
    name_index = head_list.index('symbol')
    alias_index = head_list.index('alias_symbol')
    prev_index = head_list.index('prev_symbol')

    with open(hgnc_full, 'r', encoding='utf8') as ff:
        for line in ff:
            line = line.replace('\n', '')
            if len(line) > 0 and not line.startswith('#'):
                lns = line.split('\t')
                # print(lns)
                gene_name = lns[name_index]
                if gene_name not in Genes_Ls:
                    Genes_Ls.append(gene_name)
                alias_symbols = lns[alias_index]
                prev_symbols = lns[prev_index]

                # 首先核对alias_symbols
                for gene_symbol in alias_symbols.split('|'):
                    if len(gene_symbol) > 0:
                        gene_symbol = gene_symbol.replace(r'"', '')
                        if gene_symbol not in gene_names:
                            if gene_symbol not in Gas_Dd:
                                Gas_Dd[gene_symbol] = gene_name

                # 再核对prev_symbols
                for gene_symbol in prev_symbols.split('|'):
                    if len(gene_symbol) > 0:
                        gene_symbol = gene_symbol.replace(r'"', '')
                        if gene_symbol not in gene_names:
                            if gene_symbol not in Gas_Dd:
                                Gas_Dd[gene_symbol] = gene_name


# 处理综合征，基因，肿瘤的对应关系
def Get_Gene_Dd(df_xlsx):
    global Gene_Dd
    # 需要的信息:遗传性肿瘤综合征中文名\致病基因\相关肿瘤
    # 遗传方式默认AD
    wb = load_workbook(df_xlsx, data_only=True)  # 公式时取值
    sheets = wb.sheetnames
    ws = wb[sheets[0]]
    col_max = ws.max_column  # 数据终止行， 以0开始计数
    col_min = ws.min_column  # 数据开始行， 以0开始计数
    row_max = ws.max_row  # 数据终止列(包含)， 以0开始计数
    row_min = ws.min_row  # 数据开始列(包含)， 以0开始计数

    # 按照行处理数据
    sheet_rows = [row for row in ws.rows]  # 获取所有行
    # 表头处理
    syndrome_head = '遗传性肿瘤综合征中文名'
    gene_head = '致病基因'
    cancer_head = '相关肿瘤'
    key_list = [syndrome_head, gene_head, cancer_head]
    key_index = {}
    for d_key in key_list:
        key_index[d_key] = -1

    cell_index = col_min - 1
    for cell in sheet_rows[row_min - 1][col_min - 1:col_max]:
        cell_str = str(cell.value)
        # print(cell_index, cell_str)
        for d_key in key_list:
            if cell_str.find(d_key) > -1:
                key_index[d_key] = cell_index
                break
        cell_index += 1

    check_pos = 1
    for d_key in key_list:
        if key_index[d_key] == -1:
            check_pos = 0
            print('%s, 表头中未发现' % d_key)

    if check_pos == 1:
        for sheet_row in sheet_rows[row_min:row_max]:
            temp_dd = {}
            for d_key in key_list:
                d_index = key_index[d_key]
                temp_dd[d_key] = str(sheet_row[d_index].value)
            # print(temp_dd)

            genes = temp_dd[gene_head].strip()
            syndrome = temp_dd[syndrome_head].strip()
            cancers = temp_dd[cancer_head].strip()

            # 基因信息分隔，分隔标识：中文顿号，空格
            genes = genes.replace('、', ',')
            genes = genes.replace(' ', ',')
            for gene in genes.split(','):
                # 多个空格连续在一起会造成分隔出空值
                if len(gene) > 0:
                    if gene not in Genes_Ls:
                        print(gene, '基因不是HGNC标准名称')
                        gene_gas = Gas_Dd.get(gene, gene)
                        Gene_Dd[gene_gas].append([syndrome, cancers])
                    else:
                        Gene_Dd[gene].append([syndrome, cancers])

    # 处理结束
    wb.close()
    print('BRCA1', Gene_Dd['BRCA1'])


def Check_Gene_Dd():
    txt_dd = {}
    # 处理初步的gene可能是别称的字典
    for gene in Gene_Dd:
        her_syns = Gene_Dd[gene]

        # 处理更改过名称的基因
        her_ls = []
        syn_ls = []
        for her_syn in her_syns:
            hers = her_syn[0].strip()
            syns = her_syn[1].strip()

            hers = hers.replace('、', ',')
            hers = hers.replace('，', ',')
            # hers = hers.replace(' ', ',')
            hers = hers.replace('。', '')

            syns = syns.replace('、', ',')
            syns = syns.replace('，', ',')
            # syns = syns.replace(' ', ',')
            syns = syns.replace('。', '')

            for her in hers.split(','):
                if len(her) > 0:
                    if her not in her_ls:
                        her_ls.append(her)
            for syn in syns.split(','):
                if len(syn) > 0:
                    if syn not in syn_ls:
                        syn_ls.append(syn)

        # 遗传方式默认AD
        hy_type = ['AD']
        # 使用英文逗号分隔各个综合征
        teml_ls = [';'.join(her_ls)] + [';'.join(syn_ls)] + hy_type
        txt_dd[gene] = teml_ls

    # 写入字典到文件
    out_path = os.path.join(Exe_Path, 'config')
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    txt_full = os.path.join(out_path, DD_OUT)
    with open(txt_full, 'w', encoding='utf8') as ff:
        for gene in txt_dd:
            line = '%s\t%s\n' % (gene, '|'.join(txt_dd[gene]))
            ff.write(line)


def Check_txt(pro_gene_txt):
    genes = []
    with open(pro_gene_txt, 'r', encoding='utf8') as ff:
        for line in ff:
            if not line.startswith('#'):
                line = line.strip()
                if len(line) > 0:
                    if line not in genes:
                        genes.append(line)

    out_path = os.path.join(Exe_Path, 'config')
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    out_full = os.path.join(out_path, TXT_OUT)
    with open(out_full, 'w', encoding='utf8') as ff:
        ff.write('#该panel在遗传综合征数据库匹配到的基因\n')
        for gene in genes:
            if gene in Gene_Dd:
                ff.write(gene + '\n')


def Check_Pros():
    txt_full = os.path.join(Exe_Path, 'ref', TXT_IN)
    Check_txt(txt_full)


if __name__ == '__main__':

    file_real = os.path.realpath(sys.argv[0])
    Exe_Path = os.path.dirname(file_real)
    bin_name = os.path.basename(file_real)
    log_path = os.path.join(Exe_Path, 'logs', bin_name)
    if not os.path.exists(log_path):
        os.makedirs(log_path)
    starttime = datetime.datetime.now()
    ymd = starttime.__format__('%Y%m%d_%H%M%S')
    log_base = '%s_%s.log' % (bin_name, ymd)
    log_full = os.path.join(log_path, log_base)
    GLog = Logger(log_full, mode='w')
    GLog.info('start')
    Creat_Gas()

    xlsx_ff = os.path.join(Exe_Path, 'ref', XLSX_FILE)
    Get_Gene_Dd(xlsx_ff)
    Check_Gene_Dd()
    Check_Pros()

    try:
        pass
    except BaseException:
        GLog.error(traceback.format_exc())
        print(traceback.format_exc())
    endtime = datetime.datetime.now()
    GLog.info('end')
    GLog.info('run time:%s seconds' % ((endtime - starttime).seconds))
    GLog.close()