265 lines
8.8 KiB
Python
265 lines
8.8 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: UTF-8 -*-
|
|||
|
|
|
|||
|
|
"""
|
|||
|
|
Created on: 2021-09-30
|
|||
|
|
@author: cjs
|
|||
|
|
# 用途:处理遗传性并发症信息,生成hereditary.py脚本所需的基因信息文件
|
|||
|
|
# panel.hereditary.txt为panel中有综合征信息的基因
|
|||
|
|
# gene_dd.txt为由综合征解释的基因。有的基因不在panel中,只是在综合征的解释中
|
|||
|
|
# 版本:0.0.2
|
|||
|
|
# 2022-02-10 0.0.2 项目模板文件取消对别称的输入,所以别称的检查也取消
|
|||
|
|
# 最后编辑日期: 2021-09-30
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
from cjs_test.cjs_logger import Logger
|
|||
|
|
from openpyxl import load_workbook
|
|||
|
|
from collections import defaultdict
|
|||
|
|
import pandas as pd
|
|||
|
|
import datetime
|
|||
|
|
import traceback
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
# 全局参数
|
|||
|
|
GLog = None
|
|||
|
|
Exe_Path = ''
|
|||
|
|
|
|||
|
|
|
|||
|
|
# xlsx文件
|
|||
|
|
XLSX_FILE = '160panel遗传性肿瘤综合征.庞杰.2022.02.10修.xlsx'
|
|||
|
|
# XLSX_FILE = '650panel遗传性肿瘤综合征.庞杰.2022.02.10修.xlsx'
|
|||
|
|
TXT_IN = '160_panel_genes.txt'
|
|||
|
|
# TXT_IN = '650_panel_genes.txt'
|
|||
|
|
|
|||
|
|
DD_OUT = 'gene_dd.160.txt'
|
|||
|
|
TXT_OUT = '160_panel.hereditary.txt'
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 脚本变量
|
|||
|
|
Gene_Dd = defaultdict(list) # 记录基因与综合征,肿瘤的对应关系
|
|||
|
|
# HGNC基因列表,在列表中的直接当做基因,不在列表中的当做其它基因的别称
|
|||
|
|
Genes_Ls = []
|
|||
|
|
# 基因别称的字典
|
|||
|
|
Gas_Dd = {}
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 处理基因别名
|
|||
|
|
def Creat_Gas():
|
|||
|
|
global Gas_Dd
|
|||
|
|
global Genes_Ls
|
|||
|
|
hgnc_base = 'gene_with_protein_product2.txt'
|
|||
|
|
hgnc_full = os.path.join(Exe_Path, 'ref', '001', 'HGNC', hgnc_base)
|
|||
|
|
|
|||
|
|
pd_txt = pd.read_table(hgnc_full) # 这个是带有标题的文件
|
|||
|
|
gene_names = list(pd_txt['symbol']) # 根据标题来取值
|
|||
|
|
head_list = list(pd_txt.iloc[0].keys())
|
|||
|
|
name_index = head_list.index('symbol')
|
|||
|
|
alias_index = head_list.index('alias_symbol')
|
|||
|
|
prev_index = head_list.index('prev_symbol')
|
|||
|
|
|
|||
|
|
with open(hgnc_full, 'r', encoding='utf8') as ff:
|
|||
|
|
for line in ff:
|
|||
|
|
line = line.replace('\n', '')
|
|||
|
|
if len(line) > 0 and not line.startswith('#'):
|
|||
|
|
lns = line.split('\t')
|
|||
|
|
# print(lns)
|
|||
|
|
gene_name = lns[name_index]
|
|||
|
|
if gene_name not in Genes_Ls:
|
|||
|
|
Genes_Ls.append(gene_name)
|
|||
|
|
alias_symbols = lns[alias_index]
|
|||
|
|
prev_symbols = lns[prev_index]
|
|||
|
|
|
|||
|
|
# 首先核对alias_symbols
|
|||
|
|
for gene_symbol in alias_symbols.split('|'):
|
|||
|
|
if len(gene_symbol) > 0:
|
|||
|
|
gene_symbol = gene_symbol.replace(r'"', '')
|
|||
|
|
if gene_symbol not in gene_names:
|
|||
|
|
if gene_symbol not in Gas_Dd:
|
|||
|
|
Gas_Dd[gene_symbol] = gene_name
|
|||
|
|
|
|||
|
|
# 再核对prev_symbols
|
|||
|
|
for gene_symbol in prev_symbols.split('|'):
|
|||
|
|
if len(gene_symbol) > 0:
|
|||
|
|
gene_symbol = gene_symbol.replace(r'"', '')
|
|||
|
|
if gene_symbol not in gene_names:
|
|||
|
|
if gene_symbol not in Gas_Dd:
|
|||
|
|
Gas_Dd[gene_symbol] = gene_name
|
|||
|
|
|
|||
|
|
|
|||
|
|
# 处理综合征,基因,肿瘤的对应关系
|
|||
|
|
def Get_Gene_Dd(df_xlsx):
|
|||
|
|
global Gene_Dd
|
|||
|
|
# 需要的信息:遗传性肿瘤综合征中文名\致病基因\相关肿瘤
|
|||
|
|
# 遗传方式默认AD
|
|||
|
|
wb = load_workbook(df_xlsx, data_only=True) # 公式时取值
|
|||
|
|
sheets = wb.sheetnames
|
|||
|
|
ws = wb[sheets[0]]
|
|||
|
|
col_max = ws.max_column # 数据终止行, 以0开始计数
|
|||
|
|
col_min = ws.min_column # 数据开始行, 以0开始计数
|
|||
|
|
row_max = ws.max_row # 数据终止列(包含), 以0开始计数
|
|||
|
|
row_min = ws.min_row # 数据开始列(包含), 以0开始计数
|
|||
|
|
|
|||
|
|
# 按照行处理数据
|
|||
|
|
sheet_rows = [row for row in ws.rows] # 获取所有行
|
|||
|
|
# 表头处理
|
|||
|
|
syndrome_head = '遗传性肿瘤综合征中文名'
|
|||
|
|
gene_head = '致病基因'
|
|||
|
|
cancer_head = '相关肿瘤'
|
|||
|
|
key_list = [syndrome_head, gene_head, cancer_head]
|
|||
|
|
key_index = {}
|
|||
|
|
for d_key in key_list:
|
|||
|
|
key_index[d_key] = -1
|
|||
|
|
|
|||
|
|
cell_index = col_min - 1
|
|||
|
|
for cell in sheet_rows[row_min - 1][col_min - 1:col_max]:
|
|||
|
|
cell_str = str(cell.value)
|
|||
|
|
# print(cell_index, cell_str)
|
|||
|
|
for d_key in key_list:
|
|||
|
|
if cell_str.find(d_key) > -1:
|
|||
|
|
key_index[d_key] = cell_index
|
|||
|
|
break
|
|||
|
|
cell_index += 1
|
|||
|
|
|
|||
|
|
check_pos = 1
|
|||
|
|
for d_key in key_list:
|
|||
|
|
if key_index[d_key] == -1:
|
|||
|
|
check_pos = 0
|
|||
|
|
print('%s, 表头中未发现' % d_key)
|
|||
|
|
|
|||
|
|
if check_pos == 1:
|
|||
|
|
for sheet_row in sheet_rows[row_min:row_max]:
|
|||
|
|
temp_dd = {}
|
|||
|
|
for d_key in key_list:
|
|||
|
|
d_index = key_index[d_key]
|
|||
|
|
temp_dd[d_key] = str(sheet_row[d_index].value)
|
|||
|
|
# print(temp_dd)
|
|||
|
|
|
|||
|
|
genes = temp_dd[gene_head].strip()
|
|||
|
|
syndrome = temp_dd[syndrome_head].strip()
|
|||
|
|
cancers = temp_dd[cancer_head].strip()
|
|||
|
|
|
|||
|
|
# 基因信息分隔,分隔标识:中文顿号,空格
|
|||
|
|
genes = genes.replace('、', ',')
|
|||
|
|
genes = genes.replace(' ', ',')
|
|||
|
|
for gene in genes.split(','):
|
|||
|
|
# 多个空格连续在一起会造成分隔出空值
|
|||
|
|
if len(gene) > 0:
|
|||
|
|
if gene not in Genes_Ls:
|
|||
|
|
print(gene, '基因不是HGNC标准名称')
|
|||
|
|
gene_gas = Gas_Dd.get(gene, gene)
|
|||
|
|
Gene_Dd[gene_gas].append([syndrome, cancers])
|
|||
|
|
else:
|
|||
|
|
Gene_Dd[gene].append([syndrome, cancers])
|
|||
|
|
|
|||
|
|
# 处理结束
|
|||
|
|
wb.close()
|
|||
|
|
print('BRCA1', Gene_Dd['BRCA1'])
|
|||
|
|
|
|||
|
|
|
|||
|
|
def Check_Gene_Dd():
|
|||
|
|
txt_dd = {}
|
|||
|
|
# 处理初步的gene可能是别称的字典
|
|||
|
|
for gene in Gene_Dd:
|
|||
|
|
her_syns = Gene_Dd[gene]
|
|||
|
|
|
|||
|
|
# 处理更改过名称的基因
|
|||
|
|
her_ls = []
|
|||
|
|
syn_ls = []
|
|||
|
|
for her_syn in her_syns:
|
|||
|
|
hers = her_syn[0].strip()
|
|||
|
|
syns = her_syn[1].strip()
|
|||
|
|
|
|||
|
|
hers = hers.replace('、', ',')
|
|||
|
|
hers = hers.replace(',', ',')
|
|||
|
|
# hers = hers.replace(' ', ',')
|
|||
|
|
hers = hers.replace('。', '')
|
|||
|
|
|
|||
|
|
syns = syns.replace('、', ',')
|
|||
|
|
syns = syns.replace(',', ',')
|
|||
|
|
# syns = syns.replace(' ', ',')
|
|||
|
|
syns = syns.replace('。', '')
|
|||
|
|
|
|||
|
|
for her in hers.split(','):
|
|||
|
|
if len(her) > 0:
|
|||
|
|
if her not in her_ls:
|
|||
|
|
her_ls.append(her)
|
|||
|
|
for syn in syns.split(','):
|
|||
|
|
if len(syn) > 0:
|
|||
|
|
if syn not in syn_ls:
|
|||
|
|
syn_ls.append(syn)
|
|||
|
|
|
|||
|
|
# 遗传方式默认AD
|
|||
|
|
hy_type = ['AD']
|
|||
|
|
# 使用英文逗号分隔各个综合征
|
|||
|
|
teml_ls = [';'.join(her_ls)] + [';'.join(syn_ls)] + hy_type
|
|||
|
|
txt_dd[gene] = teml_ls
|
|||
|
|
|
|||
|
|
# 写入字典到文件
|
|||
|
|
out_path = os.path.join(Exe_Path, 'config')
|
|||
|
|
if not os.path.exists(out_path):
|
|||
|
|
os.makedirs(out_path)
|
|||
|
|
txt_full = os.path.join(out_path, DD_OUT)
|
|||
|
|
with open(txt_full, 'w', encoding='utf8') as ff:
|
|||
|
|
for gene in txt_dd:
|
|||
|
|
line = '%s\t%s\n' % (gene, '|'.join(txt_dd[gene]))
|
|||
|
|
ff.write(line)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def Check_txt(pro_gene_txt):
|
|||
|
|
genes = []
|
|||
|
|
with open(pro_gene_txt, 'r', encoding='utf8') as ff:
|
|||
|
|
for line in ff:
|
|||
|
|
if not line.startswith('#'):
|
|||
|
|
line = line.strip()
|
|||
|
|
if len(line) > 0:
|
|||
|
|
if line not in genes:
|
|||
|
|
genes.append(line)
|
|||
|
|
|
|||
|
|
out_path = os.path.join(Exe_Path, 'config')
|
|||
|
|
if not os.path.exists(out_path):
|
|||
|
|
os.makedirs(out_path)
|
|||
|
|
out_full = os.path.join(out_path, TXT_OUT)
|
|||
|
|
with open(out_full, 'w', encoding='utf8') as ff:
|
|||
|
|
ff.write('#该panel在遗传综合征数据库匹配到的基因\n')
|
|||
|
|
for gene in genes:
|
|||
|
|
if gene in Gene_Dd:
|
|||
|
|
ff.write(gene + '\n')
|
|||
|
|
|
|||
|
|
|
|||
|
|
def Check_Pros():
|
|||
|
|
txt_full = os.path.join(Exe_Path, 'ref', TXT_IN)
|
|||
|
|
Check_txt(txt_full)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
|
|||
|
|
file_real = os.path.realpath(sys.argv[0])
|
|||
|
|
Exe_Path = os.path.dirname(file_real)
|
|||
|
|
bin_name = os.path.basename(file_real)
|
|||
|
|
log_path = os.path.join(Exe_Path, 'logs', bin_name)
|
|||
|
|
if not os.path.exists(log_path):
|
|||
|
|
os.makedirs(log_path)
|
|||
|
|
starttime = datetime.datetime.now()
|
|||
|
|
ymd = starttime.__format__('%Y%m%d_%H%M%S')
|
|||
|
|
log_base = '%s_%s.log' % (bin_name, ymd)
|
|||
|
|
log_full = os.path.join(log_path, log_base)
|
|||
|
|
GLog = Logger(log_full, mode='w')
|
|||
|
|
GLog.info('start')
|
|||
|
|
Creat_Gas()
|
|||
|
|
|
|||
|
|
xlsx_ff = os.path.join(Exe_Path, 'ref', XLSX_FILE)
|
|||
|
|
Get_Gene_Dd(xlsx_ff)
|
|||
|
|
Check_Gene_Dd()
|
|||
|
|
Check_Pros()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
pass
|
|||
|
|
except BaseException:
|
|||
|
|
GLog.error(traceback.format_exc())
|
|||
|
|
print(traceback.format_exc())
|
|||
|
|
endtime = datetime.datetime.now()
|
|||
|
|
GLog.info('end')
|
|||
|
|
GLog.info('run time:%s seconds' % ((endtime - starttime).seconds))
|
|||
|
|
GLog.close()
|