初始化

main
chaopower 2023-07-31 13:49:34 +08:00
commit 260d86d3f1
25 changed files with 8552 additions and 0 deletions

215
.gitignore vendored 100644
View File

@ -0,0 +1,215 @@
# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# ---> Perl
!Build/
.last_cover_stats
/META.yml
/META.json
/MYMETA.*
*.o
*.pm.tdy
*.bs
# Devel::Cover
cover_db/
# Devel::NYTProf
nytprof.out
# Dist::Zilla
/.build/
# Module::Build
_build/
Build
Build.bat
# Module::Install
inc/
# ExtUtils::MakeMaker
/blib/
/_eumm/
/*.gz
/Makefile
/Makefile.old
/MANIFEST.bak
/pm_to_blib
/*.zip
# ---> Perl6
# Gitignore for Perl 6 (http://www.perl6.org)
# As part of https://github.com/github/gitignore
# precompiled files
.precomp
lib/.precomp
nohup.out
log/*
!log/readme.md
example/*
!example/readme.md
/.report/

0
README.md 100644
View File

View File

@ -0,0 +1,28 @@
癌种 用药方案 方案缩写 source
非小细胞肺癌 顺铂+紫杉醇 TP lung85gene
非小细胞肺癌 卡铂+紫杉醇 TP lung85gene
非小细胞肺癌 顺铂+紫杉醇脂质体 LP lung85gene
非小细胞肺癌 卡铂+紫杉醇脂质体 LP lung85gene
非小细胞肺癌 顺铂+白蛋白紫杉醇 nab-TP lung85gene
非小细胞肺癌 卡铂+白蛋白紫杉醇 nab-TP lung85gene
非小细胞肺癌 顺铂+多西他赛 DP lung85gene
非小细胞肺癌 卡铂+多西他赛 DP lung85gene
非小细胞肺癌 奈达铂+多西他赛 DP lung85gene
非小细胞肺癌 顺铂+吉西他滨 GP lung85gene
非小细胞肺癌 卡铂+吉西他滨 GP lung85gene
非小细胞肺癌 顺铂+培美曲塞 PP lung85gene
非小细胞肺癌 卡铂+培美曲塞 PP lung85gene
非小细胞肺癌 顺铂+长春瑞滨 NP lung85gene
非小细胞肺癌 顺铂+依托泊苷 EP lung85gene
小细胞肺癌 顺铂+依托泊苷 EP lung85gene
小细胞肺癌 卡铂+依托泊苷 EC lung85gene
小细胞肺癌 洛铂+依托泊苷 EL lung85gene
小细胞肺癌 顺铂+伊立替康 IP lung85gene
小细胞肺癌 卡铂+伊立替康 IC lung85gene
结直肠癌 奥沙利铂+亚叶酸钙+氟尿嘧啶 FOLFOX crc88gene
结直肠癌 伊立替康+亚叶酸钙+氟尿嘧啶 FOLFIRI crc88gene
结直肠癌 奥沙利铂+卡培他滨 CAPEOX又称Xelox crc88gene
结直肠癌 伊立替康+奥沙利铂+亚叶酸钙+氟尿嘧啶 FOLFOXIRI crc88gene
结直肠癌 伊立替康+卡培他滨 CapIRI或XELIRI crc88gene
结直肠癌 奥沙利铂+雷替曲塞 / crc88gene
结直肠癌 伊立替康+雷替曲塞 / crc88gene
1 癌种 用药方案 方案缩写 source
2 非小细胞肺癌 顺铂+紫杉醇 TP lung85gene
3 非小细胞肺癌 卡铂+紫杉醇 TP lung85gene
4 非小细胞肺癌 顺铂+紫杉醇脂质体 LP lung85gene
5 非小细胞肺癌 卡铂+紫杉醇脂质体 LP lung85gene
6 非小细胞肺癌 顺铂+白蛋白紫杉醇 nab-TP lung85gene
7 非小细胞肺癌 卡铂+白蛋白紫杉醇 nab-TP lung85gene
8 非小细胞肺癌 顺铂+多西他赛 DP lung85gene
9 非小细胞肺癌 卡铂+多西他赛 DP lung85gene
10 非小细胞肺癌 奈达铂+多西他赛 DP lung85gene
11 非小细胞肺癌 顺铂+吉西他滨 GP lung85gene
12 非小细胞肺癌 卡铂+吉西他滨 GP lung85gene
13 非小细胞肺癌 顺铂+培美曲塞 PP lung85gene
14 非小细胞肺癌 卡铂+培美曲塞 PP lung85gene
15 非小细胞肺癌 顺铂+长春瑞滨 NP lung85gene
16 非小细胞肺癌 顺铂+依托泊苷 EP lung85gene
17 小细胞肺癌 顺铂+依托泊苷 EP lung85gene
18 小细胞肺癌 卡铂+依托泊苷 EC lung85gene
19 小细胞肺癌 洛铂+依托泊苷 EL lung85gene
20 小细胞肺癌 顺铂+伊立替康 IP lung85gene
21 小细胞肺癌 卡铂+伊立替康 IC lung85gene
22 结直肠癌 奥沙利铂+亚叶酸钙+氟尿嘧啶 FOLFOX crc88gene
23 结直肠癌 伊立替康+亚叶酸钙+氟尿嘧啶 FOLFIRI crc88gene
24 结直肠癌 奥沙利铂+卡培他滨 CAPEOX(又称Xelox) crc88gene
25 结直肠癌 伊立替康+奥沙利铂+亚叶酸钙+氟尿嘧啶 FOLFOXIRI crc88gene
26 结直肠癌 伊立替康+卡培他滨 CapIRI或XELIRI crc88gene
27 结直肠癌 奥沙利铂+雷替曲塞 / crc88gene
28 结直肠癌 伊立替康+雷替曲塞 / crc88gene

24
main.py 100644
View File

@ -0,0 +1,24 @@
import json
import os
import socket
import sys
from docxtpl import DocxTemplate
from tools.parsexlsx import run
def main(path):
resjson = run(path)
res = json.loads(resjson)
barcode = res['c']['barcode']
tplpath = os.path.join(os.path.dirname(__file__), 'template', 'nreport.docx')
tpl = DocxTemplate(tplpath)
tpl.render(res)
path = os.path.join(os.path.dirname(__file__), 'result', f'{barcode}.docx')
tpl.save(path)
return path
if __name__ == '__main__':
main(sys.argv[1])

Binary file not shown.

Binary file not shown.

0
result/readme.md 100644
View File

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,486 @@
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import pandas as pd
from pandas import DataFrame
import numpy as np
import logging
import re
import sys
import os
import json
import glob
import openpyxl
from openpyxl import Workbook, load_workbook
from openpyxl.drawing.image import Image
if len(sys.argv) != 3:
print(" ".join(['usage:python', sys.argv[0], 'output_dir', 'name']))
sys.exit()
def snv_fusion_cnv(output_dir, name):
out_xlsx = "".join([output_dir, '/report/', name, '.check_new.xlsx'])
# genefunction
genefunction = {}
gf = open("/dataseq/jmdna/codes/reportbase/gene_function.txt", 'r', encoding='utf-8').readlines()
for line in gf[1:]:
gene = line.strip().split("\t")[0]
func = line.strip().split("\t")[1]
genefunction[gene.upper()] = func
genefunction['.'] = '.'
##drug_mechanism
drug_mechanism = {}
drug_fh = open("/dataseq/jmdna/codes/reportbase/target_drug.txt", 'r', encoding='utf-8').readlines()
for line in drug_fh[1:]:
disease = line.split("\t")[8]
mechanism = line.split("\t")[11]
drugs = line.split("\t")[0].split('|') + line.split("\t")[1].split('|')
if disease or mechanism:
for drug in drugs:
drug_mechanism[drug.upper()] = "\\\\".join([disease, mechanism]).strip()
'''
snvindel_sheet
'''
##input
filter_file = "".join([output_dir, '/report/', name, '.snp.indel.Somatic.annoall.hg19_multianno_filtered.txt'])
pos_file = "".join([output_dir, '/mutation/', name, '.snvindel.pos.dedup.txt'])
vus_file = "".join([output_dir, '/mutation/', name, '.snvindel.vus.txt'])
neg_file = "".join([output_dir, '/mutation/', name, '.snvindel.neg.txt'])
##filter_file
if os.path.getsize(filter_file) > 0:
snv = pd.read_table(filter_file, sep="\t")
cols = [index for index, row in snv[snv['可信'] == 0].iterrows()]
snv.drop(cols, inplace=True)
snv.insert(loc=24, column='ACMG_level', value=0)
snv.insert(loc=25, column='Deleterious', value=0)
snv.insert(loc=26, column='freq_high', value=0)
for index, row in snv.iterrows():
if re.search("Likely_pathogenic|drug", (row['CLNSIG']), re.I):
snv.loc[index, 'ACMG_level'] = '2'
elif re.search("pathogenic", (row['CLNSIG']), re.I) and not re.search("Conflicting", (row['CLNSIG']), re.I):
snv.loc[index, 'ACMG_level'] = '1'
else:
snv.loc[index, 'ACMG_level'] = '3'
snv.loc[index, "Deleterious"] = (
snv.loc[index, ['MutationTaster_pred', 'FATHMM_pred', 'MetaLR_pred']].tolist().count("D"))
snv.loc[index, "freq_high"] = ((snv.loc[
index, ['1000g2015aug_all', '1000g2015aug_eas', 'esp6500siv2_all', 'ExAC_nontcga_ALL',
'ExAC_nontcga_EAS', 'gnomAD_genome_ALL', 'gnomAD_genome_EAS']]).replace('.', '0')).max()
snv_1 = snv.iloc[:, list(range(14)) + [15, 17, 18, 20, 23, 24, 25, 26, 111, 112, 113]]
else:
snv_1 = pd.DataFrame(columns=[])
##pos_file
if os.path.getsize(pos_file) > 0:
pos = pd.read_table(pos_file, sep="\t")
pos = pos.iloc[:, [7, 10, 18, 23, 24, 25, 29, 30, 31, 32]]
pos_1 = pd.DataFrame(
columns=['AAChange.refGene', 'OKBSIG', 'AMP_evidence_level', 'AMP_mut_level', 'Indication', 'Drug',
'Response_Type', 'Evidence_Source', 'EfficacyEvidence', 'Drug_Detail', 'Gene_function',
'Drug_Category'])
pos = list(pos.groupby(['Gene.refGene', 'AAChange.refGene', 'fun_change']))
for i in pos:
for index, row in i[1].iterrows():
drugs = row['药物中文名'].replace(" + ", ",")
drugs = list(set(drugs.split(",")))
drug_mm = ''
for drug in drugs:
if drug.upper() in drug_mechanism.keys():
drug_mm += '[[' + drug + ']]' + drug_mechanism[drug.upper()]
i[1].loc[index, ['Drug_Detail']] = drug_mm
if row['标签'] == '非适应症':
row['证据等级'] = 'C'
if (re.search("敏感", row['Response_Type_C']) and row['证据等级'] == 'A'):
i[1].loc[index, ['Drug_Category']] = 'a'
elif re.search("敏感", row['Response_Type_C']) and row['证据等级'] == 'C':
i[1].loc[index, ['Drug_Category']] = 'b'
elif re.search("耐药", row['Response_Type_C']):
i[1].loc[index, ['Drug_Category']] = 'd'
else:
i[1].loc[index, ['Drug_Category']] = 'c'
i[1]['AMP_mut_level'] = i[1]['证据等级'].replace(['A', 'B', 'C', 'D'], ['I', 'I', 'II', 'II'])
pos_1.loc[len(pos_1)] = [i[0][1], i[0][2], '|'.join(list(i[1]['证据等级'])),
'|'.join(list(i[1]['AMP_mut_level'])), '|'.join(list(i[1]['疾病中文名'])),
'|'.join(list(i[1]['药物中文名'])), \
'|'.join(list(i[1]['Response_Type_C'])), '|'.join(list(i[1]['Evidence_Source_C'])),
'|'.join(list(i[1]['EfficacyEvidence'])), '|'.join(list(i[1]['Drug_Detail'])),
genefunction[i[0][0].upper()], '|'.join(list(i[1]['Drug_Category']))]
else:
pos_1 = pd.DataFrame(columns=[])
##vus_file
if os.path.getsize(vus_file) > 0:
vus = pd.read_table(vus_file, sep="\t")
vus_1 = vus.iloc[:, [9, 17]]
vus_1.insert(loc=2, column='AMP_mut_level', value='III')
vus_1 = vus_1.rename(columns={'fun_change': 'OKBSIG'})
else:
vus_1 = pd.DataFrame(columns=[])
##neg_file
if os.path.getsize(neg_file) > 0:
neg = pd.read_table(neg_file, sep="\t")
neg_1 = neg.iloc[:, [9, 17]]
neg_1.insert(loc=2, column='AMP_mut_level', value='IIII')
neg_1 = neg_1.rename(columns={'fun_change': 'OKBSIG'})
else:
neg_1 = pd.DataFrame(columns=[])
snvindel_sheet = pd.DataFrame(
columns=['可信', 'Chr', 'Start', 'End', 'Ref', 'Alt', 'AAChange.refGene', 'mutant_frequency', 'total_reads',
'mutant_reads', 'strand_bias', 'Otherinfo10', 'Func.refGene', 'Gene.refGene', 'ExonicFunc.refGene',
'avsnp150', 'cosmic91', 'CLNDN', 'CLNSIG', 'ACMG_level', 'Deleterious', 'freq_high', 'OKBSIG',
'AMP_evidence_level', 'AMP_mut_level', 'Indication', 'Drug', 'Response_Type', 'Evidence_Source',
'EfficacyEvidence', 'Drug_Detail', 'Gene_function', 'Drug_Category', 'Otherinfo11', 'Otherinfo12',
'Otherinfo13'])
pos_vus_neg = pd.concat([pos_1, vus_1, neg_1])
snv_pos_vus_neg = snv_1.merge(pos_vus_neg, how='left', on='AAChange.refGene')
snvindel_sheet = pd.concat([snvindel_sheet, snv_pos_vus_neg])
snvindel_sheet.rename(columns={"可信": "Validated"})
snvindel_sheet = snvindel_sheet.replace(np.nan, '.')
snvindel_sheet.rename(columns={"可信": "Validated"}, inplace=True)
'''
fusion_sheet
'''
fusion_pos_file = "".join([output_dir, '/fusion/', name, '.fusion.pos.dedup.txt'])
fusion_vus_file = "".join([output_dir, '/fusion/', name, '.fusion.vus.txt'])
if os.path.getsize(fusion_pos_file) > 0:
fusion_pos = pd.read_table(fusion_pos_file, sep="\t")
else:
fusion_pos = pd.DataFrame(columns=[])
if os.path.getsize(fusion_vus_file) > 0:
fusion_vus = pd.read_table(fusion_vus_file, sep="\t")
fusion_vus.insert(loc=0, column='可信', value=1)
else:
fusion_vus = pd.DataFrame(columns=[])
fusion_pos_vus = pd.concat([fusion_pos, fusion_vus])
fusion_sheet = pd.DataFrame(
columns=['Validated', 'CHROM1', 'POS1', 'CHROM2', 'POS2', 'GENE1', 'GENE2', 'FUSION', 'Support_reads(PE:SR)',
'Depth', 'FREQ1', 'FREQ2', 'OKBSIG', 'AMP_evidence_level', \
'AMP_mut_level', 'Indication', 'Drug', 'Response_Type', 'Evidence_Source', 'Efficacy_Evidence',
'Drug_Detail', 'Gene_function', 'Drug_Category', 'INFO', 'FORMAT', 'Sample'])
if not fusion_pos_vus.empty:
fusion_pos_vus = fusion_pos_vus.replace(np.nan, '.')
fusion = list(fusion_pos_vus.groupby(
['可信', '#CHROM', 'POS', 'CHROM2', 'POS2', 'GENE1', 'GENE2', 'FUSION', 'FREQ1', 'FREQ2', 'fun_change',
'INFO', 'FORMAT', name, 'Gene_Symbol']))
for i in fusion:
for index, row in i[1].iterrows():
drugs = row['药物中文名'].replace(" + ", ",")
drugs = list(set(drugs.split(",")))
drug_mm = ''
for drug in drugs:
if drug.upper() in drug_mechanism.keys():
drug_mm += '[[' + drug + ']]' + drug_mechanism[drug.upper()]
i[1].loc[index, ['Drug_Detail']] = drug_mm
if row['标签'] == '非适应症':
row['证据等级'] = 'C'
if (re.search("敏感", row['Response_Type_C']) and row['证据等级'] == 'A'):
i[1].loc[index, ['Drug_Category']] = 'a'
elif re.search("敏感", row['Response_Type_C']) and row['证据等级'] == 'C':
i[1].loc[index, ['Drug_Category']] = 'b'
elif re.search("耐药", row['Response_Type_C']):
i[1].loc[index, ['Drug_Category']] = 'd'
elif row['Response_Type_C'] == '.':
i[1].loc[index, ['Drug_Category']] = '.'
else:
i[1].loc[index, ['Drug_Category']] = 'c'
i[1]['AMP_mut_level'] = i[1]['证据等级'].replace(['A', 'B', 'C', 'D'], ['I', 'I', 'II', 'II'])
fusion_sheet.loc[len(fusion_sheet)] = list(i[0][0:8]) + [i[0][13].split(":")[1],
i[0][13].split(":")[7]] + list(i[0][8:11]) + [
'|'.join(list(i[1]['证据等级'])),
'|'.join(list(i[1]['AMP_mut_level'])), \
'|'.join(list(i[1]['疾病中文名'])), '|'.join(list(i[1]['药物中文名'])),
'|'.join(list(i[1]['Response_Type_C'])),
'|'.join(list(i[1]['Evidence_Source_C'])),
'|'.join(list(i[1]['EfficacyEvidence'])), \
'|'.join(list(i[1]['Drug_Detail'])),
genefunction[i[0][14].upper()],
'|'.join(list(i[1]['Drug_Category']))] + list(i[0][11:14])
fusion_sheet = fusion_sheet.replace(np.nan, '.')
'''
cnv_sheet
'''
cnv_pos_file = "/home/jm001/test/reference_standard/lung85gene/Tissue/BKDL202603539-1a/cnvkit/BKDL202603539-1a.cnv.pos.dedup.txt"
cnv_sheet = pd.DataFrame(
columns=['Validated', 'Chromosome', 'Start', 'End', 'Gene', 'Depth', 'Probes', 'Copy_number', 'OKBSIG',
'Gene_Symbol', 'AMP_evidence_level', 'AMP_mut_level', \
'Indication', 'Drug', 'Response_Type', 'Evidence_Source', 'Efficacy_Evidence', 'Drug_Detail',
'Gene_Function', 'Drug_Category'])
if os.path.getsize(cnv_pos_file) > 0:
cnv_pos = pd.read_table(cnv_pos_file, sep="\t")
cnv = list(cnv_pos.groupby(
['可信', 'chromosome', 'start', 'end', 'gene', 'depth', 'probes', 'cn', 'fun_change', 'Gene_Symbol']))
for i in cnv:
for index, row in i[1].iterrows():
drugs = row['药物中文名'].replace(" + ", ",")
drugs = list(set(drugs.split(",")))
drug_mm = ''
for drug in drugs:
if drug.upper() in drug_mechanism.keys():
drug_mm += '[[' + drug + ']]' + drug_mechanism[drug.upper()]
i[1].loc[index, ['Drug_Detail']] = drug_mm
if row['标签'] == '非适应症':
row['证据等级'] = 'C'
if (re.search("敏感", row['Response_Type_C']) and row['证据等级'] == 'A'):
i[1].loc[index, ['Drug_Category']] = 'a'
elif re.search("敏感", row['Response_Type_C']) and row['证据等级'] == 'C':
i[1].loc[index, ['Drug_Category']] = 'b'
elif re.search("耐药", row['Response_Type_C']):
i[1].loc[index, ['Drug_Category']] = 'd'
elif row['Response_Type_C'] == '.':
i[1].loc[index, ['Drug_Category']] = '.'
else:
i[1].loc[index, ['Drug_Category']] = 'c'
i[1]['AMP_mut_level'] = i[1]['证据等级'].replace(['A', 'B', 'C', 'D'], ['I', 'I', 'II', 'II'])
cnv_sheet.loc[len(cnv_sheet)] = list(i[0][0:10]) + ['|'.join(list(i[1]['证据等级'])),
'|'.join(list(i[1]['AMP_mut_level'])), \
'|'.join(list(i[1]['疾病中文名'])),
'|'.join(list(i[1]['药物中文名'])),
'|'.join(list(i[1]['Response_Type_C'])),
'|'.join(list(i[1]['Evidence_Source_C'])),
'|'.join(list(i[1]['EfficacyEvidence'])), \
'|'.join(list(i[1]['Drug_Detail'])),
genefunction[i[0][9].upper()],
'|'.join(list(i[1]['Drug_Category']))]
else:
cnv_pos = pd.DataFrame(columns=[])
with pd.ExcelWriter(out_xlsx) as writer:
snvindel_sheet.to_excel(writer, sheet_name="snvindel", index=False)
fusion_sheet.to_excel(writer, sheet_name="fusion", index=False)
cnv_sheet.to_excel(writer, sheet_name="cnv", index=False)
##加入cnvkit/*.cnv.png
wb = openpyxl.load_workbook(filename=out_xlsx)
ws = wb['cnv']
mr = ws.max_row
cell = 'C' + str(mr + 4)
cnv_pic = "".join([output_dir, '/cnvkit/', name, '.cnv.png'])
image = Image(cnv_pic)
ws.add_image(image, cell)
wb.save(out_xlsx)
class PostProcess:
"""
excel处理
"""
def __init__(self, path, outpath):
self.path = path
self.outpath = outpath
self.neeecol = self.need_col()
def need_col(self):
"""
读取所需列
"""
path = os.path.join(os.path.dirname(__file__), 'columns.csv')
cols = pd.read_csv(path)
cols = cols.fillna('')
cols_record = cols.to_dict('list')
for sheet in cols_record:
cols_record[sheet] = [x for x in cols_record[sheet] if x]
return cols_record
def msi(self):
"""
Process msi result files
"""
msi_files = glob.glob(os.path.join(self.path, 'MSI', '*.msi'))
msi_res = dict()
if msi_files:
df = pd.read_csv(msi_files[0], sep='\t')
res = df.to_dict('records')[0]
msi_res['msi_count'] = res['Total_Number_of_Sites']
msi_res['msi_value'] = res['%']
if msi_res['msi_value'] >= 0.3:
msi_res['msi_result'] = 'MSI-H'
msi_res['msi_predict'] = '对免疫检查点抑制剂可能敏感'
else:
msi_res['msi_result'] = 'MSS'
msi_res['msi_predict'] = '对免疫检查点抑制剂可能不敏感'
return [msi_res]
def chemo(self):
"""
化疗
"""
chemo_files = glob.glob(os.path.join(self.path, 'chemo', '*chemo.res.txt'))
chemo_res = []
if chemo_files:
df = pd.read_csv(chemo_files[0], sep='\t')
df = df.fillna('.')
chemo_res = df.to_dict('records')
return chemo_res
def heredity(self):
"""
遗传
"""
heredi_files = glob.glob(os.path.join(self.path, 'mutation', '*Germline*filtered.txt'))
heredires = []
if heredi_files:
df = pd.read_csv(heredi_files[0], sep='\t')
df = df.fillna('.')
tmdf1 = df[
['1000g2015aug_all', '1000g2015aug_eas', 'esp6500siv2_all', 'ExAC_nontcga_ALL', 'ExAC_nontcga_EAS',
'gnomAD_genome_ALL', 'gnomAD_genome_EAS']].replace('.', 0).applymap(lambda x: eval(str(x)))
df['freq_high'] = tmdf1.max(axis=1)
tmdf2 = df[['MutationTaster_pred', 'FATHMM_pred', 'MetaLR_pred']]
df['Deleterious'] = tmdf2.apply(lambda x: x.tolist().count('D'), axis=1)
df_need = df[self.neeecol.get('HCS', [])]
try:
heredires = df_need.to_dict('records')
except KeyError as e:
raise UserWarning('表头设置和配置文件不对应', e)
return heredires
def MMR(self):
"""
MMR
"""
mmr_files = glob.glob(os.path.join(self.path, 'MMR', '*mmr.pre.txt'))
mmr = []
if mmr_files:
df = pd.read_csv(mmr_files[0], sep='\t')
df = df.fillna('.')
tmdf1 = df[
['1000g2015aug_all', '1000g2015aug_eas', 'esp6500siv2_all', 'ExAC_nontcga_ALL', 'ExAC_nontcga_EAS',
'gnomAD_genome_ALL', 'gnomAD_genome_EAS']].replace('.', 0).applymap(lambda x: eval(str(x)))
df['freq_high'] = tmdf1.max(axis=1)
tmdf2 = df[['MutationTaster_pred', 'FATHMM_pred', 'MetaLR_pred']]
df['Deleterious'] = tmdf2.apply(lambda x: x.tolist().count('D'), axis=1)
df_need = df[self.neeecol.get('HCS', [])]
try:
mmr = df_need.to_dict('records')
except KeyError as e:
raise UserWarning('表头设置和配置文件不对应', e)
return mmr
def hotspot(self):
hotspot_files = glob.glob(
os.path.join(self.path, 'mutation', 'hotspot', '*hotspot.snp.indel.filter.anno.hg19_multianno.txt'))
if hotspot_files:
return self.txt_2_excel(hotspot_files[0])
def splicing(self):
splicing_files = glob.glob(
os.path.join(self.path, 'mutation', '*.target.splicing.txt'))
if splicing_files:
return self.txt_2_excel(splicing_files[0])
def indication(self):
indication_files = glob.glob(
os.path.join(self.path, 'mutation', '*indication.txt'))
if indication_files:
return self.txt_2_excel(indication_files[0])
def longindel(self):
longindel_files = glob.glob(
os.path.join(self.path, 'fusion', '*.longindel.pos.txt'))
if longindel_files:
return self.txt_2_excel(longindel_files[0])
def cms(self):
"""
样本信息
"""
cms_files = glob.glob(os.path.join(self.path, 'qc', '*_post.json'))
cms_info_need = []
if cms_files:
file_read = open(cms_files[0], 'r')
cms_info = json.load(file_read)['data']
file_read.close()
df = pd.DataFrame(cms_info)
df_need = df[self.neeecol.get('sample_info', [])]
try:
cms_info_need = df_need.to_dict('records')
except KeyError as e:
raise UserWarning('表头设置和配置文件不对应', e)
return cms_info_need
def qc(self):
qc_files = glob.glob(os.path.join(self.path, 'qc', '*_post.json'))
qc_res = []
if qc_files:
df = pd.read_csv(qc_files[0], sep='\t', header=None)
df = df.set_index(0).T
qc_res = df.to_dict('records')
return qc_res
#
# def snv(self):
# # filter file
# filter_files = glob.glob(os.path.join(self.path, 'report', '*snp.indel.Somatic.annoall.hg19_multianno_filtered.txt'))
# if filter_files:
# snv = pd.read_csv(filter_files[0], sep="\t")
# def sign_drug_Category(x):
# if '敏感' in x['Response_Type_C'] and x['证据等级'] == 'A':
# return 'a'
# elif '敏感' in x['Response_Type_C'] and x['证据等级'] == 'C':
# return 'b'
# elif '耐药' in x['Response_Type_C']:
# return 'd'
# else:
# return 'c'
# # pos_file 处理
# pos_files = glob.glob(os.path.join(self.path, 'mutation', '*snvindel.pos.txt'))
# if pos_files:
# pos = pd.read_csv(pos_files[0], sep='\t')
# pos['证据等级'] = pos.apply(lambda x: 'C' if x['标签'] == '非适应症' else x['证据等级'], axis=1)
# pos['Drug_Category'] = pos.apply(sign_drug_Category, axis=1)
# pos['AMP_mut_level'] = pos['证据等级'].replace(['A', 'B', 'C', 'D'], ['I', 'I', 'II', 'II'])
# agg_list = ['证据等级', 'AMP_mut_level', '疾病中文名', '药物中文名', '证据等级', 'Response_Type_C', 'Evidence_Source_C',
# 'EfficacyEvidence', 'Drug_Category']
# agg_dict = {column: ','.join for column in agg_list}
# pos_group =pos.groupby(['Gene.refGene','AAChange.refGene','fun_change']).agg(agg_dict, axis=1)
def txt_2_excel(self, path):
try:
df = pd.read_csv(path, sep='\t')
except pd.errors.EmptyDataError:
return []
return df.to_dict('records')
def collect(self):
writer = pd.ExcelWriter(self.outpath, mode='a', engine='openpyxl')
sheet = {
'MSI': self.msi(),
'chemo': self.chemo(),
'HCS': self.heredity(),
'sample_info': self.cms(),
'MMR': self.MMR(),
'hotspot': self.hotspot(),
'MET': self.splicing(),
'indication': self.indication(),
'longindel': self.longindel(),
'qc': self.qc()
}
# 遍历CSV文件列表
for sheet_name in sheet:
# 读取CSV文件为DataFrame
df = pd.DataFrame(sheet[sheet_name])
df.to_excel(writer, sheet_name=sheet_name, index=False)
# 保存并关闭Excel写入器
writer.close()
if __name__ == '__main__':
snv_fusion_cnv(sys.argv[1], sys.argv[2])
# 未加日志,未添加路径
out_xlsx = "".join([sys.argv[1], '/report/', sys.argv[2], '.check_new.xlsx'])
postprocess = PostProcess(sys.argv[1], out_xlsx)
postprocess.collect()

View File

@ -0,0 +1,864 @@
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
##https://www.pianshen.com/article/5314917437/
##https://zhuanlan.zhihu.com/p/366902690
##https://itpcb.com/a/277599
import docxtpl
from docx.shared import Mm
from docxtpl import DocxTemplate,RichText
import pandas as pd
from pandas import DataFrame
import re
import sys
import os
import json
import time
if len(sys.argv) != 5:
print(" ".join(['usage:python',sys.argv[0],'output_dir','tumor','sample_type(t for tissue,c for cfdna)']),'projcet')
sys.exit()
output_dir=sys.argv[1]
name=sys.argv[2]
Sample_type=sys.argv[3]
projcet=sys.argv[4]
snv_base="".join([name,'.snvindel.pos.dedup.txt'])
snv_file='/'.join([output_dir,'mutation',snv_base])
snv_base_vus="".join([name,'.snvindel.vus.txt'])
snv_file_vus='/'.join([output_dir,'mutation',snv_base_vus])
fusion_base="".join([name,'.fusion.pos.dedup.txt'])
fusion_file='/'.join([output_dir,'fusion',fusion_base])
cnv_base="".join([name,'.cnv.pos.dedup.txt'])
cnv_file='/'.join([output_dir,'cnvkit',cnv_base])
qc_base=''.join([name,'_qc.txt'])
qc_file='/'.join([output_dir,'qc',qc_base])
report_base="".join([name,'_report.docx'])
report_file='/'.join([output_dir,'report',report_base])
indication_file="".join([output_dir,'/mutation/','indication.txt'])
context = {'list1':[],'list2':{},'list3':{},'clingene1':[],'clingene2':[],'nonclingenes':[],'genefunc':{},
'indication':[],'mmr':[],'chemo':[]}
#genefunction
genefunction={}
gf=open("/dataseq/jmdna/codes/reportbase/gene_function.txt",'r',encoding='utf-8').readlines()
for line in gf[1:]:
gene=line.strip().split("\t")[0]
func=line.strip().split("\t")[1]
genefunction[gene.upper()]=func
##sensitive_resistant_drug
drug_disease={}
drug_mechanism={}
drug_fh=open("/dataseq/jmdna/codes/reportbase/target_drug.txt",'r',encoding='utf-8').readlines()
for line in drug_fh[1:]:
disease=line.split("\t")[8]
mechanism=line.split("\t")[11]
drugs=line.split("\t")[0].split('|')
if disease or mechanism:
for drug in drugs:
drug_disease[drug.upper()]=disease
drug_mechanism[drug.upper()]=mechanism
sensitive_resistant_drug=[{'type':'可能敏感药物','drug':[]},{'type':'可能耐药药物','drug':[]}]
##somatic snvindel处理
snv_size = os.path.getsize(snv_file)
if snv_size>0:
snv=pd.read_table(snv_file,sep="\t")
cols=[index for index,row in snv[snv['可信']==0].iterrows()]
snv.drop(cols,inplace=True)
genes=snv['Gene.refGene'].drop_duplicates()
if len(genes):
for gene in genes:
rt={}
rt[gene]=[]
muts=snv['AAChange.refGene'][snv['Gene.refGene']==gene].drop_duplicates()
for mut in muts:
info2={mut:[]}
for index,row in snv[snv['AAChange.refGene']==mut].iterrows():
info3={}
info3['drug']=row['药物中文名']
info3['effect']=row['Response_Type_C']
info3['tumor']=row['疾病中文名']
info3['evidence']=row['Evidence_Source_C']
info3['sig']=row['EfficacyEvidence']
info2[mut].append(info3)
##sensitive_resistant_drug
drugs=row['Drug'].replace(" + ",",")
drugs=drugs.split(",")
drugs_chinese=row['药物中文名'].replace(" + ",",")
drugs_chinese=drugs_chinese.split(",")
bool=0
for drug in drugs:
if drug.upper() in drug_disease.keys():
if re.search(r'敏感',row['Response_Type_C']):
sensitive_drug={}
sensitive_drug['name']=drugs_chinese[bool]
sensitive_drug['mechanism']=("\n".join([drug_disease[drug.upper()],drug_mechanism[drug.upper()]])).strip()
if sensitive_drug not in sensitive_resistant_drug[0]['drug']:
sensitive_resistant_drug[0]['drug'].append(sensitive_drug)
elif re.search(r'耐药',row['Response_Type_C']):
resistant_drug={}
resistant_drug['name']=drugs_chinese[bool]
resistant_drug['mechanism']=("\n".join([drug_disease[drug.upper()],drug_mechanism[drug.upper()]])).strip()
if resistant_drug not in sensitive_resistant_drug[1]['drug']:
sensitive_resistant_drug[1]['drug'].append(resistant_drug)
bool+=1
context['list2'][mut]=info2[mut]
info={}
info['gene']=snv['Gene.refGene'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]
m=re.search(r'(p\..*)$',mut)
if m:
info['p']=m.group(1)
else:
m=re.search(r'(c\..*)$',mut)
info['p']=m.group(1)
info['freq']=snv['Freq'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]
A=[]
B=[]
C=[]
D=[]
for index,row in snv[(snv['AAChange.refGene']==mut) & (snv['标签']=='适应症') & snv['Response_Type_C'].str.contains("敏感")].iterrows():
ds=row['药物中文名'].split(",")
evidence='A'
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
A.extend(ds_new)
for index,row in snv[(snv['AAChange.refGene']==mut) & (snv['标签']=='非适应症') & snv['Response_Type_C'].str.contains("敏感")].iterrows():
ds=row['药物中文名'].split(",")
evidence='C'
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
B.extend(ds_new)
for index,row in snv[(snv['AAChange.refGene']==mut) & (snv['标签']=='.') & snv['Response_Type_C'].str.contains("敏感")].iterrows():
ds=row['药物中文名'].split(",")
evidence=row['证据等级']
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
C.extend(ds_new)
for index,row in snv[(snv['AAChange.refGene']==mut) & snv['Response_Type_C'].str.contains("耐药")].iterrows():
evidence=''
if row['标签']=='非适应症':
evidence='C'
else:
evidence=row['证据等级']
ds=row['药物中文名'].split(",")
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
D.extend(ds_new)
A=sorted(set(A),key=A.index)
B=sorted(set(B),key=B.index)
C=sorted(set(C),key=C.index)
D=sorted(set(D),key=D.index)
info['drug_A']="\n".join(A)
info['drug_B']="\n".join(B)
info['drug_C']="\n".join(C)
info['drug_D']="\n".join(D)
if not info['drug_A']:
info['drug_A']='/'
if not info['drug_B']:
info['drug_B']='/'
if not info['drug_C']:
info['drug_C']='/'
if not info['drug_D']:
info['drug_D']='/'
context['list1'].append(info)
rt[gene].append("".join([info['p'],'(',info['freq'],')']))
##genefunc
context['genefunc'][mut]=genefunction[gene.upper()]
##clingenes
clingene1={}
clingene2={}
for index,row in snv[(snv['AAChange.refGene']==mut)].iterrows():
if row['标签']=='适应症' or row['证据等级']=='B':
clingene1['freq']=snv['Freq'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]
if len((snv['AAChange.refGene'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]).split(":")) == 5:
(clingene1['gene'],clingene1['transcript'],clingene1['exon'],clingene1['nacid'],clingene1['aacid'])=(snv['AAChange.refGene'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]).split(":")
else:
(clingene1['gene'],clingene1['transcript'],clingene1['exon'],clingene1['nacid'])=(snv['AAChange.refGene'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]).split(":")
clingene1['aacid'] = '/'
clingene1['muttype']=snv['ExonicFunc.refGene'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]
if re.match("nonsynonymous SNV",clingene1['muttype']):
clingene1['muttype']='错义突变'
elif re.search("^frameshift",clingene1['muttype']):
clingene1['muttype']='移码突变'
elif re.search("^nonframeshift",clingene1['muttype']):
clingene1['muttype']='非移码突变'
elif re.match("stopgain",clingene1['muttype']):
clingene1['muttype']='提前终止'
else:
clingene1['muttype']='/'
context['clingene1'].append(clingene1)
else:
clingene2['freq']=snv['Freq'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]
if len((snv['AAChange.refGene'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]).split(":")) == 5:
(clingene2['gene'],clingene2['transcript'],clingene2['exon'],clingene2['nacid'],clingene2['aacid'])=(snv['AAChange.refGene'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]).split(":")
else:
(clingene2['gene'],clingene2['transcript'],clingene2['exon'],clingene2['nacid'])=(snv['AAChange.refGene'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]).split(":")
clingene2['aacid'] = '/'
clingene2['muttype']=snv['ExonicFunc.refGene'][snv['AAChange.refGene']==mut].reset_index(drop=True)[0]
if re.match("nonsynonymous SNV",clingene2['muttype']):
clingene2['muttype']='错义突变'
elif re.search("^frameshift",clingene2['muttype']):
clingene2['muttype']='移码突变'
elif re.search("^nonframeshift",clingene2['muttype']):
clingene2['muttype']='非移码突变'
elif re.match("stopgain",clingene2['muttype']):
clingene2['muttype']='提前终止'
else:
clingene2['muttype']='/'
context['clingene2'].append(clingene2)
break
context['list3'][gene]="\n".join(rt[gene])
else:
snv_size=0
##target vus and nontarget vus
snv_size_vus = os.path.getsize(snv_file_vus)
if snv_size_vus>0:
snv_vus=pd.read_table(snv_file_vus,sep="\t")
for index,row in snv_vus.iterrows():
if float(row['Freq'].replace('%',''))>=2:
nonclingene={}
if len(row['AAChange.refGene'].split(":")) == 5:
(nonclingene['gene'],nonclingene['transcript'],nonclingene['exon'],nonclingene['nacid'],nonclingene['aacid'])=row['AAChange.refGene'].split(":")
else:
(nonclingene['gene'],nonclingene['transcript'],nonclingene['exon'],nonclingene['nacid'])=row['AAChange.refGene'].split(":")
nonclingene['aacid'] = '/'
nonclingene['freq']=row['Freq']
nonclingene['muttype']=row['ExonicFunc.refGene']
if re.match("nonsynonymous SNV",nonclingene['muttype']):
nonclingene['muttype']='错义突变'
elif re.search("^frameshift",nonclingene['muttype']):
nonclingene['muttype']='移码突变'
elif re.search("^nonframeshift",nonclingene['muttype']):
nonclingene['muttype']='非移码突变'
elif re.match("stopgain",nonclingene['muttype']):
nonclingene['muttype']='提前终止'
else:
nonclingene['muttype']='/'
context['nonclingenes'].append(nonclingene)
##fusion处理
fusion_size = os.path.getsize(fusion_file)
if fusion_size>0:
fusion=pd.read_table(fusion_file,sep="\t")
cols=[index for index,row in fusion[fusion['可信']==0].iterrows()]
fusion.drop(cols,inplace=True)
genes=fusion['Gene_Symbol'].drop_duplicates()
if len(genes):
for gene in genes:
rt={}
rt[gene]=[]
fusions=fusion['FUSION'][fusion['Gene_Symbol']==gene].drop_duplicates()
for mut in fusions:
info2={mut:[]}
for index,row in fusion[fusion['FUSION']==mut].iterrows():
info3={}
info3['drug']=row['药物中文名']
info3['effect']=row['Response_Type_C']
info3['tumor']=row['疾病中文名']
info3['evidence']=row['Evidence_Source_C']
info3['sig']=row['EfficacyEvidence']
info2[mut].append(info3)
##sensitive_resistant_drug
drugs=row['Drug'].replace(" + ",",")
drugs=drugs.split(",")
drugs_chinese=row['药物中文名'].replace(" + ",",")
drugs_chinese=drugs_chinese.split(",")
bool=0
for drug in drugs:
if drug.upper() in drug_disease.keys():
if re.search(r'敏感',row['Response_Type_C']):
sensitive_drug={}
sensitive_drug['name']=drugs_chinese[bool]
sensitive_drug['mechanism']=("\n".join([drug_disease[drug.upper()],drug_mechanism[drug.upper()]])).strip()
if sensitive_drug not in sensitive_resistant_drug[0]['drug']:
sensitive_resistant_drug[0]['drug'].append(sensitive_drug)
elif re.search(r'耐药',row['Response_Type_C']):
resistant_drug={}
resistant_drug['name']=drugs_chinese[bool]
resistant_drug['mechanism']=("\n".join([drug_disease[drug.upper()],drug_mechanism[drug.upper()]])).strip()
if resistant_drug not in sensitive_resistant_drug[1]['drug']:
sensitive_resistant_drug[1]['drug'].append(resistant_drug)
bool+=1
context['list2'][mut]=info2[mut]
info={}
info['gene']=mut
info['p']='融合'
info['freq']="".join([str(fusion['FREQ1'][fusion['FUSION']==mut].reset_index(drop=True)[0]),'%'])
A=[]
B=[]
C=[]
D=[]
for index,row in fusion[(fusion['FUSION']==mut) & (fusion['标签']=='适应症') & fusion['Response_Type_C'].str.contains("敏感")].iterrows():
ds=row['药物中文名'].split(",")
evidence='A'
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
A.extend(ds_new)
for index,row in fusion[(fusion['FUSION']==mut) & (fusion['标签']=='非适应症') & fusion['Response_Type_C'].str.contains("敏感")].iterrows():
ds=row['药物中文名'].split(",")
evidence='C'
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
B.extend(ds_new)
for index,row in fusion[(fusion['FUSION']==mut) & (fusion['标签']=='.') & fusion['Response_Type_C'].str.contains("敏感")].iterrows():
ds=row['药物中文名'].split(",")
evidence=row['证据等级']
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
C.extend(ds_new)
for index,row in fusion[(fusion['FUSION']==mut) & fusion['Response_Type_C'].str.contains("耐药")].iterrows():
evidence=''
if row['标签']=='非适应症':
evidence='C'
else:
evidence=row['证据等级']
ds=row['药物中文名'].split(",")
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
D.extend(ds_new)
A=sorted(set(A),key=A.index)
B=sorted(set(B),key=B.index)
C=sorted(set(C),key=C.index)
D=sorted(set(D),key=D.index)
info['drug_A']="\n".join(A)
info['drug_B']="\n".join(B)
info['drug_C']="\n".join(C)
info['drug_D']="\n".join(D)
# info['drug_A']="\n".join(list(set(fusion['药物中文名'][(fusion['FUSION']==mut) & (fusion['标签']=='适应症') & fusion['Response_Type_C'].str.contains("敏感")].str.cat(sep=",").split(","))))
# info['drug_B']="\n".join(list(set(fusion['药物中文名'][(fusion['FUSION']==mut) & (fusion['标签']=='非适应症') & fusion['Response_Type_C'].str.contains("敏感")].str.cat(sep=",").split(","))))
# info['drug_C']="\n".join(list(set(fusion['药物中文名'][(fusion['FUSION']==mut) & (fusion['标签']=='.') & fusion['Response_Type_C'].str.contains("敏感")].str.cat(sep=",").split(","))))
# info['drug_D']="\n".join(list(set(fusion['药物中文名'][(fusion['FUSION']==mut) & fusion['Response_Type_C'].str.contains("耐药")].str.cat(sep=",").split(","))))
if not info['drug_A']:
info['drug_A']='/'
if not info['drug_B']:
info['drug_B']='/'
if not info['drug_C']:
info['drug_C']='/'
if not info['drug_D']:
info['drug_D']='/'
context['list1'].append(info)
rt[gene].append("".join([info['gene'],'(',info['freq'],')']))
##genefunc
context['genefunc'][mut]=genefunction[gene.upper()]
##clingenes
clingene1={}
clingene2={}
for index,row in fusion[(fusion['FUSION']==mut)].iterrows():
if row['标签']=='适应症' or row['证据等级']=='B':
clingene1['gene']= info['gene']
clingene1['freq']= info['freq']
(clingene1['transcript'],clingene1['exon'],clingene1['nacid'],clingene1['aacid'])=("/","/","/","/")
clingene1['muttype']= '融合'
context['clingene1'].append(clingene1)
else:
clingene2['gene']= info['gene']
clingene2['freq']= info['freq']
(clingene2['transcript'],clingene2['exon'],clingene2['nacid'],clingene2['aacid'])=("/","/","/","/")
clingene2['muttype']= '融合'
context['clingene2'].append(clingene2)
context['list3'][gene]="\n".join(rt[gene])
else:
fusion_size=0
##cnv处理
cnv_size = os.path.getsize(cnv_file)
if cnv_size>0:
cnv=pd.read_table(cnv_file,sep="\t")
cols=[index for index,row in cnv[cnv['可信']==0].iterrows()]
cnv.drop(cols,inplace=True)
genes=cnv['gene'].drop_duplicates()
if len(genes):
for gene in genes:
rt={}
rt[gene]=[]
cnvs=cnv['Gene_Symbol'][cnv['gene']==gene].drop_duplicates()
for mut in cnvs:
info={}
info['gene']=mut
copy=cnv['cn'][cnv['Gene_Symbol']==mut].reset_index(drop=True)[0]
if copy > 2:
info['p']='扩增'
else:
info['p']='缺失'
A=[]
B=[]
C=[]
D=[]
for index,row in cnv[(cnv['Gene_Symbol']==mut) & (cnv['标签']=='适应症') & cnv['Response_Type_C'].str.contains("敏感")].iterrows():
ds=row['药物中文名'].split(",")
evidence='A'
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
A.extend(ds_new)
for index,row in cnv[(cnv['Gene_Symbol']==mut) & (cnv['标签']=='非适应症') & cnv['Response_Type_C'].str.contains("敏感")].iterrows():
ds=row['药物中文名'].split(",")
evidence='C'
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
B.extend(ds_new)
for index,row in cnv[(cnv['Gene_Symbol']==mut) & (cnv['标签']=='.') & cnv['Response_Type_C'].str.contains("敏感")].iterrows():
ds=row['药物中文名'].split(",")
evidence=row['证据等级']
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
C.extend(ds_new)
for index,row in cnv[(cnv['Gene_Symbol']==mut) & cnv['Response_Type_C'].str.contains("耐药")].iterrows():
evidence=''
if row['标签']=='非适应症':
evidence='C'
else:
evidence=row['证据等级']
ds=row['药物中文名'].split(",")
ds_new=[str(x) + '' + evidence + ' 级】'for x in ds]
D.extend(ds_new)
A=sorted(set(A),key=A.index)
B=sorted(set(B),key=B.index)
C=sorted(set(C),key=C.index)
D=sorted(set(D),key=D.index)
info['drug_A']="\n".join(A)
info['drug_B']="\n".join(B)
info['drug_C']="\n".join(C)
info['drug_D']="\n".join(D)
info['freq']=" ".join([str(copy),'拷贝'])
# info['drug_A']="\n".join(list(set(cnv['药物中文名'][(cnv['Gene_Symbol']==mut) & (cnv['标签']=='适应症') & cnv['Response_Type_C'].str.contains("敏感")].str.cat(sep=",").split(","))))
# info['drug_B']="\n".join(list(set(cnv['药物中文名'][(cnv['Gene_Symbol']==mut) & (cnv['标签']=='非适应症') & cnv['Response_Type_C'].str.contains("敏感")].str.cat(sep=",").split(","))))
# info['drug_C']="\n".join(list(set(cnv['药物中文名'][(cnv['Gene_Symbol']==mut) & (cnv['标签']=='.') & cnv['Response_Type_C'].str.contains("敏感")].str.cat(sep=",").split(","))))
# info['drug_D']="\n".join(list(set(cnv['药物中文名'][(cnv['Gene_Symbol']==mut) & cnv['Response_Type_C'].str.contains("耐药")].str.cat(sep=",").split(","))))
if not info['drug_A']:
info['drug_A']='/'
if not info['drug_B']:
info['drug_B']='/'
if not info['drug_C']:
info['drug_C']='/'
if not info['drug_D']:
info['drug_D']='/'
context['list1'].append(info)
info2={mut:[]}
for index,row in cnv[cnv['Gene_Symbol']==mut].iterrows():
info3={}
info3['drug']=row['药物中文名']
info3['effect']=row['Response_Type_C']
info3['tumor']=row['疾病中文名']
info3['evidence']=row['Evidence_Source_C']
info3['sig']=row['EfficacyEvidence']
info2[mut].append(info3)
##sensitive_resistant_drug
drugs=row['Drug'].replace(" + ",",")
drugs=drugs.split(",")
drugs_chinese=row['药物中文名'].replace(" + ",",")
drugs_chinese=drugs_chinese.split(",")
bool=0
for drug in drugs:
if drug.upper() in drug_disease.keys():
if re.search(r'敏感',row['Response_Type_C']):
sensitive_drug={}
sensitive_drug['name']=drugs_chinese[bool]
sensitive_drug['mechanism']=("\n".join([drug_disease[drug.upper()],drug_mechanism[drug.upper()]])).strip()
if sensitive_drug not in sensitive_resistant_drug[0]['drug']:
sensitive_resistant_drug[0]['drug'].append(sensitive_drug)
elif re.search(r'耐药',row['Response_Type_C']):
resistant_drug={}
resistant_drug['name']=drugs_chinese[bool]
resistant_drug['mechanism']=("\n".join([drug_disease[drug.upper()],drug_mechanism[drug.upper()]])).strip()
if resistant_drug not in sensitive_resistant_drug[1]['drug']:
sensitive_resistant_drug[1]['drug'].append(resistant_drug)
bool+=1
rt[gene].append("".join([info['p'],'(',info['freq'],')']))
context['list2'][" ".join([mut,info['p']])]=info2[mut]
##genefunc
context['genefunc'][" ".join([mut,info['p']])]=genefunction[mut.upper()]
##clingenes
clingene1={}
clingene2={}
for index,row in cnv[(cnv['Gene_Symbol']==mut)].iterrows():
if row['标签']=='适应症' or row['证据等级']=='B':
clingene1['gene']=info['gene']
clingene1['freq']= info['freq']
(clingene1['transcript'],clingene1['exon'],clingene1['nacid'],clingene1['aacid'])=("/","/","/","/")
clingene1['muttype']= info['p']
context['clingene1'].append(clingene1)
else:
clingene2['gene']=info['gene']
clingene2['freq']= info['freq']
(clingene2['transcript'],clingene2['exon'],clingene2['nacid'],clingene2['aacid'])=("/","/","/","/")
clingene2['muttype']= info['p']
context['clingene2'].append(clingene2)
context['list3'][gene]="\n".join(rt[gene])
else:
cnv_size=0
##msi
if Sample_type == 't':
msi_file=''.join([output_dir,'/MSI/',name,'.msi'])
msi=(open(msi_file,'r').readlines()[1]).split("\t")
context['msi_count']=msi[0]
context['msi_value']=round(float(msi[2].strip())/100,2)
if context['msi_value']>=0.3:
context['msi_result']='MSI-H'
context['msi_predict']='对免疫检查点抑制剂可能敏感'
else:
context['msi_result']='MSS'
context['msi_predict']='对免疫检查点抑制剂可能不敏感'
##MMR处理
mmr_file=''.join([output_dir,'/MMR/',name,"_mmr.txt"])
mmr_size = os.path.getsize(mmr_file)
mmr_result=0
mmr_result_summary=[]
if mmr_size>0:
mmr_fh=open(mmr_file,'r',encoding='utf-8').readlines()
for line in mmr_fh[1:]:
mmr={}
mmr['gene']= line.strip().split("\t")[0]
mmr['transcript'] = line.strip().split("\t")[1]
mmr['nacid'] = line.strip().split("\t")[2]
mmr['aacid'] = line.strip().split("\t")[3]
mmr['muttype'] = line.strip().split("\t")[5]
mmr['freq'] = line.strip().split("\t")[4]
mmr['sig'] = line.strip().split("\t")[6]
if mmr['sig'] != '意义未明突变':
mmr_result+=1
mmr_result_summary.append(mmr['gene'] + ' ' + mmr['aacid'])
context['mmr'].append(mmr)
context['mmr_result']=mmr_result
mmr_result_summary=' | '.join(mmr_result_summary)
if mmr_result_summary:
context['mmr_result_summary']=mrr_result_summary
context['mmr_predict']="对免疫检查点抑制剂可能敏感"
else:
context['mmr_result_summary']="未检测到相关基因突变"
context['mmr_predict']="对免疫检查点抑制剂可能不敏感"
##chemo
chemo_file=''.join([output_dir,'/chemo/',name,".drug.res.txt"])
chemo_fh=open(chemo_file,'r')
bool=0
chemos=chemo_fh.readlines()[1:]
chemo_result=0
recommend_drug=[]
normal_drug=[]
restrict_drug=[]
while bool<len(chemos):
chemo={}
lines=chemos[bool].strip().split("\t")
chemo['bool1']=bool+1
chemo['name1']=lines[0]
chemo['result1']=lines[4]
if lines[3] == '推荐':
# chemo['result1']=RichText('推荐使用', color='00FF00')
chemo_result+=1
recommend_drug.append(chemo['name1'])
elif lines[3] == '常规':
# chemo['result1']=RichText('常规使用')
normal_drug.append(chemo['name1'])
elif lines[3] == '谨慎':
# chemo['result1']=RichText('谨慎使用', color='FF0000')
restrict_drug.append(chemo['name1'])
if bool+1<len(chemos):
lines=chemos[bool+1].strip().split("\t")
chemo['bool2']=bool+2
chemo['name2']=lines[0]
chemo['result2']=lines[4]
if lines[3] == '推荐':
# chemo['result2']=RichText('推荐使用', color='00FF00')
chemo_result+=1
recommend_drug.append(chemo['name2'])
elif lines[3] == '常规':
# chemo['result2']=RichText('常规使用')
normal_drug.append(chemo['name2'])
elif lines[3] == '谨慎':
# chemo['result2']=RichText('谨慎使用', color='FF0000')
restrict_drug.append(chemo['name2'])
# print(chemo)
context['chemo'].append(chemo)
bool+=2
context['chemo_result']=chemo_result
chemo_detail=[]
chemo_detail_file=''.join([output_dir,'/chemo/',name,".drug.infos.txt"])
chemo_data=pd.read_table(chemo_detail_file,sep="\t")
chemo_drugs=chemo_data['药物'].drop_duplicates()
context['recommend_drug']=','.join(recommend_drug)
context['normal_drug']=','.join(normal_drug)
context['restrict_drug']=','.join(restrict_drug)
for drug in chemo_drugs:
c={}
c['drug']=drug
c['info']=[]
for index,row in chemo_data[chemo_data['药物'] == drug].iterrows():
d={}
d['gene']=row['检测基因']
d['site']=row['检测位点']
d['gt']=row['基因型']
d['level']=row['证据等级']
d['sig']=row['用药提示']
c['info'].append(d)
chemo_detail.append(c)
context['chemo_detail']=chemo_detail
context['sensitive_resistant_drug']=sensitive_resistant_drug
##化疗联合用药
chemo_comb_file=''.join([output_dir,'/chemo/',name,".chemo.comb.txt"])
chemo=pd.read_table(chemo_comb_file,sep="\t")
type=chemo['癌种'].drop_duplicates()
bool=-1
chemo_comb=[]
for i in type:
bool+=1
chemo_comb.append({'type':i,'drug':[]})
for index,row in chemo[chemo['癌种']==i].iterrows():
info={}
info['name']=row['用药方案']
info['abbr']=row['方案缩写']
info['sig']=row['临床提示']
chemo_comb[bool]['drug'].append(info)
context['chemo_comb']=chemo_comb
##hereditary cancer
if os.path.exists(''.join([output_dir,'/hereditary/',name,'.hereditary.pre.txt'])):
context['hereditary_cancer_1']=[]
context['hereditary_cancer_2']=[]
hereditary_file1=''.join([output_dir,'/hereditary/',name,".hereditary.txt"])
hereditary_file2=''.join([output_dir,'/hereditary/',name,".risk.txt"])
hereditary_file1_fh=open(hereditary_file1,'r')
hereditary_file2_fh=open(hereditary_file2,'r')
hereditary_result=0
hereditary_result_summary=[]
hereditary_disease=[]
if os.path.getsize(hereditary_file1)>0:
hereditary_file1_fh=open(hereditary_file1,'r')
for line in hereditary_file1_fh.readlines()[1:]:
hereditary_cancer_1={}
lines=line.strip().split("\t")
hereditary_cancer_1['gene']=lines[0]
hereditary_cancer_1['syndrome']=lines[1]
hereditary_cancer_1['hereditary_type']=lines[2]
hereditary_cancer_1['type']=lines[3]
hereditary_cancer_1['result']=lines[4]
hereditary_result+=len(lines[4].split(";"))
hereditary_result_summary.append(hereditary_cancer_1['gene'] + ' ' + hereditary_cancer_1['result'])
hereditary_disease.append(hereditary_cancer_1['syndrome'])
context['hereditary_cancer_1'].append(hereditary_cancer_1)
context['hereditary_result']=hereditary_result
if hereditary_result_summary:
context['hereditary_disease']=';'.join(hereditary_disease)
context['hereditary_result_summary']=' | '.join(hereditary_result_summary)
else:
context['hereditary_disease']='/'
context['hereditary_result_summary']='未检测到相关基因突变'
hereditary_risk=[]
bool=0
heres=hereditary_file2_fh.readlines()[1:]
while bool<len(heres):
hereditary_cancer_2={}
lines=(heres[bool]).strip().split("\t")
hereditary_cancer_2['type1']=lines[0]
if lines[1] == '偏高':
hereditary_risk.append(lines[0])
hereditary_cancer_2['risk1']=RichText('偏高', color='FF0000')
elif lines[1] == '同一般人群':
hereditary_cancer_2['risk1']=RichText('同一般人群')
if bool+1<len(heres):
lines=heres[bool+1].strip().split("\t")
hereditary_cancer_2['type2']=lines[0]
if lines[1] == '偏高':
hereditary_cancer_2['risk2']=RichText('偏高', color='FF0000')
elif lines[1] == '同一般人群':
hereditary_cancer_2['risk2']=RichText('同一般人群')
context['hereditary_cancer_2'].append(hereditary_cancer_2)
bool+=2
if hereditary_risk:
context['hereditary_risk']=','.join(hereditary_risk) + '风险可能较高'
else:
context['hereditary_risk']='风险同一般人群'
if len(context['hereditary_cancer_1'])==0:
context['hereditary_cancer_1']=[{'gene':'/','syndrome':'/','hereditary_type':'/','type':'/','result':'/'}]
##可能获益的临床药物处理
total_drug_count=[]
for i in context['list1']:
if i['drug_A'] != '/':
for j in i['drug_A'].split("\n"):
if j not in total_drug_count:
total_drug_count.append(j)
if i['drug_B'] != '/':
for j in i['drug_B'].split("\n"):
if j not in total_drug_count:
total_drug_count.append(j)
if i['drug_C'] != '/':
for j in i['drug_C'].split("\n"):
if j not in total_drug_count:
total_drug_count.append(j)
context['total_drug_count']=len(total_drug_count)
##去重处理
context['clingenes1']=[]
context['clingenes2']=[]
for i in context['clingene1']:
if i not in context['clingenes1']:
context['clingenes1'].append(i)
for i in context['clingene2']:
if i not in context['clingenes2'] and i not in context['clingenes1']:
context['clingenes2'].append(i)
##检测到的基因变异
context['total_mut_count']=len(context['clingenes1'])+len(context['clingenes2'])
##本癌种FDA/NMPA/NCCN批准基因检测
indication_fh=open(indication_file,'r',encoding='utf-8').readlines()
indication_genes=[]
for line in indication_fh[1:]:
indication={}
indication['gene']=line.strip().split("\t")[0]
indication['content']=line.strip().split("\t")[1]
if indication['gene'] in context['list3'].keys():
indication['result']=RichText(context['list3'][indication['gene']], color='FF0000')
else:
indication['result']='未检出变异'
context['indication'].append(indication)
##qc处理
qc_file=''.join([output_dir,'/qc/',name,'_qc.txt'])
qc=pd.read_table(qc_file,sep="\t",header=None,index_col=0,names=['A','B'])
Q30=qc.loc['Q30(%)','B']
if Q30>=85:
Q30_result='合格'
else:
Q30_result='警戒'
depth=qc.loc['mean_depth(dedup)','B']
if Sample_type == 'c':
if depth>=1000:
depth_result='合格'
else:
depth_result='警戒'
elif Sample_type == 't':
if depth>=500:
depth_result='合格'
else:
depth_result='警戒'
uniformity=qc.loc['coverage(>=0.2*meanx)','B']
if uniformity>=90:
uniformity_result='合格'
else:
uniformity_result='警戒'
if Q30_result=='合格' and depth_result=='合格' and uniformity_result=='合格':
context['qc_result']='合格'
else:
context['qc_result']='警戒'
context['Q30']=Q30
context['Q30_result']=Q30_result
context['depth']=depth
context['uniformity']=uniformity
context['depth_result']=depth_result
context['uniformity_result']=uniformity_result
##阴性模块处理
##list1
if len(context['list1'])==0:
context['list1']=[{'gene':'/','freq':'/','drug_A':'/','drug_B':'/','drug_C':'/','drug_D':'/'}]
##list2
if len(context['list2'])==0:
context['list2']= {'/':[{'drug':'/','effect':'/','tumor':'/','evidence':'/','sig':'/'}]}
context['genefunc']['/']='/'
##clingenes,nonclingenes
if len(context['clingenes1'])==0:
context['clingenes1']=[{'gene':'/','transcript':'/','nacid':'/','aacid':'/','exon':'/','muttype':'/','freq':'/'}]
if len(context['clingenes2'])==0:
context['clingenes2']=[{'gene':'/','transcript':'/','nacid':'/','aacid':'/','exon':'/','muttype':'/','freq':'/'}]
if len(context['nonclingenes'])==0:
context['nonclingenes']=[{'gene':'/','transcript':'/','nacid':'/','aacid':'/','exon':'/','muttype':'/','freq':'/'}]
##sensitive_resistant_drug
if len(context['sensitive_resistant_drug'][0]['drug'])==0:
context['sensitive_resistant_drug'][0]['drug']=[{'name':'/','mechanism':'/'}]
if len(context['sensitive_resistant_drug'][1]['drug'])==0:
context['sensitive_resistant_drug'][1]['drug']=[{'name':'/','mechanism':'/'}]
##mmr
if len(context['mmr'])==0:
context['mmr']=[{'gene':'/','transcript':'/','nacid':'/','aacid':'/','muttype':'/','freq':'/','sig':'/'}]
##info
post_file=''.join([output_dir,'/qc/',name,'_post.json'])
sex='/'
age='/'
phone='/'
medical_history='/'
family_history='/'
sample_id=name
sample_type='/'
report_date=time.strftime("%Y-%m-%d", time.localtime())
arrival_date=report_date
cancer_type='/'
pathologic_diagnosis='/'
if os.path.isfile(post_file):
# if post_file.exists():
post_fh=open(post_file,'r')
post=json.load(post_fh)
name=post["data"][0]["name"]
sex=post["data"][0]["gender"]
age=post["data"][0]["age"]
medical_history=post["data"][0]["treatHistory"]
family_history=post["data"][0]["sickFamilyHistory"]
sample_id=post["data"][0]["barcode"]
# sample_id_control=post["data"][0]["barcode_N"]
sample_type=post["data"][0]["source"]
# sample_type_control=post["data"][0]["source_N"]
# sample_type_control = post["data"][0].get('source_N', '/')
arrival_date=post["data"][0]["receiveTime"].split(' ')[0]
cancer_type=post["data"][0]["zlType"]
pathologic_diagnosis=post["data"][0]["treatResult"]
context['info']={
'name':name,
'sex':sex,
'age':age,
'phone':phone,
'medical_history':medical_history,
'family_history':family_history,
'sample_id':sample_id,
'sample_type':sample_type,
'report_date':report_date,
'arrival_date':arrival_date,
'cancer_type':cancer_type,
'pathologic_diagnosis':pathologic_diagnosis}
context['report_time']=report_date
##模板替换
file_real = os.path.realpath(sys.argv[0])
Exe_Path = os.path.dirname(file_real)
report_template={'lung85gene':{'t':'lung85-tissue-oem.docx','c':'lung85-blood-oem.docx'},
'crc88gene':{'t':'CRC88-tissue-oem.docx','c':'CRC88-blood-oem.docx'}}
doc_full = os.path.join(Exe_Path, report_template[projcet][Sample_type])
doc = DocxTemplate(doc_full)
doc.render(context)
doc.save(report_file)

85
server.py 100644
View File

@ -0,0 +1,85 @@
import json
import os
import socket
import struct
import sys
from datetime import datetime
from main import main
from tools.common import basedir
def recvdata(conn, path):
"""
接受文件
:param conn:
:param path:
:return:
"""
header_size = struct.unpack('i', conn.recv(4))[0]
header_bytes = conn.recv(header_size)
header_json = header_bytes.decode('utf-8')
header_dic = json.loads(header_json)
content_len = header_dic['contentlen']
content_name = header_dic['contentname']
recv_len = 0
fielpath = os.path.join(path, '%s_%s' % (datetime.now().strftime("%m%d%H%M"), content_name))
file = open(fielpath, 'wb')
while recv_len < content_len:
correntrecv = conn.recv(1024 * 1000)
file.write(correntrecv)
recv_len += len(correntrecv)
file.close()
return fielpath
def senddata(conn, path, message=None):
name = os.path.basename(os.path.realpath(path))
if not message:
with open(path, 'rb') as file:
content = file.read()
headerdic = dict(
contentlen=len(content),
contentname=name
)
headerjson = json.dumps(headerdic)
headerbytes = headerjson.encode('utf-8')
headersize = len(headerbytes)
conn.send(struct.pack('i', headersize))
conn.send(headerbytes)
conn.sendall(content)
else:
headerdic = dict(
contentlen=len(path),
contentname='message'
)
headerjson = json.dumps(headerdic)
headerbytes = headerjson.encode('utf-8')
headersize = len(headerbytes)
conn.send(struct.pack('i', headersize))
conn.send(headerbytes)
conn.sendall(path.encode('utf-8'))
def server():
myserver = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
adrss = ("", 8190)
myserver.bind(adrss)
myserver.listen(5)
while True:
try:
myclient, adddr = myserver.accept()
recv_content = recvdata(myclient, os.path.join(basedir, 'xlsx'))
outputpath = main(recv_content)
senddata(myclient, outputpath)
print('生成成功')
except Exception as e:
print(e, '有错误')
# continue
if __name__ == '__main__':
if len(sys.argv) > 1:
outputpath = main(sys.argv[1])
else:
server()

1746
t.json 100644

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

View File

3
tools/common.py 100644
View File

@ -0,0 +1,3 @@
import os
basedir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))

457
tools/parsexlsx.py 100644
View File

@ -0,0 +1,457 @@
import json
import re
import sys
import time
from collections import defaultdict
import pandas as pd
from tools.readxlsx import read
def tree():
return defaultdict(tree)
class BaseAssignment:
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.result = tree() # 报告结果
self.signtb = set() # 具有明确或潜在临床意义的基因变异
self.signdurg = set() # 潜在临床获益的治疗药物
self.drugs_type = dict()
class Parse(BaseAssignment):
def __init__(self, sampledata, *args, **kwargs):
super().__init__(*args, **kwargs)
self.sampledata = sampledata
def cms(self):
"""
样本信息处理
"""
data = pd.DataFrame(self.sampledata['sample_info'])
if data.empty:
raise UserWarning('sample_info表为空生成报告失败')
data = data.applymap(
lambda x: str(x).replace('.', '/').replace('-', '/').replace('——', '/') if str(x) in ['.', '-', '——'] else x)
data_dict = data.to_dict('index')[0]
data_dict['receiveTime'] = re.split(' ', data_dict['receiveTime'])[0]
data_dict['reportTime'] = time.strftime("%Y-%m-%d", time.localtime())
self.result['c'] = data_dict
def target(self):
data = pd.DataFrame(self.sampledata['snvindel'])
res = []
if data.empty:
self.result['snvindel'] = res
return
data = data[data['Validated'] == 1].reset_index()
data['muttype'] = '/'
data.loc[data['ExonicFunc.refGene'].str.match('nonsynonymous SNV'), 'muttype'] = '错义突变'
data.loc[data['ExonicFunc.refGene'].str.match('^frameshift'), 'muttype'] = '移码突变'
data.loc[data['ExonicFunc.refGene'].str.match('^nonframeshift'), 'muttype'] = '非移码突变'
data.loc[data['ExonicFunc.refGene'].str.match('stopgain'), 'muttype'] = '提前终止'
# 拆分hgvs
data[['gene', 'transcript', 'exon', 'nacid', 'aacid']] = data['AAChange.refGene'].str.split(':', expand=True)
# 没有氨基酸改变用核苷酸改变代替
data['aacid'] = data['aacid'].fillna(data['nacid'])
for alter, alter_data in data.groupby('AAChange.refGene'):
alter_data_need = alter_data[['gene', 'transcript', 'exon', 'nacid', 'aacid', 'mutant_frequency',
'AMP_mut_level', 'muttype', 'Gene_function']]
alter_res = alter_data_need.iloc[0].to_dict()
alter_res['drug_category'] = self._drug_category(alter_data)
drug_content = alter_data[
['DrugCn', 'Response_Type', 'Indication', 'Evidence_Source', 'Efficacy_Evidence']]
drug_content = drug_content[drug_content['DrugCn'] != '.']
alter_res['drug_content'] = drug_content.reset_index().to_dict('records')
alter_res['alter'] = alter
res.append(alter_res)
# 汇总
if alter_res['AMP_mut_level'] in ['I', 'II']:
self.signtb.add(alter)
self.result['snvindel'] = res
def fusion(self):
data = pd.DataFrame(self.sampledata['fusion'])
res = []
if data.empty:
self.result['fusion'] = res
return
data = data[data['Validated'] == 1].reset_index()
for alter, alter_data in data.groupby('FUSION'):
alter_data_need = alter_data[['FUSION', 'FREQ1', 'AMP_mut_level', 'Gene_function']]
alter_res = alter_data_need.iloc[0].to_dict()
alter_res['drug_category'] = self._drug_category(alter_data)
drug_content = alter_data[
['DrugCn', 'Response_Type', 'Indication', 'Evidence_Source', 'Efficacy_Evidence']]
drug_content = drug_content[drug_content['DrugCn'] != '.']
alter_res['drug_content'] = drug_content.reset_index().to_dict('records')
alter_res['alter'] = '%s 融合' % (alter_res['FUSION'].replace('-', ':'))
res.append(alter_res)
# 汇总
if alter_res['AMP_mut_level'] in ['I', 'II']:
self.signtb.add(alter)
self.result['fusion'] = res
def cnv(self):
data = pd.DataFrame(self.sampledata['cnv'])
res = []
if data.empty:
self.result['cnv'] = res
return
data = data[data['Validated'] == 1].reset_index()
for alter, alter_data in data.groupby('Gene_Symbol'):
alter_data_need = alter_data[['Gene_Symbol', 'Copy_number', 'AMP_mut_level', 'Gene_function']].reset_index()
alter_data_need['muttype'] = '缺失'
alter_data_need.loc[alter_data_need['Copy_number'] > 2, 'muttype'] = '扩增'
alter_res = alter_data_need.iloc[0].to_dict()
alter_res['drug_category'] = self._drug_category(alter_data)
drug_content = alter_data[
['DrugCn', 'Response_Type', 'Indication', 'Evidence_Source', 'Efficacy_Evidence']]
drug_content = drug_content[drug_content['DrugCn'] != '.']
alter_res['drug_content'] = drug_content.reset_index().to_dict('records')
alter_res['alter'] = '%s %s' % (alter, alter_res['muttype'])
res.append(alter_res)
# 汇总
if alter_res['AMP_mut_level'] in ['I', 'II']:
self.signtb.add(alter)
self.result['cnv'] = res
def hotspot(self):
self._to_records('hotspot')
def met(self):
self._to_records('MET')
def longindel(self):
self._to_records('longindel')
def mmr(self):
data = pd.DataFrame(self.sampledata['MMR'])
result_summary = '未检测到相关基因突变'
predict = '对免疫检查点抑制剂可能不敏感'
mmr_num = 0
res = []
if not data.empty:
tmdf = data[['gene', 'p_change']].reset_index()
tmdf['result_summary'] = tmdf.apply(lambda x: '%s %s' % (x['gene'], x['p_change']), axis=1)
result_summary = ' | '.join(tmdf['result_summary'].to_list())
predict = '对免疫检查点抑制剂可能敏感'
mmr_num = len(data.index)
res = data.to_dict('records')
self.result['MMR'] = res
self.result['sum']['mmr'] = dict(
result_summary=result_summary,
predict=predict,
mmr_num=mmr_num
)
def msi(self):
self._to_dicts('MSI')
# def chemo(self):
# data = pd.DataFrame(self.sampledata['chemo'])
#
# project = data['project'].to_list()[0]
#
# # 分类汇总 同位点,药物合并 drug.infos.txt
# drugrsid = data[['drugname', 'genename', 'rsid', 'result', 'level', 'tips', 'drugsort']]
# drugrsid = drugrsid.drop_duplicates()
# resdrugrsid = drugrsid.groupby(['drugname', 'genename', 'rsid', 'result', 'level', 'drugsort'])['tips'].agg(
# ','.join).reset_index()
# resdrugrsid.rename(columns=
# {'drugname': '药物', 'genename': '检测基因', 'rsid': '检测位点', 'result': '基因型',
# 'level': '证据等级', 'tips': '用药提示'},
# inplace=True)
# resdrugrsid = resdrugrsid.sort_values(by=['drugsort', '药物', '检测基因'])
# self.result['chemo']['druginfo'] = resdrugrsid.to_dict('records')
#
# # 药物 药物疗效 推荐程度合并 drug.res.txt
# drugtypesum = data[['drugname', 'drugtype', 'rsid', 'weights']]
# drugtypesum = drugtypesum.drop_duplicates()
# drugtyperes = list()
# drugsum = dict()
# for drug, drugdata in drugtypesum.groupby('drugname'):
# tipsnum = drugdata.groupby(['drugtype']).agg({'weights': 'sum'}).to_dict('index')
# sumlist = list()
# if 'LX' in tipsnum:
# LX = tipsnum['LX']['weights']
# if LX > 0:
# lxdes = '疗效较好'
# lxnum = 1
# elif LX == 0:
# lxdes = '疗效一般'
# lxnum = 0
# else:
# lxdes = '疗效较差'
# lxnum = -1
# sumlist.append(lxdes)
# else:
# LX = 0
# lxnum = 0
# if 'DF' in tipsnum:
# DF = tipsnum['DF']['weights']
# if DF > 0:
# dfdes = '毒副较低'
# dfnum = 1
# elif DF == 0:
# dfdes = '毒副一般'
# dfnum = 0
# else:
# dfdes = '毒副较高'
# dfnum = -1
# sumlist.append(dfdes)
# else:
# DF = 0
# dfnum = 0
#
# # 评价方式 疗效 1 0 -1, 毒副 1 0 -1 可形成9宫格
# sumnum = lxnum + dfnum
# if sumnum > 0:
# sumdes = '推荐'
# elif sumnum == 0:
# sumdes = '常规'
# else:
# sumdes = '谨慎'
#
# # 特别药物处理
# if (drug == "氟尿嘧啶" or drug == "卡培他滨") and DF < 0:
# sumdes = '谨慎'
#
# drugtyperes.append(dict(
# 药物名称=drug,
# 疗效=LX,
# 毒副=DF,
# 推荐程度=sumdes,
# 疗效和毒副总结=','.join(sumlist)
# ))
# drugsum[drug] = sumdes
#
# # 报告中展示药物有顺序
# drugsort = data[['drugname', 'drugsort']].drop_duplicates()
# drugsort_dict = drugsort.set_index('drugname')['drugsort'].to_dict()
# drugtyperes_sort = sorted(drugtyperes, key=lambda x: (
# drugsort_dict[x['药物名称']] if x['药物名称'] in drugsort_dict else 100, x['药物名称']))
#
# drugtyperes_sort_df = pd.DataFrame(drugtyperes_sort)
# self.result['chemo']['sum'] = drugtyperes_sort_df.groupby('推荐程度')['药物名称'].apply(','.join).to_dict()
# self.result['chemo']['drugres'] = drugtyperes_sort_df.to_dict('records')
#
# # 联合用药
# drug_combine_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'database',
# 'chemo_drug_combine.csv')
# drug_combine = pd.read_csv(drug_combine_path, sep='\t')
# drug_combine.fillna('.', inplace=True)
# drug_combine_data = drug_combine[drug_combine['source'].str.contains(project)]
# drug_combine_data = drug_combine_data.reset_index()
# if not drug_combine_data.empty:
# drug_combine_data['临床提示'] = drug_combine_data['用药方案'].apply(self._get_drug_plan, args=(drugsum,))
# self.result['chemo']['combine'] = drug_combine_data.groupby('癌种').apply(
# lambda group: group.set_index('癌种').to_dict('records')).to_dict()
# else:
# self.result['chemo']['combine'] = dict()
# self.result['sum']['chemo_drug_num'] = len(drugsum.keys())
def chemo(self):
chemo_res = self._to_records('chemo_res', need=True)
chemo_res_df = pd.DataFrame(chemo_res)
chemo_res_df.index = chemo_res_df.index + 1
chemo_res_df = chemo_res_df.reset_index()
self.result['chemo']['chemo_res'] = chemo_res_df.to_dict('records')
self.result['sum']['chemo']['drug_num'] = len(chemo_res)
self.result['sum']['chemo']['drug_category'] = pd.DataFrame(chemo_res).groupby('推荐程度')['药物名称'].apply(
','.join).to_dict()
chemo_comb = self._to_records('chemo_comb', need=True)
chemo_comb_res = dict()
if chemo_comb:
chemo_comb_res = pd.DataFrame(chemo_comb).groupby('癌种').apply(
lambda group: group.set_index('癌种').to_dict('records')).to_dict()
self.result['chemo']['chemo_comb'] = chemo_comb_res
chemo_info = self._to_records('chemo_info', need=True)
chemo_info_res = dict()
if chemo_info:
chemo_info_res = pd.DataFrame(chemo_info).groupby('药物').apply(
lambda group: group.set_index('药物').to_dict('records')).to_dict()
self.result['chemo']['chemo_info'] = chemo_info_res
def hcs(self):
self._to_records('HCS')
self.result['sum']['hcs']['num'] = len(self.result['HCS'])
def heredity(self):
"""
遗传的结果文件
:return:
"""
hereditary = pd.DataFrame(self.sampledata['hereditary'])
result = '/'
disease = '/'
risk = '/'
if not hereditary.empty:
result = '|'.join(hereditary.apply(lambda x: '%s %s' % (x['基因'], x['检测结果']), axis=1).to_list())
disease = '|'.join(hereditary['遗传性肿瘤综合征'].to_list())
hereditary_risk = pd.DataFrame(self.sampledata['hereditary_risk'])
if not hereditary_risk.empty:
risk = ','.join(hereditary_risk[hereditary_risk['风险值'] == '偏高']['肿瘤类型'].to_list())
self.result['hereditary'] = hereditary.to_dict('records')
self.result['sum']['hereditary']['result'] = result
self.result['sum']['hereditary']['disease'] = disease
self.result['sum']['hereditary']['risk'] = risk
def qc(self):
# self._to_dicts('qc')
data = pd.DataFrame(self.sampledata['qc'])
res = {}
if not data.empty:
data.rename(columns={
'Q30(%)': 'q30',
'mean_depth(dedup)': 'depth',
'coverage(>=0.2*meanx)': 'coverage'
}, inplace=True)
res = data.to_dict('index')[0]
self.result['qc'] = res
def drugs(self):
data = pd.DataFrame(self.sampledata['drugs'])
res = {}
if not data.empty:
data = data.dropna()
data = data[data['drug_detail'] != '.']
res = data.set_index('drug_name')['drug_detail'].to_dict()
self.result['drugs']['drugs_detail'] = res
def indication(self):
self._to_records('indication')
def _to_records(self, sheetname, need=False):
"""
for many lines
:param sheetname:
:return:
"""
data = pd.DataFrame(self.sampledata[sheetname])
res = []
if data.empty:
self.result[sheetname] = res
return
res = data.to_dict('records')
if need:
return res
self.result[sheetname] = res
def _to_dicts(self, sheetname):
"""
for single line
:param sheetname:
:return:
"""
data = pd.DataFrame(self.sampledata[sheetname])
res = {}
if data.empty:
self.result[sheetname] = res
return
res = data.to_dict('index')[0]
self.result[sheetname] = res
def _drug_category(self, groupdata):
drug_category_res = dict()
for drug_category, drug_category_alter_data in groupdata.groupby('Drug_Category'):
if drug_category == '.':
continue
# 敏感,可能敏感药物统计
if drug_category in ['a', 'b', 'c']:
self.signdurg.update(set(drug_category_alter_data['DrugCn'].str.split(',').explode().tolist()))
drug_category_alter_data['drugdes'] = drug_category_alter_data.apply(
lambda x: '%s%s 级】' % (x['DrugCn'], x['AMP_evidence_level']), axis=1)
drug_category_res[drug_category] = '\n'.join(drug_category_alter_data['drugdes'].to_list())
# 所有药物信息
groupdata['list_col'] = groupdata['DrugCn'].str.replace(' + ', '+').str.split(r'[+,]')
exploded_df = groupdata.explode('list_col').reset_index()
exploded_df = exploded_df[(exploded_df['list_col'] != '.') & (exploded_df['list_col'] != '')]
exploded_dict = exploded_df.groupby('Response_Type')['list_col'].agg(lambda x: list(set(x))).to_dict()
for drug_type in exploded_dict:
if drug_type in self.drugs_type:
self.drugs_type[drug_type].extend(exploded_dict[drug_type])
else:
self.drugs_type[drug_type] = exploded_dict[drug_type]
# for drugall in exploded_df['Drug_Detail'].to_list():
# for drug in drugall.split('|'):
# match = re.search(r'\[\[(.*?)]](.*?)$', drug)
# if match:
# self.drugs_record['drugs'].update({match.group(1).strip(): match.group(2).strip()})
return drug_category_res
@staticmethod
def _get_drug_plan(x, drugsum):
tlist = x.split('+')
tdeslist = list()
for tdes in tlist:
if tdes.strip() in drugsum:
t1_des = drugsum[tdes.strip()]
tdeslist.append(t1_des)
if '慎用' in tdeslist or '谨慎' in tdeslist:
return '慎用'
elif '推荐' in tdeslist:
return '推荐'
elif '常规' in tdeslist:
return '可选'
else:
return '可选'
def collect(self):
self.cms()
self.target()
self.fusion()
self.cnv()
self.hotspot()
self.met()
self.longindel()
self.mmr()
self.msi()
self.chemo()
self.hcs()
self.heredity()
self.qc()
self.indication()
self.drugs()
# 汇总
self.result['sum']['signtb_num'] = len(self.signtb)
self.result['sum']['signdrug_num'] = len(self.signdurg)
self.result['drugs']['drugs_type'] = {key: self.drugs_type[key] for key in sorted(self.drugs_type.keys())}
return self.result
def run(path):
parse = Parse(read(path))
res = parse.collect()
resjson = json.dumps(res, indent=4, ensure_ascii=False)
with open('t.json', 'w') as f:
f.write(resjson)
return resjson
if __name__ == '__main__':
run(sys.argv[1])

27
tools/readxlsx.py 100644
View File

@ -0,0 +1,27 @@
import pandas as pd
import logging
import json
import sys
logger = logging.getLogger('main.sub')
def read(merge):
df = pd.read_excel(merge, None)
samplelist = df['sample_info']['sampleSn'].to_list()
if not samplelist:
logger.error('sample_info表为空读取excel信息失败')
raise UserWarning('sample_info表为空读取excel信息失败')
samdict = dict()
for name, contents in df.items():
if contents.empty:
samdict[name] = []
continue
contents.fillna('.', inplace=True)
samdict[name] = contents.to_dict('list')
return samdict
if __name__ == '__main__':
res = read(sys.argv[1])
print(res)

4617
tools/t.json 100644

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.