#!/usr/bin/env python3 # -*- coding: UTF-8 -*- """ Created on: 2023-02-10 @author: cjs # 用途:处理放化疗信息,重新构建脚本处理流程 # 版本:1.0.0 # 表格注意事项,杂合子顺序无影响,即AG和GA是一样的 # 表格表头内容和顺序不能变更 # 每个RS信息的第一个信息必须是野生型信息,未检测出的位点都按野生型计算 # 最后编辑日期: 2023-02-10 # 1.0.1 2023-03-10 药物只有一种型(LX、DF)的检测时,只判断一种型 # 1.0.1 2023-03-10 药物的一个rs有多种型的检测,每种型都判读 """ from cjs_test.cjs_logger import Logger from openpyxl import load_workbook from glob import glob import datetime import traceback import os import sys import subprocess # 全局参数 Exe_Bin = '' Exe_Path = '' GLog = None StartTime = None # 命令参数 Output_dir = r'' Chemo_Path = '' Normal = '' Tumor = '' N_Bam = '' Bam_Path = '' Human_ref = '/dataseq/jmdna/database/genome/hg19/hg19.fa' Project = '' Pro_xlsx = '' # 运行变量 Drug_Dd = {} # 记录rs对药物的影响 Drug_Ls = [] # 继续处理药物的顺序 Rs_Dd = {} # 构建bed文件所需的信息 Rs_Gene = {} # rs对应的基因名称 Error_Ls = ['Error_Type\tError_info\n'] # 固定参数 MIN_DEPTH = 30 # 检出的最低深度,低于这个不算检出 MIN_RATE = 0.1 # 杂合的最低频率,低于这个补算杂合突变 RS_BED = 'rs.hg19.bed' # 表头信息可以左右平移,但是内容和顺序不能变动 HEAD_S = '基因名称' # 需要处理列的开始 HEAD_E = '证据等级' # 需要处理列的结束 SNP_TYPES = ['LX', 'DF'] # 只解析的类型 # 可以运行的项目字典信息 PROS_DD = {'160gene': '160.化疗位点注释及指导说明2023.02.02', '624gene': '624.化疗位点注释及指导说明2023.03.15.xlsx', '650gene': '650.panel化疗位点注释及指导说明.庞杰.20230403.xlsx'} def Exit_Print(pline=''): """显示错误的信息,退出脚本.""" print("Usage:") eline = "%s " % Exe_Bin eline += '-o Output_dir -p Project ' eline += '[--n normal --t tumor --r Human_ref --b bam]' print(eline) if len(pline) > 0: print(pline) if GLog: GLog.info('exit') GLog.close() sys.exit(0) # 处理运行参数 def Get_Opts(): """获取运行的环境变量.""" global Output_dir global Chemo_Path global Normal global Tumor global N_Bam global Bam_Path global Human_ref global Project global Pro_xlsx global Exe_Bin global Exe_Path global GLog global StartTime argvs = sys.argv file_real = os.path.realpath(argvs[0]) Exe_Path = os.path.dirname(file_real) Exe_Bin = os.path.basename(file_real) StartTime = datetime.datetime.now() ymd = StartTime.__format__('%Y%m%d_%H%M%S') pline = '' opt_normal = '' opt_tumor = '' opt_ref = '' N_Bam = '' if len(argvs) < 5: pline = '参数列表数量不对, %s' % argvs Exit_Print(pline=pline) else: # 参数解析 argv_index = 1 for argv in argvs[1:]: argv_index += 1 argv = argv.upper() # 必选参数获取 if argv == '-O': Output_dir = os.path.realpath(argvs[argv_index]) # 新建日志 if not os.path.exists(Output_dir): pline = '项目路径不存在: %s' % Output_dir Exit_Print(pline=pline) Chemo_Path = os.path.join(Output_dir, 'chemo') if not os.path.exists(Chemo_Path): os.makedirs(Chemo_Path) elif argv == '-P': Project = argvs[argv_index] if Project in PROS_DD: Pro_xlsx = os.path.join(Exe_Path, PROS_DD[Project]) else: pline = '无法分析的项目类型:%s' % Project Exit_Print(pline=pline) # 可选参数获得 elif argv == '--N': opt_normal = argvs[argv_index] elif argv == '--T': opt_tumor = argvs[argv_index] elif argv == '--R': opt_ref = argvs[argv_index] elif argv == '--B': N_Bam = argvs[argv_index] # 核对参数解析结果 if Output_dir == '': pline = '未指定项目路径' Exit_Print(pline=pline) # 解析noraml Bam_Path = os.path.join(Output_dir, 'alignment') bams = [] if N_Bam == '': if opt_normal == '': bams = glob('%s/*rmdup.bam' % (Bam_Path)) # 针对umi的项目,再寻找一次 if len(bams) == 0: bams = glob('%s/*consensus.mapped.bam' % (Bam_Path)) else: bams = glob('%s/*%s*rmdup.bam' % (Bam_Path, opt_normal)) if len(bams) == 0: bams = glob('%s/*%s*consensus.mapped.bam' % (Bam_Path, opt_normal)) if len(bams) == 0: if opt_normal == '': pline = '项目路径未发现rmdup.bam文件' else: pline = '项目路径未发现名称为:%s的相关rmdup.bam文件' % opt_normal Exit_Print(pline=pline) else: bams = [N_Bam] print('项目中的bams:') bam_size = 0 for bam in bams: temp_base = os.path.basename(bam) temp_size = os.path.getsize(bam) / 1024 / 1024 / 1024 print('bam_base:%s, bam_size:%0.2fGB' % (temp_base, temp_size)) if N_Bam == '': N_Bam = bam bam_size = temp_size else: if temp_size < bam_size: bam_size = temp_size N_Bam = bam # 选取文件大小最小的文件作为正常对照 bam_base = os.path.basename(N_Bam) print('使用最小的bam文件:%s' % bam_base) bam_name = bam_base.split('.')[0] Normal = bam_name # 输出文件tumor解析 if opt_tumor == '': Tumor = Normal else: # 如果有normal,变更为normal名称 # Tumor = opt_tumor if Normal != '': Tumor = Normal else: Tumor = opt_tumor if opt_ref == '': print('参数中未解析出--R的参考基因组,使用默认文件:%s' % Human_ref) else: Human_ref = opt_ref # 日志 log_base = '%s_%s.log' % (Normal, ymd) log_full = os.path.join(Chemo_Path, log_base) GLog = Logger(log_full, mode='w') GLog.info('argvs, %s' % ' '.join(argvs)) GLog.info('Normal:%s, Tumor:%s' % (Normal, Tumor)) GLog.info('start') def Check_Rs_Type(r_drug, r_rs, r_type, d_type): """检查杂合子,让杂合子对调后都有.""" global Drug_Dd d_value = Drug_Dd[r_drug][r_rs][r_type][d_type] n_type = '' infos = [] if r_type.find(r'/') > -1: infos = r_type.split(r'/') infos.insert(1, r'/') else: infos = list(r_type) for chr_str in infos: n_type = chr_str + n_type if n_type != r_type: if n_type not in Drug_Dd[r_drug][r_rs]: Drug_Dd[r_drug][r_rs][n_type] = {} # Drug_Dd[snp_drug][snp_rs].append(row_lines) Drug_Dd[r_drug][r_rs][n_type][d_type] = d_value def Check_Rs(): """检查项目所需的rs信息.""" global Rs_Dd global Drug_Ls global Drug_Dd global Rs_Gene bed_txt = os.path.join(Exe_Path, RS_BED) with open(bed_txt, 'r', encoding='utf8') as ff: for line in ff: line = line.replace('\r', '') line = line.replace('\n', '') if len(line) > 0 and not line.startswith('#'): lns = line.split('\t') rs_name = lns[3] if rs_name not in Rs_Dd: Rs_Dd[rs_name] = lns else: pline = '%s, rs有重复信息' % rs_name pline += '%s %s' % (Rs_Dd[rs_name], lns) Exit_Print(pline=pline) wb = load_workbook(Pro_xlsx, read_only=True, data_only=False) wb_sheets = wb.sheetnames sheet = wb_sheets[0] ws1 = wb[sheet] sheet_rows = [row for row in ws1.rows] # 获取所有行 # 表头处理 head_row = sheet_rows[0] head_lines = [] for cell in head_row: cell_str = str(cell.value) head_lines.append(cell_str) # print(head_lines) head_start = head_lines.index(HEAD_S) head_end = head_lines.index(HEAD_E) end_pos = 0 # 生成bed文件所需要的信息 bed_rss = [] bed_lines = [] error_rss = [] for row in sheet_rows[1:]: if end_pos == 1: break row_lines = [] for col_index in range(head_start, head_end + 1): col_str = str(row[col_index].value) if col_str == 'None': end_pos = 1 break row_lines.append(col_str) # 用药信息的字典构建 snp_rs = row_lines[1] snp_drug = row_lines[5] snp_gene = row_lines[0] if snp_rs in Rs_Dd: Rs_Gene[snp_rs] = snp_gene snp_infos = Rs_Dd[snp_rs] snp_chr = snp_infos[0] snp_start = snp_infos[1] snp_end = snp_infos[2] drug_type = row_lines[6].upper() # snp_type = row_lines[2].upper() # rs分型要不要统一大写,根据实际情况而定 snp_type = row_lines[2] snp_score = int(row_lines[3]) snp_res = row_lines[7] snp_level = row_lines[8] if snp_drug not in Drug_Dd: Drug_Dd[snp_drug] = {} Drug_Ls.append(snp_drug) if snp_rs not in Drug_Dd[snp_drug]: Drug_Dd[snp_drug][snp_rs] = {} if snp_type not in Drug_Dd[snp_drug][snp_rs]: Drug_Dd[snp_drug][snp_rs][snp_type] = {} # Drug_Dd[snp_drug][snp_rs].append(row_lines) Drug_Dd[snp_drug][snp_rs][snp_type][drug_type] = [snp_score, snp_res, snp_level] Check_Rs_Type(snp_drug, snp_rs, snp_type, drug_type) if snp_rs not in bed_rss: bed_rss.append(snp_rs) snp_line = '\t'.join([snp_chr, snp_start, snp_end, snp_rs]) bed_lines.append(snp_line + '\n') else: if snp_rs not in error_rss: error_rss.append(snp_rs) wb.close() if len(error_rss) > 0: pline = '%s, 在%s文件中没有信息' % ('\t'.join(error_rss), RS_BED) Exit_Print(pline=pline) # 写项目的bed文件 bed_txt = '%s.bed' % Normal bed_full = os.path.join(Chemo_Path, bed_txt) with open(bed_full, 'w', encoding='utf-8') as ff: ff.writelines(bed_lines) return bed_full def Samtools_mpileup(df_bed): """根据bed文件,分析文件中指定位置的突变信息,生成mpileup.""" bam_mpileup = '%s.chemo.mpileup' % Normal bam_mpileup_full = os.path.join(Chemo_Path, bam_mpileup) sam_cmd = 'samtools mpileup -Bq 20 -Q 20 -a -f %s -l %s %s -o %s' % ( Human_ref, df_bed, N_Bam, bam_mpileup_full) GLog.info('sam_cmd:%s' % sam_cmd) print('Samtools_mpileup:%s' % bam_mpileup_full) ps = subprocess.Popen(sam_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) read_bytes = b'' out_strs = '' while ps.poll() is None: out_std_b = ps.stdout.read(1) if out_std_b == b'\r' or out_std_b == b'\n': out_strs = read_bytes.decode().strip() if len(out_strs) > 0: GLog.info(out_strs) return 0 def VarScan_mpileup2cns(): """mpileup-->VCF.""" env_dist = os.environ # varscan 优先使用环境变量的版本 varscan_exe = env_dist.get('VARSCAN') if varscan_exe == '': varscan_exe = '/dataseq/jmdna/software/VarScan.v2.4.2.jar' bam_mpileup = '%s.chemo.mpileup' % Normal bam_mpileup_full = os.path.join(Chemo_Path, bam_mpileup) out_base = '%s.chemo.vcf' % Normal varscan_out = os.path.join(Chemo_Path, out_base) varscan_cmd = 'java -jar %s ' % varscan_exe varscan_cmd += 'mpileup2cns %s ' % bam_mpileup_full varscan_cmd += '--min-var-freq 0.1 --min-freq-for-hom 0.9 ' varscan_cmd += '--strand-filter 1 ' varscan_cmd += '--output-vcf 1 >%s' % varscan_out GLog.info('varscan_cmd:%s' % varscan_cmd) print('VarScan_mpileup2cns:%s' % varscan_out) ps = subprocess.Popen(varscan_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) read_bytes = b'' out_strs = '' while ps.poll() is None: out_std_b = ps.stdout.read(1) if out_std_b == b'\r' or out_std_b == b'\n': out_strs = read_bytes.decode().strip() if len(out_strs) > 0: GLog.info(out_strs) def Get_Drugs(): """处理VCF文件,核对结果,解析药物结果.""" # 生成vcf字典 vcf_dd = {} vcf_base = '%s.chemo.vcf' % Normal vcf_full = os.path.join(Chemo_Path, vcf_base) with open(vcf_full, 'r', encoding='utf-8') as ff: for line in ff: line = line.strip() if len(line) > 0 and not line.startswith('#'): lns = line.split('\t') snp_chr = lns[0] snp_end = lns[1] if snp_chr not in vcf_dd: vcf_dd[snp_chr] = {} vcf_dd[snp_chr][snp_end] = lns # 处理药物的信息 txt_head = '药物\t检测基因\t检测位点\t基因型\t证据等级\t用药提示\n' txt_res = [txt_head] drug_rd = {} err_rss = [] # 记录无法解析的rs信息 for drug in Drug_Ls: for snp_rs in Drug_Dd[drug]: snp_infos = Rs_Dd[snp_rs] snp_gene = Rs_Gene[snp_rs] snp_chr = snp_infos[0] snp_end = snp_infos[2] snp_ref = snp_infos[4] snp_alt = snp_infos[5] # 解析VCF结果 vcf_infos = vcf_dd.get(snp_chr).get(snp_end) vcf_mut_infos = vcf_infos[-1].split(':') vcf_alt = vcf_infos[3] vcf_mut = vcf_infos[4] vcf_rate = vcf_mut_infos[0] vcf_dep = 0 vcf_fre = 0 vcf_type = r'' if len(vcf_mut_infos) > 10: vcf_dep = int(vcf_mut_infos[1]) vcf_fre = float(vcf_mut_infos[6].replace(r'%', '')) / 100 if vcf_dep < MIN_DEPTH: vcf_type = r'/' # 判断VCF的检测结果 mut_len = len(vcf_mut) alt_len = len(vcf_alt) if vcf_type != r'/': # 位点正常检出 vcf_del = '' vcf_ins = '' if mut_len == 1 and alt_len == 1: # 单碱基处理 if vcf_mut == '.': # 野生型 vcf_type = '%s%s' % (vcf_alt, vcf_alt) else: if vcf_rate == r'1/1': # 纯合突变 vcf_type = '%s%s' % (vcf_mut, vcf_mut) elif vcf_rate == r'0/1': # 杂合突变 if vcf_fre < MIN_RATE: # 低频的杂合不算杂合 vcf_type = '%s%s' % (vcf_alt, vcf_alt) else: vcf_type = '%s%s' % (vcf_alt, vcf_mut) snp_alt = snp_alt.split(',')[0] # 去除单碱基多种突变类型当作ins的可能 # 针对del型的snp,野生型单独判断 if len(snp_ref) > len(snp_alt): snp_wide = snp_ref.replace(snp_alt, '', 1) if len(snp_wide) == 1: vcf_type = '%s%s' % (snp_wide, snp_wide) else: vcf_type = '%s/%s' % (snp_wide, snp_wide) # 针对ins型的snp,野生型单独判断 elif len(snp_ref) < len(snp_alt): vcf_type = '%s%s' % (snp_ref, snp_ref) else: # del型TC->T=C/DEL if alt_len > mut_len: vcf_del = vcf_alt.replace(vcf_mut, '', 1) if vcf_rate == r'1/1': # 纯合突变 vcf_type = "del/del" elif vcf_rate == r'0/1': # 杂合突变 if vcf_fre < MIN_RATE: # 低频的杂合不算杂合 vcf_type = "%s/%s" % (snp_ref, snp_ref) else: vcf_type = vcf_del + r'/del' # Ins型 elif alt_len < mut_len: vcf_ins = vcf_mut.replace(vcf_alt, '', 1) if vcf_rate == r'1/1': # 纯合突变 vcf_type = "ins%s/ins%s" % (vcf_ins, vcf_ins) elif vcf_rate == r'0/1': # 杂合突变 if vcf_fre < MIN_RATE: # 低频的杂合不算杂合 vcf_type = "%s/%s" % (snp_ref, snp_ref) else: vcf_type = "%s/ins%s" % (vcf_alt, vcf_ins) GLog.info('RS:%s, VCF:%s, VCF_Fre:%s' % ( snp_rs, vcf_type, vcf_fre)) # 个性化定制方案,针对多碱基突变的情况,每个rs号单独写 if snp_rs == 'rs3064744': # 只考虑3种型,其它默认野生型 # vcf_type:"C/insAT",vcf_ins:AT if len(vcf_ins) == 2: # 只考虑增加一个TA的情况 if vcf_rate == r'0/1': if vcf_fre < MIN_RATE: # 低频的杂合不算杂合 vcf_type = "TA[6]/TA[6]" else: vcf_type = "TA[6]/TA[7]" elif vcf_rate == r'1/1': vcf_type = "TA[7]/TA[7]" else: vcf_type = "TA[6]/TA[6]" else: # 野生型 vcf_type = "TA[6]/TA[6]" GLog.info('RS:%s, VCF:%s, VCF_Fre:%s' % ( snp_rs, vcf_type, vcf_fre)) # 650基因 elif snp_rs == 'rs45445694': # (CCGCGCCACTTGGCCTGCCTCCGTCCCG)3/(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2 if vcf_alt == 'A' and vcf_mut == '.': # 未突变 vcf_alt = '(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2' vcf_mut = '(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2' # 缺失的强制变更为野生型 elif vcf_alt == 'ACCGCGCCACTTGGCCTGCCTCCGTCCCG': if vcf_mut == 'A': vcf_alt = '(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2' vcf_mut = '(CCGCGCCACTTGGCCTGCCTCCGTCCCG)2' vcf_type = '%s/%s' % (vcf_alt, vcf_mut) # 判断有无检出 snp_res = '' drug_type_key = '' snp_score = '' if vcf_type != r'/': # 获取用药指导 if vcf_type not in Drug_Dd[drug][snp_rs]: if snp_rs not in err_rss: err_rss.append(snp_rs) eline = 'VCF结果无法解析\t%s,RS:%s' % (vcf_type, snp_rs) Error_Ls.append(eline + '\n') else: drug_type_dd = Drug_Dd[drug][snp_rs][vcf_type] drug_type_keys = list(drug_type_dd.keys()) for drug_type_key in drug_type_keys: drug_infos = drug_type_dd[drug_type_key] snp_score = drug_infos[0] snp_res = drug_infos[1] snp_level = drug_infos[-1] snp_line = '\t'.join( [drug, snp_gene, snp_rs, vcf_type, snp_level, snp_res]) txt_res.append(snp_line + '\n') # 药物的LX和DF结果汇总 if drug not in drug_rd: drug_rd[drug] = [] # 收集药物的结果信息 # '环磷酰胺': ['LX:1:疗效较好', 'LX:0:疗效居中', 'LX:-1:疗效较差'] drug_value = '%s:%s:%s' % (drug_type_key, snp_score, snp_res) drug_rd[drug].append(drug_value) else: # 未检出的也保留信息到结果文件,不是错误文件 fs_type = list(Drug_Dd[drug][snp_rs].keys())[0] Drug_Fsd = Drug_Dd[drug][snp_rs][fs_type] for fs_key in Drug_Fsd: drug_infos = Drug_Fsd[fs_key] snp_level = drug_infos[2] snp_res = r'%s,无法判读' % fs_key snp_line = '\t'.join( [drug, snp_gene, snp_rs, vcf_type, snp_level, snp_res]) txt_res.append(snp_line + '\n') # 药物的LX和DF结果汇总 if drug not in drug_rd: drug_rd[drug] = [] if len(Error_Ls) > 1: # print(Error_Ls) error_base = '%s.drug.erros.txt' % Tumor error_full = os.path.join(Chemo_Path, error_base) with open(error_full, 'w', encoding='utf8') as ff: ff.writelines(Error_Ls) # 生成文件 drug_base = '%s.drug.infos.txt' % Tumor drug_full = os.path.join(Chemo_Path, drug_base) with open(drug_full, 'w', encoding='utf8') as ff: # 把相同rs的结果汇总,例如一个rs即知道疗效又指导毒副,那就写一行 temp_drug = '' temp_gene = '' temp_rs = '' temp_line = '' line = '' for line in txt_res: lns = line.split('\t') line_drug = lns[0] line_gene = lns[1] line_rs = lns[2] if line_drug == temp_drug and \ line_gene == temp_gene and line_rs == temp_rs: temp_line = temp_line.strip() + ',' + lns[-1] else: temp_drug = line_drug temp_gene = line_gene temp_rs = line_rs ff.write(temp_line) temp_line = line ff.write(temp_line) # 解析是否推荐 head = '药物名称\t疗效\t毒副\t推荐程度\t疗效和毒副总结\n' drug_lines = [head] drug_base = '%s.drug.res.txt' % Tumor drug_full = os.path.join(Chemo_Path, drug_base) # print(drug_rd['米托坦']) for drug in Drug_Ls: drug_lx = 0 lx_find = 0 drug_dx = 0 dx_find = 0 drug_res = '' drug_infos = '' snp_res_ls = drug_rd[drug] if len(snp_res_ls) == 0: drug_line = '%s\t/\t/\t/\t/\n' % (drug) drug_lines.append(drug_line) continue for snp_res in snp_res_ls: snp_ress = snp_res.split(':') # 检出 if len(snp_ress) > 2: drug_type = snp_ress[0] drug_score = snp_ress[1] if drug_type.upper() == 'LX': drug_lx += int(drug_score) lx_find = 1 elif drug_type.upper() == 'DF': drug_dx += int(drug_score) dx_find = 1 # 疗效和毒副汇总评估 if drug_lx > 0: drug_infos = '疗效较好' elif drug_lx == 0 and lx_find == 1: drug_infos = '疗效一般' elif drug_lx < 0: drug_infos = '疗效较差' if drug_dx > 0: drug_infos += ',毒副较低' elif drug_dx == 0 and dx_find == 1: drug_infos += ',毒副一般' elif drug_dx < 0: drug_infos += ',毒副较高' if drug_infos.startswith(','): drug_infos = drug_infos.replace(',', '', 1) # print(drug, drug_lx, lx_find, drug_infos) # 最终用药判断 if drug_lx > 0: if drug_dx >= 0: drug_res = '推荐' else: drug_res = '常规' elif drug_lx == 0: if drug_dx > 0: drug_res = '推荐' elif drug_dx == 0: drug_res = '常规' else: drug_res = '谨慎' else: if drug_dx > 0: drug_res = '常规' else: drug_res = '谨慎' # 个性化判断的药物再单独处理一下 # 氟尿嘧啶、卡培他滨2种药疗效较好但毒副较高的结论为谨慎使用 if drug_dx < 0: if drug == "氟尿嘧啶" or drug == "卡培他滨": drug_res = '谨慎' # 只做单项药物,注释掉没有做的项目,不再默认0,用斜号标识 # if lx_find == 0: # drug_lx = r'/' # if dx_find == 0: # drug_dx = r'/' drug_line = '%s\t%s\t%s\t%s\t%s\n' % ( drug, drug_lx, drug_dx, drug_res, drug_infos) drug_lines.append(drug_line) with open(drug_full, 'w', encoding='utf8') as ff: ff.writelines(drug_lines) if __name__ == '__main__': Get_Opts() try: pass bed = Check_Rs() Samtools_mpileup(bed) VarScan_mpileup2cns() Get_Drugs() except BaseException: GLog.error(traceback.format_exc()) print(traceback.format_exc()) endtime = datetime.datetime.now() GLog.info('end') GLog.info('run time:%s seconds' % ((endtime - StartTime).seconds)) GLog.close()