#!/usr/bin/env python3 # -*- coding: UTF-8 -*- """ Created on: 2023-02-17 @author: cjs # 用途:批量查找项目的bam文件,批量进行chemo_panel2.py脚本测试 # 版本:0.0.1 # 最后编辑日期: 2023-02-17 """ from cjs_test.cjs_logger import Logger from multiprocessing import Process, Queue from glob import glob import subprocess import datetime import json import traceback import os import sys # 全局参数 Exe_Bin = '' Exe_Path = '' GLog = None Start_Time = None Bam_Dd = {} # 记录需要处理的bam路径 # 命令参数 Pro_Out = r'' # 固定参数 PS_MAX = 40 # 最大的进程数 def Exit_Print(pline=''): """显示错误的信息,退出脚本.""" print('%s -O Pro_Out' % Exe_Bin) if len(pline) > 0: print(pline) if GLog: GLog.info('exit') GLog.close() sys.exit(0) # 处理运行参数 def Get_Opts(): """获取运行的环境变量.""" global Exe_Bin global Exe_Path global GLog global Start_Time global Pro_Out file_real = os.path.realpath(sys.argv[0]) Exe_Path = os.path.dirname(file_real) Exe_Bin = os.path.basename(file_real) Start_Time = datetime.datetime.now() ymd = Start_Time.__format__('%Y%m%d_%H%M%S') # 运行参数数量检查 argvs = sys.argv[1:] argv_lens = len(argvs) if argv_lens < 2: err_line = '缺少运行的必要参数' Exit_Print(pline=err_line) # 获取参数 Pro_Out = '' argv_index = 0 for argv in argvs: argv_index += 1 if argv.startswith('-'): if argv.upper() == '-O': if argv_index < argv_lens: Pro_Out = argvs[argv_index] else: err_line = '%s,参数的值无法获取' % argv Exit_Print(pline=err_line) else: err_line = '无法识别的参数:%s' % argv Exit_Print(pline=err_line) if len(Pro_Out) == 0: err_line = '未能获取到正确的芯片号' Exit_Print(pline=err_line) # 开启日志 log_path = os.path.join(Exe_Path, 'logs', Exe_Bin) if not os.path.exists(log_path): os.makedirs(log_path) log_base = '%s_%s.log' % (Exe_Bin, ymd) log_full = os.path.join(log_path, log_base) GLog = Logger(log_full, mode='w') GLog.info('start') return 0 def Get_Bams(): global Bam_Dd for bam in glob('%s/**/*.rmdup.bam' % Pro_Out, recursive=True): bam_nor = "" bam_base = os.path.basename(bam) bam_dir = os.path.dirname(bam) pro_dir = os.path.dirname(bam_dir) # 寻找json for in_json in glob('%s/*.json' % pro_dir): with open(in_json, 'r', encoding='utf8') as ff: load_dict = json.load(ff) for dkey in load_dict: if dkey.upper().find("NORMAL") > -1: bam_nor = load_dict[dkey] break if bam_nor != "": if bam_base.find(bam_nor) > -1: # 只选取normal的bam if bam_nor not in Bam_Dd: Bam_Dd[bam_nor] = bam else: # 已经有的项目,选取bam文件大的处理 old_bam = Bam_Dd[bam_nor] # print("%s, 项目有重复的bam[%s, %s]" % (bam_nor, old_bam, bam)) old_size = os.path.getsize(old_bam) new_size = os.path.getsize(bam) if new_size > old_size: Bam_Dd[bam_nor] = bam def py_chemo(df_q): py_full = os.path.join(Exe_Path, "chemo_panel.py") while (1): py_ls = df_q.get() if py_ls is None: break py_normal = py_ls[0] py_bam = py_ls[1] py_ourdir = os.path.join(Exe_Path, "cheom_out", py_normal) if not os.path.exists(py_ourdir): os.makedirs(py_ourdir) py_cmd = "%s -o %s -p %s --n %s" % (py_full, py_ourdir, "650gene", py_normal) py_cmd += ' --b %s' % py_bam ps = subprocess.Popen(py_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) while ps.poll() is None: std_outs = ps.stdout.readline().decode().strip() if len(std_outs) > 0: GLog.info("%s->%s" % (py_normal, std_outs)) std_outs = ps.stdout.readline().decode().strip() if len(std_outs) > 0: GLog.info("%s->%s" % (py_normal, std_outs)) def Process_Chemo(): """多进程跑分析.""" ps_num = PS_MAX bam_num = len(Bam_Dd) if PS_MAX > bam_num: ps_num = bam_num # 安排好队列 qd = Queue() for bam_nor in Bam_Dd: bam = Bam_Dd[bam_nor] qd.put([bam_nor, bam]) # 新建输出文件夹 cheom_out = os.path.join(Exe_Path, "cheom_out") if not os.path.exists(cheom_out): os.makedirs(cheom_out) ps_ls = [] for i in range(ps_num): qd.put(None) chemo_ps = Process(target=py_chemo, args=(qd,), daemon=True) chemo_ps.start() ps_ls.append(chemo_ps) # 检测在运行的进程 temp_num = ps_num while (1): ps_runs = 0 for ps in ps_ls: if ps.is_alive(): ps_runs += 1 if ps_runs < temp_num: temp_num = ps_runs print('\r在运行的进程数:%02d' % temp_num, end='') if ps_runs == 0: break print('\n所有分析进程结束') if __name__ == '__main__': Get_Opts() Get_Bams() print(len(Bam_Dd)) try: Process_Chemo() except BaseException: GLog.error(traceback.format_exc()) print(traceback.format_exc()) endtime = datetime.datetime.now() GLog.info('end') GLog.info('run time:%s seconds' % ((endtime - Start_Time).seconds)) GLog.close()