pipeline/script/chemo/test_chemo.py

207 lines
5.9 KiB
Python
Raw Normal View History

2023-08-25 10:06:31 +08:00
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
Created on: 2023-02-17
@author: cjs
# 用途批量查找项目的bam文件批量进行chemo_panel2.py脚本测试
# 版本0.0.1
# 最后编辑日期: 2023-02-17
"""
from cjs_test.cjs_logger import Logger
from multiprocessing import Process, Queue
from glob import glob
import subprocess
import datetime
import json
import traceback
import os
import sys
# 全局参数
Exe_Bin = ''
Exe_Path = ''
GLog = None
Start_Time = None
Bam_Dd = {} # 记录需要处理的bam路径
# 命令参数
Pro_Out = r''
# 固定参数
PS_MAX = 40 # 最大的进程数
def Exit_Print(pline=''):
"""显示错误的信息,退出脚本."""
print('%s -O Pro_Out' % Exe_Bin)
if len(pline) > 0:
print(pline)
if GLog:
GLog.info('exit')
GLog.close()
sys.exit(0)
# 处理运行参数
def Get_Opts():
"""获取运行的环境变量."""
global Exe_Bin
global Exe_Path
global GLog
global Start_Time
global Pro_Out
file_real = os.path.realpath(sys.argv[0])
Exe_Path = os.path.dirname(file_real)
Exe_Bin = os.path.basename(file_real)
Start_Time = datetime.datetime.now()
ymd = Start_Time.__format__('%Y%m%d_%H%M%S')
# 运行参数数量检查
argvs = sys.argv[1:]
argv_lens = len(argvs)
if argv_lens < 2:
err_line = '缺少运行的必要参数'
Exit_Print(pline=err_line)
# 获取参数
Pro_Out = ''
argv_index = 0
for argv in argvs:
argv_index += 1
if argv.startswith('-'):
if argv.upper() == '-O':
if argv_index < argv_lens:
Pro_Out = argvs[argv_index]
else:
err_line = '%s,参数的值无法获取' % argv
Exit_Print(pline=err_line)
else:
err_line = '无法识别的参数:%s' % argv
Exit_Print(pline=err_line)
if len(Pro_Out) == 0:
err_line = '未能获取到正确的芯片号'
Exit_Print(pline=err_line)
# 开启日志
log_path = os.path.join(Exe_Path, 'logs', Exe_Bin)
if not os.path.exists(log_path):
os.makedirs(log_path)
log_base = '%s_%s.log' % (Exe_Bin, ymd)
log_full = os.path.join(log_path, log_base)
GLog = Logger(log_full, mode='w')
GLog.info('start')
return 0
def Get_Bams():
global Bam_Dd
for bam in glob('%s/**/*.rmdup.bam' % Pro_Out, recursive=True):
bam_nor = ""
bam_base = os.path.basename(bam)
bam_dir = os.path.dirname(bam)
pro_dir = os.path.dirname(bam_dir)
# 寻找json
for in_json in glob('%s/*.json' % pro_dir):
with open(in_json, 'r', encoding='utf8') as ff:
load_dict = json.load(ff)
for dkey in load_dict:
if dkey.upper().find("NORMAL") > -1:
bam_nor = load_dict[dkey]
break
if bam_nor != "":
if bam_base.find(bam_nor) > -1: # 只选取normal的bam
if bam_nor not in Bam_Dd:
Bam_Dd[bam_nor] = bam
else:
# 已经有的项目选取bam文件大的处理
old_bam = Bam_Dd[bam_nor]
# print("%s, 项目有重复的bam[%s, %s]" % (bam_nor, old_bam, bam))
old_size = os.path.getsize(old_bam)
new_size = os.path.getsize(bam)
if new_size > old_size:
Bam_Dd[bam_nor] = bam
def py_chemo(df_q):
py_full = os.path.join(Exe_Path, "chemo_panel.py")
while (1):
py_ls = df_q.get()
if py_ls is None:
break
py_normal = py_ls[0]
py_bam = py_ls[1]
py_ourdir = os.path.join(Exe_Path, "cheom_out", py_normal)
if not os.path.exists(py_ourdir):
os.makedirs(py_ourdir)
py_cmd = "%s -o %s -p %s --n %s" % (py_full, py_ourdir, "650gene",
py_normal)
py_cmd += ' --b %s' % py_bam
ps = subprocess.Popen(py_cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
shell=True)
while ps.poll() is None:
std_outs = ps.stdout.readline().decode().strip()
if len(std_outs) > 0:
GLog.info("%s->%s" % (py_normal, std_outs))
std_outs = ps.stdout.readline().decode().strip()
if len(std_outs) > 0:
GLog.info("%s->%s" % (py_normal, std_outs))
def Process_Chemo():
"""多进程跑分析."""
ps_num = PS_MAX
bam_num = len(Bam_Dd)
if PS_MAX > bam_num:
ps_num = bam_num
# 安排好队列
qd = Queue()
for bam_nor in Bam_Dd:
bam = Bam_Dd[bam_nor]
qd.put([bam_nor, bam])
# 新建输出文件夹
cheom_out = os.path.join(Exe_Path, "cheom_out")
if not os.path.exists(cheom_out):
os.makedirs(cheom_out)
ps_ls = []
for i in range(ps_num):
qd.put(None)
chemo_ps = Process(target=py_chemo, args=(qd,), daemon=True)
chemo_ps.start()
ps_ls.append(chemo_ps)
# 检测在运行的进程
temp_num = ps_num
while (1):
ps_runs = 0
for ps in ps_ls:
if ps.is_alive():
ps_runs += 1
if ps_runs < temp_num:
temp_num = ps_runs
print('\r在运行的进程数:%02d' % temp_num, end='')
if ps_runs == 0:
break
print('\n所有分析进程结束')
if __name__ == '__main__':
Get_Opts()
Get_Bams()
print(len(Bam_Dd))
try:
Process_Chemo()
except BaseException:
GLog.error(traceback.format_exc())
print(traceback.format_exc())
endtime = datetime.datetime.now()
GLog.info('end')
GLog.info('run time:%s seconds' % ((endtime - Start_Time).seconds))
GLog.close()