345 lines
14 KiB
Python
345 lines
14 KiB
Python
import copy
|
||
import os
|
||
import time
|
||
from collections import defaultdict
|
||
from datetime import datetime
|
||
|
||
import pandas as pd
|
||
|
||
from tools.common import basedir, log
|
||
|
||
|
||
class AutoLayout:
|
||
"""
|
||
自动化派样
|
||
"""
|
||
|
||
def __init__(self, path, chipnum, output=basedir, data_limit=1750):
|
||
self.path = path
|
||
self.output = output
|
||
self.chipnum = int(chipnum)
|
||
self.data_limit = data_limit
|
||
|
||
self.index_assignments = defaultdict(list)
|
||
# 芯片数量量大小
|
||
self.chip_size = dict()
|
||
# 芯片是否极致
|
||
self.chip_type = dict()
|
||
# 芯片barcode
|
||
self.chip_barcode_recode = defaultdict(set)
|
||
# 芯片原始数据读取
|
||
self.ori_data = self.read_excel()
|
||
# 当前锚芯片
|
||
self.loc_chip_num = 1
|
||
# 芯片客户
|
||
self.chip_customer = defaultdict(set)
|
||
# 文库
|
||
self.chip_classification = defaultdict(set)
|
||
self.rule = self.read_rule()
|
||
# 甲基化文库不大于200,WGBS文库不大于200G
|
||
self.chip_speciallib_size = dict()
|
||
self.logger = log(os.path.basename(f'{path}.txt'))
|
||
self.return_log = list()
|
||
self.no_assign_data = list()
|
||
|
||
def read_excel(self):
|
||
"""
|
||
原始数据处理
|
||
:return:
|
||
"""
|
||
merge = pd.read_excel(self.path, None)
|
||
ori_data = dict()
|
||
for name, sheet in merge.items():
|
||
sheet.fillna('.', inplace=True)
|
||
ori_data[name] = sheet.to_dict('records')
|
||
return ori_data
|
||
|
||
def add_new_data(self, chipname, library_data, newer=True):
|
||
"""
|
||
增加新数据到已知芯片上
|
||
:param chipname:
|
||
:param library_data:
|
||
:param newer:
|
||
:return:
|
||
"""
|
||
self.index_assignments[chipname].extend(library_data['data'])
|
||
self.chip_barcode_recode[chipname].update({item['barcode'] for item in library_data['data']})
|
||
|
||
if newer:
|
||
self.chip_size[chipname] = library_data['size']
|
||
if library_data['classification'] in ['扩增子', '不平衡文库', '单细胞文库以及甲基化']:
|
||
self.chip_speciallib_size[chipname] = library_data['size']
|
||
else:
|
||
self.chip_speciallib_size[chipname] = 0
|
||
else:
|
||
self.chip_size[chipname] += library_data['size']
|
||
if library_data['classification'] in ['扩增子', '不平衡文库', '单细胞文库', '甲基化']:
|
||
self.chip_speciallib_size[chipname] += library_data['size']
|
||
|
||
self.chip_customer[chipname].add(library_data['customer'])
|
||
self.chip_classification[chipname].add(library_data['classification'])
|
||
|
||
def count_barcode_radio(self, data):
|
||
df = pd.DataFrame(data)
|
||
ratio_sites = dict()
|
||
is_not_balance_list = []
|
||
if df.empty:
|
||
return ratio_sites, is_not_balance_list
|
||
|
||
df['barcode'] = df['barcode'].str.slice(0, 16)
|
||
barcode_df = pd.DataFrame(df['barcode'].str.split('', expand=True).iloc[:, 1:-1].values,
|
||
columns=['T' + str(x) for x in range(16)]).join(df['data_needed'])
|
||
total = barcode_df['data_needed'].sum()
|
||
|
||
for i in range(16):
|
||
column = 'T' + str(i)
|
||
col_df = barcode_df.groupby(column).agg({'data_needed': 'sum'})
|
||
# 去掉N计数
|
||
if 'N' in col_df.index:
|
||
base_N_size = col_df.loc['N', 'data_needed']
|
||
col_df = col_df.drop('N')
|
||
else:
|
||
base_N_size = 0
|
||
col_df['ratio'] = (col_df['data_needed']) / (total - base_N_size)
|
||
ratio = col_df['ratio'].to_dict()
|
||
ratio_sites[i] = ratio
|
||
A, B, C, D, E, F = list(), list(), list(), list(), list(), list()
|
||
for decbase in ['A', 'T', 'C', 'G']:
|
||
if decbase not in ratio:
|
||
ratio[decbase] = 0
|
||
if ratio[decbase] >= 0.6:
|
||
A.append(decbase)
|
||
if 0.2 <= ratio[decbase] < 0.6:
|
||
B.append(decbase)
|
||
if 0.15 <= ratio[decbase] < 0.2:
|
||
C.append(decbase)
|
||
if 0.1 <= ratio[decbase] < 0.15:
|
||
D.append(decbase)
|
||
if 0.08 <= ratio[decbase] < 0.1:
|
||
E.append(decbase)
|
||
if ratio[decbase] < 0.08:
|
||
F.append(decbase)
|
||
|
||
A_num, B_num, C_num, D_num, E_num, F_num = len(A), len(B), len(C), len(D), len(E), len(F)
|
||
if not ((B_num + C_num + D_num == 4) or (F_num == 1 and (A_num + B_num) == 3) or (
|
||
E_num == 1 and D_num == 1 and (A_num + B_num + C_num) == 2) or (
|
||
E_num == 1 and (A_num + B_num + C_num) == 3)):
|
||
is_not_balance_list.append(
|
||
'第%s位置,算出结果为 %s' % (i, ratio)
|
||
)
|
||
return ratio_sites, is_not_balance_list
|
||
|
||
def dec_barcode_radio(self, chipname):
|
||
data = self.index_assignments[chipname]
|
||
ratio_sites, is_not_balance_list = self.count_barcode_radio(data)
|
||
if is_not_balance_list:
|
||
desc = '\n'.join(is_not_balance_list)
|
||
self.return_log.append(f'芯片{chipname}有碱基不平衡:\n{desc}')
|
||
print(f'芯片{chipname}有碱基不平衡:\n{desc}')
|
||
|
||
@staticmethod
|
||
def level(row):
|
||
|
||
today_date = datetime.now()
|
||
|
||
# 将时间字符串转换为 datetime 对象
|
||
# mytime = datetime.strptime(row['time'], "%Y-%m-%d")
|
||
# mytime = row['time'].strftime("%Y-%m-%d")
|
||
mytime = row['time']
|
||
if row['拆分方式'] == '极致周期' or '极致' in row['拆分方式']:
|
||
return 2
|
||
|
||
# 判断日期是之前的还是之后的
|
||
if mytime < today_date:
|
||
return 3
|
||
|
||
if '加急' in row['priority']:
|
||
return 4
|
||
|
||
if '补测' in row['priority']:
|
||
return 5
|
||
|
||
else:
|
||
return 100
|
||
|
||
@staticmethod
|
||
def read_rule():
|
||
df = pd.read_excel(os.path.join(basedir, 'rule', 'exclusive_classfication.xlsx'))
|
||
newdf = pd.DataFrame()
|
||
newdf['c1'] = df['c2']
|
||
newdf['c2'] = df['c1']
|
||
res = pd.concat([df, newdf])
|
||
return res.reset_index()
|
||
|
||
def use_rule(self, chipname, classfication):
|
||
may_classfic = set(self.rule[self.rule['c1'] == classfication]['c2'])
|
||
if self.chip_customer[chipname].intersection(may_classfic):
|
||
return True
|
||
return False
|
||
|
||
def judge_data(self, chipname, library_data):
|
||
size = library_data['size']
|
||
# customer = library_data['customer']
|
||
library = library_data['library']
|
||
classification = library_data['classification']
|
||
|
||
# 芯片大小不能超过设定限制
|
||
sizelimit = True
|
||
if self.chip_size[chipname] + size > self.data_limit:
|
||
sizelimit = False
|
||
|
||
# barcode有重复
|
||
notrepeatbarcode = True
|
||
if self.chip_barcode_recode[chipname].intersection({item['barcode'] for item in library_data['data']}):
|
||
notrepeatbarcode = False
|
||
|
||
# 互斥的文库
|
||
exclusive_classific = True
|
||
if self.use_rule(chipname, classification):
|
||
exclusive_classific = False
|
||
|
||
# 不平衡文库大于200G 不能添加
|
||
splibrary = True
|
||
if classification in ['扩增子', '不平衡文库', '单细胞文库', '甲基化'] \
|
||
and self.chip_speciallib_size[chipname] + size > 250:
|
||
splibrary = False
|
||
|
||
# 碱基不平衡不过不添加,保证前面的数据, 在数据达到1200G的时候开始
|
||
base_balance = True
|
||
if self.chip_size[chipname] > 800:
|
||
current_data = copy.deepcopy(self.index_assignments[chipname])
|
||
new_data = library_data['data']
|
||
current_data.extend(new_data)
|
||
ratio_sites, is_not_balance_list = self.count_barcode_radio(current_data)
|
||
if is_not_balance_list:
|
||
base_balance = False
|
||
|
||
if sizelimit and notrepeatbarcode and exclusive_classific and splibrary and base_balance:
|
||
return True
|
||
return False
|
||
|
||
def assign_samples(self):
|
||
ori_library_data = list()
|
||
|
||
if '未测' not in self.ori_data.keys():
|
||
raise UserWarning('提供excel没有 未测 sheet ,请核查!')
|
||
ori_library_df = pd.DataFrame(self.ori_data['未测'])
|
||
|
||
need_col = ['#library', 'sublibrary', 'i5', 'i7', 'data_needed', 'real_data', 'customer',
|
||
'classification', 'priority', 'time', '拆分方式', 'barcode'
|
||
]
|
||
get_col = set(ori_library_df.columns)
|
||
unhave_col = set(need_col) - get_col
|
||
|
||
if unhave_col:
|
||
unhave_fom = '; '.join(unhave_col)
|
||
raise UserWarning(f'未测表里没有{unhave_fom} 表头,请核查!')
|
||
|
||
numeric_mask = pd.to_numeric(ori_library_df['data_needed'], errors='coerce').notna()
|
||
time_mask = pd.to_datetime(ori_library_df['time'], errors='coerce').notna()
|
||
|
||
ori_library_df['note'] = ''
|
||
ori_library_df.loc[~numeric_mask, 'note'] = 'data_needed 列非数字'
|
||
ori_library_df.loc[~time_mask, 'note'] = 'time 列非日期'
|
||
|
||
need_col.append('note')
|
||
|
||
no_ori_data = ori_library_df[~(numeric_mask & time_mask)]
|
||
self.no_assign_data.extend(no_ori_data.to_dict('records'))
|
||
|
||
# 使用布尔索引筛选出不是数字和非日期的行
|
||
ori_library_df = ori_library_df[(numeric_mask & time_mask)]
|
||
ori_library_df['level'] = ori_library_df.apply(self.level, axis=1)
|
||
for library, library_df in ori_library_df.groupby('#library'):
|
||
ori_library_data.append(dict(
|
||
library=library,
|
||
size=library_df['data_needed'].sum(),
|
||
split_method=library_df['拆分方式'].values[0],
|
||
time=library_df['time'].values[0],
|
||
level=library_df['level'].values[0],
|
||
customer=library_df['customer'].values[0],
|
||
classification=library_df['classification'].values[0],
|
||
data=library_df[need_col].to_dict('records')
|
||
))
|
||
ori_sort_data = sorted(ori_library_data, key=lambda x: (x['level'], x['time']))
|
||
|
||
i = 0
|
||
while ori_sort_data:
|
||
library_data = ori_sort_data[0]
|
||
chipname = f'chip{self.loc_chip_num}'
|
||
|
||
# 空白芯片直接添加
|
||
if chipname not in self.index_assignments:
|
||
self.add_new_data(chipname, library_data)
|
||
ori_sort_data.remove(library_data)
|
||
i += 1
|
||
continue
|
||
|
||
# 判断条件
|
||
if self.judge_data(chipname, library_data):
|
||
self.add_new_data(chipname, library_data, newer=False)
|
||
ori_sort_data.remove(library_data)
|
||
i += 1
|
||
else:
|
||
for j in range(len(ori_sort_data)):
|
||
newlibrary_data = ori_sort_data[j]
|
||
if self.judge_data(chipname, newlibrary_data):
|
||
ori_sort_data.remove(newlibrary_data)
|
||
i += 1
|
||
self.add_new_data(chipname, newlibrary_data, newer=False)
|
||
break
|
||
j += 1
|
||
else:
|
||
self.loc_chip_num += 1
|
||
if self.chip_size[chipname] > self.data_limit:
|
||
self.loc_chip_num += 1
|
||
|
||
def assign_again(self):
|
||
pass
|
||
|
||
def run(self):
|
||
# self.assign_samples()
|
||
try:
|
||
self.assign_samples()
|
||
except Exception as e:
|
||
self.return_log.append(f'T7排样出错, 请联系!{e}')
|
||
self.index_assignments = {}
|
||
outputname = 'assignments_%s_%s' % (datetime.now().strftime("%m%d%H%M"), os.path.basename(self.path))
|
||
outputpath = os.path.join(self.output, 'result', outputname)
|
||
writer = pd.ExcelWriter(outputpath)
|
||
|
||
chip_loc = 1
|
||
for chip_idx, chip_assignments in self.index_assignments.items():
|
||
df = pd.DataFrame(chip_assignments)
|
||
df['time'] = df['time'].dt.strftime('%Y-%m-%d')
|
||
if df['data_needed'].sum() < 1500 or chip_loc > self.chipnum:
|
||
self.no_assign_data.extend(df.to_dict('records'))
|
||
continue
|
||
if [method for method in df['拆分方式'].values if '极致' in method]:
|
||
addname = 'X'
|
||
else:
|
||
addname = ''
|
||
self.dec_barcode_radio(chip_idx)
|
||
df.to_excel(writer, sheet_name=addname + chip_idx, index=False)
|
||
chip_loc += 1
|
||
no_assign_df = pd.DataFrame(self.no_assign_data)
|
||
no_assign_df.to_excel(writer, sheet_name='未测', index=False)
|
||
if self.return_log:
|
||
pd.DataFrame(self.return_log).to_excel(writer, sheet_name='log', index=False)
|
||
writer.close()
|
||
return outputpath
|
||
|
||
|
||
if __name__ == '__main__':
|
||
start_time = time.time()
|
||
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'example', 'input排样表.xlsx')
|
||
# excel_file = 'example/input排样表.xlsx'
|
||
output_file = ''
|
||
layout = AutoLayout(filepath, output_file)
|
||
layout.run()
|
||
end_time = time.time()
|
||
execution_time = end_time - start_time
|
||
print(f"代码执行时间为:{execution_time} 秒")
|
||
|
||
# server()
|