layout/tools/t7.py

360 lines
15 KiB
Python
Raw Normal View History

2023-12-07 17:47:53 +08:00
import os
import time
2023-06-27 13:01:44 +08:00
from collections import defaultdict
from datetime import datetime
2023-12-07 17:47:53 +08:00
import pandas as pd
from tools.common import basedir, log
2023-06-27 13:01:44 +08:00
class AutoLayout:
"""
自动化派样
"""
2023-12-07 17:47:53 +08:00
def __init__(self, path, chipnum, output=basedir, data_limit=1520):
2023-06-27 13:01:44 +08:00
self.path = path
self.output = output
2023-12-07 17:47:53 +08:00
self.chipnum = int(chipnum)
2023-06-27 13:01:44 +08:00
self.data_limit = data_limit
self.index_assignments = defaultdict(list)
# 芯片数量量大小
self.chip_size = dict()
# 芯片是否极致
self.chip_type = dict()
# 芯片barcode
self.chip_barcode_recode = defaultdict(set)
# 芯片原始数据读取
self.ori_data = self.read_excel()
# 当前锚芯片
self.loc_chip_num = 1
# 芯片客户
self.chip_customer = defaultdict(set)
2023-12-07 17:47:53 +08:00
# 文库
self.chip_classification = defaultdict(set)
2023-06-27 13:01:44 +08:00
self.rule = self.read_rule()
# 甲基化文库不大于200,WGBS文库不大于200G
self.chip_speciallib_size = dict()
self.logger = log(os.path.basename(f'{path}.txt'))
self.return_log = list()
2023-12-07 17:47:53 +08:00
self.no_assign_data = list()
2023-06-27 13:01:44 +08:00
def read_excel(self):
"""
原始数据处理
:return:
"""
merge = pd.read_excel(self.path, None)
ori_data = dict()
for name, sheet in merge.items():
sheet.fillna('.', inplace=True)
ori_data[name] = sheet.to_dict('records')
return ori_data
def add_new_data(self, chipname, library_data, newer=True):
"""
增加新数据到已知芯片上
:param chipname:
:param library_data:
:param newer:
:return:
"""
self.index_assignments[chipname].extend(library_data['data'])
self.chip_barcode_recode[chipname].update({item['barcode'] for item in library_data['data']})
if newer:
self.chip_size[chipname] = library_data['size']
if library_data['classification'] in ['扩增子', '不平衡文库', '单细胞文库以及甲基化']:
self.chip_speciallib_size[chipname] = library_data['size']
else:
self.chip_speciallib_size[chipname] = 0
else:
self.chip_size[chipname] += library_data['size']
if library_data['classification'] in ['扩增子', '不平衡文库', '单细胞文库', '甲基化']:
self.chip_speciallib_size[chipname] += library_data['size']
self.chip_customer[chipname].add(library_data['customer'])
2023-12-07 17:47:53 +08:00
self.chip_classification[chipname].add(library_data['classification'])
2023-06-27 13:01:44 +08:00
def add_new_chip(self, library_data):
"""
要新增到芯片上的数据
:param library_data:
:return:
"""
chip_num_tmp = self.loc_chip_num
while True:
chip_num_tmp += 1
chipname_tmp = f'chip{chip_num_tmp}'
library = library_data['library']
if chipname_tmp not in self.index_assignments:
self.logger.error(f'{library} {chipname_tmp} 常规添加')
self.add_new_data(chipname_tmp, library_data)
break
else:
is_same_barcode = self.chip_barcode_recode[chipname_tmp].intersection(
{item['barcode'] for item in library_data['data']})
# 没有从重复的index,并且也不互斥的
if ((self.chip_size[chipname_tmp] + library_data['size']) > self.data_limit):
self.logger.error(f'{library} {chipname_tmp} 文库相加大于设定限制')
if ((self.chip_speciallib_size[chipname_tmp] + library_data['size']) >= 200):
self.logger.error(f'{library} {chipname_tmp} 不平衡文库相加大于设定限制')
if is_same_barcode:
self.logger.error(f'{library} {chipname_tmp} 文库有barcode重复')
if self.use_rule(chipname_tmp, library_data['customer']):
self.logger.error(f'{library} {chipname_tmp} 有互斥单位')
if ((self.chip_size[chipname_tmp] + library_data['size']) <= self.data_limit) \
and ((self.chip_speciallib_size[chipname_tmp] + library_data['size']) < 200) \
and (not is_same_barcode) \
and (not self.use_rule(chipname_tmp, library_data['customer'])):
self.add_new_data(chipname_tmp, library_data, newer=False)
break
def dec_barcode_radio(self, chipname):
data = self.index_assignments[chipname]
df = pd.DataFrame(data)
2023-12-07 17:47:53 +08:00
df['barcode'] = df['barcode'].str.slice(0, 16)
2023-06-27 13:01:44 +08:00
barcode_df = pd.DataFrame(df['barcode'].str.split('', expand=True).iloc[:, 1:-1].values,
columns=['T' + str(x) for x in range(16)]).join(df['data_needed'])
total = barcode_df['data_needed'].sum()
is_not_balance_list = []
for i in range(16):
column = 'T' + str(i)
col_df = barcode_df.groupby(column).agg({'data_needed': 'sum'})
# 去掉N计数
if 'N' in col_df.index:
base_N_size = col_df.loc['N', 'data_needed']
col_df = col_df.drop('N')
else:
base_N_size = 0
col_df['ratio'] = (col_df['data_needed']) / (total - base_N_size)
2023-12-07 17:47:53 +08:00
# is_need_base = col_df.index[col_df['ratio'] < 0.088]
A, B, C, D = list(), list(), list(), list(),
2023-06-27 13:01:44 +08:00
2023-12-07 17:47:53 +08:00
need_base_list = list()
2023-06-27 13:01:44 +08:00
ratio = col_df['ratio'].to_dict()
for decbase in ['A', 'T', 'C', 'G']:
if decbase not in ratio:
ratio[decbase] = 0
2023-12-07 17:47:53 +08:00
if ratio[decbase] >= 0.6:
A.append(decbase)
if 0.2 <= ratio[decbase] < 0.6:
B.append(decbase)
if 0.08 <= ratio[decbase] < 0.2:
C.append(decbase)
if ratio[decbase] <= 0.8:
D.append(decbase)
if not ((len(B) + len(C) == 4) or (len(D) == 1 and len(C) == 3)):
2023-06-27 13:01:44 +08:00
is_not_balance_list.append(
2023-12-07 17:47:53 +08:00
'%s%s位置,有碱基不平衡,算出结果为 %s' % (chipname, i, ratio)
2023-06-27 13:01:44 +08:00
)
2023-07-05 17:15:46 +08:00
if len(is_not_balance_list) > 2:
2023-06-27 13:01:44 +08:00
self.return_log.append('有碱基不平衡性!')
self.return_log.extend(is_not_balance_list)
print('有碱基不平衡性!\n', '\n'.join(is_not_balance_list))
@staticmethod
def level(row):
2023-12-07 17:47:53 +08:00
today_date = datetime.now()
# 将时间字符串转换为 datetime 对象
# mytime = datetime.strptime(row['time'], "%Y-%m-%d")
# mytime = row['time'].strftime("%Y-%m-%d")
mytime = row['time']
if row['拆分方式'] == '极致周期' or '极致' in row['拆分方式']:
2023-06-27 13:01:44 +08:00
return 2
2023-12-07 17:47:53 +08:00
# 判断日期是之前的还是之后的
if mytime < today_date:
2023-06-27 13:01:44 +08:00
return 3
2023-12-07 17:47:53 +08:00
if '加急' in row['priority']:
return 4
2023-06-27 13:01:44 +08:00
2023-12-07 17:47:53 +08:00
if '补测' in row['priority']:
2023-06-27 13:01:44 +08:00
return 5
else:
return 100
@staticmethod
def read_rule():
2023-12-07 17:47:53 +08:00
df = pd.read_excel(os.path.join(basedir, 'rule', 'exclusive_classfication.xlsx'))
2023-06-27 13:01:44 +08:00
newdf = pd.DataFrame()
2023-12-07 17:47:53 +08:00
newdf['c1'] = df['c2']
newdf['c2'] = df['c1']
res = pd.concat([df, newdf])
return res.reset_index()
def use_rule(self, chipname, classfication):
may_classfic= set(self.rule[self.rule['c1'] == classfication]['c2'])
if self.chip_customer[chipname].intersection(may_classfic):
2023-06-27 13:01:44 +08:00
return True
return False
def judge_data(self, chipname, library_data):
size = library_data['size']
2023-12-07 17:47:53 +08:00
# customer = library_data['customer']
2023-06-27 13:01:44 +08:00
library = library_data['library']
2023-12-07 17:47:53 +08:00
classification = library_data['classification']
2023-06-27 13:01:44 +08:00
# 芯片大小不能超过设定限制
sizelimit = True
if self.chip_size[chipname] + size > self.data_limit:
sizelimit = False
self.logger.error(f'{library} {chipname} 文库相加大于设定限制')
# barcode有重复
notrepeatbarcode = True
if self.chip_barcode_recode[chipname].intersection({item['barcode'] for item in library_data['data']}):
notrepeatbarcode = False
self.logger.error(f'{library} {chipname} 文库有barcode重复')
2023-12-07 17:47:53 +08:00
# # 互斥的客户
# exclusivecostom = True
# if self.use_rule(chipname, customer):
# exclusivecostom = False
# self.logger.error(f'{library} {chipname} 有互斥单位')
# 互斥的文库
exclusive_classific= True
if self.use_rule(chipname, classification):
exclusive_classific= False
2023-06-27 13:01:44 +08:00
self.logger.error(f'{library} {chipname} 有互斥单位')
2023-12-07 17:47:53 +08:00
2023-06-27 13:01:44 +08:00
# 不平衡文库大于200G 不能添加
splibrary = True
2023-12-07 17:47:53 +08:00
if classification in ['扩增子', '不平衡文库', '单细胞文库', '甲基化'] \
2023-06-27 13:01:44 +08:00
and self.chip_speciallib_size[chipname] + size > 200:
splibrary = False
self.logger.error(f'{library} {chipname} 不平衡文库相加大于设定限制')
2023-12-07 17:47:53 +08:00
if sizelimit and notrepeatbarcode and exclusive_classific and splibrary:
2023-06-27 13:01:44 +08:00
return True
return False
def assign_samples(self):
ori_library_data = list()
2023-12-07 17:47:53 +08:00
if '未测' not in self.ori_data.keys():
raise UserWarning('提供excel没有 未测 sheet ,请核查!')
2023-06-27 13:01:44 +08:00
ori_library_df = pd.DataFrame(self.ori_data['未测'])
2023-12-07 17:47:53 +08:00
need_col = ['#library', 'sublibrary', 'i5', 'i7', 'data_needed', 'real_data', 'customer',
'classification', 'priority', 'time', '拆分方式', 'barcode'
]
get_col = set(ori_library_df.columns)
unhave_col = set(need_col) - get_col
if unhave_col:
unhave_fom = '; '.join(unhave_col)
raise UserWarning(f'未测表里没有{unhave_fom} 表头,请核查!')
numeric_mask = pd.to_numeric(ori_library_df['data_needed'], errors='coerce').notna()
time_mask = pd.to_datetime(ori_library_df['time'], errors='coerce').notna()
ori_library_df['note'] = ''
ori_library_df.loc[~numeric_mask, 'note'] = 'data_needed 列非数字'
ori_library_df.loc[~time_mask, 'note'] = 'time 列非日期'
need_col.append('note')
self.no_assign_data.extend(ori_library_df[~(numeric_mask & time_mask)].to_dict('records'))
# 使用布尔索引筛选出不是数字和非日期的行
ori_library_df = ori_library_df[numeric_mask & time_mask]
2023-06-27 13:01:44 +08:00
ori_library_df['level'] = ori_library_df.apply(self.level, axis=1)
2023-12-07 17:47:53 +08:00
2023-06-27 13:01:44 +08:00
for library, library_df in ori_library_df.groupby('#library'):
ori_library_data.append(dict(
library=library,
size=library_df['data_needed'].sum(),
2023-12-07 17:47:53 +08:00
split_method=library_df['拆分方式'].values[0],
2023-06-27 13:01:44 +08:00
time=library_df['time'].values[0],
level=library_df['level'].values[0],
customer=library_df['customer'].values[0],
classification=library_df['classification'].values[0],
2023-12-07 17:47:53 +08:00
data=library_df[need_col].to_dict('records')
2023-06-27 13:01:44 +08:00
))
2023-12-07 17:47:53 +08:00
ori_sort_data = sorted(ori_library_data, key=lambda x: (x['level'], x['time']))
2023-06-27 13:01:44 +08:00
i = 0
while ori_sort_data:
library_data = ori_sort_data[0]
chipname = f'chip{self.loc_chip_num}'
# 空白芯片直接添加
if chipname not in self.index_assignments:
self.add_new_data(chipname, library_data)
ori_sort_data.remove(library_data)
i += 1
continue
# 判断条件
if self.judge_data(chipname, library_data):
self.add_new_data(chipname, library_data, newer=False)
ori_sort_data.remove(library_data)
i += 1
else:
for j in range(len(ori_sort_data)):
newlibrary_data = ori_sort_data[j]
if self.judge_data(chipname, newlibrary_data):
ori_sort_data.remove(newlibrary_data)
i += 1
self.add_new_data(chipname, newlibrary_data, newer=False)
break
j += 1
else:
self.loc_chip_num += 1
if self.chip_size[chipname] > 1500:
self.loc_chip_num += 1
def assign_again(self):
pass
def run(self):
2023-12-07 17:47:53 +08:00
self.assign_samples()
2023-07-05 17:15:46 +08:00
try:
self.assign_samples()
except Exception as e:
2023-07-12 14:27:18 +08:00
self.return_log.append(f'T7排样出错 请联系!{e}')
2023-07-05 17:15:46 +08:00
self.index_assignments = {}
2023-06-27 13:01:44 +08:00
outputname = 'assignments_%s_%s' % (datetime.now().strftime("%m%d%H%M"), os.path.basename(self.path))
outputpath = os.path.join(self.output, 'result', outputname)
writer = pd.ExcelWriter(outputpath)
2023-12-07 17:47:53 +08:00
chip_loc = 1
2023-06-27 13:01:44 +08:00
for chip_idx, chip_assignments in self.index_assignments.items():
df = pd.DataFrame(chip_assignments)
2023-12-07 17:47:53 +08:00
if df['data_needed'].sum() < 1400 or chip_loc > self.chipnum:
self.no_assign_data.extend(chip_assignments)
2023-06-27 13:01:44 +08:00
continue
2023-12-07 17:47:53 +08:00
if '极致' in df['拆分方式'].values:
2023-06-27 13:01:44 +08:00
addname = 'X'
else:
addname = ''
2023-12-07 17:47:53 +08:00
self.dec_barcode_radio(chip_idx)
2023-06-27 13:01:44 +08:00
df.to_excel(writer, sheet_name=addname + chip_idx, index=False)
2023-12-07 17:47:53 +08:00
chip_loc += 1
pd.DataFrame(self.no_assign_data).to_excel(writer, sheet_name='未测', index=False)
2023-06-27 13:01:44 +08:00
if self.return_log:
pd.DataFrame(self.return_log).to_excel(writer, sheet_name='log', index=False)
writer.close()
return outputpath
if __name__ == '__main__':
2023-07-05 17:15:46 +08:00
start_time = time.time()
2023-12-07 17:47:53 +08:00
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'example', 'input排样表.xlsx')
# excel_file = 'example/input排样表.xlsx'
2023-07-05 17:15:46 +08:00
output_file = ''
2023-12-07 17:47:53 +08:00
layout = AutoLayout(filepath, output_file)
2023-07-05 17:15:46 +08:00
layout.run()
end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间为:{execution_time}")
# server()