main
parent
c203913bd4
commit
2f07383922
10
T7_server.py
10
T7_server.py
|
|
@ -23,6 +23,8 @@ def recvdata(conn, path):
|
||||||
content_len = header_dic['contentlen']
|
content_len = header_dic['contentlen']
|
||||||
content_name = header_dic['contentname']
|
content_name = header_dic['contentname']
|
||||||
librarynum = header_dic['librarynum']
|
librarynum = header_dic['librarynum']
|
||||||
|
is_use_balance = header_dic['is_use_balance']
|
||||||
|
is_use_max = header_dic['is_use_max']
|
||||||
recv_len = 0
|
recv_len = 0
|
||||||
fielpath = os.path.join(path, '%s_%s' % (datetime.now().strftime("%m%d%H%M"), content_name))
|
fielpath = os.path.join(path, '%s_%s' % (datetime.now().strftime("%m%d%H%M"), content_name))
|
||||||
file = open(fielpath, 'wb')
|
file = open(fielpath, 'wb')
|
||||||
|
|
@ -31,7 +33,7 @@ def recvdata(conn, path):
|
||||||
file.write(correntrecv)
|
file.write(correntrecv)
|
||||||
recv_len += len(correntrecv)
|
recv_len += len(correntrecv)
|
||||||
file.close()
|
file.close()
|
||||||
return fielpath, librarynum
|
return fielpath, librarynum, is_use_balance, is_use_max
|
||||||
|
|
||||||
|
|
||||||
def senddata(conn, path, message=None):
|
def senddata(conn, path, message=None):
|
||||||
|
|
@ -77,8 +79,8 @@ def server():
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
myclient, adddr = myserver.accept()
|
myclient, adddr = myserver.accept()
|
||||||
recv_content, librarynum = recvdata(myclient, os.path.join(basedir, 'example'))
|
recv_content, librarynum, is_use_balance, is_use_max = recvdata(myclient, os.path.join(basedir, 'example'))
|
||||||
layout = T7(recv_content, librarynum)
|
layout = T7(recv_content, librarynum, is_use_balance, is_use_max)
|
||||||
outputpath = layout.run()
|
outputpath = layout.run()
|
||||||
senddata(myclient, outputpath)
|
senddata(myclient, outputpath)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -88,7 +90,7 @@ def server():
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if len(sys.argv) > 1:
|
if len(sys.argv) > 1:
|
||||||
layout = T7(sys.argv[1], sys.argv[2])
|
layout = T7(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
|
||||||
outputpath = layout.run()
|
outputpath = layout.run()
|
||||||
else:
|
else:
|
||||||
server()
|
server()
|
||||||
|
|
|
||||||
485
tools/t7.py
485
tools/t7.py
|
|
@ -5,7 +5,6 @@ from collections import defaultdict
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from deap import base, creator, tools, algorithms
|
|
||||||
|
|
||||||
from tools.common import basedir, log
|
from tools.common import basedir, log
|
||||||
|
|
||||||
|
|
@ -15,7 +14,55 @@ def format_date(date):
|
||||||
return date.strftime('%Y-%m-%d')
|
return date.strftime('%Y-%m-%d')
|
||||||
|
|
||||||
|
|
||||||
def count_barcode_radio(data):
|
class AutoLayout:
|
||||||
|
"""
|
||||||
|
自动化派样
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, path, librarynum, is_use_balance=1, is_use_max=0, output=basedir, data_limit=1750):
|
||||||
|
self.path = path
|
||||||
|
self.output = output
|
||||||
|
self.librarynum = int(librarynum)
|
||||||
|
self.data_limit = data_limit
|
||||||
|
|
||||||
|
self.index_assignments = defaultdict(list)
|
||||||
|
# 芯片数量量大小
|
||||||
|
self.chip_size = dict()
|
||||||
|
# 芯片是否极致
|
||||||
|
self.chip_type = dict()
|
||||||
|
# 芯片barcode
|
||||||
|
self.chip_barcode_recode = defaultdict(set)
|
||||||
|
# 芯片原始数据读取
|
||||||
|
self.ori_data = self.read_excel()
|
||||||
|
# 当前锚芯片
|
||||||
|
self.loc_chip_num = 1
|
||||||
|
# 芯片客户
|
||||||
|
self.chip_customer = defaultdict(set)
|
||||||
|
# 文库
|
||||||
|
self.chip_classification = defaultdict(set)
|
||||||
|
self.rule = self.read_rule()
|
||||||
|
|
||||||
|
# 不平衡文库
|
||||||
|
self.chip_speciallib_size = dict()
|
||||||
|
|
||||||
|
# 甲基化文库
|
||||||
|
self.chip_methylib_size = dict()
|
||||||
|
|
||||||
|
# Nextera 文库大小
|
||||||
|
self.chip_speciallib_nextera_size = dict()
|
||||||
|
# 华大 文库
|
||||||
|
self.chip_speciallib_huada_size = dict()
|
||||||
|
|
||||||
|
self.logger = log(os.path.basename(f'{path}.txt'))
|
||||||
|
self.return_log = list()
|
||||||
|
self.no_assign_data = list()
|
||||||
|
self.ori_lib_data = list()
|
||||||
|
self.need_cols = self.read_cols()
|
||||||
|
|
||||||
|
self.is_use_balance = is_use_balance
|
||||||
|
self.is_use_max = is_use_max
|
||||||
|
|
||||||
|
def count_barcode_radio(self, data):
|
||||||
df = pd.DataFrame(data)
|
df = pd.DataFrame(data)
|
||||||
ratio_sites = dict()
|
ratio_sites = dict()
|
||||||
is_not_balance_list = []
|
is_not_balance_list = []
|
||||||
|
|
@ -39,7 +86,7 @@ def count_barcode_radio(data):
|
||||||
col_df['ratio'] = (col_df['data_needed']) / (total - base_n_size)
|
col_df['ratio'] = (col_df['data_needed']) / (total - base_n_size)
|
||||||
ratio = col_df['ratio'].to_dict()
|
ratio = col_df['ratio'].to_dict()
|
||||||
ratio_sites[i] = ratio
|
ratio_sites[i] = ratio
|
||||||
A, B, C, D, E, F = list(), list(), list(), list(), list(), list()
|
A, B, C, D, E, F, G = list(), list(), list(), list(), list(), list(), list()
|
||||||
for decbase in ['A', 'T', 'C', 'G']:
|
for decbase in ['A', 'T', 'C', 'G']:
|
||||||
if decbase not in ratio:
|
if decbase not in ratio:
|
||||||
ratio[decbase] = 0
|
ratio[decbase] = 0
|
||||||
|
|
@ -56,151 +103,70 @@ def count_barcode_radio(data):
|
||||||
if ratio[decbase] < 0.08:
|
if ratio[decbase] < 0.08:
|
||||||
F.append(decbase)
|
F.append(decbase)
|
||||||
|
|
||||||
A_num, B_num, C_num, D_num, E_num, F_num = len(A), len(B), len(C), len(D), len(E), len(F)
|
# 新增一个碱基可行规则
|
||||||
|
if 0.125 <= ratio[decbase] <= 0.625:
|
||||||
|
G.append(decbase)
|
||||||
|
|
||||||
|
A_num, B_num, C_num, D_num, E_num, F_num, G_num = len(A), len(B), len(C), len(D), len(E), len(F), len(G)
|
||||||
if not ((B_num + C_num + D_num == 4) or (F_num == 1 and (A_num + B_num) == 3) or (
|
if not ((B_num + C_num + D_num == 4) or (F_num == 1 and (A_num + B_num) == 3) or (
|
||||||
E_num == 1 and D_num == 1 and (A_num + B_num + C_num) == 2) or (
|
E_num == 1 and D_num == 1 and (A_num + B_num + C_num) == 2) or (
|
||||||
E_num == 1 and (A_num + B_num + C_num) == 3)):
|
E_num == 1 and (A_num + B_num + C_num) == 3) or (
|
||||||
|
F_num == 1 and G_num == 3 and self.is_use_max)):
|
||||||
is_not_balance_list.append(
|
is_not_balance_list.append(
|
||||||
'第%s位置,算出结果为 %s' % (i, ratio)
|
'第%s位置,算出结果为 %s' % (i, ratio)
|
||||||
)
|
)
|
||||||
return ratio_sites, is_not_balance_list
|
return ratio_sites, is_not_balance_list
|
||||||
|
|
||||||
|
def dec_barcode_radio(self, chipname):
|
||||||
# 定义遗传算法
|
data = self.index_assignments[chipname]
|
||||||
class Ga:
|
ratio_sites, is_not_balance_list = self.count_barcode_radio(data)
|
||||||
"""
|
|
||||||
# 定义遗传算法
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, sheets):
|
|
||||||
self.sheets = sheets
|
|
||||||
|
|
||||||
# 定义个体的生成方式
|
|
||||||
def generate_individual(self):
|
|
||||||
individual = copy.deepcopy(self.sheets) # 初始解作为个体
|
|
||||||
return [individual]
|
|
||||||
|
|
||||||
# 定义评估函数
|
|
||||||
@staticmethod
|
|
||||||
def evaluate(individual):
|
|
||||||
total_data_needed_sum = 0
|
|
||||||
xchip = 0
|
|
||||||
try:
|
|
||||||
for sheetname, data in individual[0][0].items():
|
|
||||||
library_data = pd.DataFrame(data)
|
|
||||||
|
|
||||||
size = library_data['data_needed'].sum()
|
|
||||||
|
|
||||||
# 芯片大小不能超过设定限制
|
|
||||||
if size > 1700:
|
|
||||||
return (0, 100000, 100000)
|
|
||||||
|
|
||||||
# barcode有重复
|
|
||||||
if len(library_data['barcode'].values) < len(set(library_data['barcode'].values)):
|
|
||||||
return (0, 100000, 100000)
|
|
||||||
|
|
||||||
# 不平衡文库大于250G 不能添加
|
|
||||||
if library_data[library_data['is_balance_lib'] == '否']['data_needed'].sum() > 250:
|
|
||||||
return (0, 100000, 100000)
|
|
||||||
|
|
||||||
# 碱基不平衡不过不添加,保证前面的数据, 在数据达到1200G的时候开始
|
|
||||||
ratio_sites, is_not_balance_list = count_barcode_radio(library_data)
|
|
||||||
if is_not_balance_list:
|
if is_not_balance_list:
|
||||||
return (0, 100000, 100000)
|
desc = '\n'.join(is_not_balance_list)
|
||||||
|
self.return_log.append(f'芯片{chipname}有碱基不平衡:\n{desc}')
|
||||||
|
print(f'芯片{chipname}有碱基不平衡:\n{desc}')
|
||||||
|
|
||||||
if library_data[library_data['classification'].str.lower() == 'nextera']['data_needed'].sum() <= 50:
|
@staticmethod
|
||||||
return (0, 100000, 100000)
|
def level(row):
|
||||||
|
|
||||||
# 计算每个sheet的data_needed之和
|
today_date = datetime.now()
|
||||||
total_data_needed_sum += library_data['data_needed'].sum()
|
|
||||||
|
|
||||||
# 记录包含字母"A"的sheet数量
|
if 'nextera' in row['classification'].lower():
|
||||||
if any('极致' in value for value in library_data['split']):
|
return 10
|
||||||
xchip += 1
|
|
||||||
except Exception:
|
|
||||||
return (0, 100000, 100000)
|
|
||||||
|
|
||||||
# 返回一个适应度值,目标是最大化总的data_needed之和,最小化sheet的数量, 最少的极致芯片
|
if '华大' in row['classification']:
|
||||||
total_data_needed_sum, num_sheets, num_xchip = total_data_needed_sum, len(individual[0]), xchip
|
return 11
|
||||||
return total_data_needed_sum, num_sheets, num_xchip
|
|
||||||
|
|
||||||
def run(self):
|
if row['拆分方式'] == '极致周期' or '极致' in row['拆分方式']:
|
||||||
# 定义遗传算法的参数
|
return 20
|
||||||
pop_size = 50
|
|
||||||
cxpb = 0.7 # 交叉概率
|
|
||||||
mutpb = 0.2 # 变异概率
|
|
||||||
ngen = 100 # 迭代次数
|
|
||||||
|
|
||||||
# 初始化遗传算法工具箱
|
mytime = row['time']
|
||||||
creator.create("FitnessMax", base.Fitness, weights=(1.0, -1.0, -1.0,)) # 三个目标,一个最大化两个最小化
|
# 判断日期是之前的还是之后的
|
||||||
creator.create("Individual", list, fitness=creator.FitnessMax)
|
if mytime < today_date:
|
||||||
|
return 30
|
||||||
|
|
||||||
toolbox = base.Toolbox()
|
if '加急' in row['priority']:
|
||||||
|
return 40
|
||||||
|
|
||||||
# 结构初始化器
|
if '补测' in row['priority']:
|
||||||
toolbox.register("individual", tools.initRepeat, creator.Individual, self.generate_individual, n=3)
|
return 50
|
||||||
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
|
|
||||||
toolbox.register("evaluate", self.evaluate)
|
|
||||||
|
|
||||||
# 注册遗传算法所需的操作
|
else:
|
||||||
toolbox.register("mate", tools.cxTwoPoint)
|
return 1000
|
||||||
toolbox.register("mutate", tools.mutUniformInt, low=1, up=100, indpb=0.2)
|
|
||||||
toolbox.register("select", tools.selTournament, tournsize=3)
|
|
||||||
# 初始化种群
|
|
||||||
population = toolbox.population(n=pop_size)
|
|
||||||
|
|
||||||
# 运行遗传算法
|
@staticmethod
|
||||||
algorithms.eaMuPlusLambda(population, toolbox, mu=pop_size, lambda_=pop_size * 2, cxpb=cxpb, mutpb=mutpb,
|
def read_rule():
|
||||||
ngen=ngen, stats=None, halloffame=None)
|
df = pd.read_excel(os.path.join(basedir, 'rule', 'exclusive_classfication.xlsx'))
|
||||||
|
newdf = pd.DataFrame()
|
||||||
|
newdf['c1'] = df['c2']
|
||||||
|
newdf['c2'] = df['c1']
|
||||||
|
res = pd.concat([df, newdf])
|
||||||
|
return res.reset_index()
|
||||||
|
|
||||||
# 输出结果
|
@staticmethod
|
||||||
best_individual = tools.selBest(population, k=1)
|
def read_cols():
|
||||||
print(best_individual)
|
df = pd.read_excel(os.path.join(basedir, 'rule', 'columns.xlsx'))
|
||||||
optimized_sheets = best_individual[0] # 获取最优解
|
cols = list(df['cols'].values)
|
||||||
|
return cols
|
||||||
# 将优化后的结果输出
|
|
||||||
# for i, sheet in enumerate(optimized_sheets):
|
|
||||||
# sheet.to_excel(f'optimized_sheet_{i + 1}.xlsx', index=False)
|
|
||||||
return optimized_sheets
|
|
||||||
|
|
||||||
|
|
||||||
class AutoLayout:
|
|
||||||
"""
|
|
||||||
自动化派样
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, path, librarynum, output=basedir, data_limit=1750):
|
|
||||||
self.path = path
|
|
||||||
self.output = output
|
|
||||||
self.librarynum = int(librarynum)
|
|
||||||
self.data_limit = data_limit
|
|
||||||
|
|
||||||
self.index_assignments = defaultdict(list)
|
|
||||||
# 芯片数量量大小
|
|
||||||
self.chip_size = dict()
|
|
||||||
# 芯片是否极致
|
|
||||||
self.chip_type = dict()
|
|
||||||
# 芯片barcode
|
|
||||||
self.chip_barcode_recode = defaultdict(set)
|
|
||||||
# 芯片原始数据读取
|
|
||||||
self.ori_data = self.read_excel()
|
|
||||||
# 当前锚芯片
|
|
||||||
self.loc_chip_num = 1
|
|
||||||
# 芯片客户
|
|
||||||
self.chip_customer = defaultdict(set)
|
|
||||||
# 文库
|
|
||||||
self.chip_classification = defaultdict(set)
|
|
||||||
self.rule = self.read_rule()
|
|
||||||
# 甲基化文库不大于200,WGBS文库不大于200G
|
|
||||||
self.chip_speciallib_size = dict()
|
|
||||||
|
|
||||||
# Nextera 文库大小
|
|
||||||
self.chip_speciallib_nextera_size = dict()
|
|
||||||
|
|
||||||
self.logger = log(os.path.basename(f'{path}.txt'))
|
|
||||||
self.return_log = list()
|
|
||||||
self.no_assign_data = list()
|
|
||||||
self.need_cols = self.read_cols()
|
|
||||||
|
|
||||||
def read_excel(self):
|
def read_excel(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -233,122 +199,30 @@ class AutoLayout:
|
||||||
# if library_data['classification'] in ['扩增子', '不平衡文库', '单细胞文库以及甲基化']:
|
# if library_data['classification'] in ['扩增子', '不平衡文库', '单细胞文库以及甲基化']:
|
||||||
if library_data['is_balance_lib'] == '否':
|
if library_data['is_balance_lib'] == '否':
|
||||||
self.chip_speciallib_size[chipname] = library_data['size']
|
self.chip_speciallib_size[chipname] = library_data['size']
|
||||||
|
elif library_data['is_balance_lib'] == '甲基化':
|
||||||
|
self.chip_methylib_size[chipname] = library_data['size']
|
||||||
else:
|
else:
|
||||||
self.chip_speciallib_size[chipname] = 0
|
self.chip_speciallib_size[chipname] = 0
|
||||||
|
self.chip_methylib_size[chipname] = 0
|
||||||
if 'nextera' in library_data['classification'].lower():
|
if 'nextera' in library_data['classification'].lower():
|
||||||
self.chip_speciallib_nextera_size[chipname] = library_data['size']
|
self.chip_speciallib_nextera_size[chipname] = library_data['size']
|
||||||
else:
|
else:
|
||||||
self.chip_speciallib_nextera_size[chipname] = 0
|
self.chip_speciallib_nextera_size[chipname] = 0
|
||||||
|
if '华大' in library_data['classification']:
|
||||||
|
self.chip_speciallib_huada_size[chipname] = library_data['size']
|
||||||
|
else:
|
||||||
|
self.chip_speciallib_huada_size[chipname] = 0
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.chip_size[chipname] += library_data['size']
|
self.chip_size[chipname] += library_data['size']
|
||||||
if library_data['is_balance_lib'] == '否':
|
if library_data['is_balance_lib'] == '否':
|
||||||
self.chip_speciallib_size[chipname] += library_data['size']
|
self.chip_speciallib_size[chipname] += library_data['size']
|
||||||
|
if library_data['is_balance_lib'] == '甲基化':
|
||||||
|
self.chip_methylib_size[chipname] += library_data['size']
|
||||||
if 'nextera' in library_data['classification'].lower():
|
if 'nextera' in library_data['classification'].lower():
|
||||||
self.chip_speciallib_nextera_size[chipname] += library_data['size']
|
self.chip_speciallib_huada_size[chipname] += library_data['size']
|
||||||
|
if '华大' in library_data['classification']:
|
||||||
@staticmethod
|
self.chip_speciallib_huada_size[chipname] += library_data['size']
|
||||||
def count_barcode_radio(data):
|
|
||||||
df = pd.DataFrame(data)
|
|
||||||
ratio_sites = dict()
|
|
||||||
is_not_balance_list = []
|
|
||||||
if df.empty:
|
|
||||||
return ratio_sites, is_not_balance_list
|
|
||||||
|
|
||||||
df['barcode'] = df['barcode'].str.slice(0, 16)
|
|
||||||
barcode_df = pd.DataFrame(df['barcode'].str.split('', expand=True).iloc[:, 1:-1].values,
|
|
||||||
columns=['T' + str(x) for x in range(16)]).join(df['data_needed'])
|
|
||||||
total = barcode_df['data_needed'].sum()
|
|
||||||
|
|
||||||
for i in range(16):
|
|
||||||
column = 'T' + str(i)
|
|
||||||
col_df = barcode_df.groupby(column).agg({'data_needed': 'sum'})
|
|
||||||
# 去掉N计数
|
|
||||||
if 'N' in col_df.index:
|
|
||||||
base_n_size = col_df.loc['N', 'data_needed']
|
|
||||||
col_df = col_df.drop('N')
|
|
||||||
else:
|
|
||||||
base_n_size = 0
|
|
||||||
col_df['ratio'] = (col_df['data_needed']) / (total - base_n_size)
|
|
||||||
ratio = col_df['ratio'].to_dict()
|
|
||||||
ratio_sites[i] = ratio
|
|
||||||
A, B, C, D, E, F = list(), list(), list(), list(), list(), list()
|
|
||||||
for decbase in ['A', 'T', 'C', 'G']:
|
|
||||||
if decbase not in ratio:
|
|
||||||
ratio[decbase] = 0
|
|
||||||
if ratio[decbase] >= 0.6:
|
|
||||||
A.append(decbase)
|
|
||||||
if 0.2 <= ratio[decbase] < 0.6:
|
|
||||||
B.append(decbase)
|
|
||||||
if 0.15 <= ratio[decbase] < 0.2:
|
|
||||||
C.append(decbase)
|
|
||||||
if 0.1 <= ratio[decbase] < 0.15:
|
|
||||||
D.append(decbase)
|
|
||||||
if 0.08 <= ratio[decbase] < 0.1:
|
|
||||||
E.append(decbase)
|
|
||||||
if ratio[decbase] < 0.08:
|
|
||||||
F.append(decbase)
|
|
||||||
|
|
||||||
A_num, B_num, C_num, D_num, E_num, F_num = len(A), len(B), len(C), len(D), len(E), len(F)
|
|
||||||
if not ((B_num + C_num + D_num == 4) or (F_num == 1 and (A_num + B_num) == 3) or (
|
|
||||||
E_num == 1 and D_num == 1 and (A_num + B_num + C_num) == 2) or (
|
|
||||||
E_num == 1 and (A_num + B_num + C_num) == 3)):
|
|
||||||
is_not_balance_list.append(
|
|
||||||
'第%s位置,算出结果为 %s' % (i, ratio)
|
|
||||||
)
|
|
||||||
return ratio_sites, is_not_balance_list
|
|
||||||
|
|
||||||
def dec_barcode_radio(self, chipname):
|
|
||||||
data = self.index_assignments[chipname]
|
|
||||||
ratio_sites, is_not_balance_list = self.count_barcode_radio(data)
|
|
||||||
if is_not_balance_list:
|
|
||||||
desc = '\n'.join(is_not_balance_list)
|
|
||||||
self.return_log.append(f'芯片{chipname}有碱基不平衡:\n{desc}')
|
|
||||||
print(f'芯片{chipname}有碱基不平衡:\n{desc}')
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def level(row):
|
|
||||||
|
|
||||||
today_date = datetime.now()
|
|
||||||
|
|
||||||
# 将时间字符串转换为 datetime 对象
|
|
||||||
# mytime = datetime.strptime(row['time'], "%Y-%m-%d")
|
|
||||||
# mytime = row['time'].strftime("%Y-%m-%d")
|
|
||||||
|
|
||||||
if 'nextera' in row['classification'].lower():
|
|
||||||
return 1
|
|
||||||
|
|
||||||
if row['拆分方式'] == '极致周期' or '极致' in row['拆分方式']:
|
|
||||||
return 2
|
|
||||||
|
|
||||||
mytime = row['time']
|
|
||||||
# 判断日期是之前的还是之后的
|
|
||||||
if mytime < today_date:
|
|
||||||
return 3
|
|
||||||
|
|
||||||
if '加急' in row['priority']:
|
|
||||||
return 4
|
|
||||||
|
|
||||||
if '补测' in row['priority']:
|
|
||||||
return 5
|
|
||||||
|
|
||||||
else:
|
|
||||||
return 100
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def read_rule():
|
|
||||||
df = pd.read_excel(os.path.join(basedir, 'rule', 'exclusive_classfication.xlsx'))
|
|
||||||
newdf = pd.DataFrame()
|
|
||||||
newdf['c1'] = df['c2']
|
|
||||||
newdf['c2'] = df['c1']
|
|
||||||
res = pd.concat([df, newdf])
|
|
||||||
return res.reset_index()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def read_cols():
|
|
||||||
df = pd.read_excel(os.path.join(basedir, 'rule', 'columns.xlsx'))
|
|
||||||
cols = list(df['cols'].values)
|
|
||||||
return cols
|
|
||||||
|
|
||||||
def use_rule(self, chipname, classfication):
|
def use_rule(self, chipname, classfication):
|
||||||
may_classfic = set(self.rule[self.rule['c1'] == classfication]['c2'])
|
may_classfic = set(self.rule[self.rule['c1'] == classfication]['c2'])
|
||||||
|
|
@ -357,9 +231,10 @@ class AutoLayout:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def judge_data(self, chipname, library_data):
|
def judge_data(self, chipname, library_data):
|
||||||
|
"""
|
||||||
|
约束条件
|
||||||
|
"""
|
||||||
size = library_data['size']
|
size = library_data['size']
|
||||||
# customer = library_data['customer']
|
|
||||||
# library = library_data['library']
|
|
||||||
classification = library_data['classification']
|
classification = library_data['classification']
|
||||||
is_balance_lib = library_data['is_balance_lib']
|
is_balance_lib = library_data['is_balance_lib']
|
||||||
|
|
||||||
|
|
@ -383,9 +258,19 @@ class AutoLayout:
|
||||||
if is_balance_lib == '否' and self.chip_speciallib_size[chipname] + size > 250:
|
if is_balance_lib == '否' and self.chip_speciallib_size[chipname] + size > 250:
|
||||||
splibrary = False
|
splibrary = False
|
||||||
|
|
||||||
|
# 甲基化文库不能大于250G
|
||||||
|
spmethylibrary = True
|
||||||
|
if is_balance_lib == '甲基化' and self.chip_methylib_size[chipname] + size > 250:
|
||||||
|
spmethylibrary = False
|
||||||
|
|
||||||
|
# 不使用平衡文库
|
||||||
|
if not self.is_use_balance:
|
||||||
|
splibrary = True
|
||||||
|
spmethylibrary = True
|
||||||
|
|
||||||
# 碱基不平衡不过不添加,保证前面的数据, 在数据达到1200G的时候开始
|
# 碱基不平衡不过不添加,保证前面的数据, 在数据达到1200G的时候开始
|
||||||
base_balance = True
|
base_balance = True
|
||||||
if self.chip_size[chipname] > 800:
|
if self.chip_size[chipname] > 900:
|
||||||
current_data = copy.deepcopy(self.index_assignments[chipname])
|
current_data = copy.deepcopy(self.index_assignments[chipname])
|
||||||
new_data = library_data['data']
|
new_data = library_data['data']
|
||||||
current_data.extend(new_data)
|
current_data.extend(new_data)
|
||||||
|
|
@ -393,17 +278,21 @@ class AutoLayout:
|
||||||
if is_not_balance_list:
|
if is_not_balance_list:
|
||||||
base_balance = False
|
base_balance = False
|
||||||
|
|
||||||
if sizelimit and notrepeatbarcode and exclusive_classific and splibrary and base_balance:
|
if sizelimit and notrepeatbarcode and exclusive_classific and splibrary and base_balance and spmethylibrary:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def add_loc_num(self):
|
def add_loc_num(self):
|
||||||
# 有nextera文库 必须满足大于50G
|
"""
|
||||||
|
锚定芯片号增加
|
||||||
|
"""
|
||||||
|
# 有nextera, 华大文库 必须满足大于50G
|
||||||
chipname = f'chip{self.loc_chip_num}'
|
chipname = f'chip{self.loc_chip_num}'
|
||||||
nextera_size = self.chip_speciallib_nextera_size[chipname]
|
nextera_size = self.chip_speciallib_nextera_size[chipname]
|
||||||
if nextera_size > 50 or nextera_size == 0:
|
huada_size = self.chip_speciallib_huada_size[chipname]
|
||||||
self.loc_chip_num += 1
|
print(chipname, huada_size, nextera_size)
|
||||||
else:
|
flag = True
|
||||||
|
if 0 < nextera_size < 50:
|
||||||
# 有nextera文库,但是不满足50G 去除
|
# 有nextera文库,但是不满足50G 去除
|
||||||
nextary_barcode = set()
|
nextary_barcode = set()
|
||||||
no_nextary_data = list()
|
no_nextary_data = list()
|
||||||
|
|
@ -416,6 +305,26 @@ class AutoLayout:
|
||||||
self.index_assignments[chipname] = no_nextary_data
|
self.index_assignments[chipname] = no_nextary_data
|
||||||
self.chip_barcode_recode[chipname] -= nextary_barcode
|
self.chip_barcode_recode[chipname] -= nextary_barcode
|
||||||
self.chip_speciallib_nextera_size[chipname] = 0
|
self.chip_speciallib_nextera_size[chipname] = 0
|
||||||
|
self.chip_size[chipname] -= nextera_size
|
||||||
|
flag = False
|
||||||
|
if 0 < huada_size < 50:
|
||||||
|
# 有华大文库,但是不满足50G 去除
|
||||||
|
huada_barcode = set()
|
||||||
|
no_huada_data = list()
|
||||||
|
for libdata in self.index_assignments[chipname]:
|
||||||
|
if libdata['classification'] != '华大':
|
||||||
|
no_huada_data.append(libdata)
|
||||||
|
else:
|
||||||
|
self.no_assign_data.append(libdata)
|
||||||
|
huada_barcode.update(libdata['barcode'])
|
||||||
|
self.index_assignments[chipname] = no_huada_data
|
||||||
|
self.chip_barcode_recode[chipname] -= huada_barcode
|
||||||
|
self.chip_speciallib_huada_size[chipname] = 0
|
||||||
|
self.chip_size[chipname] -= huada_size
|
||||||
|
flag = False
|
||||||
|
if flag:
|
||||||
|
print(self.loc_chip_num)
|
||||||
|
self.loc_chip_num += 1
|
||||||
|
|
||||||
def assign_samples(self):
|
def assign_samples(self):
|
||||||
ori_library_data = list()
|
ori_library_data = list()
|
||||||
|
|
@ -424,44 +333,48 @@ class AutoLayout:
|
||||||
raise UserWarning('提供excel没有 未测 sheet ,请核查!')
|
raise UserWarning('提供excel没有 未测 sheet ,请核查!')
|
||||||
ori_library_df = pd.DataFrame(self.ori_data['未测'])
|
ori_library_df = pd.DataFrame(self.ori_data['未测'])
|
||||||
|
|
||||||
# need_col = ['status', '#library', 'sublibrary', 'i5', 'i7', 'data_needed', 'real_data', 'customer',
|
# 检查提供excel 是否有必须表头
|
||||||
# 'classification', 'priority', 'time', '拆分方式', 'barcode', 'is_balance_lib', '备注',
|
|
||||||
# 'TIPS1', 'TIPS2', 'TIPS3'
|
|
||||||
# ]
|
|
||||||
self.need_cols = self.read_cols()
|
|
||||||
get_col = set(ori_library_df.columns)
|
get_col = set(ori_library_df.columns)
|
||||||
unhave_col = set(self.need_cols) - get_col
|
unhave_col = set(self.need_cols) - get_col
|
||||||
|
|
||||||
if unhave_col:
|
if unhave_col:
|
||||||
unhave_fom = '; '.join(unhave_col)
|
unhave_from = '; '.join(unhave_col)
|
||||||
raise UserWarning(f'未测表里没有{unhave_fom} 表头,请核查!')
|
raise UserWarning(f'未测表里没有{unhave_from} 表头,请核查!')
|
||||||
|
|
||||||
|
# 数据标准格式
|
||||||
numeric_mask = pd.to_numeric(ori_library_df['data_needed'], errors='coerce').notna()
|
numeric_mask = pd.to_numeric(ori_library_df['data_needed'], errors='coerce').notna()
|
||||||
time_mask = pd.to_datetime(ori_library_df['time'], errors='coerce').notna()
|
time_mask = pd.to_datetime(ori_library_df['time'], errors='coerce').notna()
|
||||||
|
|
||||||
# 添加处理status列的逻辑
|
# 添加处理status列的逻辑
|
||||||
status_mask = ori_library_df['status'] == '暂不排样'
|
status_mask = ori_library_df['status'] == '暂不排样'
|
||||||
|
|
||||||
|
# 非正常barcode
|
||||||
|
barcode_mask = ori_library_df['barcode'].str.len() != 16
|
||||||
|
|
||||||
ori_library_df['note'] = ''
|
ori_library_df['note'] = ''
|
||||||
ori_library_df.loc[~numeric_mask, 'note'] = 'data_needed 列非数字'
|
ori_library_df.loc[~numeric_mask, 'note'] = 'data_needed 列非数字'
|
||||||
ori_library_df.loc[~time_mask, 'note'] = 'time 列非日期'
|
ori_library_df.loc[~time_mask, 'note'] = 'time 列非日期'
|
||||||
ori_library_df.loc[status_mask, 'note'] = '暂不排样'
|
ori_library_df.loc[status_mask, 'note'] = '暂不排样'
|
||||||
|
# ori_library_df.loc[barcode_mask, 'note'] = '非16位barcode'
|
||||||
|
|
||||||
# need_col.append('note')
|
no_ori_data = ori_library_df[~(numeric_mask & time_mask) | status_mask | barcode_mask]
|
||||||
|
|
||||||
no_ori_data = ori_library_df[~(numeric_mask & time_mask) | status_mask]
|
|
||||||
|
|
||||||
self.no_assign_data.extend(no_ori_data.to_dict('records'))
|
self.no_assign_data.extend(no_ori_data.to_dict('records'))
|
||||||
|
|
||||||
# 使用布尔索引筛选出不是数字和非日期的行
|
# 使用布尔索引筛选出不是数字和非日期的行,并且不是暂不排样的行
|
||||||
ori_library_df = ori_library_df[(numeric_mask & time_mask) & ~status_mask]
|
ori_library_df = ori_library_df[(numeric_mask & time_mask) & ~status_mask & ~barcode_mask]
|
||||||
|
|
||||||
|
# 某个客户的检测的数据超过1个T就单独处理
|
||||||
|
# summary = ori_library_df.groupby('customer').agg({'data_needed': 'sum'})
|
||||||
|
# print(summary)
|
||||||
|
|
||||||
|
# 时间格式化
|
||||||
|
ori_library_df['time'] = pd.to_datetime(ori_library_df['time'], errors='coerce')
|
||||||
ori_library_df['level'] = ori_library_df.apply(self.level, axis=1)
|
ori_library_df['level'] = ori_library_df.apply(self.level, axis=1)
|
||||||
|
|
||||||
# # 极致客户有重复的,把等级调到0,防止放到了最后,到了未测里
|
# 极致客户有重复的,把等级调到0,防止放到了最后,到了未测里
|
||||||
# duplicate_name = ori_library_df[ori_library_df['level'] == 2].duplicated(subset='barcode', keep=False)
|
ori_library_df.loc[
|
||||||
# # 将 'level' 列的值改为 0
|
(ori_library_df.duplicated(subset='barcode')) & (ori_library_df['level'] == 20), 'level'] = 19
|
||||||
# ori_library_df.loc[duplicate_name, 'level'] = 0
|
|
||||||
|
|
||||||
for library, library_df in ori_library_df.groupby('#library'):
|
for library, library_df in ori_library_df.groupby('#library'):
|
||||||
|
|
||||||
|
|
@ -470,10 +383,10 @@ class AutoLayout:
|
||||||
# 文库内部有重复
|
# 文库内部有重复
|
||||||
if len(library_df['barcode'].values) > len(set(library_df['barcode'].values)):
|
if len(library_df['barcode'].values) > len(set(library_df['barcode'].values)):
|
||||||
library_df['note'] = '文库内部有重复'
|
library_df['note'] = '文库内部有重复'
|
||||||
library_df.loc[:, 'time'] = library_df['time'].apply(format_date)
|
|
||||||
self.no_assign_data.extend(library_df.to_dict('records'))
|
self.no_assign_data.extend(library_df.to_dict('records'))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# 拆分处理
|
||||||
flag = False
|
flag = False
|
||||||
if size > (self.data_limit) / 2:
|
if size > (self.data_limit) / 2:
|
||||||
library_df['data_needed'] = library_df['data_needed'] / 2
|
library_df['data_needed'] = library_df['data_needed'] / 2
|
||||||
|
|
@ -491,6 +404,7 @@ class AutoLayout:
|
||||||
data=library_df[self.need_cols].to_dict('records')
|
data=library_df[self.need_cols].to_dict('records')
|
||||||
))
|
))
|
||||||
|
|
||||||
|
# 拆分对半
|
||||||
if flag:
|
if flag:
|
||||||
self.return_log.append(f'文库{library} 已做拆分处理, 请注意!!! ')
|
self.return_log.append(f'文库{library} 已做拆分处理, 请注意!!! ')
|
||||||
ori_library_data.append(dict(
|
ori_library_data.append(dict(
|
||||||
|
|
@ -504,43 +418,40 @@ class AutoLayout:
|
||||||
classification=library_df['classification'].values[0],
|
classification=library_df['classification'].values[0],
|
||||||
data=library_df[self.need_cols].to_dict('records')
|
data=library_df[self.need_cols].to_dict('records')
|
||||||
))
|
))
|
||||||
ori_sort_data = sorted(ori_library_data, key=lambda x: (x['level'], x['time'], -x['size']))
|
self.ori_lib_data = sorted(ori_library_data, key=lambda x: (x['level'], x['time']))
|
||||||
|
|
||||||
i = 0
|
# self.ori_lib_data = ori_sort_data
|
||||||
while ori_sort_data:
|
|
||||||
library_data = ori_sort_data[0]
|
n = 1
|
||||||
|
while self.ori_lib_data:
|
||||||
|
n += 1
|
||||||
|
print(n)
|
||||||
|
library_data = self.ori_lib_data[0]
|
||||||
chipname = f'chip{self.loc_chip_num}'
|
chipname = f'chip{self.loc_chip_num}'
|
||||||
|
|
||||||
# 空白芯片直接添加
|
# 空白芯片直接添加
|
||||||
if chipname not in self.index_assignments:
|
if chipname not in self.index_assignments:
|
||||||
self.add_new_data(chipname, library_data)
|
self.add_new_data(chipname, library_data)
|
||||||
ori_sort_data.remove(library_data)
|
self.ori_lib_data.remove(library_data)
|
||||||
i += 1
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 判断条件
|
# 判断条件
|
||||||
if self.judge_data(chipname, library_data):
|
if self.judge_data(chipname, library_data):
|
||||||
self.add_new_data(chipname, library_data, newer=False)
|
self.add_new_data(chipname, library_data, newer=False)
|
||||||
ori_sort_data.remove(library_data)
|
self.ori_lib_data.remove(library_data)
|
||||||
i += 1
|
|
||||||
else:
|
else:
|
||||||
for j in range(len(ori_sort_data)):
|
for j in range(len(self.ori_lib_data)):
|
||||||
newlibrary_data = ori_sort_data[j]
|
newlibrary_data = self.ori_lib_data[j]
|
||||||
if self.judge_data(chipname, newlibrary_data):
|
if self.judge_data(chipname, newlibrary_data):
|
||||||
ori_sort_data.remove(newlibrary_data)
|
self.ori_lib_data.remove(newlibrary_data)
|
||||||
i += 1
|
|
||||||
self.add_new_data(chipname, newlibrary_data, newer=False)
|
self.add_new_data(chipname, newlibrary_data, newer=False)
|
||||||
break
|
break
|
||||||
j += 1
|
j += 1
|
||||||
else:
|
else:
|
||||||
# self.loc_chip_num += 1
|
|
||||||
self.add_loc_num()
|
|
||||||
if self.chip_size[chipname] > self.data_limit:
|
|
||||||
# self.loc_chip_num += 1
|
|
||||||
self.add_loc_num()
|
self.add_loc_num()
|
||||||
|
|
||||||
def assign_again(self):
|
if self.chip_size[chipname] > self.data_limit:
|
||||||
pass
|
self.add_loc_num()
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
# self.assign_samples()
|
# self.assign_samples()
|
||||||
|
|
@ -553,9 +464,6 @@ class AutoLayout:
|
||||||
outputpath = os.path.join(self.output, 'result', outputname)
|
outputpath = os.path.join(self.output, 'result', outputname)
|
||||||
writer = pd.ExcelWriter(outputpath)
|
writer = pd.ExcelWriter(outputpath)
|
||||||
|
|
||||||
# ga = Ga(sheets=self.index_assignments)
|
|
||||||
# self.index_assignments = ga.run()
|
|
||||||
|
|
||||||
chip_loc = 1
|
chip_loc = 1
|
||||||
librarynum = 0
|
librarynum = 0
|
||||||
for chip_idx, chip_assignments in self.index_assignments.items():
|
for chip_idx, chip_assignments in self.index_assignments.items():
|
||||||
|
|
@ -564,15 +472,21 @@ class AutoLayout:
|
||||||
df = pd.DataFrame(chip_assignments)
|
df = pd.DataFrame(chip_assignments)
|
||||||
df['time'] = df['time'].dt.strftime('%Y-%m-%d')
|
df['time'] = df['time'].dt.strftime('%Y-%m-%d')
|
||||||
|
|
||||||
if df['data_needed'].sum() < 1600 or librarynum > self.librarynum:
|
|
||||||
df['note'] = '排样数据量不足1600或者排样管数超标'
|
|
||||||
self.no_assign_data.extend(df.to_dict('records'))
|
|
||||||
continue
|
|
||||||
librarynum += len(set(df['#library'].values))
|
|
||||||
if [method for method in df['拆分方式'].values if '极致' in method]:
|
if [method for method in df['拆分方式'].values if '极致' in method]:
|
||||||
addname = 'X'
|
addname = 'X'
|
||||||
else:
|
else:
|
||||||
addname = ''
|
addname = ''
|
||||||
|
|
||||||
|
if df['data_needed'].sum() < 1600 and not addname:
|
||||||
|
df['note'] = '排样数据量不足1600G'
|
||||||
|
self.no_assign_data.extend(df.to_dict('records'))
|
||||||
|
continue
|
||||||
|
if librarynum > self.librarynum:
|
||||||
|
df['note'] = '排样管数超标'
|
||||||
|
self.no_assign_data.extend(df.to_dict('records'))
|
||||||
|
continue
|
||||||
|
librarynum += len(set(df['#library'].values))
|
||||||
|
|
||||||
self.dec_barcode_radio(chip_idx)
|
self.dec_barcode_radio(chip_idx)
|
||||||
chipname = addname + chip_idx
|
chipname = addname + chip_idx
|
||||||
|
|
||||||
|
|
@ -588,8 +502,10 @@ class AutoLayout:
|
||||||
res_df = pd.concat([df, df_sum], axis=1)
|
res_df = pd.concat([df, df_sum], axis=1)
|
||||||
res_df.to_excel(writer, sheet_name=chipname, index=False)
|
res_df.to_excel(writer, sheet_name=chipname, index=False)
|
||||||
chip_loc += 1
|
chip_loc += 1
|
||||||
|
# self.no_assign_data.extend(self.diffic_assign_data)
|
||||||
no_assign_df = pd.DataFrame(self.no_assign_data)
|
no_assign_df = pd.DataFrame(self.no_assign_data)
|
||||||
no_assign_df = no_assign_df.applymap(lambda x: format_date(x) if isinstance(x, pd.Timestamp) else x)
|
no_assign_df = no_assign_df.applymap(lambda x: format_date(x) if isinstance(x, pd.Timestamp) else x)
|
||||||
|
if not no_assign_df.empty:
|
||||||
no_assign_df = no_assign_df[self.need_cols]
|
no_assign_df = no_assign_df[self.need_cols]
|
||||||
no_assign_df.to_excel(writer, sheet_name='未测', index=False)
|
no_assign_df.to_excel(writer, sheet_name='未测', index=False)
|
||||||
if self.return_log:
|
if self.return_log:
|
||||||
|
|
@ -601,7 +517,6 @@ class AutoLayout:
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'example', 'input排样表.xlsx')
|
filepath = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'example', 'input排样表.xlsx')
|
||||||
# excel_file = 'example/input排样表.xlsx'
|
|
||||||
output_file = ''
|
output_file = ''
|
||||||
layout = AutoLayout(filepath, output_file)
|
layout = AutoLayout(filepath, output_file)
|
||||||
layout.run()
|
layout.run()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue