diff --git a/tools/t7.py b/tools/t7.py index a6e3c1e..fe14f2e 100644 --- a/tools/t7.py +++ b/tools/t7.py @@ -1,7 +1,7 @@ import copy import os import time -from collections import defaultdict +from collections import defaultdict, Counter from datetime import datetime import pandas as pd @@ -132,27 +132,27 @@ class AutoLayout: today_date = datetime.now() if 'nextera' in row['classification'].lower(): - return 10 + return 1000 if '华大' in row['classification']: - return 11 + return 1100 if row['拆分方式'] == '极致周期' or '极致' in row['拆分方式']: - return 20 + return 2000 + + if '加急' in row['priority']: + return 3000 + + if '补测' in row['priority']: + return 4000 mytime = row['time'] # 判断日期是之前的还是之后的 if mytime < today_date: - return 30 - - if '加急' in row['priority']: - return 40 - - if '补测' in row['priority']: - return 50 + return 5000 else: - return 1000 + return 100000 @staticmethod def read_rule(): @@ -190,6 +190,49 @@ class AutoLayout: ori_data[name] = sheet.to_dict('records') return ori_data + def combinations_same_barcode(self): + """ + barcode 有重复的极致样本 进行排列组合,汇集成新的可能性 + """ + # 筛选有重复的行 + + # same_barcode_data = [data for data in self.ori_lib_data if data['level'] == 1900] + # same_barcode_sorted = sorted(same_barcode_data, key=lambda x: (-x['size'])) + # + # same_barcode_dict = dict() + # for index, data in enumerate(same_barcode_sorted): + # same_barcode_dict[data['library']] = data['level'] + index + 1 + # correct_data = list() + # for data in self.ori_lib_data: + # if data in same_barcode_sorted: + # data['level'] = same_barcode_dict[data['library']] + # correct_data.append(data) + # self.ori_lib_data = correct_data + + same_barcode_df = pd.DataFrame( + [spdata for data in self.ori_lib_data if data['level'] == 1900 for spdata in data['data']]) + + # 按照 'barcode' 列进行分组 + if same_barcode_df.empty: + return + grouped = same_barcode_df.groupby('barcode') + + # 获取具有重复的 'barcode' 分组 + duplicate_groups = grouped.filter(lambda x: len(x) > 1) + + # 提取这些分组,计算文库重复次数 + grouped_names = duplicate_groups.groupby('barcode')['#library'].apply(list).reset_index() + random_list = list(set(tuple(sublst) for sublst in list(grouped_names['#library']))) + new_lst = [spdata for data in random_list for spdata in data] + counts = Counter(new_lst) + + correct_data = list() + for data in self.ori_lib_data: + if data['library'] in counts: + data['level'] -= counts[data['library']] + correct_data.append(data) + self.ori_lib_data = correct_data + def add_new_data(self, chipname, library_data, newer=True): """ 增加新数据到已知芯片上 @@ -241,7 +284,8 @@ class AutoLayout: return False def use_rule_exclusive_customer(self, chipname, customer): - may_classfic = set(self.rule_exclusive_customer[self.rule_exclusive_customer['customer1'] == customer]['customer2']) + may_classfic = set( + self.rule_exclusive_customer[self.rule_exclusive_customer['customer1'] == customer]['customer2']) if self.chip_customer[chipname].intersection(may_classfic): return True return False @@ -285,8 +329,9 @@ class AutoLayout: splibrary = False # 甲基化文库不能大于250G + # 甲基化更改成100G spmethylibrary = True - if is_balance_lib == '甲基化' and self.chip_methylib_size[chipname] + size > 250: + if is_balance_lib == '甲基化' and self.chip_methylib_size[chipname] + size > 100: spmethylibrary = False # 不使用平衡文库 @@ -304,7 +349,12 @@ class AutoLayout: if is_not_balance_list: base_balance = False - if sizelimit and notrepeatbarcode and exclusive_classific and exclusive_customer and splibrary and base_balance and spmethylibrary: + # 华大的文库不能超过限制的一半 + use_huada = True + if self.chip_speciallib_huada_size[chipname] > self.data_limit / 2: + use_huada = False + + if sizelimit and notrepeatbarcode and exclusive_classific and exclusive_customer and splibrary and base_balance and spmethylibrary and use_huada: return True return False @@ -312,7 +362,7 @@ class AutoLayout: """ 锚定芯片号增加 """ - # 有nextera, 华大文库 必须满足大于50G + # 有nextera, 华大文库 必须满足大于50G 到了芯片结算 chipname = f'chip{self.loc_chip_num}' nextera_size = self.chip_speciallib_nextera_size[chipname] huada_size = self.chip_speciallib_huada_size[chipname] @@ -351,7 +401,7 @@ class AutoLayout: self.loc_chip_num += 1 def assign_samples(self): - ori_library_data = list() + # ori_library_data = list() if '未测' not in self.ori_data.keys(): raise UserWarning('提供excel没有 未测 sheet ,请核查!') @@ -396,9 +446,10 @@ class AutoLayout: ori_library_df['time'] = pd.to_datetime(ori_library_df['time'], errors='coerce') ori_library_df['level'] = ori_library_df.apply(self.level, axis=1) - # 极致客户有重复的,把等级调到0,防止放到了最后,到了未测里 - ori_library_df.loc[ - (ori_library_df.duplicated(subset='barcode')) & (ori_library_df['level'] == 20), 'level'] = 19 + # 极致客户有重复的,把等级调到19,防止放到了最后,到了未测里 + must_lib_df = ori_library_df[ori_library_df['level'] == 2000] + must_lib = set(must_lib_df[must_lib_df.duplicated(subset='barcode', keep=False)]['#library'].to_list()) + ori_library_df.loc[ori_library_df['#library'].isin(must_lib), 'level'] = 1900 for library, library_df in ori_library_df.groupby('#library'): @@ -410,13 +461,23 @@ class AutoLayout: self.no_assign_data.extend(library_df.to_dict('records')) continue - # 拆分处理 - flag = False + # 拆分处理 分为了2个大文库 if size > (self.data_limit) / 2: library_df['data_needed'] = library_df['data_needed'] / 2 - flag = True + self.return_log.append(f'文库{library} 已做拆分处理, 请注意!!! ') + self.ori_lib_data.append(dict( + library=library, + is_balance_lib=library_df['is_balance_lib'].values[0], + size=library_df['data_needed'].sum(), + split_method=library_df['拆分方式'].values[0], + time=library_df['time'].values[0], + level=library_df['level'].values[0], + customer=library_df['customer'].values[0], + classification=library_df['classification'].values[0], + data=library_df[self.need_cols].to_dict('records') + )) - ori_library_data.append(dict( + self.ori_lib_data.append(dict( library=library, is_balance_lib=library_df['is_balance_lib'].values[0], size=library_df['data_needed'].sum(), @@ -428,23 +489,9 @@ class AutoLayout: data=library_df[self.need_cols].to_dict('records') )) - # 拆分对半 - if flag: - self.return_log.append(f'文库{library} 已做拆分处理, 请注意!!! ') - ori_library_data.append(dict( - library=library, - is_balance_lib=library_df['is_balance_lib'].values[0], - size=library_df['data_needed'].sum(), - split_method=library_df['拆分方式'].values[0], - time=library_df['time'].values[0], - level=library_df['level'].values[0], - customer=library_df['customer'].values[0], - classification=library_df['classification'].values[0], - data=library_df[self.need_cols].to_dict('records') - )) - self.ori_lib_data = sorted(ori_library_data, key=lambda x: (x['level'], x['time'])) + self.combinations_same_barcode() + self.ori_lib_data = sorted(self.ori_lib_data, key=lambda x: (x['level'], x['time'])) - # self.ori_lib_data = ori_sort_data while self.ori_lib_data: library_data = self.ori_lib_data[0] chipname = f'chip{self.loc_chip_num}' @@ -474,6 +521,7 @@ class AutoLayout: self.add_loc_num() def run(self): + # print('# 测试代码') # self.assign_samples() try: self.assign_samples() @@ -522,7 +570,7 @@ class AutoLayout: res_df = pd.concat([df, df_sum], axis=1) res_df.to_excel(writer, sheet_name=chipname, index=False) chip_loc += 1 - # self.no_assign_data.extend(self.diffic_assign_data) + no_assign_df = pd.DataFrame(self.no_assign_data) no_assign_df = no_assign_df.applymap(lambda x: format_date(x) if isinstance(x, pd.Timestamp) else x) if not no_assign_df.empty: @@ -543,5 +591,3 @@ if __name__ == '__main__': end_time = time.time() execution_time = end_time - start_time print(f"代码执行时间为:{execution_time} 秒") - - # server()