diff --git a/tools/t7.py b/tools/t7.py index bb93621..ebf9ca6 100644 --- a/tools/t7.py +++ b/tools/t7.py @@ -26,6 +26,8 @@ class AutoLayout: self.librarynum = int(librarynum) self.data_limit = data_limit self.data_lower = data_lower + self.get_col = list() + self.items = list() # 芯片原始数据读取 self.ori_data = self.read_excel() @@ -54,7 +56,7 @@ class AutoLayout: self.rule = self.read_rule() self.rule_exclusive_customer = self.read_rule_exclusive_customer() - # 子文库名称 + # subsamplename self.chip_sublib = defaultdict(set) # 不平衡文库 @@ -93,11 +95,12 @@ class AutoLayout: 原始数据处理 :return: """ - merge = pd.read_excel(self.path, None) - ori_data = dict() - for name, sheet in merge.items(): - sheet.fillna('', inplace=True) - ori_data[name] = sheet.to_dict('records') + # 获取表头备注 + nrow = pd.read_excel(self.path, nrows=1) + self.items = nrow.to_dict('records') + merge = pd.read_excel(self.path, skiprows=[1]) + merge.fillna('', inplace=True) + ori_data = merge.to_dict('records') return ori_data @staticmethod @@ -125,26 +128,26 @@ class AutoLayout: if df.empty: return ratio_sites, is_not_balance_list s, e = 0, 16 - if maxt == 'i7': + if maxt == 'indexi7': s, e = 8, 16 - if maxt == 'i5': + if maxt == 'indexi5': s, e = 0, 8 num = e - s - df['barcode'] = df['barcode'].str.slice(s, e) - barcode_df = pd.DataFrame(df['barcode'].str.split('', expand=True).iloc[:, 1:-1].values, - columns=['T' + str(x) for x in range(num)]).join(df['data_needed']) - total = barcode_df['data_needed'].sum() + df['indexi5i7'] = df['indexi5i7'].str.slice(s, e) + barcode_df = pd.DataFrame(df['indexi5i7'].str.split('', expand=True).iloc[:, 1:-1].values, + columns=['T' + str(x) for x in range(num)]).join(df['orderdatavolume']) + total = barcode_df['orderdatavolume'].sum() for i in range(num): column = 'T' + str(i) - col_df = barcode_df.groupby(column).agg({'data_needed': 'sum'}) + col_df = barcode_df.groupby(column).agg({'orderdatavolume': 'sum'}) # 去掉N计数 if 'N' in col_df.index: - base_n_size = col_df.loc['N', 'data_needed'] + base_n_size = col_df.loc['N', 'orderdatavolume'] col_df = col_df.drop('N') else: base_n_size = 0 - col_df['ratio'] = (col_df['data_needed']) / (total - base_n_size) + col_df['ratio'] = (col_df['orderdatavolume']) / (total - base_n_size) ratio = col_df['ratio'].to_dict() ratio_sites[i] = ratio A, B, C, D, E, F, G = list(), list(), list(), list(), list(), list(), list() @@ -191,25 +194,19 @@ class AutoLayout: today_date = datetime.now() - if 'nextera' in row['classification'].lower(): + if 'nextera' in row['librarystructure'].lower(): return 1000 - if '华大' in row['classification']: + if '华大' in row['librarystructure']: return 1100 - if '超加急' in row['priority']: - return 1500 - - if row['拆分方式'] == '极致周期' or '极致' in row['拆分方式']: + if row['cycletype'] == '极致周期' or '极致' in row['cycletype']: return 2000 - if '加急' in row['priority']: + if row['retestflag'] == '是': return 3000 - if '补测' in row['priority']: - return 4000 - - mytime = row['time'] + mytime = row['receivedtime'] # 判断日期是之前的还是之后的 if mytime < today_date: return 5000 @@ -224,17 +221,17 @@ class AutoLayout: same_barcode_df = pd.DataFrame( [spdata for data in self.ori_lib_data if data['level'] == 1900 for spdata in data['data']]) - # 按照 'barcode' 列进行分组 + # 按照 'indexi5i7' 列进行分组 if same_barcode_df.empty: return - grouped = same_barcode_df.groupby('barcode') + grouped = same_barcode_df.groupby('indexi5i7') - # 获取具有重复的 'barcode' 分组 + # 获取具有重复的 'indexi5i7' 分组 duplicate_groups = grouped.filter(lambda x: len(x) > 1) # 提取这些分组,计算文库重复次数 - grouped_names = duplicate_groups.groupby('barcode')['#library'].apply(list).reset_index() - random_list = list(set(tuple(sublst) for sublst in list(grouped_names['#library']))) + grouped_names = duplicate_groups.groupby('indexi5i7')['samplename'].apply(list).reset_index() + random_list = list(set(tuple(sublst) for sublst in list(grouped_names['samplename']))) new_lst = [spdata for data in random_list for spdata in data] counts = Counter(new_lst) @@ -255,17 +252,17 @@ class AutoLayout: """ self.index_assignments[chipname].extend(library_data['data']) - self.chip_barcode_recode[chipname].update({item['barcode'] for item in library_data['data']}) - self.chip_barcodei7_recode[chipname].update({item['i7'] for item in library_data['data']}) - self.chip_barcodei5_recode[chipname].update({item['i5'] for item in library_data['data']}) + self.chip_barcode_recode[chipname].update({item['indexi5i7'] for item in library_data['data']}) + self.chip_barcodei7_recode[chipname].update({item['indexi7'] for item in library_data['data']}) + self.chip_barcodei5_recode[chipname].update({item['indexi5'] for item in library_data['data']}) # 华大的 文库 i7 不能重复,添加N+i7 if '华大' in library_data['classification']: - self.chip_barcode_recode[chipname].update({'N' * 8 + item['i7'] for item in library_data['data']}) - # self.chip_barcode_recode[chipname].update({item['i5'] + 'N' * 8 for item in library_data['data']}) + self.chip_barcode_recode[chipname].update({'N' * 8 + item['indexi7'] for item in library_data['data']}) + # self.chip_barcode_recode[chipname].update({item['indexi5'] + 'N' * 8 for item in library_data['data']}) # 子文库 - self.chip_sublib[chipname].update({item['sublibrary'] for item in library_data['data']}) + self.chip_sublib[chipname].update({item['subsamplename'] for item in library_data['data']}) self.chip_customer[chipname].add(library_data['customer']) self.chip_classification[chipname].add(library_data['classification']) @@ -273,13 +270,11 @@ class AutoLayout: if newer: self.chip_size[chipname] = library_data['size'] self.chip_size_N[chipname] = 0 - if 'N' in library_data['data'][0]['barcode']: - # print(library_data['data'][0]['barcode']) + if 'N' in library_data['data'][0]['indexi5i7']: self.chip_size_N[chipname] = library_data['size'] - # if library_data['classification'] in ['扩增子', '不平衡文库', '单细胞文库以及甲基化']: if library_data['is_balance_lib'] == '否': self.chip_speciallib_size[chipname] = library_data['size'] - elif library_data['is_balance_lib'] == '甲基化': + elif '甲基化' in library_data['classification']: self.chip_methylib_size[chipname] = library_data['size'] else: self.chip_speciallib_size[chipname] = 0 @@ -297,15 +292,13 @@ class AutoLayout: self.chip_size[chipname] += library_data['size'] if library_data['is_balance_lib'] == '否': self.chip_speciallib_size[chipname] += library_data['size'] - if library_data['is_balance_lib'] == '甲基化': + if '甲基化' in library_data['classification']: self.chip_methylib_size[chipname] += library_data['size'] if 'nextera' in library_data['classification'].lower(): self.chip_speciallib_nextera_size[chipname] += library_data['size'] if '华大' in library_data['classification']: self.chip_speciallib_huada_size[chipname] += library_data['size'] - - if 'N' in library_data['data'][0]['barcode']: - # print(library_data['data'][0]['barcode']) + if 'N' in library_data['data'][0]['indexi5i7']: self.chip_size_N[chipname] += library_data['size'] def use_rule_exclusive_classfication(self, chipname, classfication): @@ -331,7 +324,7 @@ class AutoLayout: """ size = library_data['size'] size_N = 0 - if 'N' in library_data['data'][0]['barcode']: + if 'N' in library_data['data'][0]['indexi5i7']: size_N = library_data['size'] classification = library_data['classification'] customer = library_data['customer'] @@ -346,11 +339,11 @@ class AutoLayout: # barcode有重复 notrepeatbarcode = True - if self.chip_barcode_recode[chipname].intersection({item['barcode'] for item in library_data['data']}) or \ + if self.chip_barcode_recode[chipname].intersection({item['indexi5i7'] for item in library_data['data']}) or \ self.chip_barcode_recode[chipname].intersection( - {'N' * 8 + item['i7'] for item in library_data['data']}) or \ + {'N' * 8 + item['indexi7'] for item in library_data['data']}) or \ self.chip_barcode_recode[chipname].intersection( - {item['i5'] + 'N' * 8 for item in library_data['data']}): + {item['indexi5'] + 'N' * 8 for item in library_data['data']}): notrepeatbarcode = False # print(chipname, library, 'barcode有重复') @@ -411,15 +404,15 @@ class AutoLayout: base_balance = True notrepeatbarcode = True if self.chip_barcodei7_recode[chipname].intersection( - {item['i7'] for item in library_data['data']}) and max_barcode == 'i7': + {item['indexi7'] for item in library_data['data']}) and max_barcode == 'indexi7': notrepeatbarcode = False if self.chip_barcodei5_recode[chipname].intersection( - {item['i5'] for item in library_data['data']}) and max_barcode == 'i5': + {item['indexi5'] for item in library_data['data']}) and max_barcode == 'indexi5': notrepeatbarcode = False # 是个N的取消 - if ('N' * 8 in {item['i5'] for item in library_data['data']}) and max_barcode == 'i5': + if ('N' * 8 in {item['indexi5'] for item in library_data['data']}) and max_barcode == 'indexi5': notrepeatbarcode = False - if ('N' * 8 in {item['i7'] for item in library_data['data']}) and max_barcode == 'i7': + if ('N' * 8 in {item['indexi7'] for item in library_data['data']}) and max_barcode == 'indexi7': notrepeatbarcode = False if self.chip_size[chipname] > 900: current_data = copy.deepcopy(self.index_assignments[chipname]) @@ -431,7 +424,7 @@ class AutoLayout: # 子文库名不能重复 notrepeatsublib = True - if self.chip_sublib[chipname].intersection({item['sublibrary'] for item in library_data['data']}): + if self.chip_sublib[chipname].intersection({item['subsamplename'] for item in library_data['data']}): notrepeatsublib = False if sizelimit and notrepeatbarcode and \ @@ -464,7 +457,7 @@ class AutoLayout: no_nextary_data.append(libdata) else: self.no_assign_data.append(libdata) - nextary_barcode.update(libdata['barcode']) + nextary_barcode.update(libdata['indexi5i7']) self.index_assignments[chipname] = no_nextary_data self.chip_barcode_recode[chipname] -= nextary_barcode self.chip_speciallib_nextera_size[chipname] = 0 @@ -479,7 +472,7 @@ class AutoLayout: no_huada_data.append(libdata) else: self.no_assign_data.append(libdata) - huada_barcode.update(libdata['barcode']) + huada_barcode.update(libdata['indexi5i7']) self.index_assignments[chipname] = no_huada_data self.chip_barcode_recode[chipname] -= huada_barcode self.chip_speciallib_huada_size[chipname] = 0 @@ -490,61 +483,52 @@ class AutoLayout: def assign_samples(self): - if '未测' not in self.ori_data.keys(): - raise UserWarning('提供excel没有 未测 sheet ,请核查!') - ori_library_df = pd.DataFrame(self.ori_data['未测']) + # if '未测' not in self.ori_data.keys(): + # raise UserWarning('提供excel没有 未测 sheet ,请核查!') + ori_library_df = pd.DataFrame(self.ori_data) - # 检查提供excel 是否有必须表头 - get_col = set(ori_library_df.columns) - unhave_col = set(self.need_cols) - get_col - - if unhave_col: - unhave_from = '; '.join(unhave_col) - raise UserWarning(f'未测表里没有 {unhave_from} 表头,请核查!') + # # 检查提供excel 是否有必须表头 + # get_col = set(ori_library_df.columns) + # unhave_col = set(self.need_cols) - get_col + # + # if unhave_col: + # unhave_from = '; '.join(unhave_col) + # raise UserWarning(f'未测表里没有 {unhave_from} 表头,请核查!') # 数据标准格式 - numeric_mask = pd.to_numeric(ori_library_df['data_needed'], errors='coerce').notna() - time_mask = pd.to_datetime(ori_library_df['time'], errors='coerce').notna() - - # 添加处理status列的逻辑 - status_mask = ori_library_df['status'] == '暂不排样' + numeric_mask = pd.to_numeric(ori_library_df['orderdatavolume'], errors='coerce').notna() + time_mask = pd.to_datetime(ori_library_df['receivedtime'], errors='coerce').notna() # 非正常barcode - barcode_mask = ori_library_df['barcode'].str.len() != 16 + barcode_mask = ori_library_df['indexi5i7'].str.len() != 16 ori_library_df['note'] = '' ori_library_df.loc[~numeric_mask, 'note'] = 'data_needed 列非数字' ori_library_df.loc[~time_mask, 'note'] = 'time 列非日期' - ori_library_df.loc[status_mask, 'note'] = '暂不排样' ori_library_df.loc[barcode_mask, 'note'] = '非16位barcode' - - no_ori_data = ori_library_df[~(numeric_mask & time_mask) | status_mask | barcode_mask] - - # 某个客户的检测的数据超过1个T就单独处理 - # summary = ori_library_df.groupby('customer').agg({'data_needed': 'sum'}) - # print(summary) + no_ori_data = ori_library_df[~(numeric_mask & time_mask) | barcode_mask] self.no_assign_data.extend(no_ori_data.to_dict('records')) # 使用布尔索引筛选出不是数字和非日期的行,并且不是暂不排样的行, 以及非16位置barcode - ori_library_df = ori_library_df[(numeric_mask & time_mask) & ~status_mask & ~barcode_mask] + ori_library_df = ori_library_df[(numeric_mask & time_mask) & ~barcode_mask] # 时间格式化 - ori_library_df['time'] = pd.to_datetime(ori_library_df['time'], errors='coerce') + ori_library_df['receivedtime'] = pd.to_datetime(ori_library_df['receivedtime'], errors='coerce') ori_library_df['level'] = ori_library_df.apply(self.level, axis=1) # 极致客户有重复的,把等级调到1900,防止放到了最后,到了未测里 must_lib_df = ori_library_df[ori_library_df['level'] == 2000] - must_lib = set(must_lib_df[must_lib_df.duplicated(subset='barcode', keep=False)]['#library'].to_list()) - ori_library_df.loc[ori_library_df['#library'].isin(must_lib), 'level'] = 1900 + must_lib = set(must_lib_df[must_lib_df.duplicated(subset='indexi5i7', keep=False)]['samplename'].to_list()) + ori_library_df.loc[ori_library_df['samplename'].isin(must_lib), 'level'] = 1900 - for library, library_df in ori_library_df.groupby('#library'): + for library, library_df in ori_library_df.groupby('samplename'): - size = library_df['data_needed'].sum() - is_balance_lib = library_df['is_balance_lib'].values[0] + size = library_df['orderdatavolume'].sum() + is_balance_lib = library_df['librarybalancedflag'].values[0] # 文库内部有重复 - if len(library_df['barcode'].values) > len(set(library_df['barcode'].values)): + if len(library_df['indexi5i7'].values) > len(set(library_df['indexi5i7'].values)): library_df['note'] = '文库内部有重复' self.no_assign_data.extend(library_df.to_dict('records')) continue @@ -552,53 +536,53 @@ class AutoLayout: # 不平衡文库 大于250G 的数据 先进行拆分 if is_balance_lib == '否' and size > 250: self.return_log.append(f'文库{library} 是不平衡文库, 数据为{size}, 大于250G, 已做拆分处理, 请注意!!! ') - data_needed = library_df['data_needed'].copy() + data_needed = library_df['orderdatavolume'].copy() for num in range(int(size), 0, -200): addnum = 200 if num <= 200: addnum = num - library_df['data_needed'] = (addnum / size) * data_needed + library_df['orderdatavolume'] = (addnum / size) * data_needed self.ori_lib_data.append(dict( library=library, - is_balance_lib=library_df['is_balance_lib'].values[0], - size=library_df['data_needed'].sum(), - split_method=library_df['拆分方式'].values[0], - time=library_df['time'].values[0], + is_balance_lib=library_df['librarybalancedflag'].values[0], + size=library_df['orderdatavolume'].sum(), + split_method=library_df['cycletype'].values[0], + time=library_df['receivedtime'].values[0], level=1950, - customer=library_df['customer'].values[0], - classification=library_df['classification'].values[0], - data=library_df[self.need_cols].to_dict('records') + customer=library_df['companynamea'].values[0], + classification=library_df['librarystructure'].values[0], + data=library_df.to_dict('records') )) self.split_lib.add(library) continue # 拆分处理 分为了2个大文库 if size > self.data_limit / 2: - library_df['data_needed'] = library_df['data_needed'] / 2 + library_df['orderdatavolume'] = library_df['orderdatavolume'] / 2 self.return_log.append(f'文库{library} 已做拆分处理, 请注意!!! ') self.ori_lib_data.append(dict( library=library, - is_balance_lib=library_df['is_balance_lib'].values[0], - size=library_df['data_needed'].sum(), - split_method=library_df['拆分方式'].values[0], - time=library_df['time'].values[0], + is_balance_lib=library_df['librarybalancedflag'].values[0], + size=library_df['orderdatavolume'].sum(), + split_method=library_df['cycletype'].values[0], + time=library_df['receivedtime'].values[0], level=library_df['level'].values[0], - customer=library_df['customer'].values[0], - classification=library_df['classification'].values[0], - data=library_df[self.need_cols].to_dict('records') + customer=library_df['companynamea'].values[0], + classification=library_df['librarystructure'].values[0], + data=library_df.to_dict('records') )) self.ori_lib_data.append(dict( library=library, - is_balance_lib=library_df['is_balance_lib'].values[0], - size=library_df['data_needed'].sum(), - split_method=library_df['拆分方式'].values[0], - time=library_df['time'].values[0], + is_balance_lib=library_df['librarybalancedflag'].values[0], + size=library_df['orderdatavolume'].sum(), + split_method=library_df['cycletype'].values[0], + time=library_df['receivedtime'].values[0], level=library_df['level'].values[0], - customer=library_df['customer'].values[0], - classification=library_df['classification'].values[0], - data=library_df[self.need_cols].to_dict('records') + customer=library_df['companynamea'].values[0], + classification=library_df['librarystructure'].values[0], + data=library_df.to_dict('records') )) self.combinations_same_barcode() @@ -643,30 +627,31 @@ class AutoLayout: if not chip_assignments: continue df = pd.DataFrame(chip_assignments) - if df['data_needed'].sum() < self.data_lower: + if df['orderdatavolume'].sum() < self.data_lower: left_data.extend(chip_assignments) no_need_chipname.append(chip_idx) for chip_idx in no_need_chipname: del self.index_assignments[chip_idx] - + if not left_data: + return ori_library_df = pd.DataFrame(left_data) ori_library_df['level'] = ori_library_df.apply(self.level, axis=1) ori_lib_data = list() - for library, library_df in ori_library_df.groupby('#library'): + for library, library_df in ori_library_df.groupby('samplename'): level = library_df['level'].values[0] if library in self.split_lib: level = 1950 ori_lib_data.append(dict( library=library, - is_balance_lib=library_df['is_balance_lib'].values[0], - size=library_df['data_needed'].sum(), - split_method=library_df['拆分方式'].values[0], - time=library_df['time'].values[0], + is_balance_lib=library_df['librarybalancedflag'].values[0], + size=library_df['orderdatavolume'].sum(), + split_method=library_df['cycletype'].values[0], + time=library_df['receivedtime'].values[0], level=level, - customer=library_df['customer'].values[0], - classification=library_df['classification'].values[0], - data=library_df[self.need_cols].to_dict('records') + customer=library_df['companynamea'].values[0], + classification=library_df['librarystructure'].values[0], + data=library_df.to_dict('records') )) ori_lib_data = sorted(ori_lib_data, key=lambda x: (x['level'], x['time'], -x['size'])) @@ -700,14 +685,14 @@ class AutoLayout: self.add_loc_num(chipname) def run(self): - # print('# 测试代码') - # self.assign_samples() - # self.assign_again() + print('# 测试代码') + self.assign_samples() + self.assign_again_size() try: self.assign_samples() self.assign_again_size() - # self.assign_again_size(max_barcode='i7') - # self.assign_again_size(max_barcode='i5') + # self.assign_again_size(max_barcode='indexi7') + # self.assign_again_size(max_barcode='indexi5') except Exception as e: self.return_log.append(f'T7排样出错, 请联系!{e}') self.index_assignments = {} @@ -721,18 +706,15 @@ class AutoLayout: if not chip_assignments: continue df = pd.DataFrame(chip_assignments) - df['time'] = df['time'].dt.strftime('%Y-%m-%d') + df['receivedtime'] = df['receivedtime'].dt.strftime('%Y-%m-%d') - if [method for method in df['拆分方式'].values if '极致' in method]: + if [method for method in df['cycletype'].values if '极致' in method]: addname = 'X' else: addname = '' - other_name = '' - # if 'chipB' in chip_idx and df['barcode'].duplicated().any(): - # other_name = '_i7' - if df['data_needed'].sum() < (self.data_lower - 50) and not addname: + if df['orderdatavolume'].sum() < (self.data_lower - 50) and not addname: df['note'] = f'排样数据量不足{self.data_lower - 50}G' self.no_assign_data.extend(df.to_dict('records')) continue @@ -740,32 +722,26 @@ class AutoLayout: df['note'] = '排样管数超标' self.no_assign_data.extend(df.to_dict('records')) continue - librarynum += len(set(df['#library'].values)) + librarynum += len(set(df['samplename'].values)) self.dec_barcode_radio(chip_idx) chipname = addname + chip_idx + other_name - sum_list = list() - for library, library_df in df.groupby('#library'): - sum_list.append(dict( - 二次拆分=library, - 客户=library_df['customer'].values[0], - 类型=library_df['classification'].values[0], - 打折前=library_df['data_needed'].sum() - )) - df_sum = pd.DataFrame(sum_list) - res_df = pd.concat([df, df_sum], axis=1) - res_df.to_excel(writer, sheet_name=chipname, index=False) + df = pd.concat([pd.DataFrame(self.items), df]).reset_index(drop=True) + + df.to_excel(writer, sheet_name=chipname, index=False) chip_loc += 1 no_assign_df = pd.DataFrame(self.no_assign_data) - no_assign_df = no_assign_df.applymap(lambda x: format_date(x) if isinstance(x, pd.Timestamp) else x) - no_assign_df_not_balance = ','.join(set([lib for lib in no_assign_df['#library'] if lib in self.split_lib])) - if no_assign_df_not_balance: - self.return_log.append(f'文库{no_assign_df_not_balance}有做不平衡文库拆分处理,并且没有排完,请核查!') if not no_assign_df.empty: - no_assign_df = no_assign_df[self.need_cols] - no_assign_df.to_excel(writer, sheet_name='未测', index=False) + no_assign_df = no_assign_df.applymap(lambda x: format_date(x) if isinstance(x, pd.Timestamp) else x) + no_assign_df_not_balance = ','.join(set([lib for lib in no_assign_df['samplename'] if lib in self.split_lib])) + if no_assign_df_not_balance: + self.return_log.append(f'文库{no_assign_df_not_balance}有做不平衡文库拆分处理,并且没有排完,请核查!') + # if not no_assign_df.empty: + # no_assign_df = no_assign_df[self.need_cols] + no_assign_df = pd.concat([pd.DataFrame(self.items), no_assign_df]).reset_index(drop=True) + no_assign_df.to_excel(writer, sheet_name='未测', index=False) if self.return_log: pd.DataFrame(self.return_log).to_excel(writer, sheet_name='log', index=False) writer.close()