From 0f0e2f97688adafe755ce4007a60ab943505de21 Mon Sep 17 00:00:00 2001 From: chaopower Date: Wed, 20 Dec 2023 17:06:48 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=A2=B1=E5=9F=BA=E4=B8=8D?= =?UTF-8?q?=E5=B9=B3=E8=A1=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/t7.py | 112 ++++++++++++++++++++++------------------------------ 1 file changed, 48 insertions(+), 64 deletions(-) diff --git a/tools/t7.py b/tools/t7.py index d63cd6b..e3697cb 100644 --- a/tools/t7.py +++ b/tools/t7.py @@ -1,3 +1,4 @@ +import copy import os import time from collections import defaultdict @@ -13,7 +14,7 @@ class AutoLayout: 自动化派样 """ - def __init__(self, path, chipnum, output=basedir, data_limit=1800): + def __init__(self, path, chipnum, output=basedir, data_limit=1750): self.path = path self.output = output self.chipnum = int(chipnum) @@ -78,48 +79,18 @@ class AutoLayout: self.chip_customer[chipname].add(library_data['customer']) self.chip_classification[chipname].add(library_data['classification']) - # def add_new_chip(self, library_data): - # """ - # 要新增到芯片上的数据 - # :param library_data: - # :return: - # """ - # chip_num_tmp = self.loc_chip_num - # while True: - # chip_num_tmp += 1 - # chipname_tmp = f'chip{chip_num_tmp}' - # library = library_data['library'] - # if chipname_tmp not in self.index_assignments: - # self.logger.error(f'{library} {chipname_tmp} 常规添加') - # self.add_new_data(chipname_tmp, library_data) - # break - # else: - # is_same_barcode = self.chip_barcode_recode[chipname_tmp].intersection( - # {item['barcode'] for item in library_data['data']}) - # # 没有从重复的index,并且也不互斥的 - # if ((self.chip_size[chipname_tmp] + library_data['size']) > self.data_limit): - # self.logger.error(f'{library} {chipname_tmp} 文库相加大于设定限制') - # if ((self.chip_speciallib_size[chipname_tmp] + library_data['size']) >= 200): - # self.logger.error(f'{library} {chipname_tmp} 不平衡文库相加大于设定限制') - # if is_same_barcode: - # self.logger.error(f'{library} {chipname_tmp} 文库有barcode重复') - # if self.use_rule(chipname_tmp, library_data['customer']): - # self.logger.error(f'{library} {chipname_tmp} 有互斥单位') - # if ((self.chip_size[chipname_tmp] + library_data['size']) <= self.data_limit) \ - # and ((self.chip_speciallib_size[chipname_tmp] + library_data['size']) < 200) \ - # and (not is_same_barcode) \ - # and (not self.use_rule(chipname_tmp, library_data['customer'])): - # self.add_new_data(chipname_tmp, library_data, newer=False) - # break - - def dec_barcode_radio(self, chipname): - data = self.index_assignments[chipname] + def count_barcode_radio(self, data): df = pd.DataFrame(data) + ratio_sites = dict() + is_not_balance_list = [] + if df.empty: + return ratio_sites, is_not_balance_list + df['barcode'] = df['barcode'].str.slice(0, 16) barcode_df = pd.DataFrame(df['barcode'].str.split('', expand=True).iloc[:, 1:-1].values, columns=['T' + str(x) for x in range(16)]).join(df['data_needed']) total = barcode_df['data_needed'].sum() - is_not_balance_list = [] + for i in range(16): column = 'T' + str(i) col_df = barcode_df.groupby(column).agg({'data_needed': 'sum'}) @@ -129,12 +100,10 @@ class AutoLayout: col_df = col_df.drop('N') else: base_N_size = 0 - col_df['ratio'] = (col_df['data_needed']) / (total - base_N_size) - # is_need_base = col_df.index[col_df['ratio'] < 0.088] - - A, B, C, D = list(), list(), list(), list(), ratio = col_df['ratio'].to_dict() + ratio_sites[i] = ratio + A, B, C, D, E, F = list(), list(), list(), list(), list(), list() for decbase in ['A', 'T', 'C', 'G']: if decbase not in ratio: ratio[decbase] = 0 @@ -142,19 +111,31 @@ class AutoLayout: A.append(decbase) if 0.2 <= ratio[decbase] < 0.6: B.append(decbase) - if 0.08 <= ratio[decbase] < 0.2: + if 0.15 <= ratio[decbase] < 0.2: C.append(decbase) - if ratio[decbase] <= 0.8: + if 0.1 <= ratio[decbase] < 0.15: D.append(decbase) - if not ((len(B) + len(C) == 4) or (len(D) == 1 and len(C) == 3)): - is_not_balance_list.append( - '%s 第%s位置,有碱基不平衡,算出结果为 %s' % (chipname, i, ratio) - ) + if 0.08 <= ratio[decbase] < 0.1: + E.append(decbase) + if ratio[decbase] < 0.08: + F.append(decbase) - if len(is_not_balance_list): - self.return_log.append('有碱基不平衡性!') - self.return_log.extend(is_not_balance_list) - print('有碱基不平衡性!\n', '\n'.join(is_not_balance_list)) + A_num, B_num, C_num, D_num, E_num, F_num = len(A), len(B), len(C), len(D), len(E), len(F) + if not ((B_num + C_num + D_num == 4) or (F_num == 1 and (A_num + B_num) == 3) or ( + E_num == 1 and D_num == 1 and (A_num + B_num + C_num) == 2) or ( + E_num == 1 and (A_num + B_num + C_num) == 3)): + is_not_balance_list.append( + '第%s位置,算出结果为 %s' % (i, ratio) + ) + return ratio_sites, is_not_balance_list + + def dec_barcode_radio(self, chipname): + data = self.index_assignments[chipname] + ratio_sites, is_not_balance_list = self.count_barcode_radio(data) + if is_not_balance_list: + desc = '\n'.join(is_not_balance_list) + self.return_log.append(f'芯片{chipname}有碱基不平衡:\n{desc}') + print(f'芯片{chipname}有碱基不平衡:\n{desc}') @staticmethod def level(row): @@ -206,31 +187,34 @@ class AutoLayout: sizelimit = True if self.chip_size[chipname] + size > self.data_limit: sizelimit = False - self.logger.error(f'{library} {chipname} 文库相加大于设定限制') + # barcode有重复 notrepeatbarcode = True if self.chip_barcode_recode[chipname].intersection({item['barcode'] for item in library_data['data']}): notrepeatbarcode = False - self.logger.error(f'{library} {chipname} 文库有barcode重复') - # # 互斥的客户 - # exclusivecostom = True - # if self.use_rule(chipname, customer): - # exclusivecostom = False - # self.logger.error(f'{library} {chipname} 有互斥单位') # 互斥的文库 exclusive_classific = True if self.use_rule(chipname, classification): exclusive_classific = False - self.logger.error(f'{library} {chipname} 有互斥单位') # 不平衡文库大于200G 不能添加 splibrary = True if classification in ['扩增子', '不平衡文库', '单细胞文库', '甲基化'] \ and self.chip_speciallib_size[chipname] + size > 250: splibrary = False - self.logger.error(f'{library} {chipname} 不平衡文库相加大于设定限制') - if sizelimit and notrepeatbarcode and exclusive_classific and splibrary: + + # 碱基不平衡不过不添加,保证前面的数据, 在数据达到1200G的时候开始 + base_balance = True + if self.chip_size[chipname] > 800: + current_data = copy.deepcopy(self.index_assignments[chipname]) + new_data = library_data['data'] + current_data.extend(new_data) + ratio_sites, is_not_balance_list = self.count_barcode_radio(current_data) + if is_not_balance_list: + base_balance = False + + if sizelimit and notrepeatbarcode and exclusive_classific and splibrary and base_balance: return True return False @@ -314,6 +298,7 @@ class AutoLayout: pass def run(self): + # self.assign_samples() try: self.assign_samples() except Exception as e: @@ -328,7 +313,7 @@ class AutoLayout: df = pd.DataFrame(chip_assignments) df['time'] = df['time'].dt.strftime('%Y-%m-%d') if df['data_needed'].sum() < 1500 or chip_loc > self.chipnum: - self.no_assign_data.extend(chip_assignments) + self.no_assign_data.extend(df.to_dict('records')) continue if [method for method in df['拆分方式'].values if '极致' in method]: addname = 'X' @@ -338,7 +323,6 @@ class AutoLayout: df.to_excel(writer, sheet_name=addname + chip_idx, index=False) chip_loc += 1 no_assign_df = pd.DataFrame(self.no_assign_data) - # no_assign_df['time'] = no_assign_df['time'].dt.strftime('%Y-%m-%d') no_assign_df.to_excel(writer, sheet_name='未测', index=False) if self.return_log: pd.DataFrame(self.return_log).to_excel(writer, sheet_name='log', index=False)