增加碱基不平衡

main
chaopower 2023-12-20 17:06:48 +08:00
parent 40204a8837
commit 0f0e2f9768
1 changed files with 48 additions and 64 deletions

View File

@ -1,3 +1,4 @@
import copy
import os import os
import time import time
from collections import defaultdict from collections import defaultdict
@ -13,7 +14,7 @@ class AutoLayout:
自动化派样 自动化派样
""" """
def __init__(self, path, chipnum, output=basedir, data_limit=1800): def __init__(self, path, chipnum, output=basedir, data_limit=1750):
self.path = path self.path = path
self.output = output self.output = output
self.chipnum = int(chipnum) self.chipnum = int(chipnum)
@ -78,48 +79,18 @@ class AutoLayout:
self.chip_customer[chipname].add(library_data['customer']) self.chip_customer[chipname].add(library_data['customer'])
self.chip_classification[chipname].add(library_data['classification']) self.chip_classification[chipname].add(library_data['classification'])
# def add_new_chip(self, library_data): def count_barcode_radio(self, data):
# """
# 要新增到芯片上的数据
# :param library_data:
# :return:
# """
# chip_num_tmp = self.loc_chip_num
# while True:
# chip_num_tmp += 1
# chipname_tmp = f'chip{chip_num_tmp}'
# library = library_data['library']
# if chipname_tmp not in self.index_assignments:
# self.logger.error(f'{library} {chipname_tmp} 常规添加')
# self.add_new_data(chipname_tmp, library_data)
# break
# else:
# is_same_barcode = self.chip_barcode_recode[chipname_tmp].intersection(
# {item['barcode'] for item in library_data['data']})
# # 没有从重复的index,并且也不互斥的
# if ((self.chip_size[chipname_tmp] + library_data['size']) > self.data_limit):
# self.logger.error(f'{library} {chipname_tmp} 文库相加大于设定限制')
# if ((self.chip_speciallib_size[chipname_tmp] + library_data['size']) >= 200):
# self.logger.error(f'{library} {chipname_tmp} 不平衡文库相加大于设定限制')
# if is_same_barcode:
# self.logger.error(f'{library} {chipname_tmp} 文库有barcode重复')
# if self.use_rule(chipname_tmp, library_data['customer']):
# self.logger.error(f'{library} {chipname_tmp} 有互斥单位')
# if ((self.chip_size[chipname_tmp] + library_data['size']) <= self.data_limit) \
# and ((self.chip_speciallib_size[chipname_tmp] + library_data['size']) < 200) \
# and (not is_same_barcode) \
# and (not self.use_rule(chipname_tmp, library_data['customer'])):
# self.add_new_data(chipname_tmp, library_data, newer=False)
# break
def dec_barcode_radio(self, chipname):
data = self.index_assignments[chipname]
df = pd.DataFrame(data) df = pd.DataFrame(data)
ratio_sites = dict()
is_not_balance_list = []
if df.empty:
return ratio_sites, is_not_balance_list
df['barcode'] = df['barcode'].str.slice(0, 16) df['barcode'] = df['barcode'].str.slice(0, 16)
barcode_df = pd.DataFrame(df['barcode'].str.split('', expand=True).iloc[:, 1:-1].values, barcode_df = pd.DataFrame(df['barcode'].str.split('', expand=True).iloc[:, 1:-1].values,
columns=['T' + str(x) for x in range(16)]).join(df['data_needed']) columns=['T' + str(x) for x in range(16)]).join(df['data_needed'])
total = barcode_df['data_needed'].sum() total = barcode_df['data_needed'].sum()
is_not_balance_list = []
for i in range(16): for i in range(16):
column = 'T' + str(i) column = 'T' + str(i)
col_df = barcode_df.groupby(column).agg({'data_needed': 'sum'}) col_df = barcode_df.groupby(column).agg({'data_needed': 'sum'})
@ -129,12 +100,10 @@ class AutoLayout:
col_df = col_df.drop('N') col_df = col_df.drop('N')
else: else:
base_N_size = 0 base_N_size = 0
col_df['ratio'] = (col_df['data_needed']) / (total - base_N_size) col_df['ratio'] = (col_df['data_needed']) / (total - base_N_size)
# is_need_base = col_df.index[col_df['ratio'] < 0.088]
A, B, C, D = list(), list(), list(), list(),
ratio = col_df['ratio'].to_dict() ratio = col_df['ratio'].to_dict()
ratio_sites[i] = ratio
A, B, C, D, E, F = list(), list(), list(), list(), list(), list()
for decbase in ['A', 'T', 'C', 'G']: for decbase in ['A', 'T', 'C', 'G']:
if decbase not in ratio: if decbase not in ratio:
ratio[decbase] = 0 ratio[decbase] = 0
@ -142,19 +111,31 @@ class AutoLayout:
A.append(decbase) A.append(decbase)
if 0.2 <= ratio[decbase] < 0.6: if 0.2 <= ratio[decbase] < 0.6:
B.append(decbase) B.append(decbase)
if 0.08 <= ratio[decbase] < 0.2: if 0.15 <= ratio[decbase] < 0.2:
C.append(decbase) C.append(decbase)
if ratio[decbase] <= 0.8: if 0.1 <= ratio[decbase] < 0.15:
D.append(decbase) D.append(decbase)
if not ((len(B) + len(C) == 4) or (len(D) == 1 and len(C) == 3)): if 0.08 <= ratio[decbase] < 0.1:
is_not_balance_list.append( E.append(decbase)
'%s%s位置,有碱基不平衡,算出结果为 %s' % (chipname, i, ratio) if ratio[decbase] < 0.08:
) F.append(decbase)
if len(is_not_balance_list): A_num, B_num, C_num, D_num, E_num, F_num = len(A), len(B), len(C), len(D), len(E), len(F)
self.return_log.append('有碱基不平衡性!') if not ((B_num + C_num + D_num == 4) or (F_num == 1 and (A_num + B_num) == 3) or (
self.return_log.extend(is_not_balance_list) E_num == 1 and D_num == 1 and (A_num + B_num + C_num) == 2) or (
print('有碱基不平衡性!\n', '\n'.join(is_not_balance_list)) E_num == 1 and (A_num + B_num + C_num) == 3)):
is_not_balance_list.append(
'%s位置,算出结果为 %s' % (i, ratio)
)
return ratio_sites, is_not_balance_list
def dec_barcode_radio(self, chipname):
data = self.index_assignments[chipname]
ratio_sites, is_not_balance_list = self.count_barcode_radio(data)
if is_not_balance_list:
desc = '\n'.join(is_not_balance_list)
self.return_log.append(f'芯片{chipname}有碱基不平衡:\n{desc}')
print(f'芯片{chipname}有碱基不平衡:\n{desc}')
@staticmethod @staticmethod
def level(row): def level(row):
@ -206,31 +187,34 @@ class AutoLayout:
sizelimit = True sizelimit = True
if self.chip_size[chipname] + size > self.data_limit: if self.chip_size[chipname] + size > self.data_limit:
sizelimit = False sizelimit = False
self.logger.error(f'{library} {chipname} 文库相加大于设定限制')
# barcode有重复 # barcode有重复
notrepeatbarcode = True notrepeatbarcode = True
if self.chip_barcode_recode[chipname].intersection({item['barcode'] for item in library_data['data']}): if self.chip_barcode_recode[chipname].intersection({item['barcode'] for item in library_data['data']}):
notrepeatbarcode = False notrepeatbarcode = False
self.logger.error(f'{library} {chipname} 文库有barcode重复')
# # 互斥的客户
# exclusivecostom = True
# if self.use_rule(chipname, customer):
# exclusivecostom = False
# self.logger.error(f'{library} {chipname} 有互斥单位')
# 互斥的文库 # 互斥的文库
exclusive_classific = True exclusive_classific = True
if self.use_rule(chipname, classification): if self.use_rule(chipname, classification):
exclusive_classific = False exclusive_classific = False
self.logger.error(f'{library} {chipname} 有互斥单位')
# 不平衡文库大于200G 不能添加 # 不平衡文库大于200G 不能添加
splibrary = True splibrary = True
if classification in ['扩增子', '不平衡文库', '单细胞文库', '甲基化'] \ if classification in ['扩增子', '不平衡文库', '单细胞文库', '甲基化'] \
and self.chip_speciallib_size[chipname] + size > 250: and self.chip_speciallib_size[chipname] + size > 250:
splibrary = False splibrary = False
self.logger.error(f'{library} {chipname} 不平衡文库相加大于设定限制')
if sizelimit and notrepeatbarcode and exclusive_classific and splibrary: # 碱基不平衡不过不添加,保证前面的数据, 在数据达到1200G的时候开始
base_balance = True
if self.chip_size[chipname] > 800:
current_data = copy.deepcopy(self.index_assignments[chipname])
new_data = library_data['data']
current_data.extend(new_data)
ratio_sites, is_not_balance_list = self.count_barcode_radio(current_data)
if is_not_balance_list:
base_balance = False
if sizelimit and notrepeatbarcode and exclusive_classific and splibrary and base_balance:
return True return True
return False return False
@ -314,6 +298,7 @@ class AutoLayout:
pass pass
def run(self): def run(self):
# self.assign_samples()
try: try:
self.assign_samples() self.assign_samples()
except Exception as e: except Exception as e:
@ -328,7 +313,7 @@ class AutoLayout:
df = pd.DataFrame(chip_assignments) df = pd.DataFrame(chip_assignments)
df['time'] = df['time'].dt.strftime('%Y-%m-%d') df['time'] = df['time'].dt.strftime('%Y-%m-%d')
if df['data_needed'].sum() < 1500 or chip_loc > self.chipnum: if df['data_needed'].sum() < 1500 or chip_loc > self.chipnum:
self.no_assign_data.extend(chip_assignments) self.no_assign_data.extend(df.to_dict('records'))
continue continue
if [method for method in df['拆分方式'].values if '极致' in method]: if [method for method in df['拆分方式'].values if '极致' in method]:
addname = 'X' addname = 'X'
@ -338,7 +323,6 @@ class AutoLayout:
df.to_excel(writer, sheet_name=addname + chip_idx, index=False) df.to_excel(writer, sheet_name=addname + chip_idx, index=False)
chip_loc += 1 chip_loc += 1
no_assign_df = pd.DataFrame(self.no_assign_data) no_assign_df = pd.DataFrame(self.no_assign_data)
# no_assign_df['time'] = no_assign_df['time'].dt.strftime('%Y-%m-%d')
no_assign_df.to_excel(writer, sheet_name='未测', index=False) no_assign_df.to_excel(writer, sheet_name='未测', index=False)
if self.return_log: if self.return_log:
pd.DataFrame(self.return_log).to_excel(writer, sheet_name='log', index=False) pd.DataFrame(self.return_log).to_excel(writer, sheet_name='log', index=False)