#!/usr/bin/env python3 import argparse import os import pandas as pd def single_monitoring(name, vcf, bed, freq_range, output_dir): vcf_header = [] # 用于存储VCF文件头部分 vcf_data = [] # 用于存储筛选后的VCF数据 p_value_list = [] # 用于存储符合条件的 p-value # 按行处理,保存为列表 with open(vcf, 'r') as vcf_file: for line in vcf_file: if line.startswith("#"): vcf_header.append(line) else: vcf_data.append(line) # bed 处理 df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4)) position_list = list(df_position[2]) chr_list = list(df_position[0]) result_data = [] for i in range(len(position_list)): filtered_lines = [line for line in vcf_data if line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len( line.split()[3]) < 2 and len(line.split()[4]) < 2] for line in filtered_lines: p_value_str = line.split()[9].split(":")[4] p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str) if 0.1 <= p_value <= 0.9: result_data.append(line) p_value_list.append(p_value) select_vcf = os.path.join(output_dir, f'{name}_select.vcf') with open(select_vcf, 'w') as output: for header_line in vcf_header: output.write(header_line) for data_line in result_data: output.write(data_line) count_normal = 0 count_exception = 0 for p_value in p_value_list: if freq_range[0] <= p_value <= freq_range[1]: count_normal += 1 else: count_exception += 1 count_all = count_exception + count_normal if count_all == 0: z_score = 0 else: z_score = count_exception / count_all res = dict( barcode=name, count_normal=count_normal, count_exception=count_exception, z_score=z_score ) pd.DataFrame([res]).to_csv(os.path.join(f'{name}_pollution.csv'), sep="\t", index=False) # 根据小bed筛选vcf def select_position(vcf, bed, matched_file, unmatched_file): vcf_header = [] # 用于存储VCF文件头部分 vcf_data = [] # 用于存储筛选后的VCF数据 # 按行处理,保存为列表 with open(vcf, 'r') as vcf_file: for line in vcf_file: if line.startswith("#"): vcf_header.append(line) else: vcf_data.append(line) df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4)) position_list = list(df_position[2]) chr_list = list(df_position[0]) result_data = [] unmatched_data = [] # 用于存储未匹配的数据 for i in range(len(position_list)): filtered_lines = [line for line in vcf_data if line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len( line.split()[3]) < 2 and len(line.split()[4]) < 2] if not filtered_lines: # 如果没有匹配的点,添加到未匹配数据列表 unmatched_data.append(f"{chr_list[i]}\t{position_list[i]}\t.\t.\t.\t.\t.\n") result_data.extend(filtered_lines) with open(matched_file, 'w') as output: for header_line in vcf_header: output.write(header_line) for data_line in result_data: output.write(data_line) with open(unmatched_file, 'w') as unmatched_output: unmatched_output.writelines(unmatched_data) return matched_file, unmatched_file # 处理体系、胚系的vcf,得到目标信息 def process_judge_vcf(input_vcf, output_vcf): with open(input_vcf, 'r') as input_file, open(output_vcf, 'w') as output_file: for line in input_file: if not line.startswith("#"): fields = line.strip().split('\t') info = fields[9].split(":") percentage = float(info[4]) if 0.1 <= percentage <= 0.9: b = 0.5 elif percentage < 0.1: b = 0 elif percentage > 0.9: b = 1 # 构建新的行数据 new_line = '\t'.join([fields[0], fields[1], fields[3], fields[4], info[4], str(b), info[2]]) output_file.write(new_line + '\n') return output_vcf def merge_and_sort_files(matched_file, unmatched_file, output_file): # 检查 unmatched_file 是否为空 if os.stat(unmatched_file).st_size == 0: # 对 matched_file 进行排序并写入 output_file matched_df = pd.read_csv(matched_file, sep='\t', header=None) sorted_df = matched_df.sort_values(by=[0, 1]) sorted_df.to_csv(output_file, sep='\t', header=False, index=False) return output_file # 如果 unmatched_file 不为空,继续合并和排序操作 matched_df = pd.read_csv(matched_file, sep='\t', header=None) unmatched_df = pd.read_csv(unmatched_file, sep='\t', header=None) # 合并数据帧 combined_df = pd.concat([matched_df, unmatched_df]) # 根据第一列和第二列排序 sorted_df = combined_df.sort_values(by=[0, 1]) # 将排序后的数据写入输出文件 sorted_df.to_csv(output_file, sep='\t', header=False, index=False) return output_file # 合并体系,胚系 def merge_and_compare_files(somatic_file, germline_file, output_merged_file, output_final_file): # 合并两个文件 with open(somatic_file, 'r') as somatic, open(germline_file, 'r') as germline: merged_lines = [f"{somatic_line.strip()}\t{germline_line.strip()}" for somatic_line, germline_line in zip(somatic, germline)] # 将合并后的数据写入输出文件 with open(output_merged_file, 'w') as output_file: output_file.write('\n'.join(merged_lines)) # 比较两列数据并添加比较结果列 with open(output_merged_file, 'r') as merged, open(output_final_file, 'w') as final_output: for line in merged: fields = line.strip().split('\t') if fields[5] == fields[12]: comparison_result = "yes" else: comparison_result = "no" final_output.write(f"{line.strip()}\t{comparison_result}\n") return output_merged_file, output_final_file # 根据大bed筛选vcf,作cnvkit的图 def select_cnvkit_vcf(vcf, bed, output_file): vcf_header = [] # 用于存储VCF文件头部分 vcf_data = [] # 用于存储筛选后的VCF数据 p_value_list = [] # 用于存储符合条件的 p-value # 按行处理,保存为列表 with open(vcf, 'r') as vcf_file: for line in vcf_file: if line.startswith("#"): vcf_header.append(line) else: vcf_data.append(line) df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4)) position_list = list(df_position[2]) chr_list = list(df_position[0]) result_data = [] for i in range(len(position_list)): filtered_lines = [line for line in vcf_data if line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len( line.split()[3]) < 2 and len(line.split()[4]) < 2] for line in filtered_lines: p_value_str = line.split()[9].split(":")[4] p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str) if 0.1 <= p_value <= 0.9: result_data.append(line) with open(output_file, 'w') as output: for header_line in vcf_header: output.write(header_line) for data_line in result_data: output.write(data_line) return output_file def paired_monitoring(name, somatic_vcf, germline_vcf, ref_bed, cnvkit_ref_bed, output_dir): # 处理体系,根据bed筛选 select_position_output_file1 = os.path.join(output_dir, f'{name}_somatic_matched.vcf') select_position_output_file2 = os.path.join(output_dir, f'{name}_somatic_unmatched.vcf') somatic_matched_file, somatic_unmatched_file = select_position(somatic_vcf, ref_bed, select_position_output_file1, select_position_output_file2) # 处理胚系,根据bed筛选 select_position_output_file3 = os.path.join(output_dir, f'{name}_germline_matched.vcf') select_position_output_file4 = os.path.join(output_dir, f'{name}_germline_unmatched.vcf') Germline_matched_file, Germline_unmatched_file = select_position(germline_vcf, ref_bed, select_position_output_file3, select_position_output_file4) # 处理体系,数值转换 process_judge_vcf_file1 = os.path.join(output_dir, f'{name}_somatic_matched_add_judge.vcf') somatic_matched_add_judge_file = process_judge_vcf(somatic_matched_file, process_judge_vcf_file1) # 处理胚系,数值转换 process_judge_vcf_file2 = os.path.join(output_dir, f'{name}_germline_matched_add_judge.vcf') germline_matched_add_judge_file = process_judge_vcf(Germline_matched_file, process_judge_vcf_file2) # 合并体系,将匹配到的和未匹配到bed的的合并 merge_and_sort_files_file1 = os.path.join(output_dir, f'{name}_somatic_merged.vcf') somatic_merged_file = merge_and_sort_files(somatic_matched_add_judge_file, somatic_unmatched_file, merge_and_sort_files_file1) # 合并胚系,将匹配到的和未匹配到bed的的合并 merge_and_sort_files_file2 = os.path.join(output_dir, f'{name}_germline__merged.vcf') Germline_merged_file = merge_and_sort_files(germline_matched_add_judge_file, Germline_unmatched_file, merge_and_sort_files_file2) # 合并胚系,体系,将体系,胚系两个合并文件再合并 result_pro_file = os.path.join(output_dir, f'{name}_result_pro.txt') result_file = os.path.join(output_dir, f'{name}_contaminate_result.txt') merge_and_compare_files(somatic_merged_file, Germline_merged_file, result_pro_file, result_file) ##筛选作图vcf cnvkit_output_file = os.path.join(output_dir, f'{name}_select_cnvkit.vcf') select_cnvkit_vcf(germline_vcf, cnvkit_ref_bed, cnvkit_output_file) ##删除中间文件 os.remove(select_position_output_file1) os.remove(select_position_output_file2) os.remove(select_position_output_file3) os.remove(select_position_output_file4) os.remove(process_judge_vcf_file1) os.remove(process_judge_vcf_file2) os.remove(merge_and_sort_files_file1) os.remove(merge_and_sort_files_file2) os.remove(result_pro_file) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Pollution Process Script") parser.add_argument('-n', '--name', help="Name for sample", required=True) parser.add_argument('-b', '--ref_bed', help="ref_bed", required=True) parser.add_argument('-v', '--vcf', help="raw vcf for prbe 160 or 17 ; somatic vcf for prbe 682 or 624", required=True) parser.add_argument('-v2', '--vcf2', help="germline vcf; required when prbe 682 or 624", default='', required=False, nargs='?') parser.add_argument('-c', '--cnvkit_bed', help="cnvkit_bed; required when prbe 682 or 624") parser.add_argument('-p', '--probe', help="probe, 682, 624, 160, 17 for now ", required=True) parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='') args = parser.parse_args() bed_path = os.path.realpath(args.ref_bed) print(f'污染检测使用ref_bed: {bed_path}') probe = args.probe if probe == '160' or probe == '17': freq_range = {"17": [0.3452, 0.6512], "160": [0.2930, 0.6753]}.get(probe) single_monitoring(args.name, args.vcf, bed_path, freq_range, args.output_dir) elif probe == '682' or probe == '624': if not args.vcf2: parser.error('--vcf2 is required in prbe 682 or 624') if not args.cnvkit_bed: parser.error('--cnvkit_bed is required in prbe 682 or 624') cnvkit_bed_path = os.path.realpath(args.cnvkit_bed) print(f'污染检测使用cnvkit_bed: {cnvkit_bed_path}') paired_monitoring(args.name, args.vcf, args.vcf2, bed_path, args.cnvkit_bed, args.output_dir) else: parser.error('probe error. 682, 624, 160, 17 for now')