污染处理

master
chaopower 2024-01-19 17:56:25 +08:00
parent 73e90f8a5b
commit 1922d14b76
12 changed files with 9104 additions and 9072 deletions

View File

@ -4,51 +4,82 @@ import argparse
import os
import pandas as pd
import pysam
def single_monitoring(name, vcf, bed, freq_range, output_dir):
vcf_header = [] # 用于存储VCF文件头部分
vcf_data = [] # 用于存储筛选后的VCF数据
p_value_list = [] # 用于存储符合条件的 p-value
def load_bed_regions(bed_file):
bed_regions = set()
if bed_file:
with open(bed_file, 'r') as file:
for line in file:
parts = line.strip().split('\t')
if len(parts) >= 3:
chrom, start, end = parts[0], int(parts[1]), int(parts[2])
bed_regions.add((chrom, start, end))
return bed_regions
# 按行处理,保存为列表
with open(vcf, 'r') as vcf_file:
for line in vcf_file:
if line.startswith("#"):
vcf_header.append(line)
def build_record_dict(vcf_file):
records = {}
for record in vcf_file:
key = (record.contig, record.pos)
records[key] = record
return records
def simplify_genotype(freq):
if freq < 0.1:
return '0/0'
elif 0.1 <= freq <= 0.9:
return '0/1'
else:
vcf_data.append(line)
return '1/1'
def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
"""
17 160 单样本处理
rawvcf 对应bed 挑选出 位点计算位点是否在给定范围内计算比例得到z_socre
"""
# bed 处理
df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
position_list = list(df_position[2])
chr_list = list(df_position[0])
result_data = []
for i in range(len(position_list)):
filtered_lines = [line for line in vcf_data if
line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
line.split()[3]) < 2 and len(line.split()[4]) < 2]
for line in filtered_lines:
p_value_str = line.split()[9].split(":")[4]
p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
if 0.1 <= p_value <= 0.9:
result_data.append(line)
p_value_list.append(p_value)
select_vcf = os.path.join(output_dir, f'{name}_select.vcf')
with open(select_vcf, 'w') as output:
for header_line in vcf_header:
output.write(header_line)
for data_line in result_data:
output.write(data_line)
bed_regions = load_bed_regions(bed_file)
vcf = pysam.VariantFile(vcf_file)
res_pos = list()
count_normal = 0
count_exception = 0
for p_value in p_value_list:
if freq_range[0] <= p_value <= freq_range[1]:
for record in vcf:
contig = record.contig
pos = record.pos
ref = record.ref
alt = record.alts[0]
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
if freq < 0.1 or freq > 0.9:
continue
# 检查记录是否是点突变
if len(record.ref) == 1 and len(record.alts[0]) == 1:
# 在bed 区间内
for (bed_chrom, bed_start, bed_end) in bed_regions:
if contig == bed_chrom and bed_start < pos <= bed_end:
if freq_range[0] <= freq <= freq_range[1]:
count_normal += 1
res = 'yes'
else:
count_exception += 1
res = 'no'
res_pos.append(dict(
chr=contig,
pos=pos,
ref=ref,
alt=alt,
freq=freq,
res=res
))
break
count_all = count_exception + count_normal
if count_all == 0:
z_score = 0
@ -61,212 +92,160 @@ def single_monitoring(name, vcf, bed, freq_range, output_dir):
count_exception=count_exception,
z_score=z_score
)
pd.DataFrame([res]).to_csv(os.path.join(output_dir, f'{name}_pollution.csv'), sep="\t", index=False)
pd.DataFrame([res]).to_csv(os.path.join(output_dir, f'{name}_pollution_sum.txt'), sep="\t", index=False)
res_pos_df = pd.DataFrame(res_pos)
if not res_pos_df.empty:
res_pos_df.rename(columns={
'res': f'res{freq_range}',
}, inplace=True)
res_pos_df.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)
# 根据小bed筛选vcf
def select_position(vcf, bed, matched_file, unmatched_file):
vcf_header = [] # 用于存储VCF文件头部分
vcf_data = [] # 用于存储筛选后的VCF数据
def paired_monitor(name, vcf_file, bed_file, cnvkit_bed, output_dir):
"""
682双样本处理
"""
bed_regions = load_bed_regions(bed_file)
cnvkit_bed_regions = load_bed_regions(cnvkit_bed)
vcf = pysam.VariantFile(vcf_file)
records_dict = build_record_dict(vcf)
res_pos = list()
# 按行处理,保存为列表
with open(vcf, 'r') as vcf_file:
for line in vcf_file:
if line.startswith("#"):
vcf_header.append(line)
else:
vcf_data.append(line)
# 遍历 bed
for (bed_chrom, bed_start, bed_end) in bed_regions:
key = (bed_chrom, bed_end)
ref = '.'
alt = '.'
freq = '.'
genotype = '.'
ref_normal = '.'
alt_normal = '.'
freq_normal = '.'
genotype_normal = '.'
res = 'no'
if key in records_dict:
record = records_dict[key]
ref = record.ref
alt = record.alts[0]
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
freq_normal = record.samples.get(record.samples.keys()[1]).get("AF")[0]
ref_normal = ref
alt_normal = alt
df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
position_list = list(df_position[2])
chr_list = list(df_position[0])
result_data = []
unmatched_data = [] # 用于存储未匹配的数据
genotype = simplify_genotype(freq)
genotype_normal = simplify_genotype(freq_normal)
for i in range(len(position_list)):
filtered_lines = [line for line in vcf_data if
line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
line.split()[3]) < 2 and len(line.split()[4]) < 2]
if not filtered_lines: # 如果没有匹配的点,添加到未匹配数据列表
unmatched_data.append(f"{chr_list[i]}\t{position_list[i]}\t.\t.\t.\t.\t.\n")
result_data.extend(filtered_lines)
if genotype == genotype_normal:
res = 'yes'
res_pos.append(dict(
chr=bed_chrom,
pos=bed_end,
ref=ref,
alt=alt,
freq=freq,
genotype=genotype,
ref_normal=ref_normal,
alt_normal=alt_normal,
freq_normal=freq_normal,
genotype_normal=genotype_normal,
res=res
))
res_pos_df = pd.DataFrame(res_pos)
res_pos_df_sort = res_pos_df.sort_values(by='chr')
res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)
with open(matched_file, 'w') as output:
for header_line in vcf_header:
output.write(header_line)
for data_line in result_data:
output.write(data_line)
output_tumor_vcf = open(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w')
output_normal_vcf = open(os.path.join(output_dir, f'{name}_cnvkit_normal.vcf'), 'w')
for line in str(vcf.header).strip().split('\n'):
output_tumor_vcf.write(line + '\n') if line.startswith('##') else output_tumor_vcf.write(
'\t'.join(line.split('\t')[:-1]) + '\n')
output_normal_vcf.write(line + '\n') if line.startswith('##') else output_normal_vcf.write(
'\t'.join(line.split('\t')[:-2] + line.split('\t')[-1:]) + '\n')
with open(unmatched_file, 'w') as unmatched_output:
unmatched_output.writelines(unmatched_data)
for (bed_chrom, bed_start, bed_end) in cnvkit_bed_regions:
key = (bed_chrom, bed_end)
if key in records_dict:
record = records_dict[key]
output_tumor_vcf.write('\t'.join(str(record).split('\t')[:-1]) + '\n')
output_normal_vcf.write('\t'.join(str(record).split('\t')[:-2] + str(record).split('\t')[-1:]) + '\n')
return matched_file, unmatched_file
output_tumor_vcf.close()
output_normal_vcf.close()
# 处理体系、胚系的vcf得到目标信息
def process_judge_vcf(input_vcf, output_vcf):
with open(input_vcf, 'r') as input_file, open(output_vcf, 'w') as output_file:
for line in input_file:
if not line.startswith("#"):
fields = line.strip().split('\t')
def paired_monitor_vcf(name, tumor_vcf_file, normal_vcf_file, bed_file, cnvkit_bed, output_dir):
"""
624双样本处理
"""
bed_regions = load_bed_regions(bed_file)
tumor_vcf = pysam.VariantFile(tumor_vcf_file)
normal_vcf = pysam.VariantFile(normal_vcf_file)
vcf_format = fields[8].split(":")
vcf_normal = fields[-1].split(":")
vcf_info = dict(zip(vcf_format, vcf_normal))
af = vcf_info['AF']
vd = vcf_info['VD']
# info = fields[9].split(":")
# percentage = float(info[6])
percentage = float(af)
if 0.1 <= percentage <= 0.9:
b = 0.5
elif percentage < 0.1:
b = 0
elif percentage > 0.9:
b = 1
tumor_records_dict = build_record_dict(tumor_vcf)
normal_records_dict = build_record_dict(normal_vcf)
# 构建新的行数据
new_line = '\t'.join([fields[0], fields[1], fields[3], fields[4], af, str(b), vd])
output_file.write(new_line + '\n')
return output_vcf
res_pos = list()
# 遍历 bed
for (bed_chrom, bed_start, bed_end) in bed_regions:
key = (bed_chrom, bed_end)
ref = '.'
alt = '.'
freq = '.'
genotype = '.'
ref_normal = '.'
alt_normal = '.'
freq_normal = '.'
genotype_normal = '.'
res = 'no'
def merge_and_sort_files(matched_file, unmatched_file, output_file):
# 检查 unmatched_file 是否为空
if os.stat(unmatched_file).st_size == 0:
# 对 matched_file 进行排序并写入 output_file
matched_df = pd.read_csv(matched_file, sep='\t', header=None)
sorted_df = matched_df.sort_values(by=[0, 1])
sorted_df.to_csv(output_file, sep='\t', header=False, index=False)
return output_file
if key in tumor_records_dict:
record = tumor_records_dict[key]
ref = record.ref
alt = record.alts[0]
freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
genotype = simplify_genotype(freq)
if key in normal_records_dict:
record = normal_records_dict[key]
ref_normal = record.ref
alt_normal = record.alts[0]
freq_normal = record.samples.get(record.samples.keys()[0]).get("AF")[0]
genotype_normal = simplify_genotype(freq_normal)
if genotype == genotype_normal:
res = 'yes'
res_pos.append(dict(
chr=bed_chrom,
pos=bed_end,
ref=ref,
alt=alt,
freq=freq,
genotype=genotype,
ref_normal=ref_normal,
alt_normal=alt_normal,
freq_normal=freq_normal,
genotype_normal=genotype_normal,
res=res
))
# 如果 unmatched_file 不为空,继续合并和排序操作
if os.stat(matched_file).st_size == 0:
matched_df = pd.DataFrame()
else:
matched_df = pd.read_csv(matched_file, sep='\t', header=None)
unmatched_df = pd.read_csv(unmatched_file, sep='\t', header=None)
res_pos_df = pd.DataFrame(res_pos)
res_pos_df_sort = res_pos_df.sort_values(by='chr')
res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)
# 合并数据帧
combined_df = pd.concat([matched_df, unmatched_df])
output_tumor_vcf = pysam.VariantFile(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w',
header=tumor_vcf.header)
output_normal_vcf = pysam.VariantFile(os.path.join(output_dir, f'{name}_cnvkit_normal.vcf'), 'w',
header=normal_vcf.header)
# 根据第一列和第二列排序
sorted_df = combined_df.sort_values(by=[0, 1])
# 将排序后的数据写入输出文件
sorted_df.to_csv(output_file, sep='\t', header=False, index=False)
return output_file
# 合并体系,胚系
def merge_and_compare_files(somatic_file, germline_file, output_merged_file, output_final_file):
# 合并两个文件
with open(somatic_file, 'r') as somatic, open(germline_file, 'r') as germline:
merged_lines = [f"{somatic_line.strip()}\t{germline_line.strip()}" for somatic_line, germline_line in
zip(somatic, germline)]
# 将合并后的数据写入输出文件
with open(output_merged_file, 'w') as output_file:
output_file.write('\n'.join(merged_lines))
# 比较两列数据并添加比较结果列
with open(output_merged_file, 'r') as merged, open(output_final_file, 'w') as final_output:
for line in merged:
fields = line.strip().split('\t')
if fields[5] == fields[12]:
comparison_result = "yes"
else:
comparison_result = "no"
final_output.write(f"{line.strip()}\t{comparison_result}\n")
return output_merged_file, output_final_file
# 根据大bed筛选vcf作cnvkit的图
def select_cnvkit_vcf(vcf, bed, output_file):
vcf_header = [] # 用于存储VCF文件头部分
vcf_data = [] # 用于存储筛选后的VCF数据
p_value_list = [] # 用于存储符合条件的 p-value
# 按行处理,保存为列表
with open(vcf, 'r') as vcf_file:
for line in vcf_file:
if line.startswith("#"):
vcf_header.append(line)
else:
vcf_data.append(line)
df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
position_list = list(df_position[2])
chr_list = list(df_position[0])
result_data = []
for i in range(len(position_list)):
filtered_lines = [line for line in vcf_data if
line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
line.split()[3]) < 2 and len(line.split()[4]) < 2]
for line in filtered_lines:
# p_value_str = line.split()[9].split(":")[6]
# p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
vcf_format = line.split('\t')[8].split(":")
vcf_normal = line.split('\t')[-1].split(":")
vcf_info = dict(zip(vcf_format, vcf_normal))
p_value = float(vcf_info['AF'])
if 0.1 <= p_value <= 0.9:
result_data.append(line)
with open(output_file, 'w') as output:
for header_line in vcf_header:
output.write(header_line)
for data_line in result_data:
output.write(data_line)
return output_file
def paired_monitoring(name, somatic_vcf, germline_vcf, ref_bed, cnvkit_ref_bed, output_dir):
# 处理体系根据bed筛选
select_position_output_file1 = os.path.join(output_dir, f'{name}_somatic_matched.vcf')
select_position_output_file2 = os.path.join(output_dir, f'{name}_somatic_unmatched.vcf')
somatic_matched_file, somatic_unmatched_file = select_position(somatic_vcf, ref_bed, select_position_output_file1,
select_position_output_file2)
# 处理胚系根据bed筛选
select_position_output_file3 = os.path.join(output_dir, f'{name}_germline_matched.vcf')
select_position_output_file4 = os.path.join(output_dir, f'{name}_germline_unmatched.vcf')
germline_matched_file, germline_unmatched_file = select_position(germline_vcf, ref_bed,
select_position_output_file3,
select_position_output_file4)
# 处理体系,数值转换
process_judge_vcf_file1 = os.path.join(output_dir, f'{name}_somatic_matched_add_judge.vcf')
somatic_matched_add_judge_file = process_judge_vcf(somatic_matched_file, process_judge_vcf_file1)
# 处理胚系,数值转换
process_judge_vcf_file2 = os.path.join(output_dir, f'{name}_germline_matched_add_judge.vcf')
germline_matched_add_judge_file = process_judge_vcf(germline_matched_file, process_judge_vcf_file2)
# 合并体系将匹配到的和未匹配到bed的的合并
merge_and_sort_files_file1 = os.path.join(output_dir, f'{name}_somatic_merged.vcf')
somatic_merged_file = merge_and_sort_files(somatic_matched_add_judge_file, somatic_unmatched_file,
merge_and_sort_files_file1)
# 合并胚系将匹配到的和未匹配到bed的的合并
merge_and_sort_files_file2 = os.path.join(output_dir, f'{name}_germline__merged.vcf')
Germline_merged_file = merge_and_sort_files(germline_matched_add_judge_file, germline_unmatched_file,
merge_and_sort_files_file2)
# 合并胚系,体系,将体系,胚系两个合并文件再合并
result_pro_file = os.path.join(output_dir, f'{name}_result_pro.txt')
result_file = os.path.join(output_dir, f'{name}_contaminate_result.txt')
merge_and_compare_files(somatic_merged_file, Germline_merged_file, result_pro_file, result_file)
##筛选作图vcf
cnvkit_output_file = os.path.join(output_dir, f'{name}_select_cnvkit.vcf')
select_cnvkit_vcf(germline_vcf, cnvkit_ref_bed, cnvkit_output_file)
##删除中间文件
os.remove(select_position_output_file1)
os.remove(select_position_output_file2)
os.remove(select_position_output_file3)
os.remove(select_position_output_file4)
os.remove(process_judge_vcf_file1)
os.remove(process_judge_vcf_file2)
os.remove(merge_and_sort_files_file1)
os.remove(merge_and_sort_files_file2)
os.remove(result_pro_file)
cnvkit_bed_regions = load_bed_regions(cnvkit_bed)
for (bed_chrom, bed_start, bed_end) in cnvkit_bed_regions:
key = (bed_chrom, bed_end)
if key in normal_records_dict:
output_normal_vcf.write(normal_records_dict[key])
if key in tumor_records_dict:
output_tumor_vcf.write(tumor_records_dict[key])
output_tumor_vcf.close()
output_normal_vcf.close()
if __name__ == '__main__':
@ -276,7 +255,8 @@ if __name__ == '__main__':
parser.add_argument('-b', '--ref_bed', help="ref_bed", required=True)
parser.add_argument('-v', '--vcf', help="raw vcf for prbe 160 or 17 ; somatic vcf for prbe 682 or 624",
required=True)
parser.add_argument('-v2', '--vcf2', help="germline vcf; required when prbe 682 or 624", default='', required=False, nargs='?')
parser.add_argument('-v2', '--vcf2', help="germline vcf; required when prbe 682 or 624", default='', required=False,
nargs='?')
parser.add_argument('-c', '--cnvkit_bed', help="cnvkit_bed; required when prbe 682 or 624")
parser.add_argument('-p', '--probe', help="probe, 682, 624, 160, 17 for now ", required=True)
parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='')
@ -288,15 +268,18 @@ if __name__ == '__main__':
if probe == '160' or probe == '17':
freq_range = {"17": [0.3452, 0.6512], "160": [0.2930, 0.6753]}.get(probe)
single_monitoring(args.name, args.vcf, bed_path, freq_range, args.output_dir)
elif probe == '682' or probe == '624':
if not args.vcf2:
parser.error('--vcf2 is required in prbe 682 or 624')
single_monitor(args.name, args.vcf, bed_path, freq_range, args.output_dir)
elif probe == '682':
if not args.cnvkit_bed:
parser.error('--cnvkit_bed is required in prbe 682 or 624')
parser.error('--cnvkit_bed is required in probe 624')
cnvkit_bed_path = os.path.realpath(args.cnvkit_bed)
print(f'污染检测使用cnvkit_bed: {cnvkit_bed_path}')
paired_monitoring(args.name, args.vcf, args.vcf2, bed_path, args.cnvkit_bed, args.output_dir)
paired_monitor(args.name, args.vcf, bed_path, args.cnvkit_bed, args.output_dir)
elif probe == '624':
if not args.vcf2:
parser.error('--vcf2 is required in probe 624')
if not args.cnvkit_bed:
parser.error('--cnvkit_bed is required in probe 624')
paired_monitor_vcf(args.name, args.vcf, args.vcf2, bed_path, args.cnvkit_bed, args.output_dir)
else:
parser.error('probe error. 682, 624, 160, 17 for now')

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,128 +1,128 @@
chr1 2488153 2488153
chr1 23885599 23885599
chr1 65321388 65321388
chr1 120612006 120612006
chr1 193094375 193094375
chr1 226570840 226570840
chr2 47693959 47693959
chr2 48030838 48030838
chr2 61749716 61749716
chr2 141032088 141032088
chr2 215645464 215645464
chr3 30686414 30686414
chr3 47125385 47125385
chr3 134644636 134644636
chr3 142281612 142281612
chr3 192053274 192053274
chr4 55599436 55599436
chr4 143007419 143007419
chr4 153252061 153252061
chr4 185310218 185310218
chr4 25666099 25666099
chr5 35857177 35857177
chr5 38955796 38955796
chr5 39074296 39074296
chr5 67522722 67522722
chr5 112162854 112162854
chr5 149435759 149435759
chr5 161119125 161119125
chr5 170819887 170819887
chr5 180046344 180046344
chr6 30858857 30858857
chr6 32797876 32797876
chr6 36645696 36645696
chr6 117678083 117678083
chr6 117714346 117714346
chr7 2946461 2946461
chr7 2962753 2962753
chr7 87138645 87138645
chr7 151970931 151970931
chr8 90955624 90955624
chr8 90970935 90970935
chr8 90995019 90995019
chr8 68864728 68864728
chr8 38322346 38322346
chr8 145742879 145742879
chr9 5081780 5081780
chr9 8389364 8389364
chr9 93657761 93657761
chr9 98209594 98209594
chr9 21991923 21991923
chr9 37020622 37020622
chr9 21975017 21975017
chr10 43600689 43600689
chr10 89720907 89720907
chr10 104596981 104596981
chr10 104849468 104849468
chr10 104855656 104855656
chr10 123239112 123239112
chr11 32410774 32410774
chr11 69462910 69462910
chr11 94197260 94197260
chr11 94225807 94225807
chr11 94225920 94225920
chr11 125497466 125497466
chr12 1040373 1040373
chr12 4553383 4553383
chr12 6711147 6711147
chr12 21331625 21331625
chr12 49427652 49427652
chr12 49444545 49444545
chr12 121416622 121416622
chr12 133202215 133202215
chr13 21562948 21562948
chr13 28609825 28609825
chr13 32936646 32936646
chr13 113889474 113889474
chr13 113907391 113907391
chr14 105239894 105239894
chr14 105246325 105246325
chr14 38064215 38064215
chr14 20820537 20820537
chr14 35871217 35871217
chr15 41865488 41865488
chr15 51502986 51502986
chr15 51529112 51529112
chr15 99456253 99456253
chr15 99478225 99478225
chr15 51558731 51558731
chr16 68857441 68857441
chr16 359953 359953
chr16 89805977 89805977
chr16 89838078 89838078
chr16 89857964 89857964
chr17 7983969 7983969
chr17 29486152 29486152
chr17 29508775 29508775
chr17 29546175 29546175
chr17 29559932 29559932
chr17 29653293 29653293
chr17 37879762 37879762
chr17 62007498 62007498
chr17 78919558 78919558
chr19 3110349 3110349
chr19 3119239 3119239
chr19 4101062 4101062
chr19 11136215 11136215
chr19 15289613 15289613
chr19 15295134 15295134
chr19 41725271 41725271
chr20 43956527 43956527
chr20 43956636 43956636
chr20 43958850 43958850
chr20 43958872 43958872
chr20 54959296 54959296
chr20 54961541 54961541
chr21 37518706 37518706
chr21 39752673 39752673
chr21 39753375 39753375
chr22 41568480 41568480
chr22 30038152 30038152
chr22 30079213 30079213
chrX 76937963 76937963
chrX 39922359 39922359
chrX 39932907 39932907
chrX 44938563 44938563
chrX 100608191 100608191
chrX 100611285 100611285
chrX 76940534 76940534
chr1 2488152 2488153
chr1 23885598 23885599
chr1 65321387 65321388
chr1 120612005 120612006
chr1 193094374 193094375
chr1 226570839 226570840
chr2 47693958 47693959
chr2 48030837 48030838
chr2 61749715 61749716
chr2 141032087 141032088
chr2 215645463 215645464
chr3 30686413 30686414
chr3 47125384 47125385
chr3 134644635 134644636
chr3 142281611 142281612
chr3 192053273 192053274
chr4 55599435 55599436
chr4 143007418 143007419
chr4 153252060 153252061
chr4 185310217 185310218
chr4 25666098 25666099
chr5 35857176 35857177
chr5 38955795 38955796
chr5 39074295 39074296
chr5 67522721 67522722
chr5 112162853 112162854
chr5 149435758 149435759
chr5 161119124 161119125
chr5 170819886 170819887
chr5 180046343 180046344
chr6 30858856 30858857
chr6 32797875 32797876
chr6 36645695 36645696
chr6 117678082 117678083
chr6 117714345 117714346
chr7 2946460 2946461
chr7 2962752 2962753
chr7 87138644 87138645
chr7 151970930 151970931
chr8 90955623 90955624
chr8 90970934 90970935
chr8 90995018 90995019
chr8 68864727 68864728
chr8 38322345 38322346
chr8 145742878 145742879
chr9 5081779 5081780
chr9 8389363 8389364
chr9 93657760 93657761
chr9 98209593 98209594
chr9 21991922 21991923
chr9 37020621 37020622
chr9 21975016 21975017
chr10 43600688 43600689
chr10 89720906 89720907
chr10 104596980 104596981
chr10 104849467 104849468
chr10 104855655 104855656
chr10 123239111 123239112
chr11 32410773 32410774
chr11 69462909 69462910
chr11 94197259 94197260
chr11 94225806 94225807
chr11 94225919 94225920
chr11 125497465 125497466
chr12 1040372 1040373
chr12 4553382 4553383
chr12 6711146 6711147
chr12 21331624 21331625
chr12 49427651 49427652
chr12 49444544 49444545
chr12 121416621 121416622
chr12 133202214 133202215
chr13 21562947 21562948
chr13 28609824 28609825
chr13 32936645 32936646
chr13 113889473 113889474
chr13 113907390 113907391
chr14 105239893 105239894
chr14 105246324 105246325
chr14 38064214 38064215
chr14 20820536 20820537
chr14 35871216 35871217
chr15 41865487 41865488
chr15 51502985 51502986
chr15 51529111 51529112
chr15 99456252 99456253
chr15 99478224 99478225
chr15 51558730 51558731
chr16 68857440 68857441
chr16 359952 359953
chr16 89805976 89805977
chr16 89838077 89838078
chr16 89857963 89857964
chr17 7983968 7983969
chr17 29486151 29486152
chr17 29508774 29508775
chr17 29546174 29546175
chr17 29559931 29559932
chr17 29653292 29653293
chr17 37879761 37879762
chr17 62007497 62007498
chr17 78919557 78919558
chr19 3110348 3110349
chr19 3119238 3119239
chr19 4101061 4101062
chr19 11136214 11136215
chr19 15289612 15289613
chr19 15295133 15295134
chr19 41725270 41725271
chr20 43956526 43956527
chr20 43956635 43956636
chr20 43958849 43958850
chr20 43958871 43958872
chr20 54959295 54959296
chr20 54961540 54961541
chr21 37518705 37518706
chr21 39752672 39752673
chr21 39753374 39753375
chr22 41568479 41568480
chr22 30038151 30038152
chr22 30079212 30079213
chrX 76937962 76937963
chrX 39922358 39922359
chrX 39932906 39932907
chrX 44938562 44938563
chrX 100608190 100608191
chrX 100611284 100611285
chrX 76940533 76940534

File diff suppressed because it is too large Load Diff

View File

@ -1,128 +1,128 @@
chr1 2488153 2488153
chr1 23885599 23885599
chr1 65321388 65321388
chr1 120612006 120612006
chr1 193094375 193094375
chr1 226570840 226570840
chr2 47693959 47693959
chr2 48030838 48030838
chr2 61749716 61749716
chr2 141032088 141032088
chr2 215645464 215645464
chr3 30686414 30686414
chr3 47125385 47125385
chr3 134644636 134644636
chr3 142281612 142281612
chr3 192053274 192053274
chr4 55599436 55599436
chr4 143007419 143007419
chr4 153252061 153252061
chr4 185310218 185310218
chr4 25666099 25666099
chr5 35857177 35857177
chr5 38955796 38955796
chr5 39074296 39074296
chr5 67522722 67522722
chr5 112162854 112162854
chr5 149435759 149435759
chr5 161119125 161119125
chr5 170819887 170819887
chr5 180046344 180046344
chr6 30858857 30858857
chr6 32797876 32797876
chr6 36645696 36645696
chr6 117678083 117678083
chr6 117714346 117714346
chr7 2946461 2946461
chr7 2962753 2962753
chr7 87138645 87138645
chr7 151970931 151970931
chr8 90955624 90955624
chr8 90970935 90970935
chr8 90995019 90995019
chr8 68864728 68864728
chr8 38322346 38322346
chr8 145742879 145742879
chr9 5081780 5081780
chr9 8389364 8389364
chr9 93657761 93657761
chr9 98209594 98209594
chr9 21991923 21991923
chr9 37020622 37020622
chr9 21975017 21975017
chr10 43600689 43600689
chr10 89720907 89720907
chr10 104596981 104596981
chr10 104849468 104849468
chr10 104855656 104855656
chr10 123239112 123239112
chr11 32410774 32410774
chr11 69462910 69462910
chr11 94197260 94197260
chr11 94225807 94225807
chr11 94225920 94225920
chr11 125497466 125497466
chr12 1040373 1040373
chr12 4553383 4553383
chr12 6711147 6711147
chr12 21331625 21331625
chr12 49427652 49427652
chr12 49444545 49444545
chr12 121416622 121416622
chr12 133202215 133202215
chr13 21562948 21562948
chr13 28609825 28609825
chr13 32936646 32936646
chr13 113889474 113889474
chr13 113907391 113907391
chr14 105239894 105239894
chr14 105246325 105246325
chr14 38064215 38064215
chr14 20820537 20820537
chr14 35871217 35871217
chr15 41865488 41865488
chr15 51502986 51502986
chr15 51529112 51529112
chr15 99456253 99456253
chr15 99478225 99478225
chr15 51558731 51558731
chr16 68857441 68857441
chr16 359953 359953
chr16 89805977 89805977
chr16 89838078 89838078
chr16 89857964 89857964
chr17 7983969 7983969
chr17 29486152 29486152
chr17 29508775 29508775
chr17 29546175 29546175
chr17 29559932 29559932
chr17 29653293 29653293
chr17 37879762 37879762
chr17 62007498 62007498
chr17 78919558 78919558
chr19 3110349 3110349
chr19 3119239 3119239
chr19 4101062 4101062
chr19 11136215 11136215
chr19 15289613 15289613
chr19 15295134 15295134
chr19 41725271 41725271
chr20 43956527 43956527
chr20 43956636 43956636
chr20 43958850 43958850
chr20 43958872 43958872
chr20 54959296 54959296
chr20 54961541 54961541
chr21 37518706 37518706
chr21 39752673 39752673
chr21 39753375 39753375
chr22 41568480 41568480
chr22 30038152 30038152
chr22 30079213 30079213
chrX 76937963 76937963
chrX 39922359 39922359
chrX 39932907 39932907
chrX 44938563 44938563
chrX 100608191 100608191
chrX 100611285 100611285
chrX 76940534 76940534
chr1 2488152 2488153
chr1 23885598 23885599
chr1 65321387 65321388
chr1 120612005 120612006
chr1 193094374 193094375
chr1 226570839 226570840
chr2 47693958 47693959
chr2 48030837 48030838
chr2 61749715 61749716
chr2 141032087 141032088
chr2 215645463 215645464
chr3 30686413 30686414
chr3 47125384 47125385
chr3 134644635 134644636
chr3 142281611 142281612
chr3 192053273 192053274
chr4 55599435 55599436
chr4 143007418 143007419
chr4 153252060 153252061
chr4 185310217 185310218
chr4 25666098 25666099
chr5 35857176 35857177
chr5 38955795 38955796
chr5 39074295 39074296
chr5 67522721 67522722
chr5 112162853 112162854
chr5 149435758 149435759
chr5 161119124 161119125
chr5 170819886 170819887
chr5 180046343 180046344
chr6 30858856 30858857
chr6 32797875 32797876
chr6 36645695 36645696
chr6 117678082 117678083
chr6 117714345 117714346
chr7 2946460 2946461
chr7 2962752 2962753
chr7 87138644 87138645
chr7 151970930 151970931
chr8 90955623 90955624
chr8 90970934 90970935
chr8 90995018 90995019
chr8 68864727 68864728
chr8 38322345 38322346
chr8 145742878 145742879
chr9 5081779 5081780
chr9 8389363 8389364
chr9 93657760 93657761
chr9 98209593 98209594
chr9 21991922 21991923
chr9 37020621 37020622
chr9 21975016 21975017
chr10 43600688 43600689
chr10 89720906 89720907
chr10 104596980 104596981
chr10 104849467 104849468
chr10 104855655 104855656
chr10 123239111 123239112
chr11 32410773 32410774
chr11 69462909 69462910
chr11 94197259 94197260
chr11 94225806 94225807
chr11 94225919 94225920
chr11 125497465 125497466
chr12 1040372 1040373
chr12 4553382 4553383
chr12 6711146 6711147
chr12 21331624 21331625
chr12 49427651 49427652
chr12 49444544 49444545
chr12 121416621 121416622
chr12 133202214 133202215
chr13 21562947 21562948
chr13 28609824 28609825
chr13 32936645 32936646
chr13 113889473 113889474
chr13 113907390 113907391
chr14 105239893 105239894
chr14 105246324 105246325
chr14 38064214 38064215
chr14 20820536 20820537
chr14 35871216 35871217
chr15 41865487 41865488
chr15 51502985 51502986
chr15 51529111 51529112
chr15 99456252 99456253
chr15 99478224 99478225
chr15 51558730 51558731
chr16 68857440 68857441
chr16 359952 359953
chr16 89805976 89805977
chr16 89838077 89838078
chr16 89857963 89857964
chr17 7983968 7983969
chr17 29486151 29486152
chr17 29508774 29508775
chr17 29546174 29546175
chr17 29559931 29559932
chr17 29653292 29653293
chr17 37879761 37879762
chr17 62007497 62007498
chr17 78919557 78919558
chr19 3110348 3110349
chr19 3119238 3119239
chr19 4101061 4101062
chr19 11136214 11136215
chr19 15289612 15289613
chr19 15295133 15295134
chr19 41725270 41725271
chr20 43956526 43956527
chr20 43956635 43956636
chr20 43958849 43958850
chr20 43958871 43958872
chr20 54959295 54959296
chr20 54961540 54961541
chr21 37518705 37518706
chr21 39752672 39752673
chr21 39753374 39753375
chr22 41568479 41568480
chr22 30038151 30038152
chr22 30079212 30079213
chrX 76937962 76937963
chrX 39922358 39922359
chrX 39932906 39932907
chrX 44938562 44938563
chrX 100608190 100608191
chrX 100611284 100611285
chrX 76940533 76940534

View File

@ -1,128 +1,128 @@
chr1 2488153 2488153
chr1 23885599 23885599
chr1 65321388 65321388
chr1 120612006 120612006
chr1 193094375 193094375
chr1 226570840 226570840
chr2 47693959 47693959
chr2 48030838 48030838
chr2 61749716 61749716
chr2 141032088 141032088
chr2 215645464 215645464
chr3 30686414 30686414
chr3 47125385 47125385
chr3 134644636 134644636
chr3 142281612 142281612
chr3 192053274 192053274
chr4 55599436 55599436
chr4 143007419 143007419
chr4 153252061 153252061
chr4 185310218 185310218
chr4 25666099 25666099
chr5 35857177 35857177
chr5 38955796 38955796
chr5 39074296 39074296
chr5 67522722 67522722
chr5 112162854 112162854
chr5 149435759 149435759
chr5 161119125 161119125
chr5 170819887 170819887
chr5 180046344 180046344
chr6 30858857 30858857
chr6 32797876 32797876
chr6 36645696 36645696
chr6 117678083 117678083
chr6 117714346 117714346
chr7 2946461 2946461
chr7 2962753 2962753
chr7 87138645 87138645
chr7 151970931 151970931
chr8 90955624 90955624
chr8 90970935 90970935
chr8 90995019 90995019
chr8 68864728 68864728
chr8 38322346 38322346
chr8 145742879 145742879
chr9 5081780 5081780
chr9 8389364 8389364
chr9 93657761 93657761
chr9 98209594 98209594
chr9 21991923 21991923
chr9 37020622 37020622
chr9 21975017 21975017
chr10 43600689 43600689
chr10 89720907 89720907
chr10 104596981 104596981
chr10 104849468 104849468
chr10 104855656 104855656
chr10 123239112 123239112
chr11 32410774 32410774
chr11 69462910 69462910
chr11 94197260 94197260
chr11 94225807 94225807
chr11 94225920 94225920
chr11 125497466 125497466
chr12 1040373 1040373
chr12 4553383 4553383
chr12 6711147 6711147
chr12 21331625 21331625
chr12 49427652 49427652
chr12 49444545 49444545
chr12 121416622 121416622
chr12 133202215 133202215
chr13 21562948 21562948
chr13 28609825 28609825
chr13 32936646 32936646
chr13 113889474 113889474
chr13 113907391 113907391
chr14 105239894 105239894
chr14 105246325 105246325
chr14 38064215 38064215
chr14 20820537 20820537
chr14 35871217 35871217
chr15 41865488 41865488
chr15 51502986 51502986
chr15 51529112 51529112
chr15 99456253 99456253
chr15 99478225 99478225
chr15 51558731 51558731
chr16 68857441 68857441
chr16 359953 359953
chr16 89805977 89805977
chr16 89838078 89838078
chr16 89857964 89857964
chr17 7983969 7983969
chr17 29486152 29486152
chr17 29508775 29508775
chr17 29546175 29546175
chr17 29559932 29559932
chr17 29653293 29653293
chr17 37879762 37879762
chr17 62007498 62007498
chr17 78919558 78919558
chr19 3110349 3110349
chr19 3119239 3119239
chr19 4101062 4101062
chr19 11136215 11136215
chr19 15289613 15289613
chr19 15295134 15295134
chr19 41725271 41725271
chr20 43956527 43956527
chr20 43956636 43956636
chr20 43958850 43958850
chr20 43958872 43958872
chr20 54959296 54959296
chr20 54961541 54961541
chr21 37518706 37518706
chr21 39752673 39752673
chr21 39753375 39753375
chr22 41568480 41568480
chr22 30038152 30038152
chr22 30079213 30079213
chrX 76937963 76937963
chrX 39922359 39922359
chrX 39932907 39932907
chrX 44938563 44938563
chrX 100608191 100608191
chrX 100611285 100611285
chrX 76940534 76940534
chr1 2488152 2488153
chr1 23885598 23885599
chr1 65321387 65321388
chr1 120612005 120612006
chr1 193094374 193094375
chr1 226570839 226570840
chr2 47693958 47693959
chr2 48030837 48030838
chr2 61749715 61749716
chr2 141032087 141032088
chr2 215645463 215645464
chr3 30686413 30686414
chr3 47125384 47125385
chr3 134644635 134644636
chr3 142281611 142281612
chr3 192053273 192053274
chr4 55599435 55599436
chr4 143007418 143007419
chr4 153252060 153252061
chr4 185310217 185310218
chr4 25666098 25666099
chr5 35857176 35857177
chr5 38955795 38955796
chr5 39074295 39074296
chr5 67522721 67522722
chr5 112162853 112162854
chr5 149435758 149435759
chr5 161119124 161119125
chr5 170819886 170819887
chr5 180046343 180046344
chr6 30858856 30858857
chr6 32797875 32797876
chr6 36645695 36645696
chr6 117678082 117678083
chr6 117714345 117714346
chr7 2946460 2946461
chr7 2962752 2962753
chr7 87138644 87138645
chr7 151970930 151970931
chr8 90955623 90955624
chr8 90970934 90970935
chr8 90995018 90995019
chr8 68864727 68864728
chr8 38322345 38322346
chr8 145742878 145742879
chr9 5081779 5081780
chr9 8389363 8389364
chr9 93657760 93657761
chr9 98209593 98209594
chr9 21991922 21991923
chr9 37020621 37020622
chr9 21975016 21975017
chr10 43600688 43600689
chr10 89720906 89720907
chr10 104596980 104596981
chr10 104849467 104849468
chr10 104855655 104855656
chr10 123239111 123239112
chr11 32410773 32410774
chr11 69462909 69462910
chr11 94197259 94197260
chr11 94225806 94225807
chr11 94225919 94225920
chr11 125497465 125497466
chr12 1040372 1040373
chr12 4553382 4553383
chr12 6711146 6711147
chr12 21331624 21331625
chr12 49427651 49427652
chr12 49444544 49444545
chr12 121416621 121416622
chr12 133202214 133202215
chr13 21562947 21562948
chr13 28609824 28609825
chr13 32936645 32936646
chr13 113889473 113889474
chr13 113907390 113907391
chr14 105239893 105239894
chr14 105246324 105246325
chr14 38064214 38064215
chr14 20820536 20820537
chr14 35871216 35871217
chr15 41865487 41865488
chr15 51502985 51502986
chr15 51529111 51529112
chr15 99456252 99456253
chr15 99478224 99478225
chr15 51558730 51558731
chr16 68857440 68857441
chr16 359952 359953
chr16 89805976 89805977
chr16 89838077 89838078
chr16 89857963 89857964
chr17 7983968 7983969
chr17 29486151 29486152
chr17 29508774 29508775
chr17 29546174 29546175
chr17 29559931 29559932
chr17 29653292 29653293
chr17 37879761 37879762
chr17 62007497 62007498
chr17 78919557 78919558
chr19 3110348 3110349
chr19 3119238 3119239
chr19 4101061 4101062
chr19 11136214 11136215
chr19 15289612 15289613
chr19 15295133 15295134
chr19 41725270 41725271
chr20 43956526 43956527
chr20 43956635 43956636
chr20 43958849 43958850
chr20 43958871 43958872
chr20 54959295 54959296
chr20 54961540 54961541
chr21 37518705 37518706
chr21 39752672 39752673
chr21 39753374 39753375
chr22 41568479 41568480
chr22 30038151 30038152
chr22 30079212 30079213
chrX 76937962 76937963
chrX 39922358 39922359
chrX 39932906 39932907
chrX 44938562 44938563
chrX 100608190 100608191
chrX 100611284 100611285
chrX 76940533 76940534

View File

@ -190,8 +190,10 @@ workflow pipeline {
output_dir=workdir,
probe=probe,
raw_vcf=call_mutation.raw_vcf,
somatic_vcf=call_mutation.somatic_vcf,
germline_vcf=call_mutation.germline_vcf
initial_vcf=call_mutation.initial_vcf,
normal_vcf=call_mutation.normal_vcf,
cnv_cnr=call_cnv.cnv_cnr,
cnv_cns=call_cnv.cnv_cns
}
call postprocess.call_postprocess as call_postprocess {

View File

@ -75,6 +75,8 @@ task mutation_calling_umi {
-e 'INFO/AF[0] >= 0.1' \
-o ${output_dir}/mutation/${name}.snp_indel.germline.vcf
cp ${output_dir}/mutation/${name}.1r.vcf ${output_dir}/mutation/${name}.initial.vcf
>>>
output {
@ -165,8 +167,13 @@ task mutation_calling_umi_control {
${output_dir}/mutation/${name}.raw.addtagmsi.addmutalt.snp_indel.vcf
cp ${output_dir}/mutation/${name}.raw.addtagmsi.addmutalt.snp_indel.vcf ${output_dir}/mutation/${name}.snp_indel.somatic.vcf
cp ${output_dir}/mutation/${name}.normal.vcf ${output_dir}/mutation/${name}.snp_indel.germline.vcf
# cp ${output_dir}/mutation/${name}.normal.vcf ${output_dir}/mutation/${name}.snp_indel.germline.vcf
vcf_filter.py \
-i ${output_dir}/mutation/${name}.normal.vcf \
-e 'INFO/AF[0] >= 0.1' \
-o ${output_dir}/mutation/${name}.snp_indel.germline.vcf
cp ${output_dir}/mutation/${name}.1r.vcf ${output_dir}/mutation/${name}.initial.vcf
>>>
output {
@ -221,6 +228,8 @@ task mutation_calling_tissue {
-e 'INFO/AF[0] > 0.1' \
-o ${output_dir}/mutation/${name}.snp_indel.germline.vcf
cp ${output_dir}/mutation/${name}.raw.vcf ${output_dir}/mutation/${name}.initial.vcf
>>>
output {
@ -274,6 +283,8 @@ task mutation_calling_tissue_control {
-o ${output_dir}/mutation/${name}.snp_indel.germline.vcf \
-e 'INFO/STATUS="Germline" && FORMAT/AF[0] > 0.1 && FORMAT/AF[1] > 0.1'
cp ${output_dir}/mutation/${name}.raw.vcf ${output_dir}/mutation/${name}.initial.vcf
>>>
output {
@ -824,5 +835,7 @@ workflow call_mutation {
String somatic_filter = "${output_dir}/mutation/${tumor}.snp_indel.somatic.hg19_multianno.filter.txt"
String germline_filter = "${output_dir}/mutation/${tumor}.snp_indel.germline.hg19_multianno.filter.txt"
String hotspot_filter = "${output_dir}/mutation/${tumor}.snp_indel.hotspot.hg19_multianno.filter.txt"
String initial_vcf = "${output_dir}/mutation/${tumor}.initial.vcf"
String normal_vcf = "${output_dir}/mutation/${tumor}.normal.vcf"
}
}

View File

@ -151,6 +151,8 @@ workflow call_cnv {
output {
String cnv_filter = "${output_dir}/cnv/${tumor}.rmdup.cns.filter.txt"
String cnv_cnr = "${output_dir}/cnv/${tumor}.rmdup.cnr"
String cnv_cns = "${output_dir}/cnv/${tumor}.rmdup.cns"
}
}

View File

@ -21,6 +21,25 @@ task run_pollution {
-c $PUBLIC/pollution/${probe}_contaminate_cnvkit.bed
}
output {
String cnvkit_tumor_vcf = "${output_dir}/pollution/${name}_cnvkit_tumor.vcf"
String cnvkit_normal_vcf = "${output_dir}/pollution/${name}_cnvkit_normal.vcf"
}
}
task run_generate_png {
String name
String probe
String cnvkit_tumor_vcf
String cnvkit_normal_vcf
String cnv_cnr
String cnv_cns
String output_dir
command {
cnvkit.py scatter ${cnv_cnr} -s ${cnv_cns} -v ${cnvkit_tumor_vcf} -o ${output_dir}/pollution/${name}_pollution_cnvkit_tumor.png
cnvkit.py scatter ${cnv_cnr} -s ${cnv_cns} -v ${cnvkit_normal_vcf} -o ${output_dir}/pollution/${name}_pollution_cnvkit_normal.png
}
}
workflow call_pollution {
@ -32,8 +51,10 @@ workflow call_pollution {
String output_dir
String probe
String raw_vcf
String somatic_vcf
String germline_vcf
String initial_vcf
String normal_vcf
String cnv_cnr
String cnv_cns
if (run) {
if (defined(normal)) {
@ -42,8 +63,19 @@ workflow call_pollution {
name=tumor,
output_dir=output_dir,
probe=probe,
vcf=somatic_vcf,
vcf2=germline_vcf
vcf=initial_vcf,
vcf2=normal_vcf
}
call run_generate_png {
input:
name=tumor,
probe=probe,
cnvkit_tumor_vcf=run_pollution_paired.cnvkit_tumor_vcf,
cnvkit_normal_vcf=run_pollution_paired.cnvkit_normal_vcf,
cnv_cnr=cnv_cnr,
cnv_cns=cnv_cns,
output_dir=output_dir
}
}
@ -53,12 +85,12 @@ workflow call_pollution {
name=tumor,
output_dir=output_dir,
probe=probe,
vcf=raw_vcf
vcf=initial_vcf
}
}
}
output {
String pollution_res = "${output_dir}/pollution/${tumor}_pollution.csv"
String pollution_res = "${output_dir}/pollution/${tumor}_pollution.txt"
}
}