污染处理

2024-01-19 17:56:25 +08:00 · 2024-01-19 17:56:25 +08:00 · 1922d14b76
parent 73e90f8a5b
commit 1922d14b76
12 changed files with 9104 additions and 9072 deletions
--- a/codes/pollution.py
+++ b/codes/pollution.py
@ -4,51 +4,82 @@ import argparse
 import os

 import pandas as pd
+import pysam


-def single_monitoring(name, vcf, bed, freq_range, output_dir):
-    vcf_header = []  # 用于存储VCF文件头部分
-    vcf_data = []  # 用于存储筛选后的VCF数据
-    p_value_list = []  # 用于存储符合条件的 p-value
+def load_bed_regions(bed_file):
+    bed_regions = set()
+    if bed_file:
+        with open(bed_file, 'r') as file:
+            for line in file:
+                parts = line.strip().split('\t')
+                if len(parts) >= 3:
+                    chrom, start, end = parts[0], int(parts[1]), int(parts[2])
+                    bed_regions.add((chrom, start, end))
+    return bed_regions

-    # 按行处理，保存为列表
-    with open(vcf, 'r') as vcf_file:
-        for line in vcf_file:
-            if line.startswith("#"):
-                vcf_header.append(line)
+
+def build_record_dict(vcf_file):
+    records = {}
+    for record in vcf_file:
+        key = (record.contig, record.pos)
+        records[key] = record
+    return records
+
+
+def simplify_genotype(freq):
+    if freq < 0.1:
+        return '0/0'
+    elif 0.1 <= freq <= 0.9:
+        return '0/1'
    else:
-                vcf_data.append(line)
+        return '1/1'
+
+
+def single_monitor(name, vcf_file, bed_file, freq_range, output_dir):
+    """
+    17， 160 单样本处理
+    rawvcf 对应bed 挑选出 位点，计算位点是否在给定范围内，计算比例得到z_socre
+    """

    # bed 处理
-    df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
-    position_list = list(df_position[2])
-    chr_list = list(df_position[0])
-    result_data = []
-
-    for i in range(len(position_list)):
-        filtered_lines = [line for line in vcf_data if
-                          line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
-                              line.split()[3]) < 2 and len(line.split()[4]) < 2]
-        for line in filtered_lines:
-            p_value_str = line.split()[9].split(":")[4]
-            p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
-            if 0.1 <= p_value <= 0.9:
-                result_data.append(line)
-                p_value_list.append(p_value)
-    select_vcf = os.path.join(output_dir, f'{name}_select.vcf')
-    with open(select_vcf, 'w') as output:
-        for header_line in vcf_header:
-            output.write(header_line)
-        for data_line in result_data:
-            output.write(data_line)
+    bed_regions = load_bed_regions(bed_file)

+    vcf = pysam.VariantFile(vcf_file)
+    res_pos = list()
    count_normal = 0
    count_exception = 0
-    for p_value in p_value_list:
-        if freq_range[0] <= p_value <= freq_range[1]:
+
+    for record in vcf:
+        contig = record.contig
+        pos = record.pos
+        ref = record.ref
+        alt = record.alts[0]
+        freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
+
+        if freq < 0.1 or freq > 0.9:
+            continue
+
+        # 检查记录是否是点突变
+        if len(record.ref) == 1 and len(record.alts[0]) == 1:
+            # 在bed 区间内
+            for (bed_chrom, bed_start, bed_end) in bed_regions:
+                if contig == bed_chrom and bed_start < pos <= bed_end:
+                    if freq_range[0] <= freq <= freq_range[1]:
                        count_normal += 1
+                        res = 'yes'
                    else:
                        count_exception += 1
+                        res = 'no'
+                    res_pos.append(dict(
+                        chr=contig,
+                        pos=pos,
+                        ref=ref,
+                        alt=alt,
+                        freq=freq,
+                        res=res
+                    ))
+                    break
    count_all = count_exception + count_normal
    if count_all == 0:
        z_score = 0
@ -61,212 +92,160 @@ def single_monitoring(name, vcf, bed, freq_range, output_dir):
        count_exception=count_exception,
        z_score=z_score
    )
-    pd.DataFrame([res]).to_csv(os.path.join(output_dir, f'{name}_pollution.csv'), sep="\t", index=False)
+    pd.DataFrame([res]).to_csv(os.path.join(output_dir, f'{name}_pollution_sum.txt'), sep="\t", index=False)
+
+    res_pos_df = pd.DataFrame(res_pos)
+    if not res_pos_df.empty:
+        res_pos_df.rename(columns={
+            'res': f'res{freq_range}',
+        }, inplace=True)
+        res_pos_df.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)


-# 根据小bed筛选vcf
-def select_position(vcf, bed, matched_file, unmatched_file):
-    vcf_header = []  # 用于存储VCF文件头部分
-    vcf_data = []  # 用于存储筛选后的VCF数据
+def paired_monitor(name, vcf_file, bed_file, cnvkit_bed, output_dir):
+    """
+    682双样本处理
+    """
+    bed_regions = load_bed_regions(bed_file)
+    cnvkit_bed_regions = load_bed_regions(cnvkit_bed)
+    vcf = pysam.VariantFile(vcf_file)
+    records_dict = build_record_dict(vcf)
+    res_pos = list()

-    # 按行处理，保存为列表
-    with open(vcf, 'r') as vcf_file:
-        for line in vcf_file:
-            if line.startswith("#"):
-                vcf_header.append(line)
-            else:
-                vcf_data.append(line)
+    # 遍历 bed
+    for (bed_chrom, bed_start, bed_end) in bed_regions:
+        key = (bed_chrom, bed_end)
+        ref = '.'
+        alt = '.'
+        freq = '.'
+        genotype = '.'
+        ref_normal = '.'
+        alt_normal = '.'
+        freq_normal = '.'
+        genotype_normal = '.'
+        res = 'no'
+        if key in records_dict:
+            record = records_dict[key]
+            ref = record.ref
+            alt = record.alts[0]
+            freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
+            freq_normal = record.samples.get(record.samples.keys()[1]).get("AF")[0]
+            ref_normal = ref
+            alt_normal = alt

-    df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
-    position_list = list(df_position[2])
-    chr_list = list(df_position[0])
-    result_data = []
-    unmatched_data = []  # 用于存储未匹配的数据
+            genotype = simplify_genotype(freq)
+            genotype_normal = simplify_genotype(freq_normal)

-    for i in range(len(position_list)):
-        filtered_lines = [line for line in vcf_data if
-                          line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
-                              line.split()[3]) < 2 and len(line.split()[4]) < 2]
-        if not filtered_lines:  # 如果没有匹配的点，添加到未匹配数据列表
-            unmatched_data.append(f"{chr_list[i]}\t{position_list[i]}\t.\t.\t.\t.\t.\n")
-        result_data.extend(filtered_lines)
+        if genotype == genotype_normal:
+            res = 'yes'
+        res_pos.append(dict(
+            chr=bed_chrom,
+            pos=bed_end,
+            ref=ref,
+            alt=alt,
+            freq=freq,
+            genotype=genotype,
+            ref_normal=ref_normal,
+            alt_normal=alt_normal,
+            freq_normal=freq_normal,
+            genotype_normal=genotype_normal,
+            res=res
+        ))
+    res_pos_df = pd.DataFrame(res_pos)
+    res_pos_df_sort = res_pos_df.sort_values(by='chr')
+    res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)

-    with open(matched_file, 'w') as output:
-        for header_line in vcf_header:
-            output.write(header_line)
-        for data_line in result_data:
-            output.write(data_line)
+    output_tumor_vcf = open(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w')
+    output_normal_vcf = open(os.path.join(output_dir, f'{name}_cnvkit_normal.vcf'), 'w')
+    for line in str(vcf.header).strip().split('\n'):
+        output_tumor_vcf.write(line + '\n') if line.startswith('##') else output_tumor_vcf.write(
+            '\t'.join(line.split('\t')[:-1]) + '\n')
+        output_normal_vcf.write(line + '\n') if line.startswith('##') else output_normal_vcf.write(
+            '\t'.join(line.split('\t')[:-2] + line.split('\t')[-1:]) + '\n')

-    with open(unmatched_file, 'w') as unmatched_output:
-        unmatched_output.writelines(unmatched_data)
+    for (bed_chrom, bed_start, bed_end) in cnvkit_bed_regions:
+        key = (bed_chrom, bed_end)
+        if key in records_dict:
+            record = records_dict[key]
+            output_tumor_vcf.write('\t'.join(str(record).split('\t')[:-1]) + '\n')
+            output_normal_vcf.write('\t'.join(str(record).split('\t')[:-2] + str(record).split('\t')[-1:]) + '\n')

-    return matched_file, unmatched_file
+    output_tumor_vcf.close()
+    output_normal_vcf.close()


-# 处理体系、胚系的vcf，得到目标信息
-def process_judge_vcf(input_vcf, output_vcf):
-    with open(input_vcf, 'r') as input_file, open(output_vcf, 'w') as output_file:
-        for line in input_file:
-            if not line.startswith("#"):
-                fields = line.strip().split('\t')
+def paired_monitor_vcf(name, tumor_vcf_file, normal_vcf_file, bed_file, cnvkit_bed, output_dir):
+    """
+    624双样本处理
+    """
+    bed_regions = load_bed_regions(bed_file)
+    tumor_vcf = pysam.VariantFile(tumor_vcf_file)
+    normal_vcf = pysam.VariantFile(normal_vcf_file)

-                vcf_format = fields[8].split(":")
-                vcf_normal = fields[-1].split(":")
-                vcf_info = dict(zip(vcf_format, vcf_normal))
-                af = vcf_info['AF']
-                vd = vcf_info['VD']
-                # info = fields[9].split(":")
-                # percentage = float(info[6])
-                percentage = float(af)
-                if 0.1 <= percentage <= 0.9:
-                    b = 0.5
-                elif percentage < 0.1:
-                    b = 0
-                elif percentage > 0.9:
-                    b = 1
+    tumor_records_dict = build_record_dict(tumor_vcf)
+    normal_records_dict = build_record_dict(normal_vcf)

-                # 构建新的行数据
-                new_line = '\t'.join([fields[0], fields[1], fields[3], fields[4], af, str(b), vd])
-                output_file.write(new_line + '\n')
-        return output_vcf
+    res_pos = list()

+    # 遍历 bed
+    for (bed_chrom, bed_start, bed_end) in bed_regions:
+        key = (bed_chrom, bed_end)
+        ref = '.'
+        alt = '.'
+        freq = '.'
+        genotype = '.'
+        ref_normal = '.'
+        alt_normal = '.'
+        freq_normal = '.'
+        genotype_normal = '.'
+        res = 'no'

-def merge_and_sort_files(matched_file, unmatched_file, output_file):
-    # 检查 unmatched_file 是否为空
-    if os.stat(unmatched_file).st_size == 0:
-        # 对 matched_file 进行排序并写入 output_file
-        matched_df = pd.read_csv(matched_file, sep='\t', header=None)
-        sorted_df = matched_df.sort_values(by=[0, 1])
-        sorted_df.to_csv(output_file, sep='\t', header=False, index=False)
-        return output_file
+        if key in tumor_records_dict:
+            record = tumor_records_dict[key]
+            ref = record.ref
+            alt = record.alts[0]
+            freq = record.samples.get(record.samples.keys()[0]).get("AF")[0]
+            genotype = simplify_genotype(freq)
+        if key in normal_records_dict:
+            record = normal_records_dict[key]
+            ref_normal = record.ref
+            alt_normal = record.alts[0]
+            freq_normal = record.samples.get(record.samples.keys()[0]).get("AF")[0]
+            genotype_normal = simplify_genotype(freq_normal)
+        if genotype == genotype_normal:
+            res = 'yes'
+        res_pos.append(dict(
+            chr=bed_chrom,
+            pos=bed_end,
+            ref=ref,
+            alt=alt,
+            freq=freq,
+            genotype=genotype,
+            ref_normal=ref_normal,
+            alt_normal=alt_normal,
+            freq_normal=freq_normal,
+            genotype_normal=genotype_normal,
+            res=res
+        ))

-    # 如果 unmatched_file 不为空，继续合并和排序操作
-    if os.stat(matched_file).st_size == 0:
-        matched_df = pd.DataFrame()
-    else:
-        matched_df = pd.read_csv(matched_file, sep='\t', header=None)
-    unmatched_df = pd.read_csv(unmatched_file, sep='\t', header=None)
+    res_pos_df = pd.DataFrame(res_pos)
+    res_pos_df_sort = res_pos_df.sort_values(by='chr')
+    res_pos_df_sort.to_csv(os.path.join(output_dir, f'{name}_pollution_pos.txt'), sep="\t", index=False)

-    # 合并数据帧
-    combined_df = pd.concat([matched_df, unmatched_df])
+    output_tumor_vcf = pysam.VariantFile(os.path.join(output_dir, f'{name}_cnvkit_tumor.vcf'), 'w',
+                                         header=tumor_vcf.header)
+    output_normal_vcf = pysam.VariantFile(os.path.join(output_dir, f'{name}_cnvkit_normal.vcf'), 'w',
+                                          header=normal_vcf.header)

-    # 根据第一列和第二列排序
-    sorted_df = combined_df.sort_values(by=[0, 1])
-
-    # 将排序后的数据写入输出文件
-    sorted_df.to_csv(output_file, sep='\t', header=False, index=False)
-    return output_file
-
-
-# 合并体系，胚系
-def merge_and_compare_files(somatic_file, germline_file, output_merged_file, output_final_file):
-    # 合并两个文件
-    with open(somatic_file, 'r') as somatic, open(germline_file, 'r') as germline:
-        merged_lines = [f"{somatic_line.strip()}\t{germline_line.strip()}" for somatic_line, germline_line in
-                        zip(somatic, germline)]
-
-    # 将合并后的数据写入输出文件
-    with open(output_merged_file, 'w') as output_file:
-        output_file.write('\n'.join(merged_lines))
-
-    # 比较两列数据并添加比较结果列
-    with open(output_merged_file, 'r') as merged, open(output_final_file, 'w') as final_output:
-        for line in merged:
-            fields = line.strip().split('\t')
-            if fields[5] == fields[12]:
-                comparison_result = "yes"
-            else:
-                comparison_result = "no"
-            final_output.write(f"{line.strip()}\t{comparison_result}\n")
-    return output_merged_file, output_final_file
-
-
-# 根据大bed筛选vcf，作cnvkit的图
-def select_cnvkit_vcf(vcf, bed, output_file):
-    vcf_header = []  # 用于存储VCF文件头部分
-    vcf_data = []  # 用于存储筛选后的VCF数据
-    p_value_list = []  # 用于存储符合条件的 p-value
-
-    # 按行处理，保存为列表
-    with open(vcf, 'r') as vcf_file:
-        for line in vcf_file:
-            if line.startswith("#"):
-                vcf_header.append(line)
-            else:
-                vcf_data.append(line)
-
-    df_position = pd.read_csv(bed, sep='\t', header=None, names=range(4))
-    position_list = list(df_position[2])
-    chr_list = list(df_position[0])
-    result_data = []
-    for i in range(len(position_list)):
-        filtered_lines = [line for line in vcf_data if
-                          line.split()[1] == str(position_list[i]) and line.split()[0] == str(chr_list[i]) and len(
-                              line.split()[3]) < 2 and len(line.split()[4]) < 2]
-        for line in filtered_lines:
-            # p_value_str = line.split()[9].split(":")[6]
-            # p_value = float(p_value_str[:-1]) / 100 if p_value_str[-1] == "%" else float(p_value_str)
-
-            vcf_format = line.split('\t')[8].split(":")
-            vcf_normal = line.split('\t')[-1].split(":")
-            vcf_info = dict(zip(vcf_format, vcf_normal))
-            p_value = float(vcf_info['AF'])
-
-            if 0.1 <= p_value <= 0.9:
-                result_data.append(line)
-
-    with open(output_file, 'w') as output:
-        for header_line in vcf_header:
-            output.write(header_line)
-        for data_line in result_data:
-            output.write(data_line)
-
-    return output_file
-
-
-def paired_monitoring(name, somatic_vcf, germline_vcf, ref_bed, cnvkit_ref_bed, output_dir):
-    # 处理体系，根据bed筛选
-    select_position_output_file1 = os.path.join(output_dir, f'{name}_somatic_matched.vcf')
-    select_position_output_file2 = os.path.join(output_dir, f'{name}_somatic_unmatched.vcf')
-    somatic_matched_file, somatic_unmatched_file = select_position(somatic_vcf, ref_bed, select_position_output_file1,
-                                                                   select_position_output_file2)
-    # 处理胚系，根据bed筛选
-    select_position_output_file3 = os.path.join(output_dir, f'{name}_germline_matched.vcf')
-    select_position_output_file4 = os.path.join(output_dir, f'{name}_germline_unmatched.vcf')
-    germline_matched_file, germline_unmatched_file = select_position(germline_vcf, ref_bed,
-                                                                     select_position_output_file3,
-                                                                     select_position_output_file4)
-    # 处理体系，数值转换
-    process_judge_vcf_file1 = os.path.join(output_dir, f'{name}_somatic_matched_add_judge.vcf')
-    somatic_matched_add_judge_file = process_judge_vcf(somatic_matched_file, process_judge_vcf_file1)
-    # 处理胚系，数值转换
-    process_judge_vcf_file2 = os.path.join(output_dir, f'{name}_germline_matched_add_judge.vcf')
-    germline_matched_add_judge_file = process_judge_vcf(germline_matched_file, process_judge_vcf_file2)
-    # 合并体系，将匹配到的和未匹配到bed的的合并
-    merge_and_sort_files_file1 = os.path.join(output_dir, f'{name}_somatic_merged.vcf')
-    somatic_merged_file = merge_and_sort_files(somatic_matched_add_judge_file, somatic_unmatched_file,
-                                               merge_and_sort_files_file1)
-    # 合并胚系，将匹配到的和未匹配到bed的的合并
-    merge_and_sort_files_file2 = os.path.join(output_dir, f'{name}_germline__merged.vcf')
-    Germline_merged_file = merge_and_sort_files(germline_matched_add_judge_file, germline_unmatched_file,
-                                                merge_and_sort_files_file2)
-    # 合并胚系，体系，将体系，胚系两个合并文件再合并
-    result_pro_file = os.path.join(output_dir, f'{name}_result_pro.txt')
-    result_file = os.path.join(output_dir, f'{name}_contaminate_result.txt')
-    merge_and_compare_files(somatic_merged_file, Germline_merged_file, result_pro_file, result_file)
-    ##筛选作图vcf
-    cnvkit_output_file = os.path.join(output_dir, f'{name}_select_cnvkit.vcf')
-    select_cnvkit_vcf(germline_vcf, cnvkit_ref_bed, cnvkit_output_file)
-    ##删除中间文件
-    os.remove(select_position_output_file1)
-    os.remove(select_position_output_file2)
-    os.remove(select_position_output_file3)
-    os.remove(select_position_output_file4)
-    os.remove(process_judge_vcf_file1)
-    os.remove(process_judge_vcf_file2)
-    os.remove(merge_and_sort_files_file1)
-    os.remove(merge_and_sort_files_file2)
-    os.remove(result_pro_file)
+    cnvkit_bed_regions = load_bed_regions(cnvkit_bed)
+    for (bed_chrom, bed_start, bed_end) in cnvkit_bed_regions:
+        key = (bed_chrom, bed_end)
+        if key in normal_records_dict:
+            output_normal_vcf.write(normal_records_dict[key])
+        if key in tumor_records_dict:
+            output_tumor_vcf.write(tumor_records_dict[key])
+    output_tumor_vcf.close()
+    output_normal_vcf.close()


 if __name__ == '__main__':
@ -276,7 +255,8 @@ if __name__ == '__main__':
    parser.add_argument('-b', '--ref_bed', help="ref_bed", required=True)
    parser.add_argument('-v', '--vcf', help="raw vcf for prbe 160 or 17 ; somatic vcf for prbe 682 or 624",
                        required=True)
-    parser.add_argument('-v2', '--vcf2', help="germline vcf; required when prbe 682 or 624", default='', required=False, nargs='?')
+    parser.add_argument('-v2', '--vcf2', help="germline vcf; required when prbe 682 or 624", default='', required=False,
+                        nargs='?')
    parser.add_argument('-c', '--cnvkit_bed', help="cnvkit_bed; required when prbe 682 or 624")
    parser.add_argument('-p', '--probe', help="probe, 682, 624, 160, 17 for now ", required=True)
    parser.add_argument('-o', '--output_dir', help="Output directory, default ./", default='')
@ -288,15 +268,18 @@ if __name__ == '__main__':

    if probe == '160' or probe == '17':
        freq_range = {"17": [0.3452, 0.6512], "160": [0.2930, 0.6753]}.get(probe)
-        single_monitoring(args.name, args.vcf, bed_path, freq_range, args.output_dir)
-    elif probe == '682' or probe == '624':
-        if not args.vcf2:
-            parser.error('--vcf2 is required in prbe 682 or 624')
+        single_monitor(args.name, args.vcf, bed_path, freq_range, args.output_dir)
+    elif probe == '682':
        if not args.cnvkit_bed:
-            parser.error('--cnvkit_bed is required in prbe 682 or 624')
+            parser.error('--cnvkit_bed is required in probe 624')
        cnvkit_bed_path = os.path.realpath(args.cnvkit_bed)
        print(f'污染检测使用cnvkit_bed: {cnvkit_bed_path}')
-        paired_monitoring(args.name, args.vcf, args.vcf2, bed_path, args.cnvkit_bed, args.output_dir)
+        paired_monitor(args.name, args.vcf, bed_path, args.cnvkit_bed, args.output_dir)
+    elif probe == '624':
+        if not args.vcf2:
+            parser.error('--vcf2 is required in probe 624')
+        if not args.cnvkit_bed:
+            parser.error('--cnvkit_bed is required in probe 624')
+        paired_monitor_vcf(args.name, args.vcf, args.vcf2, bed_path, args.cnvkit_bed, args.output_dir)
    else:
        parser.error('probe error. 682, 624, 160, 17 for now')
-
--- a/codes/public/pollution/160_contaminate_ref.bed
+++ b/codes/public/pollution/160_contaminate_ref.bed
--- a/codes/public/pollution/17_contaminate_ref.bed
+++ b/codes/public/pollution/17_contaminate_ref.bed
--- a/codes/public/pollution/624_contaminate_cnvkit.bed
+++ b/codes/public/pollution/624_contaminate_cnvkit.bed
--- a/codes/public/pollution/624_contaminate_ref.bed
+++ b/codes/public/pollution/624_contaminate_ref.bed
@ -1,128 +1,128 @@
-chr1	2488153	2488153
-chr1	23885599	23885599
-chr1	65321388	65321388
-chr1	120612006	120612006
-chr1	193094375	193094375
-chr1	226570840	226570840
-chr2	47693959	47693959
-chr2	48030838	48030838
-chr2	61749716	61749716
-chr2	141032088	141032088
-chr2	215645464	215645464
-chr3	30686414	30686414
-chr3	47125385	47125385
-chr3	134644636	134644636
-chr3	142281612	142281612
-chr3	192053274	192053274
-chr4	55599436	55599436
-chr4	143007419	143007419
-chr4	153252061	153252061
-chr4	185310218	185310218
-chr4	25666099	25666099
-chr5	35857177	35857177
-chr5	38955796	38955796
-chr5	39074296	39074296
-chr5	67522722	67522722
-chr5	112162854	112162854
-chr5	149435759	149435759
-chr5	161119125	161119125
-chr5	170819887	170819887
-chr5	180046344	180046344
-chr6	30858857	30858857
-chr6	32797876	32797876
-chr6	36645696	36645696
-chr6	117678083	117678083
-chr6	117714346	117714346
-chr7	2946461	2946461
-chr7	2962753	2962753
-chr7	87138645	87138645
-chr7	151970931	151970931
-chr8	90955624	90955624
-chr8	90970935	90970935
-chr8	90995019	90995019
-chr8	68864728	68864728
-chr8	38322346	38322346
-chr8	145742879	145742879
-chr9	5081780	5081780
-chr9	8389364	8389364
-chr9	93657761	93657761
-chr9	98209594	98209594
-chr9	21991923	21991923
-chr9	37020622	37020622
-chr9	21975017	21975017
-chr10	43600689	43600689
-chr10	89720907	89720907
-chr10	104596981	104596981
-chr10	104849468	104849468
-chr10	104855656	104855656
-chr10	123239112	123239112
-chr11	32410774	32410774
-chr11	69462910	69462910
-chr11	94197260	94197260
-chr11	94225807	94225807
-chr11	94225920	94225920
-chr11	125497466	125497466
-chr12	1040373	1040373
-chr12	4553383	4553383
-chr12	6711147	6711147
-chr12	21331625	21331625
-chr12	49427652	49427652
-chr12	49444545	49444545
-chr12	121416622	121416622
-chr12	133202215	133202215
-chr13	21562948	21562948
-chr13	28609825	28609825
-chr13	32936646	32936646
-chr13	113889474	113889474
-chr13	113907391	113907391
-chr14	105239894	105239894
-chr14	105246325	105246325
-chr14	38064215	38064215
-chr14	20820537	20820537
-chr14	35871217	35871217
-chr15	41865488	41865488
-chr15	51502986	51502986
-chr15	51529112	51529112
-chr15	99456253	99456253
-chr15	99478225	99478225
-chr15	51558731	51558731
-chr16	68857441	68857441
-chr16	359953	359953
-chr16	89805977	89805977
-chr16	89838078	89838078
-chr16	89857964	89857964
-chr17	7983969	7983969
-chr17	29486152	29486152
-chr17	29508775	29508775
-chr17	29546175	29546175
-chr17	29559932	29559932
-chr17	29653293	29653293
-chr17	37879762	37879762
-chr17	62007498	62007498
-chr17	78919558	78919558
-chr19	3110349	3110349
-chr19	3119239	3119239
-chr19	4101062	4101062
-chr19	11136215	11136215
-chr19	15289613	15289613
-chr19	15295134	15295134
-chr19	41725271	41725271
-chr20	43956527	43956527
-chr20	43956636	43956636
-chr20	43958850	43958850
-chr20	43958872	43958872
-chr20	54959296	54959296
-chr20	54961541	54961541
-chr21	37518706	37518706
-chr21	39752673	39752673
-chr21	39753375	39753375
-chr22	41568480	41568480
-chr22	30038152	30038152
-chr22	30079213	30079213
-chrX	76937963	76937963
-chrX	39922359	39922359
-chrX	39932907	39932907
-chrX	44938563	44938563
-chrX	100608191	100608191
-chrX	100611285	100611285
-chrX	76940534	76940534
+chr1	2488152	2488153
+chr1	23885598	23885599
+chr1	65321387	65321388
+chr1	120612005	120612006
+chr1	193094374	193094375
+chr1	226570839	226570840
+chr2	47693958	47693959
+chr2	48030837	48030838
+chr2	61749715	61749716
+chr2	141032087	141032088
+chr2	215645463	215645464
+chr3	30686413	30686414
+chr3	47125384	47125385
+chr3	134644635	134644636
+chr3	142281611	142281612
+chr3	192053273	192053274
+chr4	55599435	55599436
+chr4	143007418	143007419
+chr4	153252060	153252061
+chr4	185310217	185310218
+chr4	25666098	25666099
+chr5	35857176	35857177
+chr5	38955795	38955796
+chr5	39074295	39074296
+chr5	67522721	67522722
+chr5	112162853	112162854
+chr5	149435758	149435759
+chr5	161119124	161119125
+chr5	170819886	170819887
+chr5	180046343	180046344
+chr6	30858856	30858857
+chr6	32797875	32797876
+chr6	36645695	36645696
+chr6	117678082	117678083
+chr6	117714345	117714346
+chr7	2946460	2946461
+chr7	2962752	2962753
+chr7	87138644	87138645
+chr7	151970930	151970931
+chr8	90955623	90955624
+chr8	90970934	90970935
+chr8	90995018	90995019
+chr8	68864727	68864728
+chr8	38322345	38322346
+chr8	145742878	145742879
+chr9	5081779	5081780
+chr9	8389363	8389364
+chr9	93657760	93657761
+chr9	98209593	98209594
+chr9	21991922	21991923
+chr9	37020621	37020622
+chr9	21975016	21975017
+chr10	43600688	43600689
+chr10	89720906	89720907
+chr10	104596980	104596981
+chr10	104849467	104849468
+chr10	104855655	104855656
+chr10	123239111	123239112
+chr11	32410773	32410774
+chr11	69462909	69462910
+chr11	94197259	94197260
+chr11	94225806	94225807
+chr11	94225919	94225920
+chr11	125497465	125497466
+chr12	1040372	1040373
+chr12	4553382	4553383
+chr12	6711146	6711147
+chr12	21331624	21331625
+chr12	49427651	49427652
+chr12	49444544	49444545
+chr12	121416621	121416622
+chr12	133202214	133202215
+chr13	21562947	21562948
+chr13	28609824	28609825
+chr13	32936645	32936646
+chr13	113889473	113889474
+chr13	113907390	113907391
+chr14	105239893	105239894
+chr14	105246324	105246325
+chr14	38064214	38064215
+chr14	20820536	20820537
+chr14	35871216	35871217
+chr15	41865487	41865488
+chr15	51502985	51502986
+chr15	51529111	51529112
+chr15	99456252	99456253
+chr15	99478224	99478225
+chr15	51558730	51558731
+chr16	68857440	68857441
+chr16	359952	359953
+chr16	89805976	89805977
+chr16	89838077	89838078
+chr16	89857963	89857964
+chr17	7983968	7983969
+chr17	29486151	29486152
+chr17	29508774	29508775
+chr17	29546174	29546175
+chr17	29559931	29559932
+chr17	29653292	29653293
+chr17	37879761	37879762
+chr17	62007497	62007498
+chr17	78919557	78919558
+chr19	3110348	3110349
+chr19	3119238	3119239
+chr19	4101061	4101062
+chr19	11136214	11136215
+chr19	15289612	15289613
+chr19	15295133	15295134
+chr19	41725270	41725271
+chr20	43956526	43956527
+chr20	43956635	43956636
+chr20	43958849	43958850
+chr20	43958871	43958872
+chr20	54959295	54959296
+chr20	54961540	54961541
+chr21	37518705	37518706
+chr21	39752672	39752673
+chr21	39753374	39753375
+chr22	41568479	41568480
+chr22	30038151	30038152
+chr22	30079212	30079213
+chrX	76937962	76937963
+chrX	39922358	39922359
+chrX	39932906	39932907
+chrX	44938562	44938563
+chrX	100608190	100608191
+chrX	100611284	100611285
+chrX	76940533	76940534
--- a/codes/public/pollution/682_contaminate_cnvkit.bed
+++ b/codes/public/pollution/682_contaminate_cnvkit.bed
--- a/codes/public/pollution/682_contaminate_ref.bed
+++ b/codes/public/pollution/682_contaminate_ref.bed
@ -1,128 +1,128 @@
-chr1	2488153	2488153
-chr1	23885599	23885599
-chr1	65321388	65321388
-chr1	120612006	120612006
-chr1	193094375	193094375
-chr1	226570840	226570840
-chr2	47693959	47693959
-chr2	48030838	48030838
-chr2	61749716	61749716
-chr2	141032088	141032088
-chr2	215645464	215645464
-chr3	30686414	30686414
-chr3	47125385	47125385
-chr3	134644636	134644636
-chr3	142281612	142281612
-chr3	192053274	192053274
-chr4	55599436	55599436
-chr4	143007419	143007419
-chr4	153252061	153252061
-chr4	185310218	185310218
-chr4	25666099	25666099
-chr5	35857177	35857177
-chr5	38955796	38955796
-chr5	39074296	39074296
-chr5	67522722	67522722
-chr5	112162854	112162854
-chr5	149435759	149435759
-chr5	161119125	161119125
-chr5	170819887	170819887
-chr5	180046344	180046344
-chr6	30858857	30858857
-chr6	32797876	32797876
-chr6	36645696	36645696
-chr6	117678083	117678083
-chr6	117714346	117714346
-chr7	2946461	2946461
-chr7	2962753	2962753
-chr7	87138645	87138645
-chr7	151970931	151970931
-chr8	90955624	90955624
-chr8	90970935	90970935
-chr8	90995019	90995019
-chr8	68864728	68864728
-chr8	38322346	38322346
-chr8	145742879	145742879
-chr9	5081780	5081780
-chr9	8389364	8389364
-chr9	93657761	93657761
-chr9	98209594	98209594
-chr9	21991923	21991923
-chr9	37020622	37020622
-chr9	21975017	21975017
-chr10	43600689	43600689
-chr10	89720907	89720907
-chr10	104596981	104596981
-chr10	104849468	104849468
-chr10	104855656	104855656
-chr10	123239112	123239112
-chr11	32410774	32410774
-chr11	69462910	69462910
-chr11	94197260	94197260
-chr11	94225807	94225807
-chr11	94225920	94225920
-chr11	125497466	125497466
-chr12	1040373	1040373
-chr12	4553383	4553383
-chr12	6711147	6711147
-chr12	21331625	21331625
-chr12	49427652	49427652
-chr12	49444545	49444545
-chr12	121416622	121416622
-chr12	133202215	133202215
-chr13	21562948	21562948
-chr13	28609825	28609825
-chr13	32936646	32936646
-chr13	113889474	113889474
-chr13	113907391	113907391
-chr14	105239894	105239894
-chr14	105246325	105246325
-chr14	38064215	38064215
-chr14	20820537	20820537
-chr14	35871217	35871217
-chr15	41865488	41865488
-chr15	51502986	51502986
-chr15	51529112	51529112
-chr15	99456253	99456253
-chr15	99478225	99478225
-chr15	51558731	51558731
-chr16	68857441	68857441
-chr16	359953	359953
-chr16	89805977	89805977
-chr16	89838078	89838078
-chr16	89857964	89857964
-chr17	7983969	7983969
-chr17	29486152	29486152
-chr17	29508775	29508775
-chr17	29546175	29546175
-chr17	29559932	29559932
-chr17	29653293	29653293
-chr17	37879762	37879762
-chr17	62007498	62007498
-chr17	78919558	78919558
-chr19	3110349	3110349
-chr19	3119239	3119239
-chr19	4101062	4101062
-chr19	11136215	11136215
-chr19	15289613	15289613
-chr19	15295134	15295134
-chr19	41725271	41725271
-chr20	43956527	43956527
-chr20	43956636	43956636
-chr20	43958850	43958850
-chr20	43958872	43958872
-chr20	54959296	54959296
-chr20	54961541	54961541
-chr21	37518706	37518706
-chr21	39752673	39752673
-chr21	39753375	39753375
-chr22	41568480	41568480
-chr22	30038152	30038152
-chr22	30079213	30079213
-chrX	76937963	76937963
-chrX	39922359	39922359
-chrX	39932907	39932907
-chrX	44938563	44938563
-chrX	100608191	100608191
-chrX	100611285	100611285
-chrX	76940534	76940534
+chr1	2488152	2488153
+chr1	23885598	23885599
+chr1	65321387	65321388
+chr1	120612005	120612006
+chr1	193094374	193094375
+chr1	226570839	226570840
+chr2	47693958	47693959
+chr2	48030837	48030838
+chr2	61749715	61749716
+chr2	141032087	141032088
+chr2	215645463	215645464
+chr3	30686413	30686414
+chr3	47125384	47125385
+chr3	134644635	134644636
+chr3	142281611	142281612
+chr3	192053273	192053274
+chr4	55599435	55599436
+chr4	143007418	143007419
+chr4	153252060	153252061
+chr4	185310217	185310218
+chr4	25666098	25666099
+chr5	35857176	35857177
+chr5	38955795	38955796
+chr5	39074295	39074296
+chr5	67522721	67522722
+chr5	112162853	112162854
+chr5	149435758	149435759
+chr5	161119124	161119125
+chr5	170819886	170819887
+chr5	180046343	180046344
+chr6	30858856	30858857
+chr6	32797875	32797876
+chr6	36645695	36645696
+chr6	117678082	117678083
+chr6	117714345	117714346
+chr7	2946460	2946461
+chr7	2962752	2962753
+chr7	87138644	87138645
+chr7	151970930	151970931
+chr8	90955623	90955624
+chr8	90970934	90970935
+chr8	90995018	90995019
+chr8	68864727	68864728
+chr8	38322345	38322346
+chr8	145742878	145742879
+chr9	5081779	5081780
+chr9	8389363	8389364
+chr9	93657760	93657761
+chr9	98209593	98209594
+chr9	21991922	21991923
+chr9	37020621	37020622
+chr9	21975016	21975017
+chr10	43600688	43600689
+chr10	89720906	89720907
+chr10	104596980	104596981
+chr10	104849467	104849468
+chr10	104855655	104855656
+chr10	123239111	123239112
+chr11	32410773	32410774
+chr11	69462909	69462910
+chr11	94197259	94197260
+chr11	94225806	94225807
+chr11	94225919	94225920
+chr11	125497465	125497466
+chr12	1040372	1040373
+chr12	4553382	4553383
+chr12	6711146	6711147
+chr12	21331624	21331625
+chr12	49427651	49427652
+chr12	49444544	49444545
+chr12	121416621	121416622
+chr12	133202214	133202215
+chr13	21562947	21562948
+chr13	28609824	28609825
+chr13	32936645	32936646
+chr13	113889473	113889474
+chr13	113907390	113907391
+chr14	105239893	105239894
+chr14	105246324	105246325
+chr14	38064214	38064215
+chr14	20820536	20820537
+chr14	35871216	35871217
+chr15	41865487	41865488
+chr15	51502985	51502986
+chr15	51529111	51529112
+chr15	99456252	99456253
+chr15	99478224	99478225
+chr15	51558730	51558731
+chr16	68857440	68857441
+chr16	359952	359953
+chr16	89805976	89805977
+chr16	89838077	89838078
+chr16	89857963	89857964
+chr17	7983968	7983969
+chr17	29486151	29486152
+chr17	29508774	29508775
+chr17	29546174	29546175
+chr17	29559931	29559932
+chr17	29653292	29653293
+chr17	37879761	37879762
+chr17	62007497	62007498
+chr17	78919557	78919558
+chr19	3110348	3110349
+chr19	3119238	3119239
+chr19	4101061	4101062
+chr19	11136214	11136215
+chr19	15289612	15289613
+chr19	15295133	15295134
+chr19	41725270	41725271
+chr20	43956526	43956527
+chr20	43956635	43956636
+chr20	43958849	43958850
+chr20	43958871	43958872
+chr20	54959295	54959296
+chr20	54961540	54961541
+chr21	37518705	37518706
+chr21	39752672	39752673
+chr21	39753374	39753375
+chr22	41568479	41568480
+chr22	30038151	30038152
+chr22	30079212	30079213
+chrX	76937962	76937963
+chrX	39922358	39922359
+chrX	39932906	39932907
+chrX	44938562	44938563
+chrX	100608190	100608191
+chrX	100611284	100611285
+chrX	76940533	76940534
--- a/codes/public/pollution/contaminate_650.bed
+++ b/codes/public/pollution/contaminate_650.bed
@ -1,128 +1,128 @@
-chr1	2488153	2488153
-chr1	23885599	23885599
-chr1	65321388	65321388
-chr1	120612006	120612006
-chr1	193094375	193094375
-chr1	226570840	226570840
-chr2	47693959	47693959
-chr2	48030838	48030838
-chr2	61749716	61749716
-chr2	141032088	141032088
-chr2	215645464	215645464
-chr3	30686414	30686414
-chr3	47125385	47125385
-chr3	134644636	134644636
-chr3	142281612	142281612
-chr3	192053274	192053274
-chr4	55599436	55599436
-chr4	143007419	143007419
-chr4	153252061	153252061
-chr4	185310218	185310218
-chr4	25666099	25666099
-chr5	35857177	35857177
-chr5	38955796	38955796
-chr5	39074296	39074296
-chr5	67522722	67522722
-chr5	112162854	112162854
-chr5	149435759	149435759
-chr5	161119125	161119125
-chr5	170819887	170819887
-chr5	180046344	180046344
-chr6	30858857	30858857
-chr6	32797876	32797876
-chr6	36645696	36645696
-chr6	117678083	117678083
-chr6	117714346	117714346
-chr7	2946461	2946461
-chr7	2962753	2962753
-chr7	87138645	87138645
-chr7	151970931	151970931
-chr8	90955624	90955624
-chr8	90970935	90970935
-chr8	90995019	90995019
-chr8	68864728	68864728
-chr8	38322346	38322346
-chr8	145742879	145742879
-chr9	5081780	5081780
-chr9	8389364	8389364
-chr9	93657761	93657761
-chr9	98209594	98209594
-chr9	21991923	21991923
-chr9	37020622	37020622
-chr9	21975017	21975017
-chr10	43600689	43600689
-chr10	89720907	89720907
-chr10	104596981	104596981
-chr10	104849468	104849468
-chr10	104855656	104855656
-chr10	123239112	123239112
-chr11	32410774	32410774
-chr11	69462910	69462910
-chr11	94197260	94197260
-chr11	94225807	94225807
-chr11	94225920	94225920
-chr11	125497466	125497466
-chr12	1040373	1040373
-chr12	4553383	4553383
-chr12	6711147	6711147
-chr12	21331625	21331625
-chr12	49427652	49427652
-chr12	49444545	49444545
-chr12	121416622	121416622
-chr12	133202215	133202215
-chr13	21562948	21562948
-chr13	28609825	28609825
-chr13	32936646	32936646
-chr13	113889474	113889474
-chr13	113907391	113907391
-chr14	105239894	105239894
-chr14	105246325	105246325
-chr14	38064215	38064215
-chr14	20820537	20820537
-chr14	35871217	35871217
-chr15	41865488	41865488
-chr15	51502986	51502986
-chr15	51529112	51529112
-chr15	99456253	99456253
-chr15	99478225	99478225
-chr15	51558731	51558731
-chr16	68857441	68857441
-chr16	359953	359953
-chr16	89805977	89805977
-chr16	89838078	89838078
-chr16	89857964	89857964
-chr17	7983969	7983969
-chr17	29486152	29486152
-chr17	29508775	29508775
-chr17	29546175	29546175
-chr17	29559932	29559932
-chr17	29653293	29653293
-chr17	37879762	37879762
-chr17	62007498	62007498
-chr17	78919558	78919558
-chr19	3110349	3110349
-chr19	3119239	3119239
-chr19	4101062	4101062
-chr19	11136215	11136215
-chr19	15289613	15289613
-chr19	15295134	15295134
-chr19	41725271	41725271
-chr20	43956527	43956527
-chr20	43956636	43956636
-chr20	43958850	43958850
-chr20	43958872	43958872
-chr20	54959296	54959296
-chr20	54961541	54961541
-chr21	37518706	37518706
-chr21	39752673	39752673
-chr21	39753375	39753375
-chr22	41568480	41568480
-chr22	30038152	30038152
-chr22	30079213	30079213
-chrX	76937963	76937963
-chrX	39922359	39922359
-chrX	39932907	39932907
-chrX	44938563	44938563
-chrX	100608191	100608191
-chrX	100611285	100611285
-chrX	76940534	76940534
+chr1	2488152	2488153
+chr1	23885598	23885599
+chr1	65321387	65321388
+chr1	120612005	120612006
+chr1	193094374	193094375
+chr1	226570839	226570840
+chr2	47693958	47693959
+chr2	48030837	48030838
+chr2	61749715	61749716
+chr2	141032087	141032088
+chr2	215645463	215645464
+chr3	30686413	30686414
+chr3	47125384	47125385
+chr3	134644635	134644636
+chr3	142281611	142281612
+chr3	192053273	192053274
+chr4	55599435	55599436
+chr4	143007418	143007419
+chr4	153252060	153252061
+chr4	185310217	185310218
+chr4	25666098	25666099
+chr5	35857176	35857177
+chr5	38955795	38955796
+chr5	39074295	39074296
+chr5	67522721	67522722
+chr5	112162853	112162854
+chr5	149435758	149435759
+chr5	161119124	161119125
+chr5	170819886	170819887
+chr5	180046343	180046344
+chr6	30858856	30858857
+chr6	32797875	32797876
+chr6	36645695	36645696
+chr6	117678082	117678083
+chr6	117714345	117714346
+chr7	2946460	2946461
+chr7	2962752	2962753
+chr7	87138644	87138645
+chr7	151970930	151970931
+chr8	90955623	90955624
+chr8	90970934	90970935
+chr8	90995018	90995019
+chr8	68864727	68864728
+chr8	38322345	38322346
+chr8	145742878	145742879
+chr9	5081779	5081780
+chr9	8389363	8389364
+chr9	93657760	93657761
+chr9	98209593	98209594
+chr9	21991922	21991923
+chr9	37020621	37020622
+chr9	21975016	21975017
+chr10	43600688	43600689
+chr10	89720906	89720907
+chr10	104596980	104596981
+chr10	104849467	104849468
+chr10	104855655	104855656
+chr10	123239111	123239112
+chr11	32410773	32410774
+chr11	69462909	69462910
+chr11	94197259	94197260
+chr11	94225806	94225807
+chr11	94225919	94225920
+chr11	125497465	125497466
+chr12	1040372	1040373
+chr12	4553382	4553383
+chr12	6711146	6711147
+chr12	21331624	21331625
+chr12	49427651	49427652
+chr12	49444544	49444545
+chr12	121416621	121416622
+chr12	133202214	133202215
+chr13	21562947	21562948
+chr13	28609824	28609825
+chr13	32936645	32936646
+chr13	113889473	113889474
+chr13	113907390	113907391
+chr14	105239893	105239894
+chr14	105246324	105246325
+chr14	38064214	38064215
+chr14	20820536	20820537
+chr14	35871216	35871217
+chr15	41865487	41865488
+chr15	51502985	51502986
+chr15	51529111	51529112
+chr15	99456252	99456253
+chr15	99478224	99478225
+chr15	51558730	51558731
+chr16	68857440	68857441
+chr16	359952	359953
+chr16	89805976	89805977
+chr16	89838077	89838078
+chr16	89857963	89857964
+chr17	7983968	7983969
+chr17	29486151	29486152
+chr17	29508774	29508775
+chr17	29546174	29546175
+chr17	29559931	29559932
+chr17	29653292	29653293
+chr17	37879761	37879762
+chr17	62007497	62007498
+chr17	78919557	78919558
+chr19	3110348	3110349
+chr19	3119238	3119239
+chr19	4101061	4101062
+chr19	11136214	11136215
+chr19	15289612	15289613
+chr19	15295133	15295134
+chr19	41725270	41725271
+chr20	43956526	43956527
+chr20	43956635	43956636
+chr20	43958849	43958850
+chr20	43958871	43958872
+chr20	54959295	54959296
+chr20	54961540	54961541
+chr21	37518705	37518706
+chr21	39752672	39752673
+chr21	39753374	39753375
+chr22	41568479	41568480
+chr22	30038151	30038152
+chr22	30079212	30079213
+chrX	76937962	76937963
+chrX	39922358	39922359
+chrX	39932906	39932907
+chrX	44938562	44938563
+chrX	100608190	100608191
+chrX	100611284	100611285
+chrX	76940533	76940534
--- a/pipeline.wdl
+++ b/pipeline.wdl
@ -190,8 +190,10 @@ workflow pipeline {
            output_dir=workdir,
            probe=probe,
            raw_vcf=call_mutation.raw_vcf,
-            somatic_vcf=call_mutation.somatic_vcf,
-            germline_vcf=call_mutation.germline_vcf
+            initial_vcf=call_mutation.initial_vcf,
+            normal_vcf=call_mutation.normal_vcf,
+            cnv_cnr=call_cnv.cnv_cnr,
+            cnv_cns=call_cnv.cnv_cns
    }

    call postprocess.call_postprocess as call_postprocess {
--- a/wdl/call_mutation.wdl
+++ b/wdl/call_mutation.wdl
@ -75,6 +75,8 @@ task mutation_calling_umi {
        -e 'INFO/AF[0] >= 0.1' \
        -o ${output_dir}/mutation/${name}.snp_indel.germline.vcf

+        cp ${output_dir}/mutation/${name}.1r.vcf ${output_dir}/mutation/${name}.initial.vcf
+
    >>>

    output {
@ -165,8 +167,13 @@ task mutation_calling_umi_control {
        ${output_dir}/mutation/${name}.raw.addtagmsi.addmutalt.snp_indel.vcf

        cp ${output_dir}/mutation/${name}.raw.addtagmsi.addmutalt.snp_indel.vcf ${output_dir}/mutation/${name}.snp_indel.somatic.vcf
-        cp ${output_dir}/mutation/${name}.normal.vcf ${output_dir}/mutation/${name}.snp_indel.germline.vcf
+        # cp ${output_dir}/mutation/${name}.normal.vcf ${output_dir}/mutation/${name}.snp_indel.germline.vcf
+        vcf_filter.py \
+        -i ${output_dir}/mutation/${name}.normal.vcf \
+        -e 'INFO/AF[0] >= 0.1' \
+        -o ${output_dir}/mutation/${name}.snp_indel.germline.vcf

+        cp ${output_dir}/mutation/${name}.1r.vcf ${output_dir}/mutation/${name}.initial.vcf
    >>>

    output {
@ -221,6 +228,8 @@ task mutation_calling_tissue {
        -e 'INFO/AF[0] > 0.1' \
        -o ${output_dir}/mutation/${name}.snp_indel.germline.vcf

+        cp ${output_dir}/mutation/${name}.raw.vcf ${output_dir}/mutation/${name}.initial.vcf
+
    >>>

    output {
@ -274,6 +283,8 @@ task mutation_calling_tissue_control {
        -o ${output_dir}/mutation/${name}.snp_indel.germline.vcf \
        -e 'INFO/STATUS="Germline" && FORMAT/AF[0] > 0.1 && FORMAT/AF[1] > 0.1'

+        cp ${output_dir}/mutation/${name}.raw.vcf ${output_dir}/mutation/${name}.initial.vcf
+
    >>>

    output {
@ -824,5 +835,7 @@ workflow call_mutation {
        String somatic_filter = "${output_dir}/mutation/${tumor}.snp_indel.somatic.hg19_multianno.filter.txt"
        String germline_filter = "${output_dir}/mutation/${tumor}.snp_indel.germline.hg19_multianno.filter.txt"
        String hotspot_filter = "${output_dir}/mutation/${tumor}.snp_indel.hotspot.hg19_multianno.filter.txt"
+        String initial_vcf = "${output_dir}/mutation/${tumor}.initial.vcf"
+        String normal_vcf = "${output_dir}/mutation/${tumor}.normal.vcf"
    }
 }
--- a/wdl/cnv.wdl
+++ b/wdl/cnv.wdl
@ -151,6 +151,8 @@ workflow call_cnv {

    output {
        String cnv_filter = "${output_dir}/cnv/${tumor}.rmdup.cns.filter.txt"
+        String cnv_cnr = "${output_dir}/cnv/${tumor}.rmdup.cnr"
+        String cnv_cns = "${output_dir}/cnv/${tumor}.rmdup.cns"
    }
 }

--- a/wdl/pollution.wdl
+++ b/wdl/pollution.wdl
@ -21,6 +21,25 @@ task run_pollution {
        -c $PUBLIC/pollution/${probe}_contaminate_cnvkit.bed

    }
+    output {
+        String cnvkit_tumor_vcf = "${output_dir}/pollution/${name}_cnvkit_tumor.vcf"
+        String cnvkit_normal_vcf = "${output_dir}/pollution/${name}_cnvkit_normal.vcf"
+    }
+}
+
+task run_generate_png {
+    String name
+    String probe
+    String cnvkit_tumor_vcf
+    String cnvkit_normal_vcf
+    String cnv_cnr
+    String cnv_cns
+    String output_dir
+
+    command {
+        cnvkit.py scatter ${cnv_cnr} -s ${cnv_cns} -v ${cnvkit_tumor_vcf} -o ${output_dir}/pollution/${name}_pollution_cnvkit_tumor.png
+        cnvkit.py scatter ${cnv_cnr} -s ${cnv_cns} -v ${cnvkit_normal_vcf} -o ${output_dir}/pollution/${name}_pollution_cnvkit_normal.png
+    }
 }

 workflow call_pollution {
@ -32,8 +51,10 @@ workflow call_pollution {
    String output_dir
    String probe
    String raw_vcf
-    String somatic_vcf
-    String germline_vcf
+    String initial_vcf
+    String normal_vcf
+    String cnv_cnr
+    String cnv_cns

    if (run) {
        if (defined(normal)) {
@ -42,8 +63,19 @@ workflow call_pollution {
                    name=tumor,
                    output_dir=output_dir,
                    probe=probe,
-                    vcf=somatic_vcf,
-                    vcf2=germline_vcf
+                    vcf=initial_vcf,
+                    vcf2=normal_vcf
+            }
+            call run_generate_png {
+                input:
+                    name=tumor,
+                    probe=probe,
+                    cnvkit_tumor_vcf=run_pollution_paired.cnvkit_tumor_vcf,
+                    cnvkit_normal_vcf=run_pollution_paired.cnvkit_normal_vcf,
+                    cnv_cnr=cnv_cnr,
+                    cnv_cns=cnv_cns,
+                    output_dir=output_dir
+
            }
        }

@ -53,12 +85,12 @@ workflow call_pollution {
                    name=tumor,
                    output_dir=output_dir,
                    probe=probe,
-                    vcf=raw_vcf
+                    vcf=initial_vcf
            }
        }
    }

    output {
-        String pollution_res = "${output_dir}/pollution/${tumor}_pollution.csv"
+        String pollution_res = "${output_dir}/pollution/${tumor}_pollution.txt"
    }
 }