#!/usr/bin/env python3 # -*-coding:utf-8-*- import json import os import re import sys import numpy as np import pandas as pd if len(sys.argv) != 4: print("usage:python3 qc.py fastp_json_file bamdst_out output ") sys.exit() fastp_json = sys.argv[1] bamdst_out = sys.argv[2] out = open(sys.argv[3], 'w') with open(fastp_json, 'r') as file: fastp = json.load(file) out.write(''.join(["raw_reads\t", str(fastp["summary"]["before_filtering"]["total_reads"]), "\n"])) out.write(''.join(["raw_bases\t", str(fastp["summary"]["before_filtering"]["total_bases"]), "\n"])) out.write(''.join(["clean_reads\t", str(fastp["summary"]["after_filtering"]["total_reads"]), "\n"])) out.write(''.join(["clean_bases\t", str(fastp["summary"]["after_filtering"]["total_bases"]), "\n"])) out.write(''.join(["clean_reads_rate(%)\t", str('%.2f' % ( fastp["summary"]["after_filtering"]["total_reads"] / fastp["summary"]["before_filtering"][ "total_reads"] * 100)), "\n"])) out.write(''.join(["Q20(%)\t", str('%.2f' % (fastp["summary"]["before_filtering"]["q20_rate"] * 100)), "\n"])) out.write(''.join(["Q30(%)\t", str('%.2f' % (fastp["summary"]["before_filtering"]["q30_rate"] * 100)), "\n"])) with open(os.path.join(bamdst_out, 'coverage.report'), 'r') as file2: bamdst = {} for line in file2: if not re.match(r'^#', line): lines = line.strip().split('\t') bamdst[lines[0]] = lines[1] out.write(''.join(["mapped_reads\t", str(bamdst["[Total] Mapped Reads"]), "\n"])) out.write(''.join(["mapped_rate(%)\t", str('%.2f' % ( int(bamdst["[Total] Mapped Reads"]) / fastp["summary"]["after_filtering"]["total_reads"] * 100)), "\n"])) out.write(''.join(["dup_reads\t", str(bamdst["[Total] PCR duplicate reads"]), "\n"])) out.write(''.join(["dup_rate(%)\t", str('%.2f' % ( int(bamdst["[Total] PCR duplicate reads"]) / int(bamdst["[Total] Mapped Reads"]) * 100)), "\n"])) out.write(''.join(["probe_bed_size\t", str(bamdst["[Target] Len of region"]), "\n"])) out.write(''.join(["target_reads\t", str(bamdst["[Target] Target Reads"]), "\n"])) out.write(''.join(["capture_rate(reads)\t", str('%.2f' % (int(bamdst["[Target] Target Reads"]) / int(bamdst["[Total] Mapped Reads"]) * 100)), "\n"])) out.write(''.join(["mean_depth_raw\t", str(bamdst["[Target] Average depth"]), "\n"])) out.write(''.join(["mean_depth(dedup)\t", str(bamdst["[Target] Average depth(rmdup)"]), "\n"])) out.write(''.join(["coverage(>0x)\t", str(bamdst["[Target] Coverage (>0x)"].replace('%', '')), "\n"])) out.write(''.join(["coverage(>10x)\t", str(bamdst["[Target] Coverage (>=10x)"].replace('%', '')), "\n"])) uniformity = round((float(bamdst["[Target] Average depth"])) * 0.2) b = os.popen("sed -n %s'p' %s | cut -f5" % (uniformity, os.path.join(bamdst_out, 'depth_distribution.plot'))) b = float(b.read().strip()) * 100 out.write(''.join(["coverage(>=0.2*meanx)\t", str('%.2f' % (b)), "\n"])) depth_tsv_df = pd.read_csv(os.path.join(bamdst_out, 'depth.tsv.gz'), compression='gzip', sep="\t") coverage_gtoe_80 = round(np.percentile(np.array(depth_tsv_df['Rmdup depth']), 20)) out.write(''.join(["coverage(>=80%)\t", str(coverage_gtoe_80), "\n"])) out.close()