67 lines
3.3 KiB
Python
67 lines
3.3 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
#-*-coding:utf-8-*-
|
||
|
|
import sys
|
||
|
|
import json
|
||
|
|
import re
|
||
|
|
import os
|
||
|
|
import pandas as pd
|
||
|
|
from pandas import DataFrame,Series
|
||
|
|
import numpy as np
|
||
|
|
from gender_detect import main
|
||
|
|
|
||
|
|
if len(sys.argv)<6:
|
||
|
|
print("usage:python3 qc.py fastp_json_file bamdst_coverage_file bamdst_depth_distribution.plot output_dir sample_name tumor")
|
||
|
|
sys.exit()
|
||
|
|
|
||
|
|
fastp_json=sys.argv[1]
|
||
|
|
bamdst_txt=sys.argv[2]
|
||
|
|
bamdst_depth=sys.argv[3]
|
||
|
|
out_dir=sys.argv[4]
|
||
|
|
name=sys.argv[5]
|
||
|
|
tumor=sys.argv[6]
|
||
|
|
|
||
|
|
out=open(''.join([out_dir,"/qc/",name,"_qc.txt"]),'w')
|
||
|
|
|
||
|
|
with open(fastp_json,'r')as file:
|
||
|
|
fastp=json.load(file)
|
||
|
|
out.write(''.join(["raw_reads\t",str(fastp["summary"]["before_filtering"]["total_reads"]),"\n"]))
|
||
|
|
out.write(''.join(["raw_bases\t",str(fastp["summary"]["before_filtering"]["total_bases"]),"\n"]))
|
||
|
|
out.write(''.join(["clean_reads\t",str(fastp["summary"]["after_filtering"]["total_reads"]),"\n"]))
|
||
|
|
out.write(''.join(["clean_bases\t",str(fastp["summary"]["after_filtering"]["total_bases"]),"\n"]))
|
||
|
|
out.write(''.join(["clean_reads_rate(%)\t",str('%.2f' % (fastp["summary"]["after_filtering"]["total_reads"]/fastp["summary"]["before_filtering"]["total_reads"]*100)),"\n"]))
|
||
|
|
out.write(''.join(["Q20(%)\t",str('%.2f' % (fastp["summary"]["before_filtering"]["q20_rate"]*100)),"\n"]))
|
||
|
|
out.write(''.join(["Q30(%)\t",str('%.2f' % (fastp["summary"]["before_filtering"]["q30_rate"]*100)),"\n"]))
|
||
|
|
|
||
|
|
|
||
|
|
with open(bamdst_txt,'r')as file2:
|
||
|
|
bamdst={}
|
||
|
|
for line in file2:
|
||
|
|
if not re.match(r'^#',line):
|
||
|
|
lines=line.strip().split('\t')
|
||
|
|
bamdst[lines[0]]=lines[1]
|
||
|
|
|
||
|
|
out.write(''.join(["mapped_reads\t",str(bamdst["[Total] Mapped Reads"]),"\n"]))
|
||
|
|
out.write(''.join(["mapped_rate(%)\t",str('%.2f' % (int(bamdst["[Total] Mapped Reads"])/fastp["summary"]["after_filtering"]["total_reads"]*100)),"\n"]))
|
||
|
|
out.write(''.join(["dup_reads\t",str(bamdst["[Total] PCR duplicate reads"]),"\n"]))
|
||
|
|
out.write(''.join(["dup_rate(%)\t",str('%.2f' % (int(bamdst["[Total] PCR duplicate reads"])/int(bamdst["[Total] Mapped Reads"])*100)),"\n"]))
|
||
|
|
out.write(''.join(["probe_bed_size\t",str(bamdst["[Target] Len of region"]),"\n"]))
|
||
|
|
out.write(''.join(["target_reads\t",str(bamdst["[Target] Target Reads"]),"\n"]))
|
||
|
|
out.write(''.join(["capture_rate(reads)\t",str('%.2f' % (int(bamdst["[Target] Target Reads"])/int(bamdst["[Total] Mapped Reads"])*100)),"\n"]))
|
||
|
|
out.write(''.join(["mean_depth_raw\t",str(bamdst["[Target] Average depth"]),"\n"]))
|
||
|
|
out.write(''.join(["mean_depth(dedup)\t",str(bamdst["[Target] Average depth(rmdup)"]),"\n"]))
|
||
|
|
out.write(''.join(["coverage(>0x)\t",str(bamdst["[Target] Coverage (>0x)"].replace('%','')),"\n"]))
|
||
|
|
out.write(''.join(["coverage(>10x)\t",str(bamdst["[Target] Coverage (>=10x)"].replace('%','')),"\n"]))
|
||
|
|
uniformity=round((float(bamdst["[Target] Average depth"]))*0.2)
|
||
|
|
|
||
|
|
|
||
|
|
b=os.popen("sed -n %s'p' %s | cut -f5" %(uniformity,bamdst_depth))
|
||
|
|
b=float(b.read().strip())*100
|
||
|
|
out.write(''.join(["coverage(>=0.2*meanx)\t",str('%.2f' % (b)),"\n"]))
|
||
|
|
|
||
|
|
depth_tsv=''.join((out_dir,'/qc/',name,'_bamdst/depth.tsv.gz'))
|
||
|
|
depth_tsv_df=pd.read_csv(depth_tsv,compression='gzip',sep="\t")
|
||
|
|
coverage_gtoe_80= round(np.percentile(np.array(depth_tsv_df['Rmdup depth']),20))
|
||
|
|
out.write(''.join(["coverage(>=80%)\t",str(coverage_gtoe_80),"\n"]))
|
||
|
|
out.write(''.join(["gender\t",str(main(out_dir,tumor)),"\n"]))
|
||
|
|
out.close()
|