From df0416f9094714c30d98686ef3bba5c34ea86600 Mon Sep 17 00:00:00 2001 From: Mick Watson Date: Thu, 19 Oct 2017 09:30:49 +0100 Subject: [PATCH] Added GC filtering functionality --- README.rst | 1 + nanofilt/NanoFilt.py | 18 ++++++++++++++++-- nanofilt/__pycache__/NanoFilt.cpython-34.pyc | Bin 0 -> 4582 bytes nanofilt/__pycache__/__init__.cpython-34.pyc | Bin 0 -> 210 bytes nanofilt/__pycache__/version.cpython-34.pyc | Bin 0 -> 185 bytes 5 files changed, 17 insertions(+), 2 deletions(-) create mode 120000 README.rst mode change 100755 => 100644 nanofilt/NanoFilt.py create mode 100644 nanofilt/__pycache__/NanoFilt.cpython-34.pyc create mode 100644 nanofilt/__pycache__/__init__.cpython-34.pyc create mode 100644 nanofilt/__pycache__/version.cpython-34.pyc diff --git a/README.rst b/README.rst new file mode 120000 index 0000000..42061c0 --- /dev/null +++ b/README.rst @@ -0,0 +1 @@ +README.md \ No newline at end of file diff --git a/nanofilt/NanoFilt.py b/nanofilt/NanoFilt.py old mode 100755 new mode 100644 index 607eb5a..b848c5f --- a/nanofilt/NanoFilt.py +++ b/nanofilt/NanoFilt.py @@ -42,7 +42,7 @@ def main(): def get_args(): parser = ArgumentParser( - description="Perform quality and or length filtering of Nanopore fastq data on stdin.") + description="Perform quality and/or length and/or GC filtering of Nanopore fastq data on stdin.") parser.add_argument("-v", "--version", help="Print version and exit.", action="version", @@ -63,6 +63,14 @@ def get_args(): help="Filter on a minimum average read quality score", default=0, type=int) + parser.add_argument("--minGC", + help="Sequences must have GC content >= to this. Float between 0.0 and 1.0. Ignored if using summary file.", + default=0.0, + type=float) + parser.add_argument("--maxGC", + help="Sequences must have GC content <= to this. Float between 0.0 and 1.0. Ignored if using summary file.", + default=1.0, + type=float) parser.add_argument("-s", "--summary", help="Use summary file for quality scores") parser.add_argument("--readtype", @@ -80,7 +88,13 @@ def filter_stream(fq, args): ''' minlen = args.length + int(args.headcrop or 0) - (int(args.tailcrop or 0)) for rec in SeqIO.parse(fq, "fastq"): - if aveQual(rec.letter_annotations["phred_quality"]) > args.quality and len(rec) > minlen: + # assume nominal gc + gc = 0.50 + if (args.minGC > 0.0 or args.maxGC < 1.0): + # one of the GC arguments has been set, we need to calcualte GC + gc = (rec.seq.upper().count("C") + rec.seq.upper().count("G")) / len(rec) + + if aveQual(rec.letter_annotations["phred_quality"]) > args.quality and len(rec) > minlen and gc >= args.minGC and gc <= args.maxGC: print(rec[args.headcrop:args.tailcrop].format("fastq"), end="") diff --git a/nanofilt/__pycache__/NanoFilt.cpython-34.pyc b/nanofilt/__pycache__/NanoFilt.cpython-34.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60c212c5a02d93b25f3bf3204f4d163e6b224ec7 GIT binary patch literal 4582 zcmb_fOK%*<5w4kia=4Tz*|AJNTb5;MVeOIT?}K=A^?VmQ;gJJdXu z-8~|?CQnKZIp;?Nx#XNfe#rm^g5(lk0_W;ezUrCfLyCeNGUW75zpAUNzxt~BpPeP| ziT{tUzdTR$cRKO7`2Gy9{tJqbXn@k8(52X+#HE2t4%cfG*C?sepkCJN6gMbo(x6$^ z8x*%_(6V!z6t^i^pus}f*P_^?WRV7oWxY*thms{4ETO(Yp+~>Lx@Ra{q_AVZ&r-NV z;TikAOd+bv_WK;k^Aw(=r!J{KlRQJuiJTM~H)yka-v2!|<81~ini?_6lo&;^kt#~Z z!cRkCRFouK$wuPA9{NME;iuU&Q&JciOwugQ#*x$_&M-(xKh%Biiz5+%pNcIh!Yq|y zYhMI@9OSWYSYRR{JJ4cx62($zgSouWH4i%YtU`)$&t$wWa&5H{I!_W`?X!kb_*#r* zDwS_!$fEst%MVzqQr?!YC02_uv+un^p)3`#Ovh$ocj`Zbu(7mz5cBC`C->vX?DxH| z=7u=ArIE@KVJ6s{97kzt)nucgMdBAW^Q}Cn@Q6(#>;!GXD7_|jRb*f|lVS5=mKzMZ zpBl`=hR_JJ?De#2mZj`C#w!JwKVksJVs(ZhGT9okdg83#tRP3--q|aUksJ^ZE9t`=Yla zuCIw+EPlL(pWXyp3{*Bnl}F+iEw`)H!Sr%)dGi5~Yvm2j?UWW6F z#%Ur`)8FzF(eu88XHnNF&R}b)8IJPQ?iDSKO}TUb0eAgOjdPyz&{tZj4K%fIieKW$ z%86MZwcadIgOiYGmrP-Qbqz@cDF^O!IUT$YHbVY;;>ido0b+S7TkYjUUBqsG?Otx_G_xx}k;o zaNb(e`?_dYxD-p(i1I|Sn>x$-BdKNR{KLalxd)##dNoL%C95(FBe^QKR>vyKr+RhQ zHyXPhtLbX8I~+%5l5ef1@M9Kw{K1}Goo!K1{3yN1M(Lm+X90i9cwciB*pN=y&qIT? z-yK*J_A9yYIXOpG1QmvLW}L7|AqcN+{}0$+l|%AZs$k<*bd#Vy{h|l3V&orykSRNm!#O{f_N^ z_vP;QD16_J5YX9~@9Ls$`oK=QMB#@NqP$GuM>J*wl){?~z0K~GjbbsB+Hx|`I(zg` zDo}4SXCrprRrvccdvWdF-4omm>N(=+k}pEv_yUw?;nbhGy=Sw9UaxY9qCS!Fbavq( zJDaE)El0?`$n=Y5$uvc)>Y6Riu_C@Xgsgbh?)6G_X74kEKvP)DFG(OzN>(ig6s=H> z{5&?_vF{aivp+X-%O zhqqi&`mFz-^gI;>s*yr-U{S2?tkvs**Y4e&jle>=Ou;ZB$+Zy^%;X~oGUjk2ZvT|& z*i0hb7vhUJ^UV#E8qIRuKfL{ zI9|o_VQOGT_&fATFIf|P4?ShqsJ%XNBMCmq`=IJ!YkvA;jiL5;TCw7tOGv<-3w zNnza=8e8S+i`;?kw;>gg(sb-Vq@`_I;aO1o46mZ@keQD?aI2Psxh78=JfNz!=qaKw z#-C+bEWx#9jehIW$~IyFHiTr`=ETn($Jx0<_D;$zGgeIVWQ&<_^fH&7 z&KN;ceJjJV3;@U}Ugi6L{q%?gs*mqKEu6cBb8oOToq#%q)h3D>#1ypAJy$eK&DFaA zPSKtxW@-UdRpYjXMf0Fx$qYZtEJaProuZLXr&1M-Aj{zjH3(7+zrp!f8klJ4r)g%a z!|I|nU%6rJq29s3ZliEVJL)XYX+k^%3~DfX;f{kBo#zY&eT7%^2Q^lm6~O;J_b2Wz z9mP#Y4!~}^i1#jD{Wlb4ih(N-(hbJM+!FxpGS?V0H~?^D0vuJHbzlUZu)jxpJ13k0 zxMgQ}!p;C7GOU}l!V-Zy;B!o|BoN1 zA5+}``W~ro3GF~LL)WA2rD{>=hkxhm)S+Kj*!`V_1k)|wC{9x@-#m`)JWRj|Uw-@` z$oWciglnk{l5iufqx{d3Wof&ZOZ1yAS4{abN{JD$GXKb|ybaoD-Pt_KaqCer5!7plE46zi%! zP!ZC1# z&T;d@G>w913k>RUvA0DcjtO_U2gZ4a74m?~4mC11v)7DGXRrD8b6sjJNzQ)Bz zT=coP!iDu{cDthe3%UQfQkhbmuorbs+`4tO%A?pBi#i{B(JcAz1-mPDE{Ruja!-x7 zop+Ivt~eh%pEyehU$(z!4~Kj!7!JFy6|Lbg%!1*t@P@dYnOw=?Q1NLEYIkrKwItI< zj4h)k`>)k&&}Ov3;**k9S{3-LJqF8M3Zd8Q4@_QGKUq9uhRT~Z(zi}D5NM8Q$H lXx~b*FpuT!H&D}Wp=i3x2+7L`#`d??@jBhkxz2LO`8RP*v~K_a literal 0 HcmV?d00001 diff --git a/nanofilt/__pycache__/__init__.cpython-34.pyc b/nanofilt/__pycache__/__init__.cpython-34.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41261115efeb77280c49f329bcf8c17619bbcc1c GIT binary patch literal 210 zcmaFI!^?Fj>_wy^0|UcjAcg}bAj<)Wi$#D$3J@_enWF{L+GA{qn?;VxS@EMFsk~_wy)0|UcjAcg}*Aj<)Wi@AVA3IjtFkYr>C)?}(;HPkcKGw{=7yu}?K zUzS=_oSB~&AHR~JhzY0?O#JfKPpv4(FDfb4PtGmN)lW@H$xPKxP0~*<$}cS_)-O*i zDF*6EFDlT_EssynEXgQM($7oG%TLS9DM2uxX6qGH-r}&y%}*)KNwotxs~CtG0AkuS A9smFU literal 0 HcmV?d00001