From f7db587c2cacc500feaffc8d795bdf3c70a25d03 Mon Sep 17 00:00:00 2001 From: Jihan Yehia Date: Mon, 10 Nov 2025 19:45:16 -0800 Subject: [PATCH 1/3] Added Nextflow license to Metagenomics 3rd-party licenses --- .../Metagenomics_3rd_Party_Software.md | 1 + .../Nextflow_LICENSE.pdf | Bin 0 -> 41180 bytes 2 files changed, 1 insertion(+) create mode 100644 3rd_Party_Licenses/Metagenomics_3rd_Party_Software_Licenses/Nextflow_LICENSE.pdf diff --git a/3rd_Party_Licenses/Metagenomics_3rd_Party_Software.md b/3rd_Party_Licenses/Metagenomics_3rd_Party_Software.md index 88378014..d0195887 100644 --- a/3rd_Party_Licenses/Metagenomics_3rd_Party_Software.md +++ b/3rd_Party_Licenses/Metagenomics_3rd_Party_Software.md @@ -22,3 +22,4 @@ |Snakemake|[The MIT License (MIT) Copyright (c) 2012-2019 Johannes Köster](Metagenomics_3rd_Party_Software_Licenses/License_Snakemake_6.7.0_documentation.pdf)|[https://snakemake.readthedocs.io/en/stable/project_info/license.html](https://snakemake.readthedocs.io/en/stable/project_info/license.html)|Copyright (c) 2012-2019 Johannes Köster Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:| |genelab-utils|[GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007](Metagenomics_3rd_Party_Software_Licenses/genelab-utils_LICENSE.pdf)|[https://github.com/AstrobioMike/GeneLab-utils/blob/main/LICENSE](https://github.com/AstrobioMike/GeneLab-utils/blob/main/LICENSE)|Copyright (C) 2007 Free Software Foundation, Inc Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.| |R|[GNU GENERAL PUBLIC LICENSE Version 2, June 1991, and Version 3, 29 June 2007](Metagenomics_3rd_Party_Software_Licenses/R_GPL-2_and_GPL-3_LICENSES.pdf)|[https://www.r-project.org/Licenses/](https://www.r-project.org/Licenses/)|Version 2: Copyright (C) 1989, 1991 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.; Version 3: Copyright (C) 2007 Free Software Foundation, Inc. http://fsf.org/ Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.| +|Nextflow|[Apache License Version 2.0, January 2004](Metagenomics_3rd_Party_Software_Licenses/Nextflow_LICENSE.pdf)|[https://github.com/nextflow-io/nextflow/blob/master/COPYING](https://github.com/nextflow-io/nextflow/blob/master/COPYING)| Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.| \ No newline at end of file diff --git a/3rd_Party_Licenses/Metagenomics_3rd_Party_Software_Licenses/Nextflow_LICENSE.pdf b/3rd_Party_Licenses/Metagenomics_3rd_Party_Software_Licenses/Nextflow_LICENSE.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f4af319abab3906e40efb2d1f11b26dc295cfb37 GIT binary patch literal 41180 zcmcG#W0YmhmhYXmD^Y3Nw(ZPHn>%f@(za2RwrxAJ(zb0|Z`FBD_j!8s>3jSAaQ7a2 zVMffrir8cPV#a@wDu{^DGSV@_koNE2?4K6hW>57G!Y~ss5ZD@8!0_-8(2JQ{IRPAg zpREj>03rY*TVntLy$ry{)X9v1iG`DafR7Kx(a8Z|U=8C2+@Q5;vnqk&y{Lu?M-vN# zGJcTpnjFR}fL?bUT`{Kb*FN=abVr1Ql>{*GdhgjP8OQP?);wIsw?8+d)I>*4CFA5x zwrh6N>7CQ4{+C2bR~;JOak>Fo)&RyB@hUUxX10n*7tIJ&XGj;^?5vYc(+J!BGE?s9 zemrvCelXvZxaNmwlXS^WdwUQ9VWH%xN{ZR2k@fW5U=?1e=|GuNsfkPz%d9Eb{M}0h z-+91%@~rK4_z))j{a7H{WS5~elSzxr?K$1{ojqIp$e3&DnUK+Qa-Mtic>K;W=fBIuudxMJTvZ>wUA%fL)xOwT}Ufx9(bK{4O#x$0cW)XJ|In1_iYd#OlZ4 ztBOPOiPgClZ4M5Av7nPYGCeAO*!;-dcg(qQnV)V$R z%oVZ+W|g2n6@%z)FF)S}vN|`W4%@2O^^)~2I{BumTp#bww_W5On?75*+MXXYcrr(` zTAy#K+dpqc*{_XbrFrlr4MG#zZ$1m~7{S8P8F(7SQ%z+LkySWmkAiJ`ImY}5(iS%F zoqx8wNCIgtL2V)xn3v5q(Ih*1`^~HkKT- zwStLy4OUUCLbTzE@`O@~ACI|X0Z{ubHA$~q?dYxJ6A>`8*nb2sKZmb8=O*FGz9vG} z;-+~^JoCvegd{-~xqm)Z6~p58VQ7 z8~6+h9^&;QfBVze4X(r!Vpj3dDx%--1TG_ri-87?t%g+v9AtULWwc39ZaA@s#rJuK z{Fi6*?U}aWOW`0-Oh5L*Lc*>pZ{p9H18=1;rH4@5+|9TXR_z6MBbg)f1ZVa$UO4YY zC7$`wpwg`q_7BvvBUO{Ycnf?#bsUq|&3*;;nl+AXS;MRZp6y-z5<-}aX+FGW$v zv_wq2U{+#-MD(v5 zoB5#d3+AlILfnF%$7bQ|w08g@o(EAIQw<&F#5yY7J|JklGGJU+gfPrCk`$&T{u!|} zA}yhu<2;ASvQio*6hG4UTdOn9=Nmn3H7>#}5SN&3GO7+KCtX%+UwbE7#BvW{l)=>y zq4v?5d9IuknKJ?MX*cllN*to!)fC(nuY@@&eFVCYeL621cj%=S2xIAD&#nmgCZ~JC z42wiyRbC3(rO#L(cAnH%i7GKFeIY|^GO`)H=RV9VMmQSdx(<6^su8qP{QSbpo+jQ+$q zUhFN5S9Af`%dN~hD$b4NzX?c|m?rd||HcVUw5g%%XyhGDac)q{Au zoKUVkyB8uI5tUsy?Ngn`4*8C7IWn{kG#~;56&CD11vy!uFh7_>A4094z|=$p#!>l@<(}#{9j93N1MP4b^)Yg783@AGL{^S;=pj@~j9s&KTo!1scnk3^i z1`V%f-$#d-X_ZzT%}$1Vn4SrO%MJz+z$nN`|BPt zv(1zLehUc(kLlUR<>=3+E0za3o;)%OGrPC$P{>~r^Fg7<3vUMZS zV)$)fV`L&==Va1>`OS>~tmlv4f1svCRrF8vRb{<8x1zf~ajyIJ&te|XmB zH_Q^yivV2AjR4=oh5kjff13DjnfLFQo006Qth9_CGTFi2F@-`zKzfJe0ZlN&ErAB6 z7$)hf01_NX<{|%MlB14#aoVGe@@2ID5#%cpFoHU8_qK@%nD~KDV?G6YL&zxqNKyHPm zd*NVVl7@xIi>c=ON5dvB*L6zIhK$sSeX`r3ZZea1^>EP)7`|1u$)dod8rErMapx5{ z=jR#=2WHP*%bR%LmEL)WJziC=$<8J-1tT*xHm%Oe={wMY9!NBQ;MR2}sQ}emLkR3| zm&;!g0NsEC)o=W~#S6C5jql%qBZrMRJF{{G&a$MVK4O2(^0)of70mgVHhS@@%3X-T z??)_&#tabMFDxpQ78x6ZLr9r4et$&z-0kKxA0O$X`I*&?aUFD|qIi@D9T;ZT7R2MH z9!qBo6X-L)?emq591`<;h9#<*vmTIl5T`=3B;>FyFOV=XD2WZf2Z;Bn77`=OaA&(& zMO>9*43o#l;R`;jcDT*9j3ANH!2?{3u>)x?4QAeEpj8hDN-U&PHVlg&9RmoBK0J;u zvmNLL0Z^c?1QKX6D4hg|xd2$LAm0)Yj3DC@uuFi29nhSg#}t^HKbswdHkevByd5l@ zuebnAaE~+t@P?l`l>ZeHTCQMeEP6g-n$Q9gwkDLju%H45D(HZ4SS(V!!1EMY@mH#V z(i~w0-aBF^tX3$UfR}8@DFPOVcVO>8QLC%B0{n1B*4gomT^S+ASI&du?YFtEm0D2 z_IscZAvp%AU(p9KcSAHHw+&Sd77V2r=2MMQRb?;^*i4X^{hs{`4CiYK)yd2GU7*>J ze?-yttLn$-YgMPzfG!!FapNEibz|<}*nMyQs?B7>S_#&KxER^yd)f!TO?K_pPPKu0 z>5tgscxC8C*a_o<;r(t=2bjF+r~FkyttSlr>OhJdboB}o>6 z9w}lxzd|Af8LGS#j;PSG9H&%^fV+UZaKShDVw*Yj($v-rP5BNHZz*qj@dW0S+mzjr ziiEN_%cRp3Pg42B#T3^RJ-RG%JwhL;kHWWR5XByW9wooLYSkRE?fPEcAW6~m+$6(L z!*s(i!+;&u0Y%dGc)$226!{W?l;KbgZ>lqH_U>_gvhqYm|K>EmdV}8RkkW~-OqN)q-QD{ zDz7A>B$FiIq?l3}72A2#`HWKQQu#Rs3o;7>i)D+~dAhQw$)ZWu%*hNrme0Mnkdx}x zFpuUJi~GJ4-jf8ZbgWJ+Oe__w6qZ3&%ygUdk@WraLzba>GfmhAFwK3hyY5ikCTj zidhv|`CZ~&T0XzJrwCT`BLsi+6z7EJbQxqC09Yfx70E69tyVDHTq3O>jw=LCA6C^UFY81 z={nWx=xuUslMe_FM6YD8jF7w`ZXq2Ak4sK26m5at^T?-1+szZ#k-L!Op5!&Nl0};t z8WWTg@Dq8ka2Qt@bF^O_rtPxVB)T)}|9}O=pE{piM_`>+2_`YbBLm$E9gNcEuLh`{fL8E{>gZ1?Q_q6%r)6eLUXwRX82)zlr z1o1$#!s(#BIDVz0VJRm&b8l90`|jonMr$Z4BHZ655i>k9EFKYsE{*OgHX=SCIxj9O z>M5EcDxIQEqfxgf7O5FYP{@mpkB#O@;^I*9II}JtvU|1zvwK0utesqUw&KvRUotvb z8LQW$VNZpJjgPyD^cnW0w`5um9$5cWKPyCP5ME8K!^}(GLEp!c z#~|GL%Y6Pqbes%@qlQUaYV?Q9y+kC*57Gb)*gA{a4O4MI)-S~|zUn*YB(*tgg3hS`o$wub(zW>0 zO$~3g=k>9N!1IoRhk{qvEf*-)we{oHTlF+U4c+Cb$}OFW{jr_QWQy)Dak#mXbxJmIK@q(e=80UHJxkdJ3_C&VcsQAH}UHc*1 z_pN(j&o5=i3e0&ePhcE`oYy-;PxzOaa6HTI;ohakQ8nmG$=3t^4|Fz7lW3?qa~&55tIes3nD zvbu6()Gwt<_qxTQWVPevG>ONG-L^)F8`PZ$Gh3P(b;Y; zIgC72W+{Kg`~1_QNlC}uS@xp-WL>CfOmMR>{b$m1m3$xgWXUA8SiRVZc!l_69)o5hO>~2sO$_Gx;a^q^u}@S1;(d8#gj*lY`C2z}wgV4%mMP_7C{}hFwWv zVIc!YfHA=zR{chx^B?HS`8Ra^KZar1Ia&Wp7^a)Qcq{=;WMkM?ueqAPw3W338mz-y?k^@1ew>Uz3?P?+9S0kCemKZtG8~#-VdMx zJa__`h-1J%41e7j*(%1c950gIB`;L$Wash&lH7s@b+vS)W-OADLW&-YgnEfj%L!* z+oQGeOLYafCh*CC?1PSr)?ic_ZUSXrb!63IyzPsF$wp;S*@R*@5Cwh5h7qD&dv*{q7vs(?c(V$qUP zkq0TmV6JpDz_T4uA<=w6&`_G}7#fePN1fkc`fX16?chFCfG57# z#f;yGT*>fsMK?)paE0-*GB?O7?|bk@(b!=fN=z`cZs!OWw?NfGOKWLSostymDq8qjo|?e%O}ceU>UZ^)w_d=lp zg>!L9bqp15O&K1-Mj@Cby-B4~~ETf8NOEoVmAO;Ts>q@w)0h`Qw^0xzsRy^9yS z$sV6^@=_JU3p;FB3N>Z ztGoH4li*VRzre%Yxe9B?d8E8S{F3>CF zd~B)UM!rQt&PLrg`5ilo8s}h-7(scr0KZIs0#cpmeQ4wjL}3?^JzHicMU}=;hTMD) z(MMQ@{Em)#t`0h3QG?tJfQiidR|JMSE?J0Fis3WMfC*@5sJvbj$0#YmGg1BO9P;4k zAPg*L6hstMSDXHYt@3%3jvPiPi^bKzYbXCrsQ#}7!`>NC+)MnQ6wW$K>2tgx?*&q@ zkxvL$`>P_2;8|k^T`a45jC0u|bV-`dsu^YnHAOeut!3R^USk!Pf$7T?yLQao4QcBu z#tQhfWA}Jq$6}Kt(Y@yuKgg2{+VTeR6K(3Dfw$UYd27B>E_aPWAURKzI;t8iyt0hQ zU647&mJp0)7)@~gl3WX_FlOqeW8T#3=kx3hEUa!y9R#o#L}P=-FOuCi@b*lHuJ?!S zLRPYiX`wx!EC-P^9p-@_uDNDV05MI_KRE|;3c0(=lwk~OC)t|ocFCgf!0DLwP3zDQ zxHb1&vHIuzs7F+R=n+)`RPEEPAFz49vUWEt@Gjos(r+8VSpj|U_D_{iLgxXlCycW( z&{|gB38fP)C^RpA;NArvAEk+L3?880CQNXu#k-Zg>D;96Fm0{6+d6^GrQQvy{TNiH!Pn6|M6clw1NyoFqVXyMe8iZ|`dm zU1yknCXgS?jjvnfLpzw9A?pWIUvXKm`0yl#_!@kQE`2Ia>n_qVK^^wBfA~|1i*}Uu z=VK!sahAe<&3d9>A973{l4F1D80v=Y&+lp`i>Rzn_Ob2m^A-m`C?a2nXl8?Y*yWqW zGE~)uiTB=h!`=|@$SIOPLtma;f^svT3gvozyD!6P^(*RFZ_K3gm!?=Yj+5zpbHy{nE#$7VMC#LfAvKGNak7V1KpT>cgIO^F9byQa+BZ@JHHWo^ z`mTsC2jLB+$@eZ1F5^MUS)CO~=dy*Yr6F}B@e|aJrpE-@+IbKmu;U3+w%O<%&u$)k zhyHStFf5c7WTDdaS_Ee|%KBVO-&Gq%7BWQia+=_YhM48m%wi0i0uFPS6hF()4T@qw zASn}Cagp#HJzhC*!9hA=z8d%%WHHG%1qv8VWyqi|2}-1?3dt@aqM;|B3xQ;$6c=)M zx>i=^U#%<|2G*p~|M zxbt`=S8_Vh87e!nKyOWOWu7n4P(?^l!tonHW~l&C0^g87j0R)>gtjJZ>LyHPZ@siJys?7ql>8p89@?BtbtZ z0aGEuO8CumCyU?3_M_St%&%M-Vw9&Fy zzWzj}{v_u611Gz;N2S}d%-T74evk#>8T|1c1Y<$K4v1N~zE?fTxk@A%WCOMs zW~h_Q=yu^!o?fkRDcXh)ZpB(Wfdhl=AQ3A4He40POCI07T&h2U3QE0g%8Vp%lIrXQ z{C(_Q^!?g*eDa7NzqWk+i)mSx@m5;y4Rkt(NV^61ij!!{P1kC#2%s`32r-88%-d6U zxgf&cfy49g7Ea-jehAx}w#Vd5;m0E4JWQ6aYUSuUEARcBVei;e#$a4Goomx7l~&E@ zMTMFf-SG@Fd9aM^#o%lIx}9x^pKWv2G zFOKcz;!Fm68Ogi2W%r2U&b^0%9y>*+s+PB%?ADC9jRQ&4FJC5|)Ay+^G?R%>{_OdS zRvAa`e4~{*%yU6l`uz<%TLv-qzhgDW-g|B?SW*xCLI|95D~#_Y18biAq|aoz)2i_WF&?=>difY8)J?GUJE(_4hC zS47deleBogY+h2I1SA$e4A=6E5Hcs=p+A>Cm7dhx1-KPj!6c3S@4UwqiL8NxJeBX=3v z&`vE{86Z#LJe9-X+gPrX>w8+T+1n-l=Qzk5m3NQN*kzpW83oKYxYu z7FGF+E_07=NBE-=$;Ffmm&WhQ&+z&j4)D{^^Hmf=_#ec|Hs@p;_34yfEP(-$gI8j6 z<@rfd=vo=gx8ZbJYjqxS!LdPne2StTTPh>F1j>&Pfn6dKpqQACd}Is~Q4dTCu%!iQ zZ@n%+)G)GmBRl;1qC?TfA@}81*+HgWyATnt-t{E%fsjFkMOdW88W9vvWJ{AZ~l-n_hx4r>;^x-jk|fV$5rshbZqGa`8|G&)u-%QzbHS` z{zW$*DT~I5IJqBAs*bMlvzNF5St&DLz|5u4!~`t5Vg98RwV zGHM8wEe2cLbb^_$vZq=zk-3=y*<32V~Ig@+R`Ch z$P(s~c&x!z^oS;~#&_jxD5xUUVd)mxct>W?q(QE2c?w{{<#m^W$_Y?qY@38nYNfllFaH=Pvag3ZO&o17$)9Gd&oylqvwjNy zFqOxm;G*bip-gga`%u}GeC-v0)g|mVSAYO3*jROzkh@MHWT~11W}zrReh(uV+5H{Z z;+vx_jh!J!`KmVy_Di*XlF$t~G4Je-8TI~^({-%pHF!T@d9kxIL-ZoGm$|wrK-vX7 z6Cdp-15>k>?4GEygn#tLa^0{9FfFM;%w-;j)KpcH>Ow~b4Sc~TZ}uHLg{SP`jt;^fg6puOasoo=WW<7@GkGaZ6rDc8AfteCaF(PJy$tAp77t zW=r;F{;?u)_!SGW?1fo4L`672k+8rhbcBRJnk@3+%#A8qA?7MH6}fSW&$Jki%GHcp zgU8kC?dqW=k8NfW_nTA#y$Wj+t8K9AI~(jnM10_Po$5FEjHZ)jll>^wD80wVT88By z7q{1TVB&!GX^TYJkFSj&-hPH$9(juakc3T*Ig<+q2J@Cc@jhW(*%2=>oXt6mDt5&? zvn&i{snc$X$P!1kodHE}eyb~+z*1k2aedcETq)B0{R)1j17BLuOcykB{F12mNpy#oS1me4P2ZM7 z51K)=!B5iS|0)m`TkhWc{Q>FxDR_QuGh*&uwTyjyc$K?ibDF3*V@lI;^}^OmaTk-l61hCqSa$0mOSy;bbZ`xwZc zyh9+Eui2V*+!ow*+}$DFW_EV_W2n;{c%G@Rd4~7S**E>?j;3&|8Ji>c9Y=I{Tf`e3 z*QM#iZo@AIij!b7!>irF8goqomSalI1L3@ny!e2Qjy=!rbxIZ#)tm00&2t;?)tG2$ zu#2;6*S66G208e#1VZqFqe4T(>SHKQa5Qvf26~Z zT|QA}$@_O4+miTJTUT@2mWL*AP*P9zYIe*F7ttRXQf7V5r%3N(W}jP@R3?Ykv^ScTvWkX; zHIrpgeS-Z-+63P^>zDK}O>m@_mm;b{_cRQLZKvD>nGjEVW!8eFGfvN|+jF>nhm)c=wm`(0 zU+vzRxEaDXrys21KgeNhvsItsCuvS;q32hr{WV>Bi6y;N?%+%ED!n@9@)Mck<-KS} zl={58lQp_9X!AB42C|x#3z;)0`Z;On4Q+tH2|3b)hgliRUyQ}LJefQaGm!|nxE5PS zA1|gJ5g5nwo2$|B6x`eD_m-d$wm$#Xf$?rP%BymJn+{8$yhEY6j#qAP^T35`(%vDa zow88*c_)q`X73_Ej1~Nj>g(KRVK9kS|@o zqw=sPfhQ?%SMu$z{sY)Z)(~^FgA_iuC@NiS+owv-0gS_69P1cbM%R+NiHFIHrTtMg zUcQS14*@`^LEPuUjLCaPwi1QjbVNrK`q_=?BDIB+>3d0eUA*r%X0a|xpzTc+1uakw z-BDUTvqI>&A|>7yetIdnZb3-kD?LT-B*_Yk@+hzg{4%*PJsSMYI=rEN@)@)XI5CvR6o5Mt#G~s=?B>k`vH($RWCUQcXnYjy80k=*Cd;WvJ-;E=?h51`L;bjw*qLyOLNB%+iJEyfsclwa}ITYSvW4{9L zOwsX*yQ$Yb&_J~QSyY6qo;^!oQVfnK_f4?#{tZ>9Sn{U&SxSB6qoRjE06719l&wyj z$OiX%PTRoIC&MAgyP)ol_Z^#(gz63Xxj5;2j_b!0zuL;c%70%_F#i8l!Z7|*`tqME zihpqY-t`$JpbP=DgI;E;s2`aW?*6eFH4FBO&!OTVHEGjs-u;nd>B-3 zV!W{3HHF29IyOqR(i6{}pK03{ zCmnTEG~?1P(ZCp)K9{o1XGm%Sdb`LP+39q?Npmx-!eb7WnBj`fU>_^;?xI5ROe5X# z#XEDprs3RYf4h8zd}Tx zsl$=JJA)}Y0pi)O;L!&Sqd=UzkP>yD92q`suZ%@QBk%9VY*`yTQ|r7so8z4_hqG;% z-(RH?mgQeAogM%JLPV>MD4$jiil~YE*1q+!u2yc9;+3~tUbrB~U@lc@m8&VIvP7ah zKw2UrY^ugJpxVog`6877%w&mSV2A4<7lX_YbIKmP;)sK)5IzaWi$E+H(>!mR0*eQUzE$u44Rfs>9aov6*W?x6s#*Y2+^R_@1ZlCrH?fr&6| zLzgMN>_vg}NJ`c*Jid!8FyT)b*aRm|5REjISqtW1Zn2Nxi4P*E-U?=}p=jowVvCwy z7)_$#Iu(PNF8k7C`;W6gH@E^OgIorJWqQ~TsEmOK%fzCdW^Q9Vub$GklrpLhgV@1sYpqt^FzDGJW$ zXVhDjFyOmlUP`o45js(jy-%TPOAB@nX~&*QZ+CMB2l74Be=4kB?TN@y~kM+J<;3s z6`9*}xFqndWS^}P+%8i&HrfD=2W=YNBq{jC?Qf{$dF3$jk*)eLabKy*vcjW$`z2Pc zp|B#hpV_YcPA+Dgi|hN(T&y|jSs}Od&%kFq(3oh2 zJ%bdi%WZn-w%hV)vWnR`a8JDkrWk*=f<`K=<{Nh>@~oFhh$XXApBVXR0LQb5n_P$@ z+FC!?H8Ll2-bDHabI8JzCQs;;f)8C{OO5w@m)3~@I|C5HKfab&ELNb@3+}pGt~6;c zGQV9z0p)h;#k{J!GY*g(rQ);$k_Wic>hoiTQzzP;Fv%8AT#32oaL=BYawzJ?A!x={ ztUH#OL}`T5tZKA?yiOT3tY(W>D0W;AP*@SjC8=*1tcR+H(67x;e4WPzDmU2^(?Lm{ zF?(1iJBr@43bK8fdQ`E$K#qV@y1Nnytq8bl=fYn%cL*M9yl(`r9sbA zNdl3-DBY@Rzf$+yGDNq$_DClJjZz4#Z7Zueq_vlG0fdsTsYb-6CqkEHk9eo;oqrMS z7)J{sXo8!g3#PoW2kF*Rv4r}?W|o~ z!UR=wT>NBO*F$G)*KFjecpI)>q#o2DJj)fC$f4k4=HVxYpOIU_DZrG|0G41LakkTiyA=6V|y^ime$}_MpQL@=f5!pHCD@nah;=+P#|F8gEZPVD`_B%8CF8~db_Xte+dR~_lP8E&NJGu3wS zq>Z)V0IygOLa(^LIf50;o0;GO%<-{*JaQvO9Yx7v)4vN9TrHex6#M z(eWS`*K~pCYsfL8*^m1JdG&v{!MKl!_Ryy+QCe;*~k-+e7Y}(L^QSaxIQ>+>0X6g#<;Nbqjz-Lv@RuRbZ$UF?N{Iq zq>XdADA2`Mt6gLH!7&Q(_!dGOWJS{u(x<5zUa1f23(vYSe&{m2JICvHBS2S+pTIzm zOV9s?ivF(K{a?jd{~^u&Tbz~gFA48|Dun-MD*EI3|CEaU9%ubmUmX7@_41!iIR6$M zvk=frn;Sd+mMi~3#BT(DLeJj_{=tjiU>N@rMC<$=@BS`D{;M8g11AG3ThqV9)sFvE zRsT!7{P&*!J!*~$*ifWH#{2uS`)6#VlMqW;}R2Xlaf9KcmUNmltU zowNwR(a6Ev&dJsRhVie~(u>;sexuLa=64fh4UE3oS{vB>%OYm(;OHc5X5c`;{96<) zWAM)#MyB6)YUajHW`Fw5#=rpcTiMOhx$LQEKI*$1YJ!3F)9A> zJb%w7x4+%}X~^##48ZTI|Jq;q*Bt+c;XnS~f4x`ANWjd-&i-f0{qr?bRwhpN|GO$b zu>)EkRdj_xTV>r*kN5FzjrTFlGad-~7zqT1NN@s(5Lg63Fm%hW)xR|y2Znm1xVp@Z zm=G9F5L`zG(YR+-#kI#9IR*`&W9&@Gys5XfAAX;qj0gbwYb>~X$5Hsxg#I8b@P0yVr?4)>>;Ihxa9eCZ% z`{D+Wxz%*OfE=^*Na9 zS|RuVuh_Bx0n0@p{b@22_7mS~j9x*g`a(zxkLGv z!?+T^$9q9-xEzFDGpGur2O$n9!JavRGgGEwFn_2VXd!Gc%r$cG#R7e8N;jc!$vmVr zX#&ew@NYhZ_i^*XumF0E_2c`nd#YMoRzb6XYN%d^+72ZXra7%_{Btch1C~Hls8Ir| zz-lyqlzBlo+T7r|I4%h% z?ZD%h1P_dUa=A=SeQ!H{!7o&)kJCumacjVJ?vP`vxTr$<4?vAM1(j1qwE0?u!>Kw~ z5qfouD)y?hGZdr*H!OuzS4s2;@)i=rAVIqN@#Ware@ne?nX3|lalWi zikc#6FA&b_W>|z!qT8($n++B^G=1JH+esg2gYNobgWQ@FyY`b_c7!~@Y+p_U~b3TO`3qC{+{|5a-miIODw~LO0`~jWn}8IG>@B znjV{SJmwZ*bR2(f8RF?-dtn@0m65ph@HJB0`H?u>(n89FMguQX=0vXH^YyP$Kjxww z>rYhGLP`u~=mPCe7Pz&8>RZxuna!~yE}Qc(mn$nB=h7gsA>puJ^m5rsYOT#~&a!r` z*wXTeubtMo9nSKji+IOz0W1l(EHQ!Zc&Axh<`^YpNm0VCaa8Fn@+$?pa;4bxIrk^~ z!UhE1@3HtNAs0d*viaYK zCeJ>+O{Bd|FkTpDo(i-v{%P^ZSih@Hhr)1?)_|-$}_x!nz8nAKB~?Suq*=DLi{z*Loe&qanbz5)lR?~%9My9y{G%;cUf$~Xu{p^*W;HqW zV>HBBrifj7$`n4OlKWCa%k0+frJVLoL+hw6IHIq>R8j#^zeGeh9lP*gmM&Kj6sV~s z@PLrbW?COQ9TGYkTZfmXSDr|=wnZ2lXC@~=0GL-NtFlP8dekfduM|FrY$z%KObzjS z?Nq;D10x3*C)OcCkWT~#2GjG-CnmjayW2?pz$}4AyU0tE+lWf+46ASivR&mB#Jy%% z=Q9wt(po!KFBY^oa|8xEz2M9XT_?$XVqNY8{KIodw zwaV8)fe?F@53csF!o+J3bUNRPL4Wprhu)^!^Pd|X0}79F2!YwYT*kgVU*6D;_5#Zo zLIWZBW`olTd9oEEzOUp&>48rx&?WpRsKfce>;V%t3}k&t3_-lLWI5&G1pfjQHTnL< z0{o8R1+QQVh=n{wRIDBjaBVx4mpYXuG=Id)0$V}MDii+>NiYOPXOP*$p2xBVz-h%; zo(660teY^c>$9ng)957UZ79U0CACJUEAhS>9_jmela zGT`5T$UoRkz&_%IREjO;o=$7lBk%_B3Hb5ZfIhyFk5LA=tG`{l zQGLL7qHTs<4*s~Vd*XaUBqGLVhI9P#O8$VqjOci>7SK|()k3!?_I=A)sTu;K$KyhL zv-_3Q4rX!NRWd&R>zIXS-SW-a)mrVEWlM|#l4y=C?NvjR8EQIIo^H5}VB2)o*2??Q zhJL2{% zkerkQv3v5K@2k#G`^e4N`*K7bZ0)0In*&;lB9ImwgAPxr@1}rn;dL?Qg*WrqLEe$M zdK#T?CgQZ~l3zzNm|iIPmSi*hWDo2$(OPuX5PNqyUr^KT`jhY*~r}35~@~jwBhPf!I?9Y{>UEN{%X;A5ysub zk$3fD%yRD8)Y<;>{BlYizR73i?B2s(r+|axaZ#RG5jhT|Rf z0c`mJItCt~vII^1AbGj~w2KE$oCQTQ3%L3baM<%kKQy@|_Xe^Np*Lu|K?#_ff?T#kZoh zhF_593E-9#( zS zG#H?9HfTuZu!m$f(>(JYb0XKA6SA!`efBeS9n8)JfZbNZUb+B%f^B#j9>eCyNH&+7 zfSw^++0Sxzti=L)CkHz$y93wYP|lCra~eDbAbc9ty|Am8YsIJOXl$Vv36c?pa$wWLBjf6wNfZ+FeJ#Lqi zci3%Ki`iu440@ebqgE*uav94Ij8cukVQv17NLvRR30`nPnwEpppS}Us2bM4 zPlumgHyxV#^%SA!W#{!2^F75gJu&C6K{e@=zcJ|V*grJr@5YnIOoIF!Lxau!j$=}O zl$2j5<#mu31h~N8=$JFq-+|lwjUB_Tp0ld4ZD=}$w`tT3!G>AtbP8=#YapwET!%fl zXdAW<#!`mZ8wYM9NU4KY>~IE$Hg@nqxD;-~gd3+{+R-#-QsYpUAT*~_9k^j;a7G8B zTwqe_f*Pd9bjTYz6w+h-b35Q|&}#p-)ZSHVx;ZqXEvdaUcD4ez_-F9foBd&ZUWFM>AZVNGt}q zfwa+Dqky-&Lr%pc81qze29c&o>$t9q^ zv|FJBsFZ53e^>|Cc0oRFRtrL5&-`B(b?1)LtEA-YWrf%17?>;!G(YT)LH)}8hNoS{ zj2s6jhm0IIdDSX)zqVn3%vP-$7W5BW)wXJScW(KNpq~q_+64?qWYwZZKtuTz?auYA zc6AI}(+saN2M+|eC1}XDAijOfHWA-GcJicMplR{nK5kMcur>{CLzT!JpByPbORGX^q=EiAkgN<~buogZWCo3}RqB(E zY#_5afX{YH5{8HBhvCiw3V{3&LSg6>0XiN7s+mC4`jR^5TAEFxBCzI^pi-di<-h}F zP$kgYYM}fB(IB9^wLtCbfS%U_&2Nxokzs{9R6)PhAo+YjYCy`kjXcW`Hpo6HTPkmo zA5%P}>{hweF^yZ(s@b9aNcVyM9zzRP!u??U)ND0xvs`NZmu-suCC3i_f1Dd#d)%u% z3eRWWxBSb5djji&GeUgm_3&j8OXS<=zhZ5%Ly6lHe@l%?&nPM>)))U&@^Tqp_G0<6 zic2f2s{TGeHDEKyF~BRavtjxen7d)yXLHII4S7k_`i*;x2aJb|-y3Dd9&AH|yws%x zez}`GwxdM35OkR@kOyF0|B0KBBgvg?IdP2ZJ=t>X#4#904OatWD=onSBbNsQk?7eB zJZqTT&+~pco-Jh^PQQ$OH|vb}d=Y#!p8_=P&V9vN*i!&Uc3A%^&}0n}N4~*hc>rIo z%ric~45@s6ArK6OBT-e%>loz=9tx6RFvA2!ar{A^@H{i9DwC_PDz;WJIYX67TV;Uv z%$cg>-S|t98|ob_j}5M_GNf=SH@K>-oBZqfp(<3YnfZ((iR$Vwck)Z=|a?B{j*l4m3un#CLX;>`LQ<`E(uhzz} zg@v4rK(4lYPz(>Kf~c;vK87075E&JQ7}jYZGRv$nWXBNY)k2a^r=a$^=_XJH+s2OU z2m_rERYTkZ-8Q%0T~n{h9YJ-uZ;=Qo2f>m%Qr-90jM`hUg|0@XqO!cqrl^Pn19Gdy zR$f*qljZBHDyzb@&1zA|nP2Hkj@~zS-o5wEn|E)`;xS_u(||8JeNwN`7!@)Tqt+|c zkn!C&Z|;52>Au>&PRvj9@4tWk{QK^kKX%EIu@DaSvL>xsEteP4EI0qY`!A=R#w}hl z_L8fXAWT}ar$G+;5^zM57>#Qa9Ff`eX0^#Cm&-W2&1@ZPmW@)W%$xKf1c-(p-u+e& zmLUh`Z>9UAWmNBp8txbe(*j!q7Hj}4*8o^7+=44irmDV|C02`x^3gC2(Flo%pV?#(R>E_XrGF8g9SQS_o0} zEs`lyL8AF(wA)NJvh2T(_MeNhx$KsJmWR>4i7$9;Grr(07_CDS$nK=0O=jOG@8BZ9 z*5#t(74$ATjE;i>X(x;SMP5el8o-1>6ffZW0huqrp8O_1(t89IACsm}z=h#}!oZmQ z$Q9v(3@+;ZxGc!4!Ce5O9-?5)WEtQ!r>I4H;az2fXX(zm81~aqR6L5_$SP!5^XVtZ zvTLt}G40R&9h8Ir1j^wCtkkL0upeu9{eoTC3(tr390fB2SA``Ql?U^{OvZ<*o5obp z=%1sj2VP7AJpX8JBJ+)G0f5BtC z@CCg0S2C)1D#ly7Ohg^*$^C$s+=8og$|&f+!N>x?wAdA+8(KzHtxC_*M`aT_6WaS-ta zi0t~hAdR9Hn+|diF&5~@cuktRv>GBC&3P=mKyUJ`Xorl!V)xOje04<483)^Glpmcm z#4`~cb~Cne(!PvFBCsVhoP<*BHk+Ly*eLvh0b&I#YlmNXnJL43b25m$48&LLPk<@6b()l}?%YqJ!6-*cl}mIrnVMXz%Fsie zLR!2*RB>9ZfqO}%B(z2bH6|>R5&V*}piEGhT$Uct@*BzQ=aEdM)bhl#o7@B(gPr^t z%(=6TMm!sFn0t{d0`>6kLAftM5O4vf5))8&?3hFw5!YJ_l+uB@A5Xm6uTWZo zTIj-Jqd;knJSgOelVChckH|Xg_d1>4-uX1f{_J-am4R0)dG@DMQ|)Gx!(lSp*^&uz z-e}M%DHrn$%*02qmy+1OO->rdO?uv?Ny=CYLfv@E4z<=&8;}7Osq3YLyR-ysptjuK z#FQ&kae?7F{cX|PVz6q<5 zrd}HxQ})f|XU`WVQuZP~0*F3Fy*Knw$>j>9 z=q0y0V75=ckI@h9<9yFuf5p?hT&3lV_PM*JKm7N|l&iBJ_ly(hB3*UE(eD<{86BJd z_>CvmWIuyDq1SV1>r z;!1-i88b4)Fc1fl1>K1E`eTt44@=+W_nT7P_>QhfZMh!O9j01&u(q-r>qND+mT|iW zFwTL+J+K6Ba?|q~en7cQuKTtJFV6!vWd|t*I&$PFbTxX9K#_8T6kSRn z4}fLNBTaWjCKNVXSXsCtC{m_Q#Gyx_IXhUIo8S+U>X!CuRNankc{YS%6UwJi ztWgQ;0(eBKuuCQ2+!PXLovA4BFmlGc*VLw}sj#T*T&#Bd zp5KnU7iiOmbUy)C;{oaH9Z?7L+)uoM%;R(0U|Nsv^mtyf8LXym+$Ngz`j@PJL6}V# z;8z$T_(1(P<#~o-WrA1dg?uN7gxv(_mB&rDfejg;&Squ0$t|J*%k;B79v?DzF>D2I z54i#fc#5cjJ%srH%UZQSt=@&@2;C}i=dc{@qd@3ruK|gpw+6U8t_Gx#8tyw_p+E+q zVn}a|ae!<^QSvw3s}5k_0SUcd)0Y}bOWLuJ!R5w2exff^Kt$z6V-Pb;?|b;Yt;2jy zr*D`PXWyXthf~>!xOF-cJ^ePFyX=pB*a)|hL%jmP`KuJpVdhfkZ(=~>Qn>@FI6u%K zOUEyYx5lrC-yeUI|J3mhhmxm2Y@#ep=KP1uUV8fH&zCypf*VNiYoj zUDRjTLCOYpq;fnvSt$?01b4yw`U@!#d?*JiLfyIdMK#49aCJzj>xL_ru`H{!uoh*6 zjVKf5f##9sDdu_RrRLksw}y6`cZ5FCd}8{C&aA;QrNS?d@P?2-EX?xH6qX80V~dLy zm2?GmCqC5vU42w*nydt-hclY|W{c12wR!A3=gc)Q)ZN6HIELyA5IX;)yq6C-lCrT1*;zhC_v>zcHLR#|p^MTqD zJ=XK3yKyzXDUY)x35e2;-WFiof#w~flIyV}bd@Rf%TquQT;1W6UtrCg!DuiU7`axb z(GhuyO^DcU4s_#ZMJtM^ftL%1VoIn?%F-eh3_jYT!C_rgL~#W~1xAjmkvKVu*V-il zPg$k{=}Go+#gs{=kOE|fV!49s6$~J&1=y!N(xae*=U=>e)@=u0dSbzgl?`62@SdGQs89=rGF+g5LR_mSJ@Zk!b`@Fu(3lHK;D z@XpSMwywG5*|AlyE8ojynGa!CTG8@tDu%K^azI@PB9}AdC6!jEn{7oFs})&+nb6ub zRxM&UOlE7;Mgym2IjyD#b|NNEZnvv=8_pT2?nCIzO}&p(T*5Hf=ae?R$suRwN2G1B@wQWMIE@ZYZ2~O(CCCh4 zN-{$jmEslRP}38Ex6pU!J1y&EJ=SD;VsaLdY4oharMEa%Iqt>}Dj(FWk8Vsplzs|7 z8r?zmsC%?M$^GiLlIEp&vp`BMY2a)+-9c}6?ysFC!J?krUqRUTX}eJwi@`4WYiA-B z=mGsgIQLCwG$2qm$drtU%3y6gF89=$WyQ5}9XO}`Mdadf8yBf%K6lpE*+$!lts5T` zHRb+V?(;MowL3KwI6&;lTmBOingG$n+E}zlLD!nGZh};&yLq!UZ8A@b8 zMtw%WNG>ZxX=Sm9N|gU4`VG{(st*+cbSval=O|pv=j0jl z+{+Jd+I0Bxc~cVuKe+e4_XoyxzrSk9?>An3%|`pP%a=d9W!bVVWOexyZTEcg$vv%4 zRAdH>nX&4?fmJh_27Wz%!$b3CtY4p1EPU*-E3SO9W(i!dt3O=4Q1WD|-c zDtbE96OuAhtY+5kjlw(?XoWu=^$VM^&2rk{iVN}MQ??LoMdVo7>c zH(hiatp7Kt4CW~fSII3qN?$E~uaupp)P+z)uZ?PAssu=Xkkdt=LdP0H@mjTv;xkcQ z37^T^0=rD)f$``=jV#r)QjXvRB;w{aVL6#(KUn zCqVrUl$Hb7^uf(pF6=(W#Ng9^iBswf1QF2e{0Vd3f&SfB3^?gBF(yA37 z2Yc^avE_@-tDEjEt_yGuqg`Xhl^$-o(0Ng6 zSQ-RnTdT7*)s+4zVMwAxB2|P5Nvk=~dW$yQTHQt+Ii$n7xKXF$j2^Yo6pYg*eI!zz zh(zKEPcV^E0W%9#a(TH#Qamb>=FL)-%{IYgvzbjE-ee58Y1IWjy*@^jU) z+YB(|2J2KAzD#^^YvP&YpVY5uK2popCRQaM_HPJpOg7fK!YSyaCtenp!n%8Zb{_a z8%mihYK=LP3@0qd3x^}T7C`3TSwOpz5AMM7k+BYwS$`< zS)F}%Tzw*c*3-5N<_!?99SoDOEO&^#h}{BK#zW$@ zX$vj}3*%_S$ZS^I1p6$@rM9_6i!6(6iyYhS>MD0-$q3ts$|?3KnR)g(nOoft6syZi z4SrVuTnv?Zo4u;cAM_ePo;GQM+mohnm1Z^T4JWG@mLygBh_X$HM4SU%5ksl3w79gc zlr7~4tT<;8MjfN{vA35_zofCH*genHOG?|kCIVyo`UEjDB^w?ik-_J+**spmGU-*Rh*wEM5d#n|gWo_5&Z3kTl#j65U}8$|#sjeO ztb}!TGf)*BNY+wBWvqVM&VB`HFI4ywYOCI$GiVJO11pb&qv2RM9%kiclf^{jLO7ug z6=A3d)sGzC^&D;dI*Z ztMj%NjgEV+ymjWy*&1520hi9()xw8{gw~A79w=;zN!6|MM$KJv(+`t}&|R_WW7i*jZM>N4u#F+t|hZiA@ zlW3HIh{27&cjw&1T@S8{D%_;nq&lcNtU9iesWOpB5lVAulI{kIXbXo!Kr~gCn2Sx{ zn~s|p6JI%Ep=3>JsTp;wm+FpAP)f$Nv;)J@UchEhmUK!~Q-VSF%Kg~#e$AOoL6IcY zCzE{*?@N6_!9*3?pwupGB4gh=oWWwoylpiD2&=D8p$3m9x6wUxdOBT^b_s_ui)l^o z<5<#sz$!RMChL>vSav&Ilsx*%9-^1*&yH-RY4$rBx56rIgjHGst5kvhDNbvXVS`GW z2A)s$jDyE<4*g+f7uBU~VYTz(g6a&2-Ql=u*t>k^}%!-!kX zt>cb!EXQX~?xOohGSpI%0+SD%g|r@2eMb#Zy){5t3Ttux?Dwhh`mfG8ivWycpVF!S z5&-rR{+Bg)C~tDey870@pz$&f(~0l>P5(vc;}4`2(5e6X62KZP2g-3PP>vcL5v$ku zAM}%Au8tecF&Aowg(hfPv=c&)YaS21An(zt*q}Wa(ME%jP-RG7i3Y4g0|p>ZWu}-S zta3vcE~_jmD=RMYWYkJuluMhj*X{uFl&(zpJRBpqY9f`zk;=<5U|1ExdIs3PxuV}< zF_T1?Re3KfDoT4XM$W;}h(YNCM-IMr#lo|+T_P)zLkYE-Q-cHGGs^89IZIyhYC@=_ z%8h<#t|f2c$(wsPFio68dFs_*s{9toz>=8)AulSx)2Ia}Per**~tJ4dhlZq2?m3aUYIn2dHT zf9LuoyT?>w^!Y_MPs}eBzT4~MEVwg!f2Oi&UFXUNS7F(zg{2mQ)BBQ-w|mCVpLJ); z)epRL(jUWBunQg7Zr0fpu%eg4u38AYssXO%IU~;TWHq_M(Y4FtKqJ}ZR0Ft7o7)X!GV5zf-5ETO+Od^ zfOj(6vlrp*G^OFRt*;$m?h6n=z6O}{p^wCLF+}K`UBB>qz%#&uFieo(u-wM_VZwemyGQCzaim$nEwa51;3SG=TZ38Q7zAm}q;{ zT4}UeO-7Hy7vW72-a$x2X^5y)N-BgJM}z&B_pdqBM@c0IG`iSy)d7$Hip}?&r95wu zrPVqQUjuuu@qOmd$l7|&fE578uuzX>Yv)pCePHv}#EmKDP!3+pke%|N) z)Zy^>6b6$Ioa8Nt<1%her5U@ZjH8Z%s`_bQbN+Mu8f%fUdyM<3pW zKPZqpVBsi^r7A8-#4dR7!Q{by00q~1mMV!|3~)9gx7sWa8)XJyE)8;pnX#uaCkGsC zBc^noIiy=JNx4HRKmiHN^YBK=>&wbe-8z+*&YM|g>ybxVpPx^O93FMwh)XW4nv#8< z0xY>KKe@fm2M$l1i7RO-`@`_zG4GvY$k9GHg|}`7jQ9Y|@(^;V*y2b#DxLMA@=SSE z<#1t2{cK@={WZc3^{d2H^$&;}>bKVKuHRQ~HlWJ##`1|3tRaxB99B`EnNa#_-5@Pi|65?xf~vigca4gnN%U>YS9T z;;7O_!q!rOT1+LW5p1N?zlKMmw+5iuT?G+e29Z4tktJ3qDM*veD`dZU74B;v;5bLA z(?Yd6zq02kjy|VraZ7u@zWq`7iB}bRQ1#>;H+G>w?l640q=HIvoshy1nkkx)MATnH z*#%#cps`Og)Q7`CH4nm3Ux|QM>Vypx!n2zwOy_xR+(v6QkfBp)`fu70$d=8%y)M)? zm)88@x7%0Xx3a7Hu<}_%(|Zbh6AM^7J2P?PlKSJc6P|k@L2=M>*d3E#cL->{SPeo_ zh6+iP1qa;X0wL+sf=ra5G7*VrTvW_4sIbZj+_cwh;scwPN~kt3Z60j_?l`Z|l}y?a z4}6C8?cFMYjhdzuV*SVq(EG_EsYV>M{#d!EW`48gCf9lD%HHCxU5tgxl?^t|E*z z;vz-G(aK11u|}h1%@M6K60NTGc)gygN-x+!4`5%kHM%gmHM%D%i-PBBMN~9p4AHgG zgVDp$<51U4J{PSXA8y5DEj|ElrI>ZQSwdJ)Q!Ev2W+ zW*tR0v6JVHIgA5}^Z#1fq1cj4wmZ0^PRZkeHc$adlAI(twg&Q4w1NU<$0W9ls%=4E zyFv+iTN#)qli(q7hHm%Ym*;KTeml5d;g^>I5MBv9eLEcV4gT7mEysP@+yByqJkpkZ zk=sB)GW!;dhi7Q~XE?1n;LmhZR^+`h_JPXWx4}f+FV5!#dQ;?m3b^Oi(_IPb6!s>7 z;uKyef{GNY+`OB-POt{cT`p{Mu^OWUWcoO`wisc@hLd?<0;W5jip7#CPgu=LT@>Yv zLcuZ~wbLS%LAznMTVOMXytEt;$~}VM^>{)qa9zM&x61<1gk8v-jD*A9NGJpfsq1#S zED_lGU>mp&Tun7%g+dP?NrkNfI+zp%6A;iGiv|@60-&={G$r10R<{xz)HaQ*kW`; z#JMCJXS1Z*KMNrJmi^ZY{Q-h9==aMpghwm(`v^(FIKA4;b7pE&H*W0hj@Hi%)M!NjVGp~wxf=pxt~oZY$tg6 z>$bmee=&V%`^fP%_q9pkM`wG^0$=ATgV#O z9m?feP(oVoCL3hRDtVQ%TvekT$d#MQZ37+31W9U(xv(j0D|QS5e`>=XuG7?M?zDE; z_BeWYunD?9^jRY=092Xl#aE&IVMy^)6wWjXe z$u3Aaa4_vMIqJY^=_gTdan;!zHcy>{Mk*lIU7+TJrf+B}>&|_6rmQj1@+)1CLO&@r z(-atGFg_nHq$fl@FwIu*2O4blLWHS-9^8JxHAhMbO*w0wwvalg-CAcf55{_kv;mr< zf?fZ=JOh(#2Hl*6fOY|_9-zv$nG_X-I_Xlqwuzl{`}ZsNW%uFCzLnpt9RJ;m+fHG{ z<`=&s!=B83z6s9D>){OVq)pksKeZnZ%f9*9H`$MHBZaLlz$8-ulLS#39Ty$og5grY znI^N#XBel`dN7mqPHIhyP&P<@4$N=k0Cjfh6}bCP(XK_uU`X>;p5?ckYU4j_^a`X!k6G^)bChpz?Y zUKKC)3xksu(1}gSTvmEjDWgaZEFDog**U3niGNAz+RPo9&55T=4@5rjz2`p^`5^s$ z#0X+-X`^qLaBb>V->TH_e1GsglX}Ddrf@W=^X>*0dlh`8<-e^_e-UEP*){U{6G=fH zNT-6Jvac-218L+fE~3*8k)f`rNa=`!(2$@KwXcUX24u1~EJPL+AJpPxQ`ss&1y7i&S%kK~=GqC8PXBK9E z{NjqPF;6Yc?j!2#i2i`{`fnz0%nW|$2U?5mFJ;Cx&95GJKX3rMfF@XgCJaTNiiz6t zi(I42TgtDo-DX?iTYp6)f&HlcN4o<2>rZx?9hKl4BQ__+ zc zDlWP9QL`Vc0Hz51e0v_%pIR+mLXAZQ*SkJPG)Y=GJ_K)U*SOo(PA|Kp z+-G!qqN}E@+qbfWHho2{e^c*!Wd4xZUaJ$nQ*i5VU!t7ZTEM3@5J2}n>b|PS!7K(n(UF>diA8@kDp3_6^o&WN&y)b7R^EzZ zB&q)>qfa5&N z`TvEP?9V?!iE<~@BMX4LpxcUM@(d$2>*V!>L8El85wu(dt!<&6>HYSJMbBLa22j1$ zo97!hyma%}m9r)3B45D{?j5r2_>9-DB`?8>>C|-DdWWoj<)TMsN;Q&R7FLP_2&I%K ziJL_?oRDyopo<(Ml5L*5xPzXb9X}`4PogJ1L@S8{dX6i%f?vMXqqW)s2|D$F;1@** zig1y($U$&3j>N(0nMis9Nu-fnh#U(Qn4ZF0r~t_#P814;cr*kHJiYotkEh(>@$e3h z;ILake)PgVhP@2uRuR~PEOtA57$gx1@Dafh(J~QD0KRgd)er<0H!vVaN*qm&4##l^ z3qrzRQDY~OVr#2)kCm}PxeGqPV(-DXfKu^bS4#2*x)i*DP8@9kFHO+$9|P?JunK*0 zB-Q#B7xzh%|HTRDe4igln*4=M=-i6JGJ})=3rOAVq@|^(3QqmO=fwU?E!likwz=MJ z(djI9JiuYr>CN`v?s`Bw^8}rS%Wr}@(TV@}FwT+Q~ z=2cOm!|EV5yG#xa{mPirB9HjBL=_^|c%F1pzNtn!Vk(^}62~rdE_5w)FZA4QyUo5= zw%78NP1VM=8QV;4X7&KVoQ<=KHqp*Hh|BKfeI9QtZm%Sjwo>~rGR#(QZ^lz>lkB(K zpR~V8-mrZFuO<1A8M!7<{$(s2XSR5B7OMc7gvUg0$RAo1A}GX#nnHU+2Sc*Zx>zU_ zjd_F^(#oYSDuc?WA_mnS)gjgQsvK<8buyJoF7s$*tlvqySUjy@rpoXhkCO*|H~84w z?EdcTf5Zwa%lIuami1aK;2CX!Gs#{D59$@L0uu&%9d^ju!779qFX*SORvU@fyUEp} z*AW5Z6X;7ARvC>5PWsR9H%D~xh)xR!&M-aI?*IzL!=$%}We0fd<1sHLGQ3z(S-~$a zhD8D?L>@ZK)hbsDf3%jgvVJB1CM5LTD1Bsc&`OpBAg74+>e&6l^u|=aC!^O-l$R1 zuTsDV8gUF@_B)liqk!Ka$y>l`;B*Du z9ZJx#KaaHIQW1#B`vjUziEz?db#(dT)xoH*@Hn^VG- z&CudJ;{rs$1&Ao^@X3yCHm=YvDOFoqngwv13o`xjX3%UdH=EC^V?O%rcVGYZ1|KDL zxSCS1R~P=zO_O<7S4EEh*&klg_h!Gq~_lO1|f=r`LYpE+r`MIqbQIjd+Cf_QCD-!{?fUr@ zWc?W?i1&SxU0X)M;!7Hhh3^)YUqa#IC@q!|E<2R6NL0y;Um!(&;E|2uzx3s)thWw$ zqY2D%_$o*xo*|BTTJf0g=^`eg2>S-HOU&0euXZiB-0Hl`a*y*F#U{&R&gY7EC|=NS zvutT-e6s08ZuvIx9h;)Ar>nB^f(@F-}54*C6p zsI%1>1Y9iC38RHpVWY5B*drVgl!CL`zP3b=)Aoh(jq*M6LvmKmS0#3zQP#EKEk*|QyM1ZrO&DzHaGt@o? zYA1>3s$wQRfAGGi4}Uc9_R-6i&)DWyadx$S=0i;zI~UOX3w4?{SQc6iSr`i+pLXZD%Rv`GveZ%)3Yc9eqK7@nA8{O^I#US0 z%E(HKp)D+iIBuQ_muN~t!($U-m&Tq7Yz{q-cWGYm?u@=F+oyb={Y-g8_KnhLV@q+F ztX9*2M{6$hPQc@3Es7S+rT8-0e9cwldiC|*rM{Kk9^Z?B9bp^j#*TMtxERRW+q{w+ zjb!(2hBJBPWpJ1ZfrF;Cu#RW+GWeYHCpdBcN8MPS{r8T~)=Q|n;C1%MCwJZT2@UL* zz3=@g`{OIGW{>}AGu@-?V98+g#>T(?bt448;?%-E5_V<+9p53SLAe3Sypv)IGOt@d z3;!j0*mqd?NBEnlB4mx)hWbZ^M@7f`Tf&p0^9=L&x#5+(7PN&wh>OjZX7dE=<>AYs zC!I1mSUIdta6dDJovXMF+`W!_oSUtip>Gfb5(95>NuIR4+n$%U(d|YduF!O`a`zwX zLQtb0q-@^g!|Qx|eZ=QXSp;ApRCSvom>PE1fnk?VzIx6&gQvUX`vgwS)XN>tTEh<( zm>!d(VznJSonWs)0Tz`?Ak?@Hn6b}QRyvcAcTFnn1Zk7N2@y~=W-36t$h;<**Cqd0 z`PRF4zw+VJGxl9#g)`K%9(`k9_7v9adzsO>=`MN6=d`}606}GlKZB#rke;A~kMhDmnY~Z4JNGCROY$h@uFtQ0XX(;+7JqiHR9^J)J@%en~rxU+!dtSJHmC5Z>npuZ=q|x??%@e*8}bc zywA%FSJ`^puQIQi-gLj|l`CH}I{kiNh>ae>u3!bDMmx?qXcNMVUAl)02&fYUtZOlGIe0-mox${n5{PI}e#iK~cC|!KP0?Y5U@-X@9Oa>p2Hk^5ab(XAfb+8-K>?iTnra z*T3(?4?p_);Bo_RG;(DVvFpv}0dxH8rq$0qdj~)ee5n+%lL0a-&^|FNYMW%sWw&T= zD&3^*)NW6{l6*g@wu4by`v%7aR24;VY!TQob|*sM>8F%+V^MTs98iX00TgbD3m*7S z1N?kZ+96jd)o=`Tji{~!Q=Q*=KmyHsL|w5}v@Wt9w6az{bJZ^VcEP)5l;qG`10R{8 zVrmUlb@ob!UV&1cdnN%~diyFQ4;y_V>4F7K`B2go_rYNV45#yd)r+*Uyjxe&aG)LwEgevNy6f0%S(vT%T7gZL}=1kk5qo!ruxs@#L7tkNs*j2d>o}D#oCqN}jE%10Xp%GeyGk;JVBA%00;xn#j(006s1y^Qpq3i9E>3$iF2TMHz>PKASZkPC!s6@ z8&jtj9B81FQkKvpt}UmPX&d>4*D zH7y3rsRLhhVWR%@E{u2Vz$iQN?zi`^zkC0F*bC!fFD!t)P>rt?(+@aL`N{vQ>{`I1 zD$n%!&)n}bXU@!IE}1!VCX;(IA(@Z~CnN^~!FmCtQb0(h6qJiDg3xx=Ex2~Q6suCC ziq@hLVWIv=w>Zrf|wXM4%^OWVnqEoKiOb%x*e z$thWsmsH58WT=5UuzwNoG%jgrY|OQ&b$+0Cc^x6}xek9^@qiMK&tDkIJRgP@0W{zvmN_+u)*~bw;Q630-5-lvY}Rbt zf+}y!mM}~bHi?;1u0I?QtRv_4DejbPL_g?Yp#aEGB)AZxj-s^``JfbX^7Ws29z-ez(ut3@=X^gn7~?jX}{+@ENq6UECvg`H(oR9)N0=~56-5Tv_d zW?<;St*(=Jwi#3unF1mk8BWCI44+Sm5{h>M=nvq)~^YRDtOc zBTk6Mx!R?zOy~B#JErT3?#^%&FhFmP9o@0MasPTwEBJ+ADeu~YJ&P2uw}O6R*z%F! zQOY@@^b`$CL4Dx;LcadsldgjF5Ny)bXHTAA z2=VEuhR!eva<$b+8!CwVo$x3;(jn+Je(+SYE$@OY7?axZi`+`^g`Uf$rKN|f-W#er z^Y0ylkDz7^or9r#G-w64{lxQM_lnoBGgm!lwxlh0(|)YU{IbvbHuQULv-v8}Il#_9 z%ln17R&ZUb=vuRAKd5s+rdlRT&xbL9@jm$M_J(av>-dagj^py1LFt0eKnrMw^K?qO zE$cc|hb}AUlNX;iLM8Xm+Xm0TLv$}#e)%&?s=n$-G5-5c3ylr-4m(2)wY3}H3IhW= zdhQuqM|XZVXS@iR)$7lBxU1hoG@7Zq2Cu44s~9UOHBBlnEgdb1n=5fLNS@3}kr^42 z_7yG0qqOIOg+l_8ody5~YRs}GizemZdIt@SxVSvEcNk&j&^*1uC-c?hIft+2N-hs@ z=QJ5&xU%+6(ogb|TnG3@(vR2UIK{@RxU7CREmzMmT+zJA$JVl^sS(Y4_ozCuv#1mj zvN>3%IqH;mNOM~9>_UYyAiu6~@h#a->(a18Jnq{qS~`pglJ!?>az8|m`M2Y59`-V2 z;_|(vrL3mSMM91~^)h&6ch<9swmzsD>U%61v(F$jAr+r5h0_@oWw7Bemcw94pCGk6 zO+V_Ekbx^A7&b~D_Tj*oG;}B-qo}TGQ3ahI?Q+X3d>J?Vp%K-by~V2Q<42M9q-E5q z35Ai)`HKT^3E2B=gR_)a2{BgVz{GO8O8aFb&X`T z#m{4}3xZ|Ml_!=dt|eOMzprdXKfO4&IbXhjO!)nUGz$rm~ z;EJt7wU7G7nID`Wee$~mY;fA8jtPt1Lt2ZIG@YxBH&b>k&G_w2647|o7?+A<<+H)C z-UMkG?OTY-8H)ipSs(FECa5sR`A)F9obUV0Op+ek`HrW1&w$0u!uP3#((_`}5=2dN z)8~4S6|DY>$o%nD*>49u?gO`vPC1cw2!^7*QZ#+7x`uEeOr`tUX3*zp{i(g(k@5bU zTGM_Gi6tx>{H4g21S`{~?Xs(E3P%#mp;)F=dHxWYN&I{nY$m2FOn^l%f2aAq_~fdv z-ZNJYdy&)&7m<%uNl!OBySsDpo*1y#d65x+7BDrN6ZOz|V_c?Vhh-XJw5wP4H7<-B zJ&TYTtCL0k>YBu*TaAJ!?PCTL9f9r5tYUD)?3&?h(@`9Jf8e1x3DqrjwafBl6#v&W zrdPrqEwI~8VsuQB_qEFBh15|t7}W153Dsl2Lm88!DMkYvD>Cc3%^R%-kpl={r7bzc zLRw|kj1u|zc!r8boL<1?#*}C0MYzM<4VCTL?~!TVHn=YxWu|X?aL+JVm+R4%X~0wc z86F{vaRH$ulHp@V;LuaZho`d#{(b2)LI+5lJ)M?|sGxCCd!cfKbA7aB15X2ALOSgm zDO)mERl@4FJdQ+@OxY%qPczK>$z!UP>=AjMbbP9%lieR@jFMV0^kc77OOr^Zt|H2$ zxHR#cBo2ZUAqypBW&8AEEbP((VaW~%LSO|JU}9C@noUPD54&fqYazhYo3S-c*J;Qw zev+AnUxdMJ{G@Z+GefwmSH5P=Zms*#ha%(_>Td&a^d)vD0yYzd%4BPIvBr+~F1ic7 zC=x@|?h|%atv(oh76W+PF)Sx1{(ur;H+%}z&8Wvm6z^9I@n-QD`JU$$co4)~PWz?@ zdakTko^McnkBf7;3@C)9laCauxm->jDZ$RU)5w>muOD^MC4AYOfhU_h7Z>njtLM356-0rxoej7zc8 zMvEdje}oOrlNl{aJDWu=%u9FhC}2=?ax{FU3p-vDiDJ%P*h^`Hum9+G2*)mp6d#aI zq7rD|nU?F?2=g@jSX>37$;ou5?bNO5V_AdvAR+mf744OVj7Y>lj$dx2@-F^RsB(jP zWKQfRe;~4$cQtw{8Zx4ZJF%|j1)R@yT!fM^?bpYSxotsyyxFWk;-W#NY$hkRjNR@Q z#C*PJNI8uqB8|y~1F_((cJ~`}F9VKO>9Rzls~s?0q-WGPbu|3Pn2RbAI8p~1Dfx4a zD~Jp$^)g3ss#%TSd+FwUMhU`Nvzo%e1|HUvEi*p!wb6vZE;UQAd8_$2n&=L7#b?h( z3}lT9bqT3T->dn<7Lx=tMv|X1DHp5@*q$fDx ztNk=v+|A}r+VJ+r6Oz79z_M68U97wmn%{QnFNVjfuGG9;TQ8+t*omah&&(?m*LdI; zMOdGuZhA4r;^|i0@7FgzJMX-Pcp|lJ}!`&1~av|f? zUA@=oFU{Ioe*N0)$frEhUgCurW)AezJ|L0pnwn6cSukpSJ#Mc? zDZhyxKxtR>9H$#wI--~@`srvN=78lstnZQ2@Y~vc2IA4dIA)vYTtd>d`#a6}OpZBc z`FJ`z1KU*?v|$ql8X=M1?aE({{o~o5PK@F!yAJKx#dHmyBJ@n-rH(q>uShJHWSB1S zjkbEDYT6|+A=^bvJ!fg5L^}hnVaLL*~Y1YtmQ_P>A{=DsfppaP@RH?#{hjAU6 zRh*Tf`5?_X*v?-w$v&TTT#?N@abj22A(QkDFe4WG=!za;mTQ zx{oXt5{(~HzXZoE^HC=BnaBXpnx)v%{AQpgCNP51D2oYDMq-pVl5Jy;+fdt%lZwgp z=84MTU?t_e#PxYCdHN8W97Gc<6&62D0P*Zgj=+8)EUHZU9$H*2FDmD2zpsFt^GFkvL+lY5d^C<$l?zymF&ng8z&6w=v6LzjqSrtLwA&%n0Z~z8GJ&Vc=!F zT7oMT0IY9axSSpvk#7!;7=3+KF(1xnl-P8c2i7XnhPiTIK2}F+dz2Jx^->}{L|EEu z_U1L-dj-=?e83r1&p#@$_^p!RhpcYADE7!c@!O&r<<}e%YHjdq2%tOvvhF_wi>=$!WabMyHg-+zPp=;C>=A0#nR=Z{NSR7>@evttXS$a5#}P z!`+8qX;9yCbz?X9x$Lu!+)Sf9&MC2>-yhuYRwaMxagz&Hq~GgC?uTZT9B+9o;0VW^ zFg$xW1p*zeS_V28rz07hk4|ALOYEE5jE0)I*NqhC?uibm+L&7V-f8C%J@bH=TD3X`#<5whuv|f(ME`46O-{ zseL|=kJra7(RT(JwK1L#;YE)r3rIO}0Pe#^>2+}0->QlwQV65#W?Tri3C^0|;-q}Y zm}M{S^Q(ncMrkdX2O_ziECJ%;O^qan(wgAU_VV8J2fmAZo%Cpn8n(Z zV!%W%|IWy{krkf4GjIfDx23`R88?eP!RJD-@_702o$-ZDBSrnsZq~d$`N#DNk3z$# zOUU+y>vGeSJI(OSZN9MAcC<=cP?%7d8wG$(p%#?QtZekgPtF<4zO}(JcGV%awcxqZ8yw! zHr2g|HWkbHZr=0xr!TIV&{flT@u^e1ZE{_aoX&AI6)5A%L zcgA#x+SDykmD&{cu{}h?6UEGIw2UYD8 zBe0K&jyP?D7Y}C;pS+G*lUgNty~1yDH%CXRj`EQ((^5lL_o!Ysv<~WTrv=+?65UzS zM5I=x(iNF}yYEC#7=tuFjYT(0y;XAG#b1Kze0XiNjXl?@F*CmCV^dX-sVNsRD*kc~ z(%Sv9i*L>Hhuw{&)EZqrf;WQ6FNXHT3KnUFvY8o)_~Ml85=D&2N&@fJdh7#i(`tH*jBp7{y0b2KJam_iQviVaXe4on=fn?%eeWj@fNS z$n_5-kwJB5#Y^h33%b%>Geyplm)qTput6pwM)68xB+^({{9zVW4 zFJtw*GAHnNO#O&9Vig-%ET+9CCv8d096RFOz+=X3Zb`y;P_c@MiC*n2bGbq3B*^5}v)s_}`*B*~`PtbM z0*O?04G#~EF-EmuDaYl~Q^_cDz2R+PD7lX)jdu$V#wY8pVSR?J)`(B@mb)wQjVQDSLf`4m_SAN zC#@_)w9u$3rD!%ShD*eWEAE>4Qm<3t@(ac=Jv-Z8Da!VruHAgsDn6JmR#ItSlD4mD zG5+{nn%|BMjBgj)?dmeRI>+V|{}M>$Z;k7&_%Y_0s0`&T4*QZI+{6bhjet2+byctE zcg!#59}kt9F^xbi0Yx$kX%o2(m^5EviP~spRU;>IQ_iMjn&!U0^M1~`> zUofhaIM}P6w=&4k&Supr&wkVShIBWM>48izMo1c%+%glIytiNw5#q4O7EcyeOLYjY zz#e%L9An+I3?GEgSl7uf>^=g$3Am8dv%C6^4mA=Y0L9`5tE#Ih>$)ir)ZkB!&~4fG zT`xtGjA0o{TnVO7UN=f@(fzDr5Qd)n%&D09n!juoNPX+@{;FS+zim*Itbj6&ypm^` zLZeqa7qC@R*;TlE;!A&{NiYZ>L2Z4lTG8CUK%YwZ#rQa!T0Ag~Nzz%JLA~NZ6p4Z^ zT}b{7gRP$YMcZL_MJPa~ApMw3~?N#q5y0 zioUpor>vEl{lw5$+KG`?Nrfm#3?{azXLT@Q)8kC|JE_G}!t+WmexF)vC_W#3)L{+j zN@^NoocgEkumdbcG()@olISD%BZC}DS_aF$u_GhldpPmko7nVH#n^>2$l`-VZ`C8p z4#0l5^*U0}X#mvB7q(Nr%3~dR!t)y2{r1R@sj0=rcNeF*y3&kMvOO!S^&w7a6rK1R z%Ka*hb9W?^ocAOBONdhDSDx6F)KkQD;w0}U=sCT!e8T0dgi_z3@O;GGYN~8D;DK1$ zOO-~7mMBY3GuC0nck5=Wa4Xs~tagmRs!C}Ie}<3(uIg92r293RFZ1=U#LD&6vJ?|` z2-_TmDlF@)Z}-VoGOjm~jm{~Njk`ioEUZ&Zf9gC`2x!LAAO^|WE$c%$t z$IA-t=})gMulP5w(ASfLfKSHD)7}*oE`y@M*jUl)S;4Km9jtg{y-*q9s04Cwn3d;W z218f4&A!d4APfQlMNnW6eiHzI4RxTdtKn+%e_Q?+EXNlPvqf=b zKtKY(e;U0Y7|ai*x26A=1_BGCAVPmNdY6A`KmoviX{hrk=E*;O0Fa(ePr1`bQ3+08sePu>l2y`2VXu0TH1;eF6Y} z!T)vdJ>ga;x(l4(FRY}VgFg)Q-01mqU0prtfqy}dQ1D9?7h6~QzbEit9eETuNLLTyA~k_7+1h5m1Nj)x~|68}A81OS2nAOQ=D Jf|eq|{{SrwhL->U literal 0 HcmV?d00001 From 05bb937bcda1d8e64826640f269c84383d85e4e7 Mon Sep 17 00:00:00 2001 From: Jihan Yehia Date: Fri, 21 Nov 2025 14:13:37 -0800 Subject: [PATCH 2/3] NF_EstHostReads documentation --- .../GL-DPPD-7109-A.md | 169 +++++++++++++ .../Estimate_host_reads_in_raw_data/README.md | 9 +- .../NF_MGEstHostReads/CHANGELOG.md | 29 +++ .../NF_MGEstHostReads/README.md | 236 ++++++++++++++++++ .../Workflow_Documentation/README.md | 13 +- 5 files changed, 446 insertions(+), 10 deletions(-) create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Pipeline_GL-DPPD-7109_Versions/GL-DPPD-7109-A.md create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/CHANGELOG.md create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/README.md diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Pipeline_GL-DPPD-7109_Versions/GL-DPPD-7109-A.md b/Metagenomics/Estimate_host_reads_in_raw_data/Pipeline_GL-DPPD-7109_Versions/GL-DPPD-7109-A.md new file mode 100644 index 00000000..8838d7bb --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Pipeline_GL-DPPD-7109_Versions/GL-DPPD-7109-A.md @@ -0,0 +1,169 @@ +# GeneLab tracking of host reads in metagenomic datasets + +> **In order to provide an estimate of host DNA in metagenomic datasets that are sequenced from host-derived samples, datasets are screened against an appropriate reference genome using [kraken2](https://github.com/DerrickWood/kraken2/wiki). Reads are not removed from the dataset, but the percentage of detected host reads is reported.** + +--- + +**Date:** November X, 2025 +**Revision:** A +**Document Number:** GL-DPPD-7109 + +**Submitted by:** +Jihan Yehia (GeneLab Data Processing Team) + +**Approved by:** +Samrawit Gebre (OSDR Project Manager) +Danielle Lopez (OSDR Deputy Project Manager) +Jonathan Galazka (OSDR Project Scientist) +Amanda Saravia-Butler (GeneLab Science Lead) +Barbara Novak (GeneLab Data Processing Lead) + +## Updates from previous revision +* Updated kraken2 from version 2.1.1 to 2.1.6 +* In [Step 1](#1-build-kraken2-database), used kraken2's `k2` wrapper script for `download-taxonomy` because the script supports HTTPS download as mentioned [here](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#introducing-k2) + +--- + +# Table of contents + +- [**Software used**](#software-used) +- [**General processing overview with example commands**](#general-processing-overview-with-example-commands) + - [**1. Build kraken2 database**](#1-build-kraken2-database-of-host-genome) + - [**2. Identify host-classified reads**](#2-identify-host-classified-reads) + - [Example if paired-end reads](#example-if-paired-end-reads) + - [Example if single-end reads](#example-if-single-end-reads) + - [**3. Generate a summary report**](#3-generate-a-summary-report) + +--- + +# Software used + +|Program|Version|Relevant Links| +|:------|:-----:|------:| +|kraken2|2.1.6|[https://github.com/DerrickWood/kraken2/wiki](https://github.com/DerrickWood/kraken2/wiki)| + +--- + +# General processing overview with example commands + +## 1. Build kraken2 database of host genome +This depends on the appropriate host genome. This example is done with the mouse genome ([GRCm39 | GCF_000001635.27](https://www.ncbi.nlm.nih.gov/assembly/GCF_000001635.27)). +> **Note:** It is recommended to use NCBI with kraken2 because sequences not downloaded from NCBI may require explicit assignment of taxonomy information before they can be used to build the database, as mentioned [here](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown). + +```bash +# downloading and decompressing reference genome +wget -q https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.27_GRCm39/GCF_000001635.27_GRCm39_genomic.fna.gz +gunzip GCF_000001635.27_GRCm39_genomic.fna.gz + + +# building kraken2 database +k2 download-taxonomy --db kraken2-mouse-db/ +kraken2-build --add-to-library GCF_000001635.27_GRCm39_genomic.fna --no-masking --db kraken2-mouse-db/ +kraken2-build --build --db kraken2-mouse-db/ --threads 30 --no-masking +kraken2-build --clean --db kraken2-mouse-db/ +``` + +**Parameter Definitions:** + +* `download-taxonomy` - downloads taxonomic mapping information via [k2 wrapper script](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#introducing-k2) +* `--add-to-library` - adds the fasta file to the library of sequences being included +* `--db` - specifies the directory we are putting the database in +* `--threads` - specifies the number of threads to use +* `--no-masking` - prevents [masking](https://github.com/DerrickWood/kraken2/wiki/Manual#masking-of-low-complexity-sequences) of low-complexity sequences +* `--build` - specifies to construct kraken2-formatted database +* `--clean` - specifies to remove unnecessarily intermediate files + +**Input data:** + +* None + +**Output data:** + +* kraken2 database files (hash.k2d, opts.k2d, and taxo.k2d) +* reference genome used (*.fna) + +--- + +## 2. Identify host-classified reads + +### Example if paired-end reads + +```bash +kraken2 --db kraken2-mouse-db --gzip-compressed --threads 4 --use-names --paired \ + --output sample-1-kraken2-output.txt --report sample-1-kraken2-report.tsv Sample-1_R1.fastq.gz Sample-1_R2.fastq.gz +``` + +**Parameter Definitions:** + +* `--db` - specifies the directory holding the kraken2 database files created in step 1 +* `--gzip-compressed` - specifies the input fastq files are gzip-compressed +* `--threads` - specifies the number of threads to use +* `--use-names` - specifies adding taxa names in addition to taxids +* `--paired` - specifies input reads are paired-end +* `--output` - specifies the name of the kraken2 read-based output file (one line per read) +* `--report` - specifies the name of the kraken2 report output file (one line per taxa, with number of reads assigned to it) +* last two positional arguments are the input read files + +**Input data:** + +* Sample-1_R1.fastq.gz (gzipped forward-reads fastq file) +* Sample-1_R2.fastq.gz (gzipped reverse-reads fastq file) + +**Output data:** + +* sample-1-kraken2-output.txt (kraken2 read-based output file (one line per read)) +* sample-1-kraken2-report.tsv (kraken2 report output file (one line per taxa, with number of reads assigned to it)) + +### Example if single-end reads + +```bash +kraken2 --db kraken2-mouse-db --gzip-compressed --threads 4 --use-names \ + --output sample-1-kraken2-output.txt --report sample-1-kraken2-report.tsv Sample-1.fastq.gz +``` + +**Parameter Definitions:** + +* `--db` - specifies the directory holding the kraken2 database files created in step 1 +* `--gzip-compressed` - specifies the input fastq files are gzip-compressed +* `--threads` - specifies the number of threads to use +* `--use-names` - specifies adding taxa names in addition to taxids +* `--output` - specifies the name of the kraken2 read-based output file (one line per read) +* `--report` - specifies the name of the kraken2 report output file (one line per taxa, with number of reads assigned to it) +* last positional argument is the input read file + +**Input data:** + +* Sample-1.fastq.gz (gzipped reads fastq file) + +**Output data:** + +* sample-1-kraken2-output.txt (kraken2 read-based output file (one line per read)) +* sample-1-kraken2-report.tsv (kraken2 report output file (one line per taxa, with number of reads assigned to it)) + +--- + +## 3. Generate a summary report +Utilizes a Unix-like command-line. + +```bash +total_fragments=$(wc -l sample-1-kraken2-output.txt | sed 's/^ *//' | cut -f 1 -d " ") + +fragments_classified=$(grep -w -c "^C" sample-1-kraken2-output.txt) + +perc_host=$(printf "%.2f\n" $(echo "scale=4; ${fragments_classified} / ${total_fragments} * 100" | bc -l)) + +cat <( printf "Sample_ID\tTotal_fragments\tTotal_host_fragments\tPercent_host\n" ) \ + <( printf "Sample-1\t${total_fragments}\t${fragments_classified}\t${perc_host}\n" ) > Host-read-count-summary.tsv +``` + +**Input data:** + +* sample-1-kraken2-output.txt (kraken2 read-based output file (one line per read)) +* sample-1-kraken2-report.tsv (kraken2 report output file (one line per taxa, with number of reads assigned to it)) + +**Output data:** + +* Host-read-count-summary.tsv (a tab-separated file with 4 columns: "Sample\_ID", "Total\_fragments", "Total\_host\_fragments", "Percent\_host") +*Note: The percent host reads estimated for each sample is provided in the assay table on the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).* + +--- diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/README.md b/Metagenomics/Estimate_host_reads_in_raw_data/README.md index de5cc6d5..380cdf3c 100644 --- a/Metagenomics/Estimate_host_reads_in_raw_data/README.md +++ b/Metagenomics/Estimate_host_reads_in_raw_data/README.md @@ -1,6 +1,6 @@ -# GeneLab pipeline for estimating host reads in Illumina metagenomics sequencing data +# GeneLab pipeline for estimating host reads in metagenomics sequencing data -> **The document [`GL-DPPD-7109.md`](Pipeline_GL-DPPD-7109_Versions/GL-DPPD-7109.md) holds an overview and example commands for how GeneLab identifies and provides an estimate of host DNA in Illumina metagenomics sequencing datasets that are sequenced from host-derived samples. See the [Repository Links](#repository-links) descriptions below for more information. The percentage of detected host reads and a GeneLab host read estimation summary are provided for each GLDS dataset in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).** +> **The document [`GL-DPPD-7109-A.md`](Pipeline_GL-DPPD-7109_Versions/GL-DPPD-7109-A.md) holds an overview and example commands for how GeneLab identifies and provides an estimate of host DNA in metagenomics sequencing datasets that are sequenced from host-derived samples. See the [Repository Links](#repository-links) descriptions below for more information. The percentage of detected host reads and a GeneLab host read estimation summary are provided for each GLDS dataset in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).** > > Note: The exact host read identification commands and MGEstHostReads version used for specific GLDS datasets can be found in the *_processing_info.tar file under "Study Files" for each respective GLDS dataset in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/). @@ -10,7 +10,7 @@ * [**Pipeline_GL-DPPD-7109_Versions**](Pipeline_GL-DPPD-7109_Versions) - - Contains the current and previous GeneLab pipeline for identifying host reads in Illumina metagenomics sequencing data (MGEstHostReads) versions documentation + - Contains the current and previous GeneLab pipeline for identifying host reads in metagenomics sequencing data (MGEstHostReads) versions documentation * [**Workflow_Documentation**](Workflow_Documentation) @@ -19,4 +19,5 @@ --- **Developed and maintained by:** -Michael D. Lee (Mike.Lee@nasa.gov) +Michael D. Lee (Mike.Lee@nasa.gov) +Jihan Yehia (jihan.yehia@nasa.gov) diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/CHANGELOG.md b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/CHANGELOG.md new file mode 100644 index 00000000..366166b0 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/CHANGELOG.md @@ -0,0 +1,29 @@ +# Workflow change log + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + + +## [1.0.0](https://github.com/nasa/GeneLab_Data_Processing/tree/master/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads) + +### Changed +- Update to the latest pipeline version [GL-DPPD-7109-A](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7109-A.md) +of the GeneLab Estimate-Host-Reads consensus processing pipeline. +- Pipeline implementation as a Nextflow workflow [NF_MGEstHostReads](./) rather than Snakemake as in +previous workflow versions. + +### Added +- Pull dataset from OSDR option using dp_tools +- Build kraken2 database from scratch using host organism's information pulled from [hosts.csv](workflow_code/assets/hosts.csv) +- Create protocol.txt as an output file describing workflow methods + +### Removed +- kraken2-mouse-db/ no longer needed as part of the workflow files (can now be explicitly set or built from scratch in case it doesn't exist) + +
+ +--- + +> ***Note:** Change log of the Snakemake workflow (SW_MGEstHostReads) that is associated with the previous version of the GeneLab Estimate-Host-Reads Pipeline [GL-DPPD-7109](../../Pipeline_GL-DPPD-7107_Versions/GL-DPPD-7107.md) can be found [here](../SW_MGEstHostReads/CHANGELOG.md)* \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/README.md b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/README.md new file mode 100644 index 00000000..01780789 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/README.md @@ -0,0 +1,236 @@ +# NF_MGEstHostReads Workflow Information and Usage Instructions + + +## General workflow info +The current GeneLab Host Estimation pipeline for metagenomics sequencing (MGEstHostReads), [GL-DPPD-7109-A.md](../../Pipeline_GL-DPPD-7109_Versions/GL-DPPD-7109-A.md), is implemented as a [Nextflow](https://www.nextflow.io/docs/stable/index.html) DSL2 workflow and utilizes [Singularity](https://docs.sylabs.io/guides/3.10/user-guide/introduction.html) containers or [conda](https://docs.conda.io/en/latest/) environments to install/run all tools. This workflow (NF_MGEstHostReads) is run using the command line interface (CLI) of any unix-based system. While knowledge of creating or modifying Nextflow workflows is not required to run the workflow as-is, the [Nextflow documentation](https://www.nextflow.io/docs/stable/index.html) is a useful resource for users who wish to modify and/or extend the workflow. + +
+ +## Utilizing the Workflow + +1. [Install Nextflow, Singularity, and Conda](#1-install-nextflow-singularity-and-conda) + 1a. [Install Nextflow and Conda](#1a-install-nextflow-and-conda) + 1b. [Install Singularity](#1b-install-singularity) + +2. [Download the Workflow Files](#2-download-the-workflow-files) + +3. [Fetch Singularity Images](#3-fetch-singularity-images) + +4. [Run the Workflow](#4-run-the-workflow) + 4a. [Approach 1: Start with OSD and GLDS accession as input](#4a-approach-1-start-with-an-osd-and-glds-accession-as-input) + 4b. [Approach 2: Start with a sample ID list as input](#4b-approach-2-start-with-a-sample-ID-list-as-input) + 4c. [Modify parameters and compute resources in the Nextflow config file](#4c-modify-parameters-and-compute-resources-in-the-nextflow-config-file) + +5. [Workflow Outputs](#5-workflow-outputs) + 5a. [Main outputs](#5a-main-outputs) + 5b. [Resource logs](#5b-resource-logs) + +
+ +--- + +### 1. Install Nextflow, Singularity, and Conda + +#### 1a. Install Nextflow and Conda + +Nextflow can be installed either through the [Anaconda bioconda channel](https://anaconda.org/bioconda/nextflow) or as documented on the [Nextflow documentation page](https://www.nextflow.io/docs/latest/getstarted.html). + +> Note: If you wish to install conda, we recommend installing a Miniforge version appropriate for your system, as documented on the [conda-forge website](https://conda-forge.org/download/), where you can find basic binaries for most systems. More detailed miniforge documentation is available in the [miniforge github repository](https://github.com/conda-forge/miniforge). +> +> Once conda is installed on your system, you can install the latest version of Nextflow by running the following commands: +> +> ```bash +> conda install -c bioconda nextflow +> nextflow self-update +> ``` +> You may also install [mamba](https://mamba.readthedocs.io/en/latest/index.html) first which is a faster implementation of conda and can be used as a drop-in replacement: +> ```bash +> conda install -c conda-forge mamba +> ``` + +
+ +#### 1b. Install Singularity + +Singularity is a container platform that allows usage of containerized software. This enables the GeneLab workflow to retrieve and use all software required for processing without the need to install the software directly on the user's system. + +We recommend installing Singularity on a system wide level as per the associated [documentation](https://docs.sylabs.io/guides/3.10/admin-guide/admin_quickstart.html). + +> Note: Singularity is also available through the [Anaconda conda-forge channel](https://anaconda.org/conda-forge/singularity). + +> Note: Alternatively, Docker can be used in place of Singularity. To get started with Docker, see the [Docker CE installation documentation](https://docs.docker.com/engine/install/). + +
+ +--- + +### 2. Download the Workflow Files + +All files required for utilizing the NF_MGEstHostReads GeneLab workflow for estimating host reads in metagenomics sequencing data are in the [workflow_code](workflow_code) directory. To get a copy of the latest *NF_MGEstHostReads* version on to your system, the code can be downloaded as a zip file from the release page then unzipped after downloading by running the following commands: + +```bash +wget +unzip NF_MGEstHostReads_1.0.0.zip && cd NF_MGEstHostReads_1.0.0 +``` + +
+ +--- + +### 3. Fetch Singularity Images + +Although Nextflow can fetch Singularity images from a url, doing so may cause issues as detailed [here](https://github.com/nextflow-io/nextflow/issues/1210). + +To avoid such issues, the required Singularity images can be manually fetched as follows before running the workflow: + +```bash +mkdir -p singularity +cd singularity + +# Pull required containers +singularity pull kraken2_2.1.6.img docker://quay.io/biocontainers/kraken2:2.1.6--pl5321h077b44d_0 +singularity pull dp_tools.img docker://quay.io/nasa_genelab/dp_tools:1.3.8 + +cd .. +``` + +Once complete, a `singularity` folder containing the Singularity images will be created. Run the following command to export this folder as a Nextflow configuration environment variable to ensure Nextflow can locate the fetched images: + +```bash +export NXF_SINGULARITY_CACHEDIR=$(pwd)/singularity +``` + +
+ +--- + +### 4. Run the Workflow + +> ***Note:** All the commands in this step assume that the workflow will be run from within the `NF_MGEstHostReads_1.0.0` directory that was downloaded in [step 2](#2-download-the-workflow-files) above. They may also be run from a different location by providing the full path to the main.nf workflow file in the `NF_MGEstHostReads_1.0.0` directory.* + + +This workflow can be run in two different ways as shown in the two approaches below. The user can choose to provide both the OSD and GeneLab accession numbers of an [OSDR](https://osdr.nasa.gov/bio/repo/) dataset of interest, as illustrated in [Approach 1](#4a-approach-1-start-with-an-osd-and-glds-accession-as-input). Another option, depicted in [Approach 2](#4b-approach-2-start-with-a-sample-ID-list-as-input), is providing the path to a text file containing a single-column list of unique sample identifiers, an example of which is shown [here](workflow_code/unique-sample-IDs.txt), along with the path to input data (raw reads of samples). + +Both approaches also require providing the root directory for where kraken2 reference databases are (or will be) stored. The workflow assumes databases follow the naming convention `kraken2--db`. If a database for a specified host is not found in the provided root directory, the workflow automatically builds one from scratch and saves it in the same directory using that name convention. + +In cases where the workflow is to build kraken2 database from scratch, it is important to ensure that the host organism's details are present in hosts.csv table [here](workflow_code/assets/hosts.csv). If not, they should be appended to the table in the following format: `name,species,refseq_ID,genome_build,FASTA_URL`. + +Alternatively, a pre-built database can be manually downloaded and unpacked into the root directory, provided it follows the same naming convention. An example of which is available in the [reference database info page](https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/SW_MGEstHostReads/reference-database-info.md), which describes how the mouse database was generated for a previous version of this workflow and how to obtain it for reuse. + +> Note: Nextflow commands use both single hyphen arguments (e.g. -profile) that denote general Nextflow arguments and double hyphen arguments (e.g. --osd) that denote workflow specific parameters. Take care to use the proper number of hyphens for each argument. + +
+ +#### 4a. Approach 1: Start with OSD and GLDS accession as input + +```bash +nextflow run main.nf \ + -resume \ + -profile singularity \ + --ref_dbs_Dir \ + --osd OSD-465 \ + --glds GLDS-465 +``` + +
+ +#### 4b. Approach 2: Start with a sample ID list as input + +```bash +nextflow run main.nf \ + -resume \ + -profile singularity \ + --ref_dbs_Dir \ + --sample_id_list unique_sample_ids.txt \ + --reads_dir +``` + +
+ + +**Required Parameters For All Approaches:** + +* `main.nf` - Instructs Nextflow to run the NF_MGEstHostReads workflow. If running in a directory other than `NF_MGEstHostReads_1.0.0`, replace with the full path to the NF_MGEstHostReads main.nf workflow file. +* `-resume` - Resumes workflow execution using previously cached results +* `-profile` – Specifies the configuration profile(s) to load (multiple options can be provided as a comma-separated list) + * Software environment profile options (choose one): + * `singularity` - instructs Nextflow to use Singularity container environments + * `docker` - instructs Nextflow to use Docker container environments + * `conda` - instructs Nextflow to use conda environments via the conda package manager + * `mamba` - instructs Nextflow to use conda environments via the mamba package manager + * Other option (can be combined with the software environment option above using a comma, e.g. `-profile slurm,singularity`): + * `slurm` - instructs Nextflow to use the [Slurm cluster management and job scheduling system](https://slurm.schedmd.com/overview.html) to schedule and run the jobs on a Slurm HPC cluster +* `--ref_dbs_Dir` - Specifies the path to the directory where kraken2 databases are or will be stored + + +**Additional Required Parameters For Approach 1:** + +* `--osd` - The OSD accession number specifying the [OSDR](https://osdr.nasa.gov/bio/repo/) dataset to process, e.g. OSD-465 (type: string, default: null) +* `--glds` - The GLDS accession number specifying the [OSDR](https://osdr.nasa.gov/bio/repo/) dataset to process, e.g. GLDS-465 (type: string, default: null) + > *Note: Not all datasets have the same OSD and GLDS number, so make sure the correct OSD and GLDS number are specified* + + +**Additional Required Parameters For Approach 2:** + +* `--sample_id_list` - path to a single-column file with unique sample identifiers (type: string, default: null) + > *Note: An example of this file is provided in the [workflow_code](workflow_code) directory [here](workflow_code/unique-sample-IDs.txt).* +* `--reads_dir` - path to raw reads directory (type: string, default: null) + + +**Optional Parameters For All Approaches** +> *Note: See `nextflow run -h` and [Nextflow's CLI run command documentation](https://nextflow.io/docs/latest/cli.html#run) for more options and details on how to run Nextflow.* + + +* `--single_suffix` - raw reads suffix that follows the unique part of sample names (type: string, default: "_raw.fastq.gz") +* `--R1_suffix` - raw forward reads suffix that follows the unique part of sample names (type: string, default: "_R1_raw.fastq.gz") +* `--R2_suffix` - raw reverse reads suffix that follows the unique part of sample names (type: string, default: "_R2_raw.fastq.gz") +* `--host` - host organism, should match the entry provided under `name` column in [hosts.csv](workflow_code/assets/hosts.csv) (type: string, default: "mouse") + + + +**Optional Parameters For Approach 1:** + +* `--runsheet_path` - specifies the path to a local runsheet; if not provided, a runsheet is automatically generated using OSDR metadata (type: string, default: null) +* `--isa_archive_path` - specifies the path to a local ISA archive; if not provided, ISA.zip is automatically generated using OSDR metadata (type: string, default: null) + + +**Optional Parameters For Approach 2:** + +* `--is_single` - whether data is single-end (type: boolean, default: false) + +
+ +#### 4c. Modify parameters and compute resources in the Nextflow config file + +Additionally, all parameters and workflow resources can be directly specified in the [nextflow.config](./workflow_code/nextflow.config) file. For detailed instructions on how to modify and set parameters in the config file, please see the [documentation here](https://www.nextflow.io/docs/latest/config.html). + +Once you've downloaded the workflow template, you can modify the parameters in the `params` scope and cpus/memory requirements in the `process` scope in your downloaded version of the [nextflow.config](workflow_code/nextflow.config) file as needed in order to match your dataset and system setup. Additionally, if necessary, you can modify each variable in the [nextflow.config](workflow_code/nextflow.config) file to be consistent with the study you want to process and the computer you're using for processing. + +
+ +--- + +### 5. Workflow Outputs + +#### 5a. Main Outputs + +The outputs from this pipeline are documented in the [GL-DPPD-7109-A](https://github.com/nasa/GeneLab_Data_Processing/blob/master/Metagenomics/Estimate_host_reads_in_raw_data/Pipeline_GL-DPPD-7109_Versions/GL-DPPD-7109-A.md) processing protocol. + + +The workflow also outputs the following: + - processing_info/protocol.txt (a text file describing the methods used by the workflow) + +#### 5b. Resource Logs + +Standard Nextflow resource usage logs are also produced as follows: + +**Nextflow Resource Usage Logs** + - processing_info/execution_report_{timestamp}.html (an html report that includes metrics about the workflow execution including computational resources and exact workflow process commands) + - processing_info/execution_timeline_{timestamp}.html (an html timeline for all processes executed in the workflow) + - processing_info/execution_trace_{timestamp}.txt (an execution tracing file that contains information about each process executed in the workflow, including: submission time, start time, completion time, cpu and memory used, machine-readable output) + +> Further details about these logs can also found within [this Nextflow documentation page](https://www.nextflow.io/docs/latest/tracing.html#execution-report). + +
+ +--- \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/README.md b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/README.md index 193e482a..f09829d0 100644 --- a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/README.md +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/README.md @@ -1,13 +1,14 @@ -# GeneLab Workflow Information for Estimating Host Reads in Illumina Metagenomics Seq Data +# GeneLab Workflow Information for Estimating Host Reads in Metagenomics Seq Data -> **GeneLab has wrapped each step of the estimating host DNA in Illumina metagenomics sequencing data pipeline (MGEstHostReads) into a workflow. The table below lists (and links to) each MGEstHostReads pipeline version and the corresponding workflow subdirectory, the current MGEstHostReads pipeline/workflow implementation is indicated. The workflow subdirectory contains information about the workflow along with instructions for installation and usage. Exact workflow run info and MGEstHostReads version used to process specific datasets that have been released are provided with their processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).** +> **GeneLab has wrapped each step of the estimating host DNA in metagenomics sequencing data pipeline (MGEstHostReads) into a workflow. The table below lists (and links to) each MGEstHostReads pipeline version and the corresponding workflow subdirectory, the current MGEstHostReads pipeline/workflow implementation is indicated. The workflow subdirectory contains information about the workflow along with instructions for installation and usage. Exact workflow run info and MGEstHostReads version used to process specific datasets that have been released are provided with their processed data in the [Open Science Data Repository (OSDR)](https://osdr.nasa.gov/bio/repo/).** ## MGEstHostReads Pipeline Version and Corresponding Workflow -|Pipeline Version|Current Workflow Version (for respective pipeline version)| -|:---------------|:---------------------------------------------------------| -|*[GL-DPPD-7109.md](../Pipeline_GL-DPPD-7109_Versions/GL-DPPD-7109.md)|[1.0.0](SW_MGEstHostReads)| +|Pipeline Version|Current Workflow Version (for respective pipeline version)|Nextflow Version| +|:---------------|:---------------------------------------------------------|:---------------| +|*[GL-DPPD-7109-A.md](../Pipeline_GL-DPPD-7109_Versions/GL-DPPD-7109-A.md)|[NF_MGEstHostReads_1.0.0](NF_MGEstHostReads)|25.04.6| +|[GL-DPPD-7109.md](../Pipeline_GL-DPPD-7109_Versions/GL-DPPD-7109.md)|[SW_MGEstHostReads_1.0.0](SW_MGEstHostReads)|N/A (Snakemake v7.26.0)| *Current GeneLab Pipeline/Workflow Implementation -> See the [workflow change log](SW_MGEstHostReads/CHANGELOG.md) to access previous workflow versions and view all changes associated with each version update. +> See the [workflow change log](NF_MGEstHostReads/CHANGELOG.md) to access previous workflow versions and view all changes associated with each version update. From f11d4f9c8c363d4ba95f0bd1d6edd78b71627089 Mon Sep 17 00:00:00 2001 From: Jihan Yehia Date: Wed, 11 Feb 2026 13:22:48 -0800 Subject: [PATCH 3/3] Add workflow_code/ --- .../workflow_code/assets/hosts.csv | 5 + .../__init__.py | 9 + .../dp_tools__metagenomics_estHost/checks.py | 1537 +++++++++++++++++ .../config.yaml | 150 ++ .../protocol.py | 997 +++++++++++ .../dp_tools__metagenomics_estHost/schemas.py | 62 + .../workflow_code/bin/generate_protocol.sh | 12 + .../NF_MGEstHostReads/workflow_code/main.nf | 156 ++ .../workflow_code/modules/copy_reads.nf | 22 + .../workflow_code/modules/fetch_isa.nf | 14 + .../modules/generate_protocol.nf | 18 + .../workflow_code/modules/isa_to_runsheet.nf | 19 + .../workflow_code/modules/kraken2.nf | 26 + .../workflow_code/modules/kraken2_db.nf | 26 + .../workflow_code/modules/summary.nf | 22 + .../workflow_code/modules/utils.nf | 17 + .../workflow_code/nextflow.config | 110 ++ .../workflow_code/parse_runsheet.nf | 80 + 18 files changed, 3282 insertions(+) create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/assets/hosts.csv create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/__init__.py create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/checks.py create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/config.yaml create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/protocol.py create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/schemas.py create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/generate_protocol.sh create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/main.nf create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/copy_reads.nf create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/fetch_isa.nf create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/generate_protocol.nf create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/isa_to_runsheet.nf create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/kraken2.nf create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/kraken2_db.nf create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/summary.nf create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/utils.nf create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/nextflow.config create mode 100644 Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/parse_runsheet.nf diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/assets/hosts.csv b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/assets/hosts.csv new file mode 100644 index 00000000..031f97b2 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/assets/hosts.csv @@ -0,0 +1,5 @@ +name,species,refseq,genome,fasta +mouse,Mus musculus,GCF_000001635.27,GRCm39,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/635/GCF_000001635.27_GRCm39/GCF_000001635.27_GRCm39_genomic.fna.gz +arabidopsis,Arabidopsis thaliana,GCF_000001735.4,TAIR10,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_genomic.fna.gz +lettuce,Lactuca sativa,GCF_002870075.4,Lsat_Salinas_v11,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/870/075/GCF_002870075.4_Lsat_Salinas_v11/GCF_002870075.4_Lsat_Salinas_v11_genomic.fna.gz +tomato,Solanum lycopersicum,GCF_036512215.1,SLM_r2.1,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/036/512/215/GCF_036512215.1_SLM_r2.1/GCF_036512215.1_SLM_r2.1_cds_from_genomic.fna.gz \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/__init__.py b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/__init__.py new file mode 100644 index 00000000..5faa427c --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/__init__.py @@ -0,0 +1,9 @@ +from pathlib import Path + +# Import for access at the module level +from . import checks +from . import protocol +from . import schemas + +# Set config path +config = Path(__file__).parent / "config.yaml" \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/checks.py b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/checks.py new file mode 100644 index 00000000..885d2160 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/checks.py @@ -0,0 +1,1537 @@ +from collections import defaultdict +import copy +import enum +import gzip +import itertools +import logging +import math +from pathlib import Path +from statistics import mean +import string +import subprocess +from typing import Callable, Dict, Union +from importlib.metadata import files + +import pandas as pd + +from dp_tools.core.entity_model import Dataset, Sample, multiqc_run_to_dataframes + +log = logging.getLogger(__name__) + +from dp_tools.core.check_model import FlagCode, FlagEntry, FlagEntryWithOutliers + + +def r_style_make_names(s: str) -> str: + """Recreates R's make.names function for individual strings. + This function is often used to create syntactically valid names in R which are then saved in R outputs. + Source: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/make.names + + Args: + s (str): A string to convert + + Returns: + str: A string converted in the same way as R's make.names function + """ + EXTRA_WHITELIST_CHARACTERS = "_ΩπϴλθijkuΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρστυφχψω_µ" # Note: there are two "μμ" like characters one is greek letter mu, the other is the micro sign + VALID_CHARACTERS = string.ascii_letters + string.digits + "." + EXTRA_WHITELIST_CHARACTERS + REPLACEMENT_CHAR = "." + new_string_chars = list() + for char in s: + if char in VALID_CHARACTERS: + new_string_chars.append(char) + else: + new_string_chars.append(REPLACEMENT_CHAR) + return "".join(new_string_chars) + + +# adapted from reference: https://stackoverflow.com/questions/56048627/round-floats-in-a-nested-dictionary-recursively +# used to round values for easier to read messages +def formatfloat(x): + return "%.3g" % float(x) + + +def pformat(original_dictionary, function): + dictionary = copy.deepcopy( + original_dictionary + ) # we don't want to override original values + if isinstance(dictionary, dict): + new_dict = dict() + for k, v in dictionary.items(): + new_dict[k] = function(v) if isinstance(v, float) else pformat(v, function) + return new_dict + return dictionary + + +def convert_nan_to_zero(input: Dict[str, Union[float, int]]) -> Dict: + """Convert any Nan into zero""" + output = dict() + for key, value in input.items(): + output[key] = value if not math.isnan(value) else 0 + return output + + +## Functions that use the following syntax to merge values from general stats: +# "stat1 + stat2" should search and sum the stats +# TODO: refine dict typehint +def stat_string_to_value(stat_string: str, mqcData: dict) -> float: + """ "stat1 + stat2" should search and sum the stats""" + sum = float(0) + direct_keys = stat_string.split(" + ") + for direct_key in direct_keys: + print(direct_key) + sum += mqcData[direct_key] + return float(sum) + + +## Dataframe and Series specific helper functions +def nonNull(df: pd.DataFrame) -> bool: + # negation since it checks if any are null + return ~df.isnull().any(axis=None) + + +def nonNegative(df: pd.DataFrame) -> bool: + """This ignores null values, use nonNull to validate that condition""" + return ((df >= 0) | (df.isnull())).all(axis=None) + + +def onlyAllowedValues(df: pd.DataFrame, allowed_values: list) -> bool: + """This ignores null values, use nonNull to validate that condition""" + return ((df.isin(allowed_values)) | (df.isnull())).all(axis=None) + + +def check_forward_and_reverse_reads_counts_match( + sample: Sample, reads_key_1: str, reads_key_2: str +) -> FlagEntry: + # data specific preprocess + count_fwd_reads = float( + sample.compile_multiqc_data([reads_key_1])["general_stats"]["FastQC"][ + "total_sequences" + ] + ) + count_rev_reads = float( + sample.compile_multiqc_data([reads_key_2])["general_stats"]["FastQC"][ + "total_sequences" + ] + ) + + # check logic + if count_fwd_reads == count_rev_reads: + code = FlagCode.GREEN + message = ( + f"Forward and reverse read counts match at " + f"{int(count_rev_reads)} sequences " + ) + else: + code = FlagCode.HALT + message = ( + f"Forward and reverse read counts do not " + f"match: forward_Count:{int(count_fwd_reads)}, " + f"reverse_Count:{int(count_rev_reads)}" + ) + + return {"code": code, "message": message} + + +def check_file_exists(file: Path) -> FlagEntry: + # check logic + if file.is_file(): + code = FlagCode.GREEN + message = f"File exists: {file.name} " + else: + code = FlagCode.HALT + message = f"Missing file: {file.name} expected at {str(file)} " + + return {"code": code, "message": message} + + +def check_fastqgz_file_contents(file: Path, count_lines_to_check: int) -> FlagEntry: + """Check fastqgz by: + 1. Decompressing as a stream of lines. + 2. Affirming expected headers (every 4th line) look correct. + + :param file: Input fastqGZ file path + :type file: Path + :param count_lines_to_check: Maximum number of lines to check. Setting this to a negative value will remove the limit + :type count_lines_to_check: int + :return: A required fields-only flag entry dictionary + :rtype: FlagEntry + """ + + lines_with_issues: list[int] = list() + + # check logic + # truncated files raise EOFError + # catch this as HALT3 + try: + with gzip.open(file, "rb") as f: + for i, byte_line in enumerate(f): + # checks if lines counted equals the limit input + if i + 1 == count_lines_to_check: + log.debug( + f"Reached {count_lines_to_check} lines, ending line check" + ) + break + + line = byte_line.decode() + # every fourth line should be an identifier + expected_identifier_line = i % 4 == 0 + # check if line is actually an identifier line + if expected_identifier_line and line[0] != "@": + lines_with_issues.append(i + 1) + # update every 2,000,000 reads + if i % 2_000_000 == 0: + log.debug(f"Checked {i} lines for {file}") + pass + + if not len(lines_with_issues) == 0: + code = FlagCode.HALT + message = ( + f"Following decompressed fastqGZ lines have issues: {lines_with_issues}" + ) + else: + code = FlagCode.GREEN + message = f"First {count_lines_to_check} lines checked found no issues. This means headers lines were identifiable and no decompression errors occured." + except (EOFError, gzip.BadGzipFile): + code = FlagCode.HALT + message = ( + f"Error during decompression, likely a compression or truncation issue." + ) + + return {"code": code, "message": message} + +def check_gzip_file_integrity(file: Path, gzip_bin: Path = Path("gzip")) -> FlagEntry: + """ Check gzip file integrity using 'gzip -t' as per https://www.gnu.org/software/gzip/manual/gzip.html """ + output = subprocess.run( + [str(gzip_bin), "-t", str(file)], capture_output=True + ) + stdout_string = output.stdout.decode() + if stdout_string == "": + code = FlagCode.GREEN + message = f"Gzip integrity test raised no issues" + else: + code = FlagCode.HALT + message = ( + f"Gzip integrity test failed on this file with output: {stdout_string}" + ) + return {"code": code, "message": message} + +def check_bam_file_integrity( + file: Path, samtools_bin: Path = Path("samtools") +) -> FlagEntry: + """Uses http://www.htslib.org/doc/samtools-quickcheck.html""" + # data specific preprocess + + # check logic + output = subprocess.run( + [str(samtools_bin), "quickcheck", "-v", str(file)], capture_output=True + ) + stdout_string = output.stdout.decode() + if stdout_string == "": + code = FlagCode.GREEN + message = f"Samtools quickcheck raised no issues" + else: + code = FlagCode.HALT + message = ( + f"Samtools quickcheck failed on this file with output: {stdout_string}" + ) + return {"code": code, "message": message} + + +def check_thresholds( + multiqc_inputs: list[Path], mqc_key: str, stat_string: str, thresholds: list[dict] +) -> FlagEntry: + # data specific preprocess + data = multiqc_run_to_dataframes(multiqc_inputs) + value = stat_string_to_value(stat_string, data["general_stats"][mqc_key]) + + # check logic + # Assuming GREEN unless reassigned + code = FlagCode.GREEN + for threshold in thresholds: + match threshold["type"]: + case "lower": + if value < threshold["value"]: + code = ( + FlagCode[threshold["code"]] + if code < FlagCode[threshold["code"]] + else code + ) + + if code == FlagCode.GREEN: + message = f"Value: ({value}) did not breech any configured thresholds" + else: + message = f"Value: ({value}) breeched configured thresholds" + return {"code": code, "message": message} + + +def check_metadata_attributes_exist( + dataset: Dataset, expected_attrs: list[str] +) -> FlagEntry: + missing_metadata_fields = list(set(expected_attrs) - set(dataset.metadata)) + + # check if any missing_metadata_fields are present + # check logic + if not missing_metadata_fields: + code = FlagCode.GREEN + message = f"All expected metadata keys found: Expected {expected_attrs}, Found {set(dataset.metadata)}" + else: + code = FlagCode.HALT + message = f"Missing dataset metadata (source from Runsheet): {missing_metadata_fields}" + return {"code": code, "message": message} + + +def check_for_outliers( + dataset: Dataset, + data_asset_keys: list[str], + mqc_module: str, + mqc_plot: str, + mqc_keys: list[str], + thresholds: list[dict], +) -> FlagEntryWithOutliers: + # assume code is GREEN until outliers detected + code = FlagCode.GREEN + # dataframe extraction + compiled_mqc_data = dataset.compile_multiqc_data(data_asset_keys=data_asset_keys) + + if mqc_plot == "general_stats": + df = compiled_mqc_data["general_stats"][mqc_module] + else: + df = compiled_mqc_data["plots"][mqc_module][mqc_plot] + + def default_to_regular(d): + if isinstance(d, defaultdict): + d = {k: default_to_regular(v) for k, v in d.items()} + return d + + # track for outliers + outliers: dict[str, dict[str, dict[str, str]]] = defaultdict( + lambda: defaultdict(dict) + ) + + # override if mqc_keys is a special value + if mqc_keys == ["_ALL"]: + mqc_keys = df.columns + + for mqc_key in mqc_keys: + for threshold in thresholds: + if threshold["middle_fcn"] == "mean": + middle = df[mqc_key].mean() + elif threshold["middle_fcn"] == "median": + middle = df[mqc_key].median() + else: + raise ValueError( + f"Cannot compute middle from supplied middle_fcn name: {threshold['middle_fcn']}. Must supply either 'median' or 'mean'" + ) + + # bail if standard deviation == 0 + # e.g. if all values are identical (and thus has no outliers) + if df[mqc_key].std() == 0: + continue + + # compute difference + df_diffs = df[mqc_key] - middle + + # compute as number of standard deviations + df_diffs_in_std = df_diffs / df[mqc_key].std() + + # add to outlier tracker if over the threshold + for key, value in df_diffs_in_std.iteritems(): + # if an outlier + if abs(value) > threshold["stdev_threshold"]: + # track it + outliers[key][mqc_module][mqc_key] = value + # elevate code if current code is lower severity + if code < FlagCode[threshold["code"]]: + code = FlagCode[threshold["code"]] + + # convert defaultdict to regular for all reporting + outliers = default_to_regular(outliers) + # check logic + if code == FlagCode.GREEN: + message = f"No outliers found for {mqc_keys} in {mqc_plot} part of {mqc_module} multiQC module" + else: + message = ( + f"Outliers found in {mqc_module} multiQC module as follows: {outliers}" + ) + return {"code": code, "message": message, "outliers": outliers} + + +def _check_expected_files_exist( + input_dir: Path, expected_extensions: list[str], parent_dir_is_filename: bool = True +): + if parent_dir_is_filename: + fname = input_dir.name + expected_files = [input_dir / f"{fname}{ext}" for ext in expected_extensions] + missing_files = list() + for expected_file in expected_files: + if not expected_file.is_file(): + missing_files.append(str(expected_file)) + + expected_file_str = [str(f) for f in expected_files] + return missing_files, expected_file_str + + +def check_genebody_coverage_output(input_dir: Path): + EXPECTED_EXTENSIONS = [ + ".geneBodyCoverage.r", + ".geneBodyCoverage.txt", + ".geneBodyCoverage.curves.pdf", + ] + + missing_files, expected_file_str = _check_expected_files_exist( + input_dir, expected_extensions=EXPECTED_EXTENSIONS + ) + + if not missing_files: + code = FlagCode.GREEN + message = f"All output from geneBody coverage found: {expected_file_str}" + else: + code = FlagCode.HALT + message = f"Missing output from geneBody coverage: {missing_files}. Expected: {expected_file_str}" + return {"code": code, "message": message} + + +def check_inner_distance_output(input_dir: Path): + EXPECTED_EXTENSIONS = [ + ".inner_distance_plot.r", + ".inner_distance_freq.txt", + ".inner_distance.txt", + ".inner_distance_plot.pdf", + ] + + missing_files, expected_file_str = _check_expected_files_exist( + input_dir, expected_extensions=EXPECTED_EXTENSIONS + ) + + if not missing_files: + code = FlagCode.GREEN + message = f"All output from inner distance found: {expected_file_str}" + else: + code = FlagCode.HALT + message = f"Missing output from inner distance: {missing_files}. Expected: {expected_file_str}" + return {"code": code, "message": message} + + +def check_strandedness_assessable_from_infer_experiment( + dataset: Dataset, + stranded_assessment_range: dict[str, float], + unstranded_assessment_range: dict[str, float], + valid_dominant_strandedness_assessments: list[str], +) -> FlagEntry: + # data specific preprocess + def get_median_strandedness( + dataset: Dataset, + ) -> dict[str, float]: + + df = dataset.compile_multiqc_data(["infer experiment out"])["plots"]["RSeQC"][ + "Infer experiment" + ].fillna( + 0 + ) # Nan is a zero for this MultiQC table + + median_strandedness = df.median().to_dict() + + return median_strandedness + + median_strandedness = get_median_strandedness(dataset) + + # check if dominant assessment is valid + strand_assessment: str = max( + median_strandedness, key=lambda k: median_strandedness[k] + ) + + # flag based on thresholds + assessment_value: float = median_strandedness[strand_assessment] + + is_stranded: bool = ( + stranded_assessment_range["max"] + > assessment_value + > stranded_assessment_range["min"] + ) + is_unstranded: bool = ( + unstranded_assessment_range["max"] + > assessment_value + > unstranded_assessment_range["min"] + ) + + def determine_samples_outside_range( + dataset: Dataset, min: float, max: float + ) -> list[str]: + df = dataset.compile_multiqc_data(["infer experiment out"])["plots"]["RSeQC"][ + "Infer experiment" + ].fillna( + 0 + ) # Nan is a zero for this MultiQC table + + return df.index[df[strand_assessment].between(min, max) == False].to_list() + + # Catalog and flag any samples outside of range + # flags based on samples that are out of the assessment range + samples_outside_range: list[str] + if is_stranded: + samples_outside_range = determine_samples_outside_range( + dataset, + stranded_assessment_range["min"], + stranded_assessment_range["max"], + ) + elif is_unstranded: + samples_outside_range = determine_samples_outside_range( + dataset, + unstranded_assessment_range["min"], + unstranded_assessment_range["max"], + ) + else: # this means that the strandedness is ambiguous + samples_outside_range = list() + + # check logic + if strand_assessment not in valid_dominant_strandedness_assessments: + code = FlagCode.HALT + message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] is invalid for processing. Valid assessments: {valid_dominant_strandedness_assessments}" + elif not samples_outside_range and any([is_stranded, is_unstranded]): + code = FlagCode.GREEN + message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] assessed with no individual samples outside the assessment range" + elif samples_outside_range and any([is_stranded, is_unstranded]): + code = FlagCode.RED + message = f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] assessed with samples outside the assessment range: {samples_outside_range}" + else: + code = FlagCode.HALT + message = ( + f"Dominant strandedness [{strand_assessment} (median:{assessment_value:.2f})] is ambiguous due to being inside range " + f"({stranded_assessment_range['min']}-{unstranded_assessment_range['max']})" + ) + + return {"code": code, "message": message} + + +def check_rsem_counts_and_unnormalized_tables_parity( + rsem_table_path: Path, deseq2_table_path: Path +) -> FlagEntry: + # data specific preprocess + df_rsem = pd.read_csv(rsem_table_path) + df_deseq2 = pd.read_csv(deseq2_table_path) + + # return halt flag if column labels not conserved + if not set(df_deseq2.columns) == set(df_rsem.columns): + unique_to_deseq2 = set(df_deseq2.columns) - set(df_rsem.columns) + unique_to_rsem = set(df_rsem.columns) - set(df_deseq2.columns) + return { + "code": FlagCode.HALT, + "message": f"Columns do not match: unique to rsem: {unique_to_rsem}. unique to deseq2: {unique_to_deseq2}.", + } + + # rearrange columns to the same order + df_deseq2 = df_deseq2[df_rsem.columns] + + # check logic + if df_deseq2.equals(df_rsem): + code = FlagCode.GREEN + message = f"Tables of unnormalized counts match." + else: + code = FlagCode.HALT + message = ( + f"Tables of unnormalized counts have same columns but values do not match." + ) + return {"code": code, "message": message} + + +def check_aggregate_star_unnormalized_counts_table_values_against_samplewise_tables( + unnormalizedCountTable: Path, samplewise_tables: dict[str, Path] +) -> FlagEntry: + STAR_COUNT_MODES = ["unstranded", "sense", "antisense"] + # data specific preprocess + df_agg = pd.read_csv(unnormalizedCountTable, index_col=0) + + # based on which column matches the first entry + # all columns must match with the same strand column + strand_assessment: str = None # type: ignore + samples_with_issues: dict[str, list[str]] = { + "Not in aggregate table": list(), + "Sample counts mismatch": list(), + } + for sample, path in samplewise_tables.items(): + # check if samples exist as a column + if sample not in df_agg: + samples_with_issues["Not in aggregate table"].append(sample) + break + + # load + df_samp = pd.read_csv( + path, sep="\t", names=STAR_COUNT_MODES, index_col=0 + ).filter( + regex="^(?!N_.*).*", axis="rows" + ) # filter out N_* entries + + # check if the values match for any of the count modes + # unstranded, sense, antisense + # for remaining samples, only check the match for the first count mode + # TODO: Fix rare false postive related to zero counts, in those cases the strand_assessment can be prematurely determined which causes other samples to be compared with an inappropriate assessment + for count_mode in STAR_COUNT_MODES: + # make sure to sort indicies + if df_agg[sample].sort_index().equals(df_samp[count_mode].sort_index()): + # assign strand assessment if first sample + if strand_assessment is None: + strand_assessment = count_mode + + if strand_assessment == count_mode: + # no issues found (i.e. counts match with a consistent count mode column), break out + break + else: # no break + samples_with_issues["Sample counts mismatch"].append(sample) + + # check logic + if not any([issue_type for issue_type in samples_with_issues.values()]): + code = FlagCode.GREEN + message = ( + f"All samples accounted for and with matching counts " + f"between samplewise and aggregate table using strand assessment: '{strand_assessment}'" + ) + else: + code = FlagCode.HALT + message = f"Identified issues: {samples_with_issues}" + return {"code": code, "message": message} + + +def check_aggregate_rsem_unnormalized_counts_table_values_against_samplewise_tables( + unnormalizedCountTable: Path, samplewise_tables: dict[str, Path] +) -> FlagEntry: + # data specific preprocess + df_agg = pd.read_csv(unnormalizedCountTable, index_col=0) + + # based on which column matches the first entry + # TODO: LOW PRIORITY, fix this typehint + samples_with_issues: dict[str, Union[list[str], list[tuple[str, list[str]]]]] = { + "Not in aggregate table": list(), # type: ignore + "Sample counts mismatch": list(), # type: ignore + } + for sample, path in samplewise_tables.items(): + # check if samples exist as a column + if sample not in df_agg: + samples_with_issues["Not in aggregate table"].append(sample) + break + + # load + df_samp = pd.read_csv(path, sep="\t", index_col=0) + + # check if values match + if geneID_with_mismatched_counts := ( + list(df_agg.loc[df_agg[sample] != df_samp["expected_count"]].index) + ): + samples_with_issues["Sample counts mismatch"].append( + (sample, geneID_with_mismatched_counts) + ) + + # check logic + if not any([issue_type for issue_type in samples_with_issues.values()]): + code = FlagCode.GREEN + message = f"All samples accounted for and with matching counts between samplewise and aggregate table" + else: + code = FlagCode.HALT + message = f"Identified issues: {samples_with_issues}" + return {"code": code, "message": message} + + +def check_sample_table_against_runsheet( + runsheet: Path, sampleTable: Path, all_samples_required: bool +) -> FlagEntry: + """Check the sample table includes all samples as denoted in the runsheet. + + Args: + runsheet (Path): csv file used for processing, the index denotes all samples + sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table) + all_samples_required (bool): denotes if all samples must be shared or if a subset of samples from the runsheet is okay. + + Returns: + FlagEntry: A check result + """ + # data specific preprocess + df_rs = pd.read_csv(runsheet, index_col="Sample Name").sort_index() + df_sample = pd.read_csv(sampleTable, index_col=0).sort_index() + + extra_samples: dict[str, set[str]] = { + "unique_to_runsheet": set(df_rs.index) - set(df_sample.index), + "unique_to_sampleTable": set(df_sample.index) - set(df_rs.index), + } + + # check logic + if any( + [ + (extra_samples["unique_to_runsheet"] and all_samples_required), + (extra_samples["unique_to_sampleTable"]), + ] + ): + code = FlagCode.HALT + message = f"Samples mismatched: {[f'{entry}:{v}' for entry, v in extra_samples.items() if v]}" + else: + code = FlagCode.GREEN + message = f"All samples accounted for based on runsheet (All samples required?: {all_samples_required})" + return {"code": code, "message": message} + + +class GroupFormatting(enum.Enum): + r_make_names = enum.auto() + ampersand_join = enum.auto() + + +def utils_runsheet_to_expected_groups( + runsheet: Path, + formatting: GroupFormatting = GroupFormatting.ampersand_join, + limit_to_samples: list = None, + map_to_lists: bool = False, +) -> Union[dict[str, str], dict[str, list[str]]]: + df_rs = ( + pd.read_csv(runsheet, index_col="Sample Name", dtype=str) + .filter(regex="^Factor Value\[.*\]") + .sort_index() + ) # using only Factor Value columns + + if limit_to_samples: + df_rs = df_rs.filter(items=limit_to_samples, axis="rows") + + match formatting: + case GroupFormatting.r_make_names: + expected_conditions_based_on_runsheet = ( + df_rs.apply(lambda x: "...".join(x), axis="columns") + .apply(r_style_make_names) # join factors with '...' + .to_dict() + ) # reformat entire group in the R style + case GroupFormatting.ampersand_join: + expected_conditions_based_on_runsheet = df_rs.apply( + lambda x: f"({' & '.join(x)})", axis="columns" + ).to_dict() + case _: + raise ValueError( + f"Formatting method invalid, must be one of the following: {list(GroupFormatting)}" + ) + + # convert from {sample: group} dict + # to {group: [samples]} dict + if map_to_lists: + unique_groups = set(expected_conditions_based_on_runsheet.values()) + reformatted_dict: dict[str, list[str]] = dict() + for query_group in unique_groups: + reformatted_dict[query_group] = [ + sample + for sample, group in expected_conditions_based_on_runsheet.items() + if group == query_group + ] + expected_conditions_based_on_runsheet: dict[str, list[str]] = reformatted_dict + + return expected_conditions_based_on_runsheet + + +def check_sample_table_for_correct_group_assignments( + runsheet: Path, sampleTable: Path +) -> FlagEntry: + """Check the sample table is assigned to the correct experimental group. + An experimental group is defined by the Factor Value columns found in the runsheet. + + Args: + runsheet (Path): csv file used for processing, includes metadata used for experimental group designation + sampleTable (Path): csv file that pairs each sample with resolved experimental group (called condition within the table) + + Returns: + FlagEntry: A check result + """ + df_sample = pd.read_csv(sampleTable, index_col=0).sort_index() + # data specific preprocess + df_rs = ( + pd.read_csv(runsheet, index_col="Sample Name", dtype=str) # Ensure no factor value columns are misinterpreted as numeric + .filter(regex="^Factor Value\[.*\]") + .loc[df_sample.index] # ensure only sampleTable groups are checked + .sort_index() + ) # using only Factor Value columns + + # TODO: refactor with utils_runsheet_to_expected_groups + expected_conditions_based_on_runsheet = df_rs.apply( + lambda x: "...".join(x), axis="columns" + ).apply( # join factors with '...' + r_style_make_names + ) # reformat entire group in the R style + + mismatched_rows = expected_conditions_based_on_runsheet != df_sample["condition"] + + # check logic + if not any(mismatched_rows): + code = FlagCode.GREEN + message = f"Conditions are formatted and assigned correctly based on runsheet for all {len(df_sample)} samples in sample table: {list(df_sample.index)}" + else: + code = FlagCode.HALT + mismatch_description = ( + df_sample[mismatched_rows]["condition"] + + " <--SAMPLETABLE : RUNSHEET--> " + + expected_conditions_based_on_runsheet[mismatched_rows] + ).to_dict() + message = f"Mismatch in expected conditions based on runsheet for these rows: {mismatch_description}" + return {"code": code, "message": message} + + +def check_contrasts_table_headers(contrasts_table: Path, runsheet: Path) -> FlagEntry: + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + df_contrasts = pd.read_csv(contrasts_table, index_col=0) + + # check logic + differences = set(expected_comparisons).symmetric_difference( + set(df_contrasts.columns) + ) + if not differences: + code = FlagCode.GREEN + message = f"Contrasts header includes expected comparisons as determined runsheet Factor Value Columns: {set(expected_comparisons)}" + else: + code = FlagCode.HALT + message = f"Contrasts header does not match expected comparisons as determined runsheet Factor Value Columns: {differences}" + return {"code": code, "message": message} + + +def check_contrasts_table_rows(contrasts_table: Path, **_) -> FlagEntry: + # data specific preprocess + df_contrasts = pd.read_csv(contrasts_table, index_col=0) + + def _get_groups_from_comparisions(s: str) -> set[str]: + """Converts '(G1)v(G2)' + into G1...G2 where G1 and G2 are renamed as per the r make names function + + Args: + s (str): Input that fits this format: '(G1)v(G2)' + + Returns: + str: Reformatted string + """ + g1, g2 = s.split(")v(") + # remove parens and reformat with r make names style + g1 = r_style_make_names(g1[1:].replace(" & ", "...")) + g2 = r_style_make_names(g2[:-1].replace(" & ", "...")) + return {g1, g2} + + bad_columns: dict[str, dict[str, set]] = dict() + for (col_name, col_series) in df_contrasts.iteritems(): + expected_values = _get_groups_from_comparisions(col_name) + if not expected_values == set(col_series): + bad_columns[col_name] = { + "expected": expected_values, + "actual": set(col_series), + } + + # check logic + if not bad_columns: + code = FlagCode.GREEN + message = f"Contrasts column and rows match expected formatting" + else: + code = FlagCode.HALT + message = f"Contrasts columns {bad_columns} have unexpected values" + return {"code": code, "message": message} + + +def check_dge_table_annotation_columns_exist( + dge_table: Path, organism: str, **_ +) -> FlagEntry: + REQUIRED_ANNOTATION_KEYS = { + "SYMBOL", + "GENENAME", + "REFSEQ", + "ENTREZID", + "STRING_id", + "GOSLIM_IDS", + } + MASTER_ANNOTATION_KEY = {"_DEFAULT": "ENSEMBL", "Arabidopsis thaliana": "TAIR"} + + df_dge = pd.read_csv(dge_table) + + required_columns = REQUIRED_ANNOTATION_KEYS.union( + {MASTER_ANNOTATION_KEY.get(organism, MASTER_ANNOTATION_KEY["_DEFAULT"])} + ) + + missing_columns = required_columns - set(df_dge.columns) + # check logic + if not missing_columns: + code = FlagCode.GREEN + message = f"Found all required annotation columns: {required_columns}" + else: + code = FlagCode.HALT + message = ( + f"Missing the following required annotation columns: {missing_columns}" + ) + return {"code": code, "message": message} + + +def check_dge_table_sample_columns_exist( + dge_table: Path, samples: set[str], **_ +) -> FlagEntry: + # data specific preprocess + df_dge = pd.read_csv(dge_table) + + missing_sample_columns = samples - set(df_dge.columns) + + # check logic + if not missing_sample_columns: + code = FlagCode.GREEN + message = f"All samplewise columns present" + else: + code = FlagCode.HALT + message = f"Missing these sample count columns: {missing_sample_columns}" + return {"code": code, "message": message} + + +def check_dge_table_sample_columns_constraints( + dge_table: Path, samples: set[str], **_ +) -> FlagEntry: + MINIMUM_COUNT = 0 + # data specific preprocess + df_dge = pd.read_csv(dge_table)[samples] + + column_meets_constraints = df_dge.apply( + lambda col: all(col >= MINIMUM_COUNT), axis="rows" + ) + + # check logic + contraint_description = f"All counts are greater or equal to {MINIMUM_COUNT}" + if all(column_meets_constraints): + code = FlagCode.GREEN + message = ( + f"All values in columns: {samples} met constraint: {contraint_description}" + ) + else: + code = FlagCode.HALT + message = ( + f"These columns {list(column_meets_constraints.index[~column_meets_constraints])} " + f"fail the contraint: {contraint_description}." + ) + return {"code": code, "message": message} + + +def check_dge_table_group_columns_exist( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + # data specific preprocess + GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"] + expected_groups = utils_runsheet_to_expected_groups(runsheet) + expected_columns = { + "".join(comb) + for comb in itertools.product(GROUP_PREFIXES, expected_groups.values()) + } + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All group summary statistic columns (Prefixes: {GROUP_PREFIXES}) present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = f"Missing these group summary statistic columns (Prefixes: {GROUP_PREFIXES}): {sorted(list(missing_cols))}" + return {"code": code, "message": message} + + +def check_dge_table_group_columns_constraints( + dge_table: Path, runsheet: Path, samples: set[str], **_ +) -> FlagEntry: + FLOAT_TOLERANCE = ( + 0.001 # Percent allowed difference due to float precision differences + ) + # data specific preprocess + GROUP_PREFIXES = ["Group.Stdev_", "Group.Mean_"] + expected_groups = utils_runsheet_to_expected_groups(runsheet) + query_columns = { + "".join(comb) + for comb in itertools.product(GROUP_PREFIXES, expected_groups.values()) + } + + expected_group_lists = utils_runsheet_to_expected_groups( + runsheet, map_to_lists=True, limit_to_samples=samples + ) + df_dge = pd.read_csv(dge_table) + + # issue trackers + issues: dict[str, list[str]] = { + f"mean computation deviates by more than {FLOAT_TOLERANCE} percent": [], + f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent": [], + } + + group: str + sample_set: list[str] + for group, sample_set in expected_group_lists.items(): + abs_percent_differences = abs( + (df_dge[f"Group.Mean_{group}"] - df_dge[sample_set].mean(axis="columns")) + / df_dge[sample_set].mean(axis="columns") + * 100 + ) + if any(abs_percent_differences > FLOAT_TOLERANCE): + issues[ + f"mean computation deviates by more than {FLOAT_TOLERANCE} percent" + ].append(group) + + abs_percent_differences = abs( + (df_dge[f"Group.Stdev_{group}"] - df_dge[sample_set].std(axis="columns")) + / df_dge[sample_set].mean(axis="columns") + * 100 + ) + if any(abs_percent_differences > FLOAT_TOLERANCE): + issues[ + f"standard deviation deviates by more than {FLOAT_TOLERANCE} percent" + ].append(group) + + # check logic + contraint_description = f"Group mean and standard deviations are correctly computed from samplewise normalized counts within a tolerance of {FLOAT_TOLERANCE} percent (to accomodate minor float related differences )" + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = f"All values in columns: {query_columns} met constraint: {contraint_description}" + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" + f"fail the contraint: {contraint_description}." + ) + return {"code": code, "message": message} + + +def check_dge_table_comparison_statistical_columns_exist( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + # data specific preprocess + COMPARISON_PREFIXES = ["Log2fc_", "Stat_", "P.value_", "Adj.p.value_"] + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + expected_columns = { + "".join(comb) + for comb in itertools.product(COMPARISON_PREFIXES, expected_comparisons) + } + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All comparision summary statistic columns (Prefixes: {COMPARISON_PREFIXES}) present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = f"Missing these comparision summary statistic columns (Prefixes: {COMPARISON_PREFIXES}): {sorted(list(missing_cols))}" + return {"code": code, "message": message} + + +def utils_common_constraints_on_dataframe( + df: pd.DataFrame, constraints: tuple[tuple[set, dict], ...] +) -> dict: + + issues: dict[str, list[str]] = { + "Failed non null constraint": list(), + "Failed non negative constraint": list(), + } + + for (col_set, col_constraints) in constraints: + # this will avoid overriding the original constraints dictionary + # which is likely used in the check message + col_constraints = col_constraints.copy() + + # limit to only columns of interest + query_df = df[col_set] + for (colname, colseries) in query_df.iteritems(): + # check non null constraint + if col_constraints.pop("nonNull", False) and nonNull(colseries) == False: + issues["Failed non null constraint"].append(colname) + # check non negative constraint + if ( + col_constraints.pop("nonNegative", False) + and nonNegative(colseries) == False + ): + issues["Failed non negative constraint"].append(colname) + # check allowed values constraint + if allowedValues := col_constraints.pop("allowedValues", False): + if onlyAllowedValues(colseries, allowedValues) == False: + issues["Failed non negative constraint"].append(colname) + + # raise exception if there are unhandled constraint keys + if col_constraints: + raise ValueError(f"Unhandled constraint types: {col_constraints}") + + return issues + + +def check_dge_table_group_statistical_columns_constraints( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + + resolved_constraints = ( + ({f"Log2fc_{comp}" for comp in expected_comparisons}, {"nonNull": True}), + ({f"Stat_{comp}" for comp in expected_comparisons}, {"nonNull": True}), + # can be removed from analysis before p-value and adj-p-value assessed + # ref: https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na + ( + {f"P.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": True, "nonNull": False}, + ), + ( + {f"Adj.p.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": True, "nonNull": False}, + ), + ) + + df_dge = pd.read_csv(dge_table) + + # issue trackers + # here: {prefix+constraint: [failed_columns]} + issues: dict[str, list[str]] = dict() + + issues = utils_common_constraints_on_dataframe(df_dge, resolved_constraints) + + # check logic + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = f"All values in columns met constraint: {resolved_constraints}" + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" f"fail the contraint: {resolved_constraints}." + ) + return {"code": code, "message": message} + + +def check_dge_table_fixed_statistical_columns_exist(dge_table: Path, **_) -> FlagEntry: + # data specific preprocess + fixed_stats_columns = { + "All.mean": {"nonNull": True, "nonNegative": True}, + "All.stdev": {"nonNull": True, "nonNegative": True}, + "LRT.p.value": {"nonNull": False, "nonNegative": True}, + } + expected_columns = set(fixed_stats_columns) + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All dataset summary stat columns present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = ( + f"Missing these dataset summary stat columns: {sorted(list(missing_cols))}" + ) + return {"code": code, "message": message} + + +def check_dge_table_fixed_statistical_columns_constraints( + dge_table: Path, **_ +) -> FlagEntry: + # data specific preprocess + fixed_stats_columns = ( + ({"All.mean", "All.stdev"}, {"nonNull": True, "nonNegative": True}), + ({"LRT.p.value"}, {"nonNull": False, "nonNegative": True}), + ) + + df_dge = pd.read_csv(dge_table) + + # issue trackers + # here: {prefix+constraint: [failed_columns]} + issues: dict[str, list[str]] = dict() + + issues = utils_common_constraints_on_dataframe(df_dge, fixed_stats_columns) + + # check logic + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = f"All values in columns met constraint: {fixed_stats_columns}" + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" f"fail the contraint: {fixed_stats_columns}." + ) + return {"code": code, "message": message} + + +def check_dge_table_log2fc_within_reason( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD = 10 # Percent + LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT = 50 # Percent + + # TODO: discuss, this might even be fine to lower quite a bit + # e.g THRESHOLD_PERCENT_MEANS_DIFFERENCE = 1 # percent + THRESHOLD_PERCENT_MEANS_DIFFERENCE = 50 # percent + + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + df_dge = pd.read_csv(dge_table) + + # Track error messages + err_msg_yellow = "" + all_suspect_signs: dict[int, dict[str, float]] = dict() + for comparision in expected_comparisons: + query_column = f"Log2fc_{comparision}" + group1_mean_col = ( + "Group.Mean_" + comparision.split(")v(")[0] + ")" + ) # Uses parens and adds them back to prevent slicing on 'v' within factor names + group2_mean_col = "Group.Mean_" + "(" + comparision.split(")v(")[1] + computed_log2fc = (df_dge[group1_mean_col] / df_dge[group2_mean_col]).apply( + math.log, args=[2] + ) + abs_percent_difference = abs( + ((computed_log2fc - df_dge[query_column]) / df_dge[query_column]) * 100 + ) + percent_within_tolerance = ( + mean( + abs_percent_difference + < LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD + ) + * 100 + ) + # flag if not enough within tolerance + if percent_within_tolerance < LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT: + err_msg_yellow += ( + f"For comparison: '{comparision}' {percent_within_tolerance:.2f} % of genes have absolute percent differences " + f"(between log2fc direct computation and DESeq2's approach) " + f"less than {LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD} % which does not met the minimum percentage " + f"({LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT} %) of genes required. " + f"This may indicate misassigned or misaligned columns. " + ) + + #### sign based checks + + # filter to genes with based on groups means + abs_percent_differences = ( + abs( + (df_dge[group1_mean_col] - df_dge[group2_mean_col]) + / df_dge[group2_mean_col] + ) + * 100 + ) + df_dge_filtered = df_dge.loc[ + abs_percent_differences > THRESHOLD_PERCENT_MEANS_DIFFERENCE + ] + + df_dge_filtered["positive_sign_expected"] = ( + df_dge[group1_mean_col] - df_dge[group2_mean_col] > 0 + ) + + df_dge_filtered["matches_expected_sign"] = ( + (df_dge[query_column] > 0) & df_dge_filtered["positive_sign_expected"] + ) | ((df_dge[query_column] < 0) & ~df_dge_filtered["positive_sign_expected"]) + + all_suspect_signs = all_suspect_signs | df_dge_filtered.loc[ + df_dge_filtered["matches_expected_sign"] == False + ][[group1_mean_col, group2_mean_col, query_column]].to_dict("index") + + if all_suspect_signs: + code = FlagCode.RED + message = f"At least one log2fc sign is suspect, the following log2fc compared to actual group means: {all_suspect_signs}" + elif err_msg_yellow: + code = FlagCode.YELLOW + message = ( + f"All log2fc not within reason, specifically no more than {LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT}% " + f"of genes (actual %: {100 - percent_within_tolerance:.2f}) have a percent difference greater than " + f"{LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD}%. " + ) + else: + code = FlagCode.GREEN + message = ( + f"All log2fc within reason, specifically no more than {LOG2FC_CROSS_METHOD_TOLERANCE_PERCENT}% " + f"of genes (actual %: {100 - percent_within_tolerance:.2f}) have a percent difference greater than " + f"{LOG2FC_CROSS_METHOD_PERCENT_DIFFERENCE_THRESHOLD}%. Additionally, for comparisons with mean differences " + f"greater than {THRESHOLD_PERCENT_MEANS_DIFFERENCE}% all have reasonable log2fc signs" + ) + + return {"code": code, "message": message} + + +def check_viz_table_columns_exist(dge_table: Path, runsheet: Path, **_) -> FlagEntry: + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + viz_pairwise_columns_prefixes = ( + ( + {f"Log2_Adj.p.value_{comp}" for comp in expected_comparisons}, + {"nonNull": False}, + ), + ( + {f"Sig.1_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Sig.05_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Log2_P.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": False, "nonNull": False}, + ), + ( + {f"Updown_{comp}" for comp in expected_comparisons}, + {"allowedValues": [1, 0, -1], "nonNull": True}, + ), + ) + + expected_columns = set( + itertools.chain(*[c1 for c1, _ in viz_pairwise_columns_prefixes]) + ) + df_dge_columns = set(pd.read_csv(dge_table).columns) + missing_cols = expected_columns - df_dge_columns + + # check logic + if not missing_cols: + code = FlagCode.GREEN + message = f"All viz specific comparison columns present. {sorted(list(expected_columns))}" + else: + code = FlagCode.HALT + message = f"Missing these viz specific comparison columns: {sorted(list(missing_cols))}" + return {"code": code, "message": message} + + +def check_viz_table_columns_constraints( + dge_table: Path, runsheet: Path, **_ +) -> FlagEntry: + # data specific preprocess + expected_groups = utils_runsheet_to_expected_groups(runsheet, map_to_lists=True) + expected_comparisons = [ + "v".join(paired_groups) + for paired_groups in itertools.permutations(expected_groups, 2) + ] + viz_pairwise_columns_constraints = ( + ( + {f"Log2_Adj.p.value_{comp}" for comp in expected_comparisons}, + {"nonNull": False}, + ), + ( + {f"Sig.1_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Sig.05_{comp}" for comp in expected_comparisons}, + {"allowedValues": [False, True], "nonNull": False}, + ), + ( + {f"Log2_P.value_{comp}" for comp in expected_comparisons}, + {"nonNegative": False, "nonNull": False}, + ), + ( + {f"Updown_{comp}" for comp in expected_comparisons}, + {"allowedValues": [1, 0, -1], "nonNull": True}, + ), + ) + + df_viz = pd.read_csv(dge_table) + + # issue trackers + # here: {prefix+constraint: [failed_columns]} + issues: dict[str, list[str]] = dict() + + issues = utils_common_constraints_on_dataframe( + df_viz, viz_pairwise_columns_constraints + ) + + # check logic + if not any([issue_type for issue_type in issues.values()]): + code = FlagCode.GREEN + message = ( + f"All values in columns met constraint: {viz_pairwise_columns_constraints}" + ) + else: + code = FlagCode.HALT + message = ( + f"Issues found {issues} that" + f"fail the contraint: {viz_pairwise_columns_constraints}." + ) + return {"code": code, "message": message} + + +def check_viz_pca_table_index_and_columns_exist( + pca_table: Path, samples: set[str] +) -> FlagEntry: + EXPECTED_VIS_PCA_COLUMNS = {"PC1", "PC2", "PC3"} + err_msg = "" + # data specific preprocess + df = pd.read_csv(pca_table, index_col=0) + + # check all samples included + if missing_samples := samples - set(df.index): + err_msg += f"Missing samples in index: {missing_samples}" + + # check all expected columns exist + if missing_cols := EXPECTED_VIS_PCA_COLUMNS - set(df.columns): + err_msg += f"Missing expected columns: {missing_cols}" + + if not err_msg: + code = FlagCode.GREEN + message = f"PCA Table has all the samples in the index and these columns exist: {EXPECTED_VIS_PCA_COLUMNS}" + else: + code = FlagCode.HALT + message = err_msg + + return {"code": code, "message": message} + + +def utils_formatting_list(l: list[str], spaces: int = 2) -> str: + """Reformats list to print friendly multi line string. + + Example: + Reformatting a list of samples:: + + l = ['groundControl_1','groundControl_2','spaceFlight_1','spaceFlight-2'] + print(f"Samples: \n{utils_formatting_list(l)}") + + Args: + l (list): A list of strings to format + spaces (int): Number of leading spaces per line + + Returns: + str: Print friendly multiline string + """ + leading_spaces = " " * spaces + return "\n".join([f"{leading_spaces}- {item}" for item in l]) + + +def utils_rsem_counts_table_to_dataframe( + counts_table: Path, describe: bool = True +) -> pd.DataFrame: + df = pd.read_csv(counts_table, index_col=0).rename_axis("geneID") + if describe: + print(f"Loaded rsem counts table:") + print(f" Samples: \n{utils_formatting_list(list(df.columns), spaces = 4)}") + print(f" Number of Genes: {len(df)}") + return df + + +def utils_get_asset(asset_name: str) -> Path: + [p] = (p for p in files("dp_tools") if p.name == asset_name) + return p.locate() + + +def check_ERCC_subgroup_representation(unnormalizedCountTable: Path, **_) -> FlagEntry: + """Check ERCC subgroup representation is robust. + Specifically, counts the dataset wide ERCC IDs then categorizes each subgroup + by the number of represented ERCC IDs in that subgroup. + Finally, generates a Flag result by comparison to thresholds. + + Args: + counts_table (Path): RSEM unnormalized counts table + + Returns: + FlagEntry: Result of the check. + """ + MINIMUM_GREEN = 21 + MINIMUM_YELLOW = 19 + MINIMUM_RED = 0 + MINIMUM_HALT = 0 + + # data specific preprocess + df_counts = utils_rsem_counts_table_to_dataframe(unnormalizedCountTable) + + ercc_file = utils_get_asset("cms_095046.txt") + df_ercc = pd.read_csv(ercc_file, sep="\t") + + # filter to only ercc genes + df_counts = df_counts.loc[df_counts.index.isin(df_ercc["ERCC ID"])] + + # filter to only genes with at least one count (i.e. ERCC genes represented in the dataset) + df_counts = df_counts.loc[df_counts.sum(axis="columns") > 0] + + # merge to ercc table data including subgroup + df_counts = df_counts.merge(df_ercc, left_index=True, right_on="ERCC ID") + + # generate subgroup counts + df_subgroup_counts = df_counts["subgroup"].value_counts().sort_index() + + green_key = f"green level subgroups: > {MINIMUM_GREEN} ERCC represented" + yellow_key = ( + f"yellow level subgroups: {MINIMUM_YELLOW}-{MINIMUM_GREEN} ERCC represented" + ) + red_key = f"red level subgroups: {MINIMUM_RED}-{MINIMUM_YELLOW} ERCC represented" + halt_key = f"halt level subgroups: < {MINIMUM_HALT} ERCC represented" + + # classify each representation count + representation_category: dict[str, dict[str,int]] = { + green_key: df_subgroup_counts.loc[df_subgroup_counts > MINIMUM_GREEN].to_dict(), + yellow_key: + df_subgroup_counts.loc[ + df_subgroup_counts.between(MINIMUM_YELLOW, MINIMUM_GREEN) + ].to_dict() + , + red_key: + df_subgroup_counts.loc[ + df_subgroup_counts.between( + MINIMUM_RED, MINIMUM_YELLOW, inclusive="left" + ) + ].to_dict() + , + halt_key: df_subgroup_counts.loc[df_subgroup_counts < MINIMUM_HALT].to_dict(), + } + + # check logic + if representation_category[halt_key]: + code = FlagCode.HALT + message = ( + f"Dataset wide ERCC representation is not robust: {representation_category}" + ) + elif representation_category[red_key]: + code = FlagCode.RED + message = ( + f"Dataset wide ERCC representation is not robust: {representation_category}" + ) + elif representation_category[yellow_key]: + code = FlagCode.YELLOW + message = ( + f"Dataset wide ERCC representation is not robust: {representation_category}" + ) + else: + code = FlagCode.GREEN + message = ( + f"Dataset wide ERCC representation is robust: {representation_category}" + ) + return {"code": code, "message": message} + + +def check_sample_in_multiqc_report( + samples: list[str], + multiqc_report_path: Path, + name_reformat_func: Callable = lambda s: s, +) -> FlagEntry: + """Determines if the query samples are present in the multiqc report. + + This is achieved by checking the 'multiqc_sources.txt' table, 'Sample Name' column. + An optional name_reformat_function can be supplied to address sample name changes that occur in the multiqc report. + An example being the renaming of Sample '-' characters to '_' for certain RSeQC modules. + + :param sample: Query sample names to check for presense + :type sample: list[str] + :param multiqc_report_path: MultiQC report directory + :type multiqc_report_path: Path + :param name_reformat_func: A function applied to the multiQC sample names before searching against query sample names, defaults to not renaming the multiQC sample names + :type name_reformat_func: Callable, optional + :return: Flag Entry denoting successful or failing results. Includes description of query sample names and any missing samples + :rtype: FlagEntry + """ + # Load multiQC sources table and retrieve set of samples + [sources_table] = multiqc_report_path.glob("**/multiqc_sources.txt") + multiQC_samples = list(pd.read_csv(sources_table, sep="\t")["Sample Name"]) + + # Transform multiQC samples using name_reformat_func + reformatted_multiQC_samples = [name_reformat_func(s) for s in multiQC_samples] + + # Check for any missing reformatted sample names. + # Also track extra samples, these are not errors but should be reported as well. + missing_samples = set(samples) - set(reformatted_multiQC_samples) + + # check logic + if len(missing_samples) == 0: + code = FlagCode.GREEN + message = f"Found all query samples after reformatting multiQC sample names. Details: { {'query samples': samples, 'original multiQC sample names': multiQC_samples, 'reformatted multiQC sample names': reformatted_multiQC_samples} }" + else: + code = FlagCode.HALT + message = f"Missing the following query samples: {missing_samples}. Details: { {'query samples': samples, 'original multiQC sample names': multiQC_samples, 'reformatted multiQC sample names': reformatted_multiQC_samples} }" + return {"code": code, "message": message} \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/config.yaml b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/config.yaml new file mode 100644 index 00000000..ea4f7041 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/config.yaml @@ -0,0 +1,150 @@ +# TOP LEVEL +NAME: "metagenomics" +VERSION: "1" + +Staging: + General: + Required Metadata: + From ISA: + + - ISA Field Name: + - Characteristics[Organism] + - Characteristics[organism] + ISA Table Source: Sample + Runsheet Column Name: organism + Processing Usage: >- + Mapping to the appropriate alignment reference and annotation databases. + Example: Microbiota + + - ISA Field Name: + - Characteristics[host organism] + - Characteristics[Host Organism] + - Characteristics[Host organism] + ISA Table Source: Sample + Runsheet Column Name: host organism + Processing Usage: >- + Mapping to the appropriate alignment reference and annotation databases. + Example: Mus musculus + + - ISA Field Name: Sample Name + ISA Table Source: Assay + Runsheet Column Name: sample_name + Runsheet Index: true + Processing Usage: >- + Sample name is used as a unique sample identifier during processing + Example: Atha_Col-0_Root_WT_Ctrl_45min_Rep1_GSM502538 + + - ISA Field Name: + - Parameter Value[library layout] + - Parameter Value[Library Layout] + - Parameter Value: library layout + ISA Table Source: Assay + Runsheet Column Name: paired_end + Remapping: {"PAIRED":true, "Paired":true, "SINGLE":false, "Single":false} + Processing Usage: >- + Indicates if the sequencing was paired end. This controls how a variety of tools are invoked + including in-house written scripts. + Example: 'TRUE' + + # this entry denotes the following: + # retrieve from that ISA field name + # multiple values (separated by ",") + # index those to certain runsheet columns + # if the index doesn't exist, optional prevents raising an exception + # GLDS URL Mapping means the names are searched against the GLDS filelisting json for urls + # an exception will be raised if one and only one url is not mapped to each filename + - ISA Field Name: + - Parameter Value[Merged Sequence Data File] + - Characteristics[Merged Sequence Data File] + - Raw Data File + ISA Table Source: Assay + Multiple Values Per Entry: true + Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma + Runsheet Column Name: + - {'name':'read1_path', 'index':0} + - {'name':'read2_path', 'index':1, 'optional':true} + GLDS URL Mapping: True + Processing Usage: >- + Location to the raw data fastq file. May be a url or local path. + Example: 'https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-194_rna...' + + - ISA Field Name: + - Parameter Value[Merged Sequence Data File] + - Characteristics[Merged Sequence Data File] + - Raw Data File + ISA Table Source: Assay + Multiple Values Per Entry: true + Multiple Values Delimiter: '\s*,\s*' # whitespace surrounded comma + Runsheet Column Name: + - {'name':'raw_R1_suffix', 'index':0} + - {'name':'raw_R2_suffix', 'index':1, 'optional':true} + + Processing Usage: >- + Raw data fastq file. + Example: '_R1_raw.fastq.gz or _raw.fastq.gz for SE' + + - ISA Field Name: Factor Value[{factor_name}] + ISA Table Source: [Assay, Sample] + Runsheet Column Name: Factor Value[{factor_name}] + Matches Multiple Columns: true + Match Regex: "Factor Value\\[.*\\]" + Append Column Following: "Unit" + Processing Usage: >- + Factor values in a study. Used to assign experimental groups for each sample. + Note: On the runsheet, a subsequent 'Unit' Column value will be + suffix-concatenated if it exists. + Example: Basal Control + + - ISA Field Name: Unit + ISA Table Source: [Assay, Sample] + Runsheet Column Name: null + Matches Multiple Columns: true + Autoload: false # handled by factor value loading above + Processing Usage: >- + Unit to be suffix-concatenated onto prior Factor value columns. + Example: day + + From User: + # Removed since unused by Processing via the runsheet + # - Runsheet Column Name: GLDS + # Processing Usage: >- + # The GLDS accession number + # Example: GLDS-205 + + - Runsheet Column Name: read1_path + # used to generate candidate file names for searching GLDS repository filelisting + Data Asset Keys: ["raw forward reads fastq GZ", "raw reads fastq GZ"] + Processing Usage: >- + The location of either the forward reads (paired end) or only reads file (single end) + raw fastq file. Can be either a url or local path. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + + + - Runsheet Column Name: read2_path + Data Asset Keys: ["raw reverse reads fastq GZ"] + Processing Usage: >- + The location of either the reverse reads (paired end) + raw fastq file. Can be either a url or local path. + For single end studies, this should be an empty string. + Note: For GLDS raw data assets, either the filelisting json API or the OpenAPI + may be used to retrieve urls given the array data filename (sourced from ISA archive). + Example: /some/local/path OR https://genelab-data.ndc.nasa.gov/genelab/static/media/dataset/GLDS-123_microarray_E-MTAB-3289.raw.1.zip?version=1 + +ISA Meta: + Valid Study Assay Technology And Measurement Types: + - measurement: "Metagenomic sequencing" + technology: "Whole-Genome Shotgun Sequencing" + + # this is prepended to all file names in the curation assay table + Global file prefix: "{datasystem}_metagenomics_" + + # # configuration related to updating investigation file + # # each must refer to a STUDY PROCESS in the 'ISA_investigation.yaml' file + # # LEADCAP_organism should be the studied organisms scientific name with a leading cap + # Post Processing Add Study Protocol: + # GeneLab Methyl-Seq data processing protocol::{LEADCAP_organism} V1 + +data assets: + # resource categories: *neverPublished \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/protocol.py b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/protocol.py new file mode 100644 index 00000000..5eaa896a --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/protocol.py @@ -0,0 +1,997 @@ +from pathlib import Path +import re +from typing import Union +import yaml +import logging + +from dp_tools.core.entity_model import Dataset + +log = logging.getLogger(__name__) + +from dp_tools.core.check_model import ValidationProtocol + +from .checks import * + +CONFIG = { + "Metadata-check_metadata_attributes_exist": { + "expected_attrs": ["paired_end", "has_ERCC", "organism"] + }, + "Raw Reads-check_for_outliers": { + "mqc_module": "FastQC", + "mqc_plot": "general_stats", + "mqc_keys": [ + "percent_gc", + "avg_sequence_length", + "total_sequences", + "percent_duplicates", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "Trim Reads-check_for_outliers": { + "mqc_module": "FastQC", + "mqc_plot": "general_stats", + "mqc_keys": [ + "percent_gc", + "avg_sequence_length", + "total_sequences", + "percent_duplicates", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "Raw Reads By Sample-check_fastqgz_file_contents": { + "count_lines_to_check": 200000000 + }, + "Trim Reads By Sample-check_fastqgz_file_contents": { + "count_lines_to_check": 200000000 + }, + "STAR Alignments By Sample-check_thresholds-Mapped": { + "mqc_key": "STAR", + "stat_string": "uniquely_mapped_percent + multimapped_percent", + "thresholds": [ + {"code": "YELLOW", "type": "lower", "value": 70}, + {"code": "RED", "type": "lower", "value": 50}, + ], + }, + "STAR Alignments By Sample-check_thresholds-MultiMapped": { + "mqc_key": "STAR", + "stat_string": "multimapped_toomany_percent + multimapped_percent", + "thresholds": [ + {"code": "YELLOW", "type": "lower", "value": 30}, + {"code": "RED", "type": "lower", "value": 15}, + ], + }, + "STAR Alignments-check_for_outliers": { + "mqc_module": "STAR", + "mqc_plot": "general_stats", + "mqc_keys": [ + "uniquely_mapped_percent", + "avg_mapped_read_length", + "mismatch_rate", + "deletion_rate", + "deletion_length", + "insertion_rate", + "insertion_length", + "multimapped_percent", + "multimapped_toomany_percent", + "unmapped_mismatches_percent", + "unmapped_tooshort_percent", + "unmapped_other_percent", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-geneBody_coverage": { + "mqc_module": "RSeQC", + "mqc_plot": "Gene Body Coverage", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-infer_experiment": { + "mqc_module": "RSeQC", + "mqc_plot": "Infer experiment", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-inner_distance": { + "mqc_module": "RSeQC", + "mqc_plot": "Inner Distance", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_for_outliers-read_distribution": { + "mqc_module": "RSeQC", + "mqc_plot": "Read Distribution", + "mqc_keys": ["_ALL"], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, + "RSeQC-check_strandedness_assessable_from_infer_experiment": { + "stranded_assessment_range": {"max": 100, "min": 75}, + "unstranded_assessment_range": {"min": 40, "max": 60}, + "valid_dominant_strandedness_assessments": [ + "Sense (% Tags)", + "Antisense (% Tags)", + ], + }, + "RSEM Counts-check_for_outliers": { + "mqc_module": "Rsem", + "mqc_plot": "general_stats", + "mqc_keys": [ + "Unalignable", + "Alignable", + "Filtered", + "Total", + "alignable_percent", + "Unique", + "Multi", + "Uncertain", + ], + "thresholds": [ + {"code": "YELLOW", "stdev_threshold": 2, "middle_fcn": "median"}, + {"code": "RED", "stdev_threshold": 4, "middle_fcn": "median"}, + ], + }, +} + +# Manual kept in sync for now +COMPONENTS_LIST = [ + "Metadata", # for raw reads V&V + "Raw Reads", # for raw reads V&V + "Raw Reads By Sample", # for raw reads V&V + "Trim Reads", # for trim reads V&V + "Trimmed Reads By Sample", # for trim reads V&V + "STAR Alignments", # for star alignment V&V + "STAR Alignments By Sample", # for star alignment V&V + "RSeQC By Sample", # for RSeQC V&V + "RSeQC", # for RSeQC V&V + "RSEM Counts", # for after RSEM V&V + "Unnormalized Gene Counts", # for after RSEM V&V + "DGE Metadata", # for post DGE + "DGE Metadata ERCC", # for post DGE + "DGE Output", # for post DGE + "DGE Output ERCC", # for post DGE +] + + +def validate( + dataset: Dataset, + config_path: Path = None, + run_args: dict = None, + report_args: dict = None, + protocol_args: dict = None, + defer_run: bool = False, +) -> Union[ValidationProtocol, ValidationProtocol.Report]: + + if config_path is not None: + with open(config_path, "r") as f: + config = yaml.safe_load(f) + else: + config = CONFIG + + if run_args is None: + run_args = dict() + + if report_args is None: + report_args = dict() + + if protocol_args is None: + protocol_args = dict() + + # Modify protocol_args to convert run_components to skip_components based on COMPONENTS_LIST + if ( + "run_components" in protocol_args + and protocol_args.get("run_components") is not None + ): + protocol_args["skip_components"] = [ + c for c in COMPONENTS_LIST if c not in protocol_args["run_components"] + ] + # Check if any run components are not in COMPONENTS_LIST + if set(protocol_args["run_components"]) - set(COMPONENTS_LIST): + raise ValueError( + f"run_components contains components not in COMPONENTS_LIST. Unique to run_components: {set(protocol_args['run_components']) - set(COMPONENTS_LIST)}. All Components: {COMPONENTS_LIST}" + ) + del protocol_args["run_components"] + + # init validation protocol + vp = ValidationProtocol(**protocol_args) + # fmt: on + with vp.component_start( + name=dataset.name, + description="Validate processing from trim reads through differential gene expression output", + ): + + with vp.component_start( + name="Metadata", description="Metadata file validation" + ): + with vp.payload(payloads=[{"dataset": dataset}]): + vp.add( + check_metadata_attributes_exist, + config=config["Metadata-check_metadata_attributes_exist"], + ) + + with vp.component_start( + name="Raw Reads", description="Raw Reads Outliers Detection" + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["raw reads fastQC ZIP"], + } + ] + if not dataset.metadata["paired_end"] + else [ + { + "dataset": dataset, + "data_asset_keys": [ + "raw forward reads fastQC ZIP", + ], + }, + { + "dataset": dataset, + "data_asset_keys": [ + "raw reverse reads fastQC ZIP", + ], + }, + ] + ): + vp.add( + check_for_outliers, config=config["Raw Reads-check_for_outliers"] + ) + + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "raw MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_raw|_R1_raw|_R2_raw$", "", s + ), + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in raw reads multiQC report", + ) + + with vp.component_start( + name="Trim Reads", description="Trimmed Reads Outliers Detection" + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["trimmed reads fastQC ZIP"], + } + ] + if not dataset.metadata["paired_end"] + else [ + { + "dataset": dataset, + "data_asset_keys": [ + "trimmed forward reads fastQC ZIP", + ], + }, + { + "dataset": dataset, + "data_asset_keys": [ + "trimmed reverse reads fastQC ZIP", + ], + }, + ] + ): + vp.add( + check_for_outliers, config=config["Trim Reads-check_for_outliers"] + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "trimmed fastQC MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_R1|_R2$", "", s + ), + }, + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "trimming MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_raw|_R1_raw|_R2_raw$", "", s + ), + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check that all samples are present in the trimmed FastQC and trimming report multiQC reports", + ) + with vp.component_start( + name="STAR Alignments", + description="Dataset wide checks including outliers detection", + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["aligned log Final"], + } + ] + ): + vp.add( + check_for_outliers, + config=config["STAR Alignments-check_for_outliers"], + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "aligned MultiQC directory" + ].path, + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in STAR multiQC report", + ) + + with vp.component_start( + name="RSeQC", + description="RSeQC submodule outliers checking and other submodule specific dataset wide checks", + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["genebody coverage out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in geneBody Coverage", + config=config["RSeQC-check_for_outliers-geneBody_coverage"], + ) + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["infer experiment out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in infer experiment", + config=config["RSeQC-check_for_outliers-infer_experiment"], + ) + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["inner distance out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in inner distance", + config=config["RSeQC-check_for_outliers-inner_distance"], + skip=(not dataset.metadata["paired_end"]), + ) + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["read distribution out"], + } + ] + ): + vp.add( + check_for_outliers, + description="Check for outliers in read distribution", + config=config["RSeQC-check_for_outliers-read_distribution"], + ) + + with vp.payload(payloads=[{"dataset": dataset}]): + vp.add( + check_strandedness_assessable_from_infer_experiment, + config=config[ + "RSeQC-check_strandedness_assessable_from_infer_experiment" + ], + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "genebody coverage MultiQC directory" + ].path, + }, + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "infer experiment MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_infer_expt$", "", s + ), + }, + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "read distribution MultiQC directory" + ].path, + "name_reformat_func": lambda: lambda s: re.sub( + "_read_dist$", "", s + ), + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in RSeQC multiQC reports", + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "inner distance MultiQC directory" + ].path, + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in RSeQC inner distance multiQC report (paired end only)", + skip=(not dataset.metadata["paired_end"]), + ) + with vp.component_start( + name="RSEM Counts", + description="Dataset wide checks including outliers detection", + ): + with vp.payload( + payloads=[ + { + "dataset": dataset, + "data_asset_keys": ["sample counts stats directory"], + } + ] + ): + vp.add( + check_for_outliers, config=config["RSEM Counts-check_for_outliers"] + ) + with vp.payload( + payloads=[ + { + "samples": list(dataset.samples), + "multiqc_report_path": lambda: dataset.data_assets[ + "RSEM counts MultiQC directory" + ].path, + }, + ] + ): + vp.add( + check_sample_in_multiqc_report, + description="Check all samples are present in RSEM multiQC report", + ) + with vp.component_start( + name="Unnormalized Gene Counts", + description="Validate normalization related output", + ): + + with vp.payload( + payloads=[ + { + "unnormalizedCountTable": lambda: dataset.data_assets[ + "star unnormalized counts table" + ].path, + "samplewise_tables": lambda: { + s.name: s.data_assets["sample reads per gene table"].path + for s in dataset.samples.values() + }, + }, + ] + ): + vp.add( + check_aggregate_star_unnormalized_counts_table_values_against_samplewise_tables + ) + with vp.payload( + payloads=[ + { + "unnormalizedCountTable": lambda: dataset.data_assets[ + "rsem unnormalized counts table" + ].path, + "samplewise_tables": lambda: { + s.name: s.data_assets["sample gene counts table"].path + for s in dataset.samples.values() + }, + }, + ] + ): + vp.add( + check_aggregate_rsem_unnormalized_counts_table_values_against_samplewise_tables + ) + vp.add( + check_ERCC_subgroup_representation, + skip=(not dataset.metadata["has_ERCC"]), + ) + + with vp.component_start( + name="DGE Metadata", + description="", + ): + + with vp.component_start( + name="Sample Table", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "sampleTable": lambda: dataset.data_assets[ + "sample table" + ].path, + } + ] + ): + vp.add( + check_sample_table_against_runsheet, + config={"all_samples_required": True}, + ) + vp.add(check_sample_table_for_correct_group_assignments) + + with vp.component_start( + name="Contrasts Tables", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "contrasts_table": lambda: dataset.data_assets[ + "DESeq2 contrasts table" + ].path, + } + ] + ): + vp.add(check_contrasts_table_headers) + vp.add(check_contrasts_table_rows) + + with vp.component_start( + name="DGE Metadata ERCC", + description="", + skip=(not dataset.metadata["has_ERCC"]), + ): + + with vp.component_start( + name="Sample Table", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "sampleTable": lambda: dataset.data_assets[ + "ERCC sample table" + ].path, + } + ] + ): + vp.add( + check_sample_table_against_runsheet, + config={"all_samples_required": False}, + ) + vp.add(check_sample_table_for_correct_group_assignments) + + with vp.component_start( + name="Contrasts Tables", + description="", + ): + with vp.payload( + payloads=[ + { + "runsheet": lambda: dataset.data_assets["runsheet"].path, + "contrasts_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 contrasts table" + ].path, + } + ] + ): + vp.add(check_contrasts_table_headers) + vp.add(check_contrasts_table_rows) + + with vp.component_start( + name="DGE Output", + description="", + ): + with vp.payload( + payloads=[ + { + "rsem_table_path": lambda: dataset.data_assets[ + "rsem unnormalized counts table" + ].path, + "deseq2_table_path": lambda: dataset.data_assets[ + "DESeq2 unnormalized counts table" + ].path, + } + ] + ): + vp.add( + check_rsem_counts_and_unnormalized_tables_parity, + skip=( + "rsem unnormalized counts table" not in dataset.data_assets + or "DESeq2 unnormalized counts table" not in dataset.data_assets + ), + ) + + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set(dataset.samples), + "dge_table": lambda: dataset.data_assets[ + "DESeq2 annotated DGE table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + + with vp.component_start( + name="Viz Tables", + description="Extended from the dge tables", + ): + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set(dataset.samples), + "dge_table": lambda: dataset.data_assets[ + "DESeq2 annotated DGE extended for viz table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + vp.add(check_viz_table_columns_exist) + vp.add(check_viz_table_columns_constraints) + + with vp.payload( + payloads=[ + { + "samples": lambda: set(dataset.samples), + "pca_table": lambda: dataset.data_assets[ + "DESeq2 viz PCA table" + ].path, + } + ] + ): + vp.add(check_viz_pca_table_index_and_columns_exist) + + with vp.component_start( + name="DGE Output ERCC", + description="", + skip=(not dataset.metadata["has_ERCC"]), + ): + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set( + pd.read_csv( + dataset.data_assets["ERCC sample table"].path, + index_col=0, + ).index + ), + "dge_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 annotated DGE table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + + with vp.component_start( + name="Viz Tables", + description="Extended from the dge tables", + ): + with vp.payload( + payloads=[ + { + "organism": lambda: dataset.metadata["organism"], + "samples": lambda: set( + pd.read_csv( + dataset.data_assets["ERCC sample table"].path, + index_col=0, + ).index + ), + "dge_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 annotated DGE extended for viz table" + ].path, + "runsheet": lambda: dataset.data_assets["runsheet"].path, + } + ] + ): + vp.add(check_dge_table_annotation_columns_exist) + vp.add(check_dge_table_sample_columns_exist) + vp.add(check_dge_table_sample_columns_constraints) + vp.add(check_dge_table_group_columns_exist) + vp.add(check_dge_table_group_columns_constraints) + vp.add(check_dge_table_comparison_statistical_columns_exist) + vp.add(check_dge_table_group_statistical_columns_constraints) + vp.add(check_dge_table_fixed_statistical_columns_exist) + vp.add(check_dge_table_fixed_statistical_columns_constraints) + vp.add(check_dge_table_log2fc_within_reason) + vp.add(check_viz_table_columns_exist) + vp.add(check_viz_table_columns_constraints) + + with vp.payload( + payloads=[ + { + "samples": lambda: set( + pd.read_csv( + dataset.data_assets["ERCC sample table"].path, + index_col=0, + ).index + ), + "pca_table": lambda: dataset.data_assets[ + "ERCC normalized DESeq2 viz PCA table" + ].path, + } + ] + ): + vp.add(check_viz_pca_table_index_and_columns_exist) + + for sample in dataset.samples.values(): + with vp.component_start( + name=sample.name, description="Samples level checks" + ): + with vp.component_start( + name="Raw Reads By Sample", description="Raw reads" + ): + with vp.payload( + payloads=( + [ + { + "file": lambda sample=sample: sample.data_assets[ + "raw forward reads fastq GZ" + ].path + }, + { + "file": lambda sample=sample: sample.data_assets[ + "raw reverse reads fastq GZ" + ].path + }, + ] + if dataset.metadata["paired_end"] + else [ + { + "file": lambda sample=sample: sample.data_assets[ + "raw reads fastq GZ" + ].path + }, + ] + ) + ): + vp.add( + check_fastqgz_file_contents, + config=config[ + "Raw Reads By Sample-check_fastqgz_file_contents" + ], + ) + vp.add( + check_gzip_file_integrity, + ) + with vp.payload( + payloads=[ + { + "sample": sample, + "reads_key_1": "raw forward reads fastQC ZIP", + "reads_key_2": "raw reverse reads fastQC ZIP", + }, + ], + ): + vp.add( + check_forward_and_reverse_reads_counts_match, + skip=(not dataset.metadata["paired_end"]), + ) + with vp.component_start( + name="Trimmed Reads By Sample", description="Trimmed reads" + ): + with vp.payload( + payloads=( + [ + { + "file": lambda sample=sample: sample.data_assets[ + "trimmed forward reads fastq GZ" + ].path + }, + { + "file": lambda sample=sample: sample.data_assets[ + "trimmed reverse reads fastq GZ" + ].path + }, + ] + if dataset.metadata["paired_end"] + else [ + { + "file": lambda sample=sample: sample.data_assets[ + "trimmed reads fastq GZ" + ].path + } + ] + ) + ): + vp.add(check_file_exists, description="Check reads files exist") + vp.add( + check_fastqgz_file_contents, + config=config[ + "Trim Reads By Sample-check_fastqgz_file_contents" + ], + ) + + with vp.payload( + payloads=[ + { + "sample": sample, + "reads_key_1": "trimmed forward reads fastQC ZIP", + "reads_key_2": "trimmed reverse reads fastQC ZIP", + }, + ], + ): + vp.add( + check_forward_and_reverse_reads_counts_match, + skip=(not dataset.metadata["paired_end"]), + ) + + with vp.component_start( + name="STAR Alignments By Sample", + description="STAR Alignment outputs", + ): + + with vp.payload( + payloads=[ + { + "file": lambda sample=sample: sample.data_assets[ + "aligned ToTranscriptome Bam" + ].path, + }, + { + "file": lambda sample=sample: sample.data_assets[ + "aligned SortedByCoord Bam" + ].path, + }, + ] + ): + vp.add( + check_bam_file_integrity, + config={ + "samtools_bin": "samtools" + }, # assumes accessible on path already + ) + + with vp.payload( + payloads=[ + { + "multiqc_inputs": lambda sample=sample: [ + sample.data_assets["aligned log Final"].path + ], + }, + ] + ): + vp.add( + check_thresholds, + config=config[ + "STAR Alignments By Sample-check_thresholds-Mapped" + ], + description="Check that mapping rates are reasonable, specifically most reads map to the target genome", + ) + vp.add( + check_thresholds, + config=config[ + "STAR Alignments By Sample-check_thresholds-MultiMapped" + ], + description="Check that mapping rates are reasonable, specifically that a considerable amount of reads multimap to the target genome", + ) + + with vp.component_start( + name="RSeQC By Sample", + description="RNASeq QA outputs", + ): + with vp.component_start( + name="geneBody_coverage", + description="Assess integrity of transcripts and library prep signatures", + ): + with vp.payload( + payloads=[ + { + "input_dir": lambda sample=sample: sample.data_assets[ + "genebody coverage out" + ].path + }, + ] + ): + vp.add(check_genebody_coverage_output) + with vp.component_start( + name="inner_distance", + description="Reports on distance between mate reads based on gene annotations", + skip=(not dataset.metadata["paired_end"]), + ): + with vp.payload( + payloads=[ + { + "input_dir": lambda sample=sample: sample.data_assets[ + "inner distance out" + ].path + }, + ] + ): + vp.add(check_inner_distance_output) + # return protocol object without running or generating a report + if defer_run: + return vp + + vp.run(**run_args) + + # return report + return vp.report(**report_args, combine_with_flags=dataset.loaded_assets_dicts) \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/schemas.py b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/schemas.py new file mode 100644 index 00000000..d3db1810 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/dp_tools__metagenomics_estHost/schemas.py @@ -0,0 +1,62 @@ +""" Schemas for validation +Uses Schema to allow usage of validation functions +""" +from schema import Schema +from schema import Optional as schema_Optional +from typing import Optional +import pandera as pa + + +check_read2_path_populated_if_paired_end = pa.Check( + lambda df: ("read2_path" in df.columns and df['paired_end'].iloc[0] == True) or + ("read2_path" not in df.columns and df['paired_end'].iloc[0] == False), + title="Check 'read2_path' is only populated if paired_end is True", + description="Failures here are likely either due to manual user error or inappropriate source file (e.g. ISA archive)", + error="Expected 'read2_path' to be populated only if paired_end is True" + ) + +runsheet = { + "metagenomics": pa.DataFrameSchema( + columns={ + "Original Sample Name": pa.Column(str), + "read1_path": pa.Column(str), + "read2_path": pa.Column(str, required=False), # Expect if paired_end is True + }#, + # define checks at the DataFrameSchema-level + #checks=check_read2_path_populated_if_paired_end + ) +} + +import pandas as pd + +class runsheet: # Bad casing since we will use the class definition itself for all static methods + + @staticmethod + def check_single_value(column: pd.Series, error_msg: str, errors: list[str]) -> None: + if len(column.unique()) != 1: + errors.append(error_msg) + + @staticmethod + def check_read2_path_populated_if_paired_end(df: pd.DataFrame, errors: list[str]) -> None: + if (("read2_path" in df.columns and df['paired_end'][0] == True) or + ("read2_path" not in df.columns and df['paired_end'][0] == False)): + return + else: + errors.append("Expected 'read2_path' to be populated only if paired_end is True") + + @staticmethod + def validate(df_runsheet: pd.DataFrame) -> bool: + errors = [] + + # Check for single value in specified columns + + runsheet.check_single_value(df_runsheet['organism'], "Dataset level columns do NOT contain one unique value for 'organism'", errors) + runsheet.check_single_value(df_runsheet['paired_end'], "Dataset level columns do NOT contain one unique value for 'paired_end'", errors) + + # Check for 'read2_path' population if paired_end is True + #runsheet.check_read2_path_populated_if_paired_end(df_runsheet, errors) + + if errors: + raise ValueError("\n".join(errors)) + else: + return True \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/generate_protocol.sh b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/generate_protocol.sh new file mode 100644 index 00000000..560b918d --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/bin/generate_protocol.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +KRAKEN2=$(grep -i 'kraken2' $1 | sort -u |awk '{print $2}' |sed -E 's/v//') + + +HOST=$2 # ex: mouse +REFSEQ_ID=$3 +GENOME=$4 + +PROTOCOL="FASTQ files were analyzed using Kraken2 v"$KRAKEN2" against a "$HOST" genome reference database that was constructed from NCBI's RefSeq ("$REFSEQ_ID") "$GENOME", with --no-masking, and defaults of kmer-length 35 and minimizer-length of 31, and is fully compatible with Kraken2 v"$KRAKEN2". Reads classified as host were quantified and expressed as a percentage of total reads." + +echo ${PROTOCOL} \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/main.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/main.nf new file mode 100644 index 00000000..7ab49513 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/main.nf @@ -0,0 +1,156 @@ +// main.nf +nextflow.enable.dsl=2 + +// Terminal text color definitions +c_back_bright_red = "\u001b[41;1m"; +c_reset = "\033[0m"; + +include { FETCH_ISA } from './modules/fetch_isa.nf' +include { ISA_TO_RUNSHEET } from './modules/isa_to_runsheet.nf' + +include { PARSE_RUNSHEET } from './parse_runsheet.nf' +include { COPY_READS } from './modules/copy_reads.nf' + +include { KRAKEN2_DB } from './modules/kraken2_db.nf' +include { KRAKEN_2 } from './modules/kraken2.nf' +include { SUMMARY } from './modules/summary.nf' + +include { SOFTWARE_VERSIONS } from './modules/utils.nf' +include { GENERATE_PROTOCOL } from './modules/generate_protocol.nf' + + +dp_tools_plugin_ch = params.dp_tools_plugin ? + Channel.value(file(params.dp_tools_plugin)) : + Channel.value(file("$projectDir/bin/dp_tools__metagenomics_estHost")) + +runsheet_ch = params.runsheet_path ? Channel.fromPath(params.runsheet_path) : null +isa_archive_ch = params.isa_archive_path ? Channel.fromPath(params.isa_archive_path) : null + + +workflow { + + if (!params.osd && !params.glds && !params.sample_id_list){ + error("""${c_back_bright_red}INPUT ERROR! + Please supply either an accession (OSD and Genelab number) or an input CSV file + by passing either to the --osd and --glds parameters or to --sample_id_list parameter, respectively. + ${c_reset}""") + } + + if (params.osd && !params.glds || !params.osd && params.glds){ + error("""${c_back_bright_red}INPUT ERROR! + Please supply both accession numbers (OSD and GLDS) + by passing to both --osd and --glds parameters. + ${c_reset}""") + } + + if (!params.ref_dbs_Dir){ + error("""${c_back_bright_red}INPUT ERROR! + Please supply the path to the directory storing kraken2 reference databases + by passing to the --ref_dbs_Dir parameter. + ${c_reset}""") + } + + // Capture software versions + software_versions_ch = Channel.empty() + + if (params.osd) { + //If no runsheet path, fetch ISA archive from OSDR (if needed) and convert to runsheet + if (runsheet_ch == null) { + if (isa_archive_ch == null) { + FETCH_ISA() + isa_archive_ch = FETCH_ISA.out.isa_archive + } + ISA_TO_RUNSHEET( isa_archive_ch, dp_tools_plugin_ch ) + runsheet_ch = ISA_TO_RUNSHEET.out.runsheet + ISA_TO_RUNSHEET.out.version | mix(software_versions_ch) | set{software_versions_ch} + } + + // Get sample metadata from runsheet + PARSE_RUNSHEET( runsheet_ch ) + + samples = PARSE_RUNSHEET.out.samples + + COPY_READS( samples ) + generated_reads_ch = COPY_READS.out.raw_reads + } + else { + Channel + .fromPath(params.sample_id_list, checkIfExists: true) + .splitText() + .map { it.trim() } + .filter { it } //Ignore blank lines + .map { sample_id -> + def meta = [id: sample_id, paired_end: !params.is_single] + + def reads = params.is_single ? + file("$params.reads_dir/${sample_id}$params.single_suffix") : + [file("$params.reads_dir/${sample_id}$params.R1_suffix"), file("$params.reads_dir/${sample_id}$params.R2_suffix")] + + return tuple(meta, reads) + } + .set {generated_reads_ch} + } + } + + // Get host info + host_info = Channel + .fromPath(params.hosts_table) + .splitCsv(header:true) + .filter { row -> row.name.toLowerCase() == params.host.toLowerCase() } // match host + .map { row -> + def host_id = row.name.replaceAll(' ', '_').toLowerCase() + tuple(row.name, host_id, row.species, row.refseq, row.genome, row.fasta) } + + host_info + .ifEmpty { error("INPUT ERROR: Host '${params.host}' not found in hosts table '${params.hosts_table}'") } + + // Check if kraken2 database already exists or needs to be built + def host_id = params.host.replaceAll(' ', '_').toLowerCase() + def host_db = file("${params.ref_dbs_Dir}/kraken2-${host_id}-db") + def db_exists = host_db.exists() + + if (db_exists) { + database_ch = Channel.value(host_db) + } + else { + build_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, hostID, fasta) } + KRAKEN2_DB(build_ch) + database_ch = KRAKEN2_DB.out.first() + } + + KRAKEN_2(database_ch, generated_reads_ch) + KRAKEN_2.out.version | mix(software_versions_ch) | set{software_versions_ch} + + // Generate summary and compile one file + SUMMARY(KRAKEN_2.out.output) + SUMMARY.out + .collect() + .subscribe { summary_files -> + def outfile = file("${params.outdir}/results/Host-read-count-summary.tsv") + def header = "Sample_ID\tTotal_fragments\tTotal_host_fragments\tPercent_host\n" + outfile.text = header + summary_files.collect { it.text }.join() + + // summary.tmp cleanup + summary_files.each { f -> + def tmpFile = f.toFile() + tmpFile.delete() + } + } + + // Software Version Capturing - combining all captured software versions + nf_version = "Nextflow Version ".concat("${nextflow.version}") + nextflow_version_ch = Channel.value(nf_version) + + // Write software versions to file + software_versions_ch | map { it.text.strip() } + | unique + | mix(nextflow_version_ch) + | collectFile({it -> it}, newLine: true, cache: false) + | SOFTWARE_VERSIONS + + // Protocol always needs name, refseq ID, and genome build + protocol_ch = host_info.map { name, hostID, species, refseq, genome, fasta -> tuple(name, refseq, genome) } + + GENERATE_PROTOCOL(protocol_ch, SOFTWARE_VERSIONS.out) + +} \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/copy_reads.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/copy_reads.nf new file mode 100644 index 00000000..ca8f95cd --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/copy_reads.nf @@ -0,0 +1,22 @@ +process COPY_READS { + publishDir "${params.outdir}/RawData/Fastq", mode: params.publishDir_mode + tag "${ meta.id }" + + input: + tuple val(meta), path("?.gz") + + output: + tuple val(meta), path("${meta.id}*.gz"), emit: raw_reads + + script: + if ( meta.paired_end ) { + """ + cp -P 1.gz ${meta.id}${params.R1_suffix} + cp -P 2.gz ${meta.id}${params.R2_suffix} + """ + } else { + """ + cp -P 1.gz ${meta.id}${params.single_suffix} + """ + } +} diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/fetch_isa.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/fetch_isa.nf new file mode 100644 index 00000000..15eab23f --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/fetch_isa.nf @@ -0,0 +1,14 @@ +process FETCH_ISA { + + tag "${params.osd}_${params.glds}" + + publishDir "${params.outdir}/Metadata", mode: params.publishDir_mode + + output: + path "*.zip", emit: isa_archive + + script: + """ + dpt-get-isa-archive --accession ${ params.osd } + """ +} diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/generate_protocol.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/generate_protocol.nf new file mode 100644 index 00000000..49b4c2a9 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/generate_protocol.nf @@ -0,0 +1,18 @@ +process GENERATE_PROTOCOL { + + beforeScript "chmod +x ${projectDir}/bin/*" + tag "Generating your analysis protocol..." + publishDir "${params.outdir}/processing_info" + + input: + tuple val(host), val(refSeq_ID), val(genome) + path(software_versions) + + output: + path("protocol.txt") + + script: + """ + generate_protocol.sh ${software_versions} ${host} "${refSeq_ID}" ${genome} > protocol.txt + """ +} \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/isa_to_runsheet.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/isa_to_runsheet.nf new file mode 100644 index 00000000..f1b45e1b --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/isa_to_runsheet.nf @@ -0,0 +1,19 @@ +process ISA_TO_RUNSHEET { + tag "${params.osd}_${params.glds}" + + publishDir "${params.outdir}/Metadata", mode: params.publishDir_mode + + input: + path isa_archive + path dp_tools_plugin + + output: + path "*.csv", emit: runsheet + path("versions.txt"), emit: version + + script: + """ + dpt-isa-to-runsheet --accession ${params.osd} --isa-archive ${isa_archive} --plugin-dir ${dp_tools_plugin} + echo "dp_tools \$(pip show dp_tools | grep Version | sed 's/Version: //')" >> versions.txt + """ +} diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/kraken2.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/kraken2.nf new file mode 100644 index 00000000..5fadb95c --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/kraken2.nf @@ -0,0 +1,26 @@ +process KRAKEN_2 { + + tag "${meta.id}" + publishDir "${params.outdir}/results/kraken2-output" + + input: + path database + tuple val(meta), path(reads) + + output: + path "${meta.id}-kraken2-output.txt", emit: output + path "${meta.id}-kraken2-report.tsv", emit: report + path("versions.txt"), emit: version + + script: + def input = meta.paired_end ? "--paired ${reads[0]} ${reads[1]}" : "${reads}" + """ + kraken2 --db $database --gzip-compressed \ + --threads ${task.cpus} --use-names \ + --output ${meta.id}-kraken2-output.txt \ + --report ${meta.id}-kraken2-report.tsv \ + ${input} + + echo "Kraken2 \$(kraken2 -version | head -n 1 | awk '{print \$3}')" >> versions.txt + """ +} \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/kraken2_db.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/kraken2_db.nf new file mode 100644 index 00000000..e9371343 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/kraken2_db.nf @@ -0,0 +1,26 @@ +process KRAKEN2_DB { + tag "Downloading ${host} reads database to ${params.ref_dbs_Dir}" + publishDir "${params.ref_dbs_Dir}", mode: 'copy' + + input: + tuple val(host), val(host_id), val(fasta_url) + + output: + path "kraken2-${host_id}-db/" + + script: + """ + k2 download-taxonomy --db kraken2-${host_id}-db/ + + # Download FASTA file and uncompress it + wget -q ${fasta_url} -O host_assembly.fasta.gz + gunzip -c host_assembly.fasta.gz > host_assembly.fasta + + kraken2-build --add-to-library host_assembly.fasta --db kraken2-${host_id}-db/ --threads ${task.cpus} --no-masking + + kraken2-build --build --db kraken2-${host_id}-db/ --threads ${task.cpus} + + kraken2-build --clean --db kraken2-${host_id}-db/ + + """ +} \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/summary.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/summary.nf new file mode 100644 index 00000000..cc129d16 --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/summary.nf @@ -0,0 +1,22 @@ +process SUMMARY { + + tag "${kraken_output.simpleName.replaceFirst(/-kraken2-output$/, '')}" + publishDir "${params.outdir}/results/kraken2-output" + + input: + path kraken_output + + output: + path "*-removal-info.tmp" + + script: + """ + meta_id=\$(basename $kraken_output | sed 's/-kraken2-output.txt//') + + total_fragments=\$(wc -l $kraken_output | sed 's/^ *//' | cut -f 1 -d " ") + fragments_classified=\$(grep -w -c "^C" $kraken_output) + perc_host=\$(printf "%.2f\\n" \$(echo "scale=4; \$fragments_classified / \$total_fragments * 100" | bc -l)) + + echo -e "\$meta_id\\t\$total_fragments\\t\$fragments_classified\\t\$perc_host\\n" > \$meta_id-removal-info.tmp + """ +} \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/utils.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/utils.nf new file mode 100644 index 00000000..1265c05e --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/modules/utils.nf @@ -0,0 +1,17 @@ +process SOFTWARE_VERSIONS { + + tag "Writing out software versions..." + publishDir "${params.outdir}/processing_info" + + input: + path(software_versions) + + output: + path("software_versions.txt") + + script: + """ + # Delete white spaces and write out unique software versions + grep -v "^\$" ${software_versions} | sort -u > software_versions.txt + """ +} \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/nextflow.config b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/nextflow.config new file mode 100644 index 00000000..da2ba2aa --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/nextflow.config @@ -0,0 +1,110 @@ +params { + dp_tools_plugin = null // Path to dp_tools plugin + osd = null // OSD accession number if running workflow in OSDR mode, null if running in list mode + glds = null // GeneLab accession number if running workflow in OSDR mode, null if running in list mode + runsheet_path = null // For OSDR datasets, runsheet will be generated based on the ISA.zip associated with the dataset + isa_archive_path = null // If a runsheet is not supplied, ISA.zip will be pulled from OSDR unless this is supplied + + sample_id_list = null // Path to Sample_ID list if running workflow in list mode, null if running in OSDR mode + reads_dir = null // Path to raw reads if running workflow in list mode, null if running in OSDR mode + is_single = false // Boolean to use only when running workflow in list mode, has no effect when running in OSDR mode + + R1_suffix = "_R1_raw.fastq.gz" + R2_suffix = "_R2_raw.fastq.gz" + single_suffix = "_raw.fastq.gz" + + host = "mouse" + hosts_table = "$projectDir/assets/hosts.csv" + ref_dbs_Dir = null // Path to directory where kraken2 databases are stored, required + + outdir = "${launchDir}" + publishDir_mode = "link" // "link", "copy" +} + +profiles { + singularity { + singularity.enabled = true + singularity.autoMounts = true + conda.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + } + + docker { + docker.enabled = true + docker.runOptions = '-u $(id -u):$(id -g)' + params.containerEngine = "docker" + } + + conda { + conda.enabled = true + params.use_conda = true + conda.channels = 'conda-forge,bioconda' + conda.cacheDir = 'conda/' // location of conda environments + conda.createTimeout = '2h' + } + + mamba { + conda.enabled = true + conda.useMamba = true + conda.channels = 'conda-forge,bioconda' + params.use_conda = true + conda.cacheDir = 'conda/' // location of conda environments + conda.createTimeout = '2h' + } + + slurm { + process.executor = 'slurm' + } + + local { + process.executor = 'local' + } +} + +process { + + cpus = { 1 * task.attempt } // Default + memory = { 2.GB * task.attempt } // Default + + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish'} + maxRetries = 1 + maxErrors = '-1' + + withName: 'FETCH_ISA|ISA_TO_RUNSHEET' { + container = "quay.io/nasa_genelab/dp_tools:1.3.8" + conda = "${projectDir}/envs/dp_tools.yaml" + } + + withName: 'KRAKEN2_DB' { + cpus = 12 + memory = '64GB' + time = '8h' + container = "quay.io/biocontainers/kraken2:2.1.6--pl5321h077b44d_0" + conda = "${projectDir}/envs/kraken2.yaml" + } + + withName: 'KRAKEN_2' { + cpus = 8 + memory = '8GB' + container = "quay.io/biocontainers/kraken2:2.1.6--pl5321h077b44d_0" + conda = "${projectDir}/envs/kraken2.yaml" + } +} + +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +timeline { + enabled = true + file = "${params.outdir}/processing_info/execution_timeline_${trace_timestamp}.html" +} +report { + enabled = true + file = "${params.outdir}/processing_info/execution_report_${trace_timestamp}.html" +} +trace { + enabled = true + file = "${params.outdir}/processing_info/execution_trace_${trace_timestamp}.txt" +} \ No newline at end of file diff --git a/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/parse_runsheet.nf b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/parse_runsheet.nf new file mode 100644 index 00000000..53578d7b --- /dev/null +++ b/Metagenomics/Estimate_host_reads_in_raw_data/Workflow_Documentation/NF_MGEstHostReads/workflow_code/parse_runsheet.nf @@ -0,0 +1,80 @@ +def colorCodes = [ + c_line: "┅" * 70, + c_back_bright_red: "\u001b[41;1m", + c_bright_green: "\u001b[32;1m", + c_blue: "\033[0;34m", + c_yellow: "\u001b[33;1m", + c_reset: "\033[0m" +] +// Adapted from Function: https://github.com/nf-core/rnaseq/blob/master/modules/local/process/samplesheet_check.nf +// Function to get list of [ meta, [ fastq_1_path, fastq_2_path ] ] +def get_runsheet_paths(LinkedHashMap row) { + def meta = [:] + meta.id = row["Sample Name"] + meta.organism_sci = row.organism.replaceAll(" ","_").toLowerCase() + meta.paired_end = row.paired_end.toBoolean() + + // Extract factors + meta.factors = row.findAll { key, value -> + key.startsWith("Factor Value[") && key.endsWith("]") + }.collectEntries { key, value -> + [(key[13..-2]): value] // Remove "Factor Value[" and "]" + } + + def array = [] + def raw_reads = [] + raw_reads.add(file(row.read1_path)) + if (meta.paired_end) { + raw_reads.add(file(row.read2_path)) + } + array = [meta, raw_reads] + return array +} + +workflow PARSE_RUNSHEET { + take: + ch_runsheet + + main: + ch_samples = ch_runsheet + | splitCsv(header: true) + | map { row -> get_runsheet_paths(row) } + + // Validate consistency across samples + ch_samples + .map { meta, reads -> [meta.paired_end, meta.organism_sci] } + .unique() + .count() + .subscribe { count -> + if (count > 1) { + log.error "${colorCodes.c_back_bright_red}ERROR: Inconsistent metadata across samples. Please check the runsheet.${colorCodes.c_reset}" + exit 1 + } else { + println "${colorCodes.c_bright_green}Metadata consistency check passed.${colorCodes.c_reset}" + } + } + + // Print autodetected processing metadata for the first sample + ch_samples.take(1) | view { meta, reads -> + """${colorCodes.c_bright_green}Autodetected Processing Metadata: + Paired End: ${meta.paired_end} + Organism: ${meta.organism_sci}${colorCodes.c_reset}""" + } + // Check that all read files are unique + ch_samples + .flatMap { meta, reads -> reads } + .collect() + .map { all_reads -> + def total_count = all_reads.size() + def unique_count = all_reads.toSet().size() + + if (unique_count != total_count) { + throw new RuntimeException("${colorCodes.c_back_bright_red}ERROR: Duplicate read files detected. Please check the runsheet.${colorCodes.c_reset}") + } else { + println "${colorCodes.c_bright_green}All ${unique_count} read files are unique.${colorCodes.c_reset}" + } + } + + emit: + samples = ch_samples +}