diff --git a/.gitignore b/.gitignore index a52ae4872..540d22e67 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,13 @@ docs/generated/ .pytest_cache/ .testing_data_cache/ +# For decision tree .tex flow charts do not archive intermediate files +decision_tree*.aux +decision_tree*.fdb_latexmk +decision_tree*.fls +decision_tree*.log +decision_tree*.synctex.gz + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/docs/_static/decision_tree_demo_external_regressors_motion_task_models.pdf b/docs/_static/decision_tree_demo_external_regressors_motion_task_models.pdf new file mode 100644 index 000000000..35227d87f Binary files /dev/null and b/docs/_static/decision_tree_demo_external_regressors_motion_task_models.pdf differ diff --git a/docs/_static/decision_tree_demo_external_regressors_motion_task_models.png b/docs/_static/decision_tree_demo_external_regressors_motion_task_models.png new file mode 100644 index 000000000..f89fe2262 Binary files /dev/null and b/docs/_static/decision_tree_demo_external_regressors_motion_task_models.png differ diff --git a/docs/_static/decision_tree_demo_external_regressors_motion_task_models.tex b/docs/_static/decision_tree_demo_external_regressors_motion_task_models.tex new file mode 100644 index 000000000..2bb2afaf4 --- /dev/null +++ b/docs/_static/decision_tree_demo_external_regressors_motion_task_models.tex @@ -0,0 +1,122 @@ +\documentclass[border=2pt]{standalone} +\usepackage[utf8]{inputenc} % Required for inserting images +\usepackage{tikz} +\usepackage{helvet} +\usetikzlibrary{shapes.geometric, arrows} +\pagecolor{white} + +%-------------------------defining colorblind friendly colors +% Using pale color scheme in Figure 6 +% by Paul Tol https://personal.sron.nl/~pault/ +\definecolor{cbblue}{HTML}{BBCCEE} +\definecolor{cbcyan}{HTML}{CCEEFF} +\definecolor{cbgreen}{HTML}{CCDDAA} +\definecolor{cbyellow}{HTML}{EEEEBB} +\definecolor{cbred}{HTML}{FFCCCC} +\definecolor{cbgrey}{HTML}{DDDDDD} + +% -------------------------defining nodes +\tikzstyle{input} = [trapezium, trapezium left angle =80, trapezium right angle = 100, +minimum width= 3cm, minimum height=0.5cm, text centered, draw=black, fill=cbblue] +\tikzstyle{process} = [rectangle, minimum width = 3cm, minimum height = 1cm, +text centered, , text width=4cm,draw=black, fill=cbgrey] +\tikzstyle{decision} = [diamond, minimum width = 3cm, minimum height = 1cm, +text centered, , text width=3cm, draw=black, fill=cbcyan] +\tikzstyle{changeclass} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm, +text centered, draw = black, fill=cbyellow] +\tikzstyle{reject} = [trapezium, trapezium left angle =80, trapezium right angle = 100, +minimum width= 1cm, minimum height=0.5cm, text centered, draw=black, fill=cbred] +\tikzstyle{accept} = [trapezium, trapezium left angle =80, trapezium right angle = 100, +minimum width= 1cm, minimum height=0.5cm, text centered, draw=black, fill=cbgreen] + +% -------------------------defining connectors +\tikzstyle{arrow} = [thick,->, >=stealth] +\tikzstyle{line} = [thick,-,>=stealth] +\begin{document} + +% ------------------------- tikz image (flow chart) +\begin{tikzpicture}[node distance = 2cm] + +% ------------------------- nodes ------------------------- +% ----- node: 0 +\node(0)[input,label={90:\textbf{Demo Decision Tree with External Regressors, Motion, and Task Models}}, label={180:$node\ 0$}]{Set all components to unclassified}; +% ----- node: 1 +\node(1)[decision, below of=0,label={180:$node\ 1$}, yshift=-1.5cm]{$\rho$ $>$ $\kappa$}; +\node(rej1)[changeclass, right of=1, xshift=3cm, align=center]{Unlikely BOLD\\$\rightarrow$ Provisional reject}; +% ----- node: 2 +\node(2)[decision, below of=1,label={180:$node\ 2$}, label={[align=center] 315: voxel counts for signif fit\\of multi-echo data\\to $T_2$ or $S_0$ decay models}, yshift=-4.0cm]{$n \, FS_0 \, > \, n \, FT_2$ \& $n \,FT_2$ $>$ 0}; +\node(rej2)[changeclass, right of=2, xshift=3cm, align=center]{Unlikely BOLD\\$\rightarrow$ Provisional Reject}; +% ----- node: 3 +\node(3)[process, below of=2, label={180:$node\ 3$}, label={[align=center] 315: varex: variance explained\\by each component}, yshift=-2.0cm]{Calculate median(varex) across all components}; +% ----- node: 4 +\node(4)[decision, below of=3,label={180:$node\ 4$},label={[align=center] 315:DICE overlap between $T_2$ or $S_0$\\decay models and ICA component\\peak clusters}, yshift=-1.5cm]{dice $FS_0$ $>$ dice $FT_2$ \& varex $>$ median(varex) +}; +\node(rej4)[changeclass, right of=4, xshift=3cm, align=center]{Unlikely BOLD\\$\rightarrow$ Provisional Reject}; +% ----- node: 5 +\node(5)[decision, below of=4,label={180:$node\ 5$}, label={[align=center] 315: $t-statistic$ of $FT_2$ values\\in component peak clusters vs\\peak voxels outside of clusters}, yshift=-4.0cm]{ $0 \, >$ signal-noise \& varex $>$ median(varex)}; +\node(rej5)[changeclass, right of=5, xshift=3cm, align=center]{Unlikely BOLD\\$\rightarrow$ Provisional Reject}; +% ----- node: 6 +\node(6)[process, below of=5, label={180:$node\ 6$}, label={0: Uses all components}, yshift=-2.0cm]{Calculate $\kappa$ elbow}; +% ----- node: 7 +\node(7)[process, below of=6, label={180:$node\ 7$}, label={[align=center] 0: Uses all components and subset\\of unclassified components}]{Calculate $\rho$ elbow\\(liberal method)}; +% ----- node: 8 +\node(8)[decision, below of=7,label={180:$node\ 8$}, yshift=-1.5cm]{$\kappa \geq \kappa$ elbow\\$\rho$ $<$ $\rho$ elbow}; +\node(chrej8)[changeclass, below of=8, xshift=0cm, yshift=-2cm]{Provisional reject}; +\node(chacc8)[changeclass, right of=8, xshift=3cm, yshift=0cm]{Provisional accept}; +% ----- node: 9 +\node(9)[decision, below of=chrej8,label={180:$node\ 9$},label={20: Accept even if $\rho < \rho\ elbow$},yshift=-1.5cm]{$\kappa > 2\rho$\\$\kappa \geq \kappa$ elbow}; +\node(chrej9)[changeclass, below of=9, xshift=0cm, yshift=-2cm]{Provisional reject}; +\node(chacc9)[changeclass, right of=9, xshift=3cm, yshift=0cm]{Provisional accept}; +% ----- node: 10 +\node(10)[decision, below of=chacc9,label={150:$node\ 10$},label={[align=center] 310: Reject if\\fits external\\nuisance\\regressors},yshift=-2cm]{F test for\\Nuisance Regressors\\$p_{Full} \leq 0.05$\\$R^2_{Full} \geq 0.5$}; +\node(chrej10)[changeclass, below of=10, xshift=0cm, yshift=-2cm, align=center]{External regressors\\$\rightarrow$Provisional reject}; +% ----- node: 11 +\node(11)[decision, left of=chrej10,label={180:$node\ 11$},xshift=-3cm]{Partial F test for\\Motion Regressors\\$p_{Full} \leq 0.05$\\$R^2_{Full} \geq 0.5$\\$p_{Motion} \leq 0.05$}; +\node(chtag11)[changeclass, below of=11, xshift=0cm, yshift=-2cm, align=center]{Tag:\\Fits motion\\external regressors}; +% ----- node: 12 +\node(12)[decision, below of=chrej10,label={150:$node\ 12$},yshift=-2cm]{Partial F test for\\CSF Regressors\\$p_{Full} \leq 0.05$\\$R^2_{Full} \geq 0.5$\\$p_{CSF} \leq 0.05$}; +\node(chtag12)[changeclass, below of=12, xshift=0cm, yshift=-2cm, align=center]{Tag:\\Fits CSF\\external regressors}; +% ----- node: 13 +\node(prej13)[changeclass, below of=chtag11, xshift=0cm, yshift=-0.5cm]{Provisional reject}; +\node(13)[decision, below of=prej13,label={180:$node\ 13$},label={[align=center] 335: If fits task and\\contains T2*, accept\\even if other criteria\\would have rejected},yshift=-2cm]{F test for\\Task Regressors\\$p_{Task} \leq 0.05$\\$R^2_{Task} \geq 0.5$\\$\kappa \geq \kappa$ elbow}; +\node(chacc13)[accept, right of=13,xshift=3cm, align=center]{Fits task\\$\rightarrow$Accept}; +% ----- node: 14 +\node(14)[decision, below of=13,label={180:$node\ 14$},label={[align=left] 335: Will accept the lowest\\variance components until\\1\% of total variance is\\accepted this way}, yshift=-3.5cm]{$if$ component variance $<0.1$};%--check in kundu +\node(acc14)[accept, right of=14, xshift=2.5cm, align=center]{Low variance\\$\rightarrow$ Accept}; +% ----- node: 15 +\node(15)[accept, below of=14,label={180:$node\ 15$},yshift=-1.5cm, align=center]{Likely BOLD\\Change provisional accept\\$\rightarrow$Accept}; +% ----- node: 16 +\node(16)[reject, below of=15,label={180:$node\ 16$}, yshift=0cm, align=center]{Unlikely BOLD\\Change provisional reject\\$\rightarrow$Reject}; + +% ------------------------- connections ------------------------- +% draw[x](origin)--node[anchor=position]{text}(destination); +\draw[arrow](0)--(1); +\draw[arrow](1)--node[anchor=south, right=0] {no} (2); +\draw[arrow](1)--node[anchor=south] {yes} (rej1); +\draw[arrow](2)--node[anchor=south, right=0] {no} (3); +\draw[arrow](2)--node[anchor=south] {yes} (rej2); +\draw[arrow](3)--(4); +\draw[arrow](4)--node[anchor=south, right=0] {no} (5); +\draw[arrow](4)--node[anchor=south] {yes} (rej4); +\draw[arrow](5)--node[anchor=south, right=0] {no} (6); +\draw[arrow](5)--node[anchor=south] {yes} (rej5); +\draw[arrow](6)--(7); +\draw[arrow](7)--(8); +\draw[arrow](8)--node[anchor=south] {yes} (chacc8); +\draw[arrow](8)--node[anchor=south, right=0] {no} (chrej8); +\draw[arrow](chrej8)--(9); +\draw[arrow](9)--node[anchor=south] {yes} (chacc9); +\draw[arrow](9)--node[anchor=south, right=0] {no} (chrej9); +\draw[arrow](chacc9)--(10); +\draw[arrow](chrej9)--(10); +\draw[arrow](10)--node[anchor=south, right=0] {yes} (chrej10); +\draw[arrow](chrej10)--(11); +\draw[arrow](11)--node[anchor=south, right=0] {yes} (chtag11); +\draw[arrow](chrej10)--(12); +\draw[arrow](12)--node[anchor=south, right=0] {yes} (chtag12); +\draw[arrow](prej13)--(13); +\draw[arrow](13)--node[anchor=south] {yes} (chacc13); +\draw[arrow](13)--node[anchor=south, right=0] {no} (14); +\draw[arrow](14)--node[anchor=south] {yes} (acc14); +\end{tikzpicture} +\end{document} diff --git a/docs/_static/decision_tree_demo_external_regressors_single_model.pdf b/docs/_static/decision_tree_demo_external_regressors_single_model.pdf new file mode 100644 index 000000000..7aa4b3aab Binary files /dev/null and b/docs/_static/decision_tree_demo_external_regressors_single_model.pdf differ diff --git a/docs/_static/decision_tree_demo_external_regressors_single_model.png b/docs/_static/decision_tree_demo_external_regressors_single_model.png new file mode 100644 index 000000000..f40f41f65 Binary files /dev/null and b/docs/_static/decision_tree_demo_external_regressors_single_model.png differ diff --git a/docs/_static/decision_tree_demo_external_regressors_single_model.tex b/docs/_static/decision_tree_demo_external_regressors_single_model.tex new file mode 100644 index 000000000..72c8cac19 --- /dev/null +++ b/docs/_static/decision_tree_demo_external_regressors_single_model.tex @@ -0,0 +1,109 @@ +\documentclass[border=2pt]{standalone} +\usepackage[utf8]{inputenc} % Required for inserting images +\usepackage{tikz} +\usepackage{helvet} +\usetikzlibrary{shapes.geometric, arrows} +\pagecolor{white} + +%-------------------------defining colorblind friendly colors +% Using pale color scheme in Figure 6 +% by Paul Tol https://personal.sron.nl/~pault/ +\definecolor{cbblue}{HTML}{BBCCEE} +\definecolor{cbcyan}{HTML}{CCEEFF} +\definecolor{cbgreen}{HTML}{CCDDAA} +\definecolor{cbyellow}{HTML}{EEEEBB} +\definecolor{cbred}{HTML}{FFCCCC} +\definecolor{cbgrey}{HTML}{DDDDDD} + +% -------------------------defining nodes +\tikzstyle{input} = [trapezium, trapezium left angle =80, trapezium right angle = 100, +minimum width= 3cm, minimum height=0.5cm, text centered, draw=black, fill=cbblue] +\tikzstyle{process} = [rectangle, minimum width = 3cm, minimum height = 1cm, +text centered, , text width=4cm,draw=black, fill=cbgrey] +\tikzstyle{decision} = [diamond, minimum width = 3cm, minimum height = 1cm, +text centered, , text width=3cm, draw=black, fill=cbcyan] +\tikzstyle{changeclass} = [rectangle, rounded corners, minimum width=3cm, minimum height=1cm, +text centered, draw = black, fill=cbyellow] +\tikzstyle{reject} = [trapezium, trapezium left angle =80, trapezium right angle = 100, +minimum width= 1cm, minimum height=0.5cm, text centered, draw=black, fill=cbred] +\tikzstyle{accept} = [trapezium, trapezium left angle =80, trapezium right angle = 100, +minimum width= 1cm, minimum height=0.5cm, text centered, draw=black, fill=cbgreen] + +% -------------------------defining connectors +\tikzstyle{arrow} = [thick,->, >=stealth] +\tikzstyle{line} = [thick,-,>=stealth] +\begin{document} + +% ------------------------- tikz image (flow chart) +\begin{tikzpicture}[node distance = 2cm] + +% ------------------------- nodes ------------------------- +% ----- node: 0 +\node(0)[input,label={90:\textbf{Demo Decision Tree. Single model with external regressors}}, label={180:$node\ 0$}]{Set all components to unclassified}; +% ----- node: 1 +\node(1)[decision, below of=0,label={180:$node\ 1$}, yshift=-1cm]{$\rho$ $>$ $\kappa$}; +\node(rej1)[reject, right of=1, xshift=2cm, align=center]{Unlikely BOLD\\$\rightarrow$ Reject}; +% ----- node: 2 +\node(2)[decision, below of=1,label={180:$node\ 2$}, label={[align=center] 315: voxel counts for signif fit\\of multi-echo data\\to $T_2$ or $S_0$ decay models}, yshift=-3.0cm]{$n \, FS_0 \, > \, n \, FT_2$ \& $n \,FT_2$ $>$ 0}; +\node(rej2)[reject, right of=2, xshift=2cm, align=center]{Unlikely BOLD\\$\rightarrow$ Reject}; +% ----- node: 3 +\node(3)[process, below of=2, label={180:$node\ 3$}, label={[align=center] 315: varex: variance explained\\by each component}, yshift=-1.5cm]{Calculate median(varex) across all components}; +% ----- node: 4 +\node(4)[decision, below of=3,label={180:$node\ 4$},label={[align=center] 315:DICE overlap between $T_2$ or $S_0$\\decay models and ICA component\\peak clusters}, yshift=-1.5cm]{dice $FS_0$ $>$ dice $FT_2$ \& varex $>$ median(varex) +}; +\node(rej4)[reject, right of=4, xshift=2.5cm, align=center]{Unlikely BOLD\\$\rightarrow$ Reject}; +% ----- node: 5 +\node(5)[decision, below of=4,label={180:$node\ 5$}, label={[align=center] 315: $t-statistic$ of $FT_2$ values\\in component peak clusters vs\\peak voxels outside of clusters}, yshift=-4.0cm]{ $0 \, >$ signal-noise \& varex $>$ median(varex)}; +\node(rej5)[reject, right of=5, xshift=2.5cm, align=center]{Unlikely BOLD\\$\rightarrow$ Reject}; +% ----- node: 6 +\node(6)[process, below of=5, label={180:$node\ 6$}, label={0: Uses all components}, yshift=-2.0cm]{Calculate $\kappa$ elbow}; +% ----- node: 7 +\node(7)[process, below of=6, label={180:$node\ 7$}, label={[align=center] 0: Uses all components and subset\\of unclassified components}]{Calculate $\rho$ elbow\\(liberal method)}; +% ----- node: 7 +\node(8)[decision, below of=7,label={180:$node\ 8$}, yshift=-1.5cm]{$\kappa \geq \kappa$ elbow}; +\node(chrej8)[changeclass, below of=8, yshift=-1.5cm]{Provisional reject}; +\node(chacc8)[changeclass, right of=8, xshift=3cm, yshift=0cm]{Provisional accept}; +% ----- node: 8 +\node(9)[decision, below of=chacc8,label={170:$node\ 9$}, yshift=-1.5cm]{ $\rho$ $>$ $\rho$ elbow}; +\node(chrej9)[changeclass, below of=9, yshift=-1.5cm]{Provisional reject}; +% ----- node: 9 +\node(10)[decision, left of=chrej9,label={180:$node\ 10$},label={235: Accept even if $\rho < \rho\ elbow$},xshift=-3.5cm]{$\kappa \geq \kappa$ elbow\\$\kappa > 2\rho$ }; +\node(chacc10)[changeclass, below of=10, xshift=0cm, yshift=-1.5cm, align=center]{Provisional accept}; +% ----- node: 10 +\node(11)[decision, below of=chacc10,label={180:$node\ 11$}, xshift=0cm, yshift=-2cm]{External regressor\\nuisance model\\$p<0.05$\\$R^2>0.5$};%--check in kundu +\node(chrej11)[changeclass, below of=11, xshift=0cm, yshift=-1.5cm, align=center]{Tag: External Regressors\\Provisional reject}; +% ----- node: 11 +\node(12)[decision, below of=chrej11,label={180:$node\ 11$},label={[align=left] 335: Will accept the lowest\\variance components until\\1\% of total variance is\\accepted this way}, yshift=-1.5cm]{$if$ component variance $<0.1$};%--check in kundu +\node(acc12)[accept, right of=12, xshift=3cm, align=center]{Low variance\\$\rightarrow$ Accept}; +% ----- node: 12 +\node(13)[accept, below of=12,label={180:$node\ 12$},yshift=-1.5cm, align=center]{Likely BOLD\\Change provisional accept\\$\rightarrow$Accept}; +% ----- node: 13 +\node(14)[reject, below of=13,label={180:$node\ 13$}, yshift=0cm, align=center]{Unlikely BOLD\\Change provisional reject\\$\rightarrow$Reject}; + + +% ------------------------- connections ------------------------- +\draw[arrow](0)--(1); +\draw[arrow](1)--node[anchor=south, right=0] {no} (2); +\draw[arrow](1)--node[anchor=south] {yes} (rej1); +\draw[arrow](2)--node[anchor=south, right=0] {no} (3); +\draw[arrow](2)--node[anchor=south] {yes} (rej2); +\draw[arrow](3)--(4); +\draw[arrow](4)--node[anchor=south, right=0] {no} (5); +\draw[arrow](4)--node[anchor=south] {yes} (rej4); +\draw[arrow](5)--node[anchor=south, right=0] {no} (6); +\draw[arrow](5)--node[anchor=south] {yes} (rej5); +\draw[arrow](6)--(7); +\draw[arrow](7)--(8); +\draw[arrow](8)--node[anchor=south] {yes} (chacc8); +\draw[arrow](8)--node[anchor=south, right=0] {no} (chrej8); +\draw[arrow](chacc8)--(9); +\draw[arrow](chrej8)--(9); +\draw[arrow](9)--node[anchor=south, right=0] {yes} (chrej9); +\draw[arrow](chrej9)--(10); +\draw[arrow](10)--node[anchor=south, right=0] {yes} (chacc10); +\draw[arrow](chacc10)--(11); +\draw[arrow](11)--node[anchor=south, right=0] {yes} (chrej11); +\draw[arrow](chrej11)--(12); +\draw[arrow](12)--node[anchor=south] {yes} (acc12); +\end{tikzpicture} +\end{document} diff --git a/docs/api.rst b/docs/api.rst index c91211989..1722002ac 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -105,6 +105,7 @@ API tedana.metrics.collect tedana.metrics.dependence + tedana.metrics.external .. _api_selection_ref: diff --git a/docs/building_decision_trees.rst b/docs/building_decision_trees.rst index 7b0e2e479..1c31b6c2d 100644 --- a/docs/building_decision_trees.rst +++ b/docs/building_decision_trees.rst @@ -186,7 +186,7 @@ All functions are currently in :mod:`~tedana.selection.selection_nodes`. to interpret. .. _resources/decision_trees: https://github.com/ME-ICA/tedana/tree/main/tedana/resources/decision_trees - +.. _This is an example TSV file: https://github.com/ME-ICA/tedana/tree/main/tedana/tests/data/external_regress_Ftest_3echo.tsv General information fields ========================== @@ -239,6 +239,100 @@ that is used to check whether results are plausible & can help avoid mistakes. none of the components include the "Likely BOLD" tag, then ICA will be repeated with a different seed and then the selection process will repeat. +External regressor configuration +================================ + +``external_regressor_config`` is an optional field. If this field is specified, then +additional metrics will be calculated that include F, :math:`R^2`, and p values for the fit +of external regressors to each component time series. +The p value might be useful for defining a statistically significant fit, +while :math:`R^2` can assess whether the regressors model a meaningful amount of variance. +For example, even if nuisance regressors, like head motion, significantly fit an ICA component +time series, do not reject if they only model 5% of the total variance of that component. + +Users will need to specify the +external regressors using the ``--external`` option. +``--external`` takes a TSV file in which each column has a header label and is the length +of the fMRI time series. +This functionality can be used to integrate non-multi-echo decision criteria, +such as correlations to head motion, CSF, or respiration time series. +These added metrics can then be used in decision tree steps just like any other metric. +Two demonstration trees that apply this functionality are in `resources/decision_trees`_. +``demo_external_regressors_single_model.json`` demonstrates the simplest application +of external regressors and ``demo_external_regressors_motion_task_models.json`` +highlights the full range of functionality. +`This is an example TSV file`_ with column labels that work with both of these trees. +Both these trees are based on ``minimal.json``. +While these might be good decision trees to use as is, +they are both called "demo" because they demonstrate what is possible, +but the utility of these specific decision trees have **not yet** been validated. + +``external_regressor_config`` is a list of dictionaries. +Each dictionary defines statistical tests to apply to a group of +regressors specified in ``--external``. +``demo_external_regressors_single_model.json`` includes a single dictionary +that specifies fitting all supplied regressors to a single nuisance model that is +used to reject components. +``demo_external_regressors_motion_task_models`` includes one dictionary for +a nuisance model to reject components and one for a task model to retain more +task-correlated signal. For the nuisance model, columns with specific header names +are also fit to partial models for motion and CSF regressors that are used to label +why components were rejected. +(i.e. Component X was rejected because it fit to head motion regressors.) +Each dictionary in ``external_regressor_config`` must include the following sub-fields: + +- "regress_ID" + A descriptive name for the external regressors that will be logged. + Will be used in the ``selector.component_table_`` labels describing the outputs of the + statistical tests. + For example, if this field is ``nuisance`` then component table column labels will include: + ``Fstat nuisance model``, ``R2stat nuisance model``, and ``pval nuisance model``. + +- "info" + A brief description of the external regressors for info logging. + +- "report" + A narrative description of how the external regressors are used + that will be used in report logging. + This should include any citations, which must be included in the + `references BibTeX file`_. + +- "detrend" + "true" or "false" to specify whether to include detrending regressors + when fitting the external regressors to the ICA component time series. + If "true", it will specify the number of detrending time regressors to + include based on the length of the time series. + If "false", it will just include an intercept regressor to remove the mean. + This can also be a integer that defines the number of regressors to include. + Can also be an integer specifying the number of detrending regressors. + +- "statistic" + The statistical test to use for fitting the external regressors to the + ICA component time series. Currently, the only valid option is "F" for + fitting using a linear model with an F statistic. + +- "regressors" + A list of strings or regular expressions to specify the columns in + the external regressor file to use in the model. Regular expressions begin with ``^`` + For example, ``["^.*$"]`` would mean use all regressors in the file, + while ``["^mot_.*$"]`` would mean use all regressors with labels beginging with ``mot_``. + ``["mot_x", "mot_y_", "mot_z"]`` would be use regressors with those specific labels. + Capitalization is ignored. + Note: When tedana is run, regular expressions are replaced with the named regressors. + The outputted decision tree will specify what was used and might be useful for validation. + +An optional field is **"partial_models"**. +This is a dictionary where each element is a descriptor and column specification is similar to +``regressors``. For example, ``"partial_models": {"Motion": ["^mot_.*$"], "CSF": ["^csf.*$"]}`` +specifies two partial models for motion and CSF time series, where the columns in +the external regressor tsv begin with either ``mot_`` or ``csf``. +When this field is used, statistics will be calculated for the full model with all regressors +and each specified partial model. This can be used to potentially reject components that fit +any combination of nuisance regressors and also note which components fit head motion regressors. +If this option is included, there would be added columns in ``selector.component_table_``, such as +``Fstat nuisance Motion partial model``, ``pval nuisance Motion partial model``, and +``R2stat nuisance Motion partial model`` + Nodes in the decision tree ========================== diff --git a/docs/included_decision_trees.rst b/docs/included_decision_trees.rst index c0d3f4e9a..af1d931ae 100644 --- a/docs/included_decision_trees.rst +++ b/docs/included_decision_trees.rst @@ -2,7 +2,7 @@ Included Decision Trees ####################### -Three decision trees are currently distributed with ``tedana``. +Five decision trees are currently distributed with ``tedana``. ``meica`` is the decision tree that is based on MEICA version 2.5 and ``tedana_orig`` is very similar and has been included with ``tedana`` @@ -18,7 +18,18 @@ and comprehensible, but it has not yet be extensively validated and parts of the tree may change in response to additional tests on a wider range of data sets. -Flowcharts describing the steps in both trees are below. +With the addition of options to fit external regressors to components, +there are two demonstration decision trees that implement this new functionality. +While these might work well, since they have not yet been validated on data, they +are labeled ``demo``. +``decision_tree_demo_external_regressors_single_model`` +demonstrates fitting all nuisance regressors to a single model. +``decision_tree_demo_external_regressors_motion_task_models`` +demonstrates fitting nuisance regressors to a model, +partial tests and tagging for components that fit Motion or CSF regressors, +and retention of some components that fit task regressors. + +Flowcharts describing the steps in these trees are below. As documented more in :doc:`building_decision_trees`, the input to each tree is a table with metrics, like :math:`\kappa` or :math:`\rho`, for each component. Each step or node in the decision tree either calculates @@ -122,3 +133,45 @@ is rejected (node 13) `LaTeX file to generate the minimal decision tree flow chart`_ .. _LaTeX file to generate the minimal decision tree flow chart: _static/decision_tree_minimal.tex + +********************************************* +Demo external regressors single model +********************************************* + +This tree is similar to the minimal tree except there is an added node (node 11) +where components are rejected if they significantly fit a model of external nuisance regressor +time series and the fit models a substantial amount of the total variance. +Unlike the minimal tree, components that would be accepted based on :math:`\kappa` & :math:`\rho` +criteria can be rejected based on a fit to external regressors. +This is called a "demo" tree because it is demonstrating how fits to external regressors can +be used. It might be a good decision tree to use, +but results have not yet been tested and validated. + +.. image:: _static/decision_tree_demo_external_regressors_single_model.png + :width: 400 + :alt: External Decision Tree With Motion and Task Models Flow Chart + +**************************************************** +Demo external regressors, motion task models +**************************************************** + +This is based on the minimal tree, but multiple nodes were added to demonstrate how to use +external regressors for fits. +Unlike the minimal tree, components that would be accepted based on :math:`\kappa` & :math:`\rho` +criteria can be rejected based on a fit to external regressors. +Components are rejected if they significantly fit a model of external nuisance regressor +time series and the fit models a substantial amount of the total variance (node 10). +For rejected components, if they also fit a partial model of motion external regressors (node 11), +or CSF external regressors (node 12), the outputs are also tagged to say they fit those groups +of regressors. +Additionally, if a rejected component fits the task design and has +:math:`\kappa` > :math:`\kappa` elbow, then it is accepted under the conservative assumption to +retain task fitting components with some :math:`T_2^*`` signal even if those components also +contain potentially rejection-worthy noise (node 13). +This is called a "demo" tree because it is demonstrating how fits to external regressors can +be used. It might be a good decision tree to use, +but results have not yet been tested and validated. + +.. image:: _static/decision_tree_demo_external_regressors_motion_task_models.png + :width: 400 + :alt: External Decision Tree With Motion and Task Models Flow Chart diff --git a/tedana/decomposition/pca.py b/tedana/decomposition/pca.py index b252ef264..fa32c6537 100644 --- a/tedana/decomposition/pca.py +++ b/tedana/decomposition/pca.py @@ -348,14 +348,16 @@ def tedpca( "normalized variance explained", "d_table_score", ] - comptable = metrics.collect.generate_metrics( - data_cat, - data_oc, - comp_ts, - adaptive_mask, - tes, - io_generator, - "PCA", + # Even if user inputted, don't fit external_regressors to PCA components + comptable, _ = metrics.collect.generate_metrics( + data_cat=data_cat, + data_optcom=data_oc, + mixing=comp_ts, + adaptive_mask=adaptive_mask, + tes=tes, + io_generator=io_generator, + label="PCA", + external_regressors=None, metrics=required_metrics, ) diff --git a/tedana/gscontrol.py b/tedana/gscontrol.py index 4ffddf631..a18e488c9 100644 --- a/tedana/gscontrol.py +++ b/tedana/gscontrol.py @@ -5,7 +5,6 @@ import numpy as np import pandas as pd from scipy import stats -from scipy.special import lpmv from tedana import utils @@ -64,8 +63,7 @@ def gscontrol_raw(catd, optcom, n_echos, io_generator, dtrank=4): ) # Legendre polynomial basis for denoising - bounds = np.linspace(-1, 1, optcom.shape[-1]) - legendre_arr = np.column_stack([lpmv(0, vv, bounds) for vv in range(dtrank)]) + legendre_arr = utils.create_legendre_polynomial_basis_set(optcom.shape[-1], dtrank=dtrank) # compute mean, std, mask local to this function # inefficient, but makes this function a bit more modular diff --git a/tedana/metrics/_utils.py b/tedana/metrics/_utils.py index 8a9486a8f..4c28fb83c 100644 --- a/tedana/metrics/_utils.py +++ b/tedana/metrics/_utils.py @@ -1,14 +1,60 @@ """Miscellaneous utility functions for metric calculation.""" import logging +from typing import Dict, List import numpy as np +import numpy.typing as npt from scipy import stats LGR = logging.getLogger("GENERAL") -def dependency_resolver(dict_, requested_metrics, base_inputs): +def add_external_dependencies( + dependency_config: Dict, external_regressor_config: List[Dict] +) -> Dict: + """ + Add dependency information when external regressors are inputted. + + Parameters + ---------- + dependency_config: :obj:`dict` + A dictionary stored in ./config/metrics.json + with information on all the internally defined metrics like kappa and rho + external_regressor_config: :obj:`list[dict]` + A list of dictionaries with info for fitting external regressors to component time series + + Returns + ------- + dependency_config: :obj:`dict` + A dictionary with the internally defined regressors inputted with this parameter + and the information for fitting external regressors defined in external_regressor_config + """ + # Add "external regressors" and an existing input + dependency_config["inputs"].append("external regressors") + + for config_idx in range(len(external_regressor_config)): + model_names = [external_regressor_config[config_idx]["regress_ID"]] + if "partial_models" in set(external_regressor_config[config_idx].keys()): + partial_keys = external_regressor_config[config_idx]["partial_models"].keys() + for key_name in partial_keys: + model_names.append( + f"{external_regressor_config[config_idx]['regress_ID']} {key_name} partial" + ) + + # F is currently the only option so this only names metrics if "statistic"=="f" + if external_regressor_config[config_idx]["statistic"].lower() == "f": + for model_name in model_names: + for stat_type in ["Fstat", "R2stat", "pval"]: + dependency_config["dependencies"][f"{stat_type} {model_name} model"] = [ + "external regressors" + ] + return dependency_config + + +def dependency_resolver( + dict_: Dict, requested_metrics: List[str], base_inputs: List[str] +) -> List[str]: """Identify all necessary metrics based on a list of requested metrics. This also determines which metrics each requested metric requires to be calculated, @@ -58,7 +104,7 @@ def dependency_resolver(dict_, requested_metrics, base_inputs): return required_metrics -def determine_signs(weights, axis=0): +def determine_signs(weights: npt.NDArray, axis: int = 0) -> npt.NDArray: """Determine component-wise optimal signs using voxel-wise parameter estimates. Parameters @@ -66,6 +112,9 @@ def determine_signs(weights, axis=0): weights : (S x C) array_like Parameter estimates for optimally combined data against the mixing matrix. + axis : int + The axis to calculate the weights over. + Default is 0 Returns ------- @@ -80,7 +129,7 @@ def determine_signs(weights, axis=0): return signs.astype(int) -def flip_components(*args, signs): +def flip_components(*args: npt.NDArray, signs: npt.NDArray) -> npt.NDArray: """Flip an arbitrary set of input arrays based on a set of signs. Parameters @@ -111,7 +160,7 @@ def flip_components(*args, signs): return [arg * signs for arg in args] -def check_mask(data, mask): +def check_mask(data: npt.NDArray, mask: npt.NDArray) -> None: """Check that no zero-variance voxels remain in masked data. Parameters diff --git a/tedana/metrics/collect.py b/tedana/metrics/collect.py index e71e5ee4a..e9c19a96b 100644 --- a/tedana/metrics/collect.py +++ b/tedana/metrics/collect.py @@ -2,13 +2,20 @@ import logging import os.path as op +from typing import Dict, List, Tuple, Union import numpy as np +import numpy.typing as npt import pandas as pd from tedana import io, utils -from tedana.metrics import dependence -from tedana.metrics._utils import dependency_resolver, determine_signs, flip_components +from tedana.metrics import dependence, external +from tedana.metrics._utils import ( + add_external_dependencies, + dependency_resolver, + determine_signs, + flip_components, +) from tedana.stats import getfbounds LGR = logging.getLogger("GENERAL") @@ -16,15 +23,18 @@ def generate_metrics( - data_cat, - data_optcom, - mixing, - adaptive_mask, - tes, - io_generator, - label, - metrics=None, -): + *, + data_cat: npt.NDArray, + data_optcom: npt.NDArray, + mixing: npt.NDArray, + adaptive_mask: npt.NDArray, + tes: Union[List[int], List[float], npt.NDArray], + io_generator: io.OutputGenerator, + label: str, + external_regressors: Union[pd.DataFrame, None] = None, + external_regressor_config: Union[List[Dict], None] = None, + metrics: Union[List[str], None] = None, +) -> Tuple[pd.DataFrame, Dict]: """Fit TE-dependence and -independence models to components. Parameters @@ -34,12 +44,12 @@ def generate_metrics( data_optcom : (S x T) array_like Optimally combined data mixing : (T x C) array_like - Mixing matrix for converting input data to component space, where `C` - is components and `T` is the same as in `data_cat` + Mixing matrix for converting input data to component space, + where `C` is components and `T` is the same as in `data_cat` adaptive_mask : (S) array_like Array where each value indicates the number of echoes with good signal - for that voxel. This mask may be thresholded; for example, with values - less than 3 set to 0. + for that voxel. + This mask may be thresholded; for example, with values less than 3 set to 0. For more information on thresholding, see `make_adaptive_mask`. tes : list List of echo times associated with `data_cat`, in milliseconds @@ -47,14 +57,23 @@ def generate_metrics( The output generator object for this workflow label : str in ['ICA', 'PCA'] The label for this metric generation type + external_regressors : None or :obj:`pandas.DataFrame`, optional + External regressors (e.g., motion parameters, physiological noise) + to correlate with ICA components. + If None, no external regressor metrics will be calculated. + external_regressor_config : :obj:`list[dic]t` + A list of dictionaries defining how to fit external regressors to component time series metrics : list List of metrics to return Returns ------- comptable : (C x X) :obj:`pandas.DataFrame` - Component metric table. One row for each component, with a column for - each metric. The index is the component number. + Component metric table. One row for each component, with a column for each metric. + The index is the component number. + external_regressor_config : :obj:`dict` + Info describing the external regressors and method used for fitting and statistical tests + (or None if none were inputed) """ # Load metric dependency tree from json file dependency_config = op.join(utils.get_resource_path(), "config", "metrics.json") @@ -62,6 +81,16 @@ def generate_metrics( if metrics is None: metrics = ["map weight"] + + if external_regressors is not None: + if external_regressor_config is None: + raise ValueError( + "If external_regressors is defined, then " + "external_regressor_config also needs to be defined." + ) + dependency_config = add_external_dependencies(dependency_config, external_regressor_config) + dependency_config["inputs"].append("external regressors") + RepLGR.info(f"The following metrics were calculated: {', '.join(metrics)}.") if not (data_cat.shape[0] == data_optcom.shape[0] == adaptive_mask.shape[0]): @@ -73,7 +102,7 @@ def generate_metrics( elif data_cat.shape[1] != len(tes): raise ValueError( f"Second dimension of data_cat ({data_cat.shape[1]}) does not match " - f"number of echoes provided (tes; {len(tes)})" + f"number of echoes provided ({tes}; {len(tes)})" ) elif not (data_cat.shape[2] == data_optcom.shape[1] == mixing.shape[0]): raise ValueError( @@ -320,6 +349,19 @@ def generate_metrics( comptable["countsigFT2"], ) + # External regressor-based metrics + if external_regressors is not None and external_regressor_config is not None: + # external_regressor_names = external_regressors.columns.tolist() + for config_idx in range(len(external_regressor_config)): + LGR.info( + "Calculating external regressor fits. " + f"{external_regressor_config[config_idx]['info']}" + ) + RepLGR.info({external_regressor_config[config_idx]["report"]}) + + comptable = external.fit_regressors( + comptable, external_regressors, external_regressor_config, mixing + ) # Write verbose metrics if needed if io_generator.verbose: write_betas = "map echo betas" in metric_maps @@ -374,6 +416,7 @@ def generate_metrics( "d_table_score", "kappa ratio", "d_table_score_scrub", + "external fit", "classification", "rationale", ) @@ -381,10 +424,10 @@ def generate_metrics( other_columns = [col for col in comptable.columns if col not in preferred_order] comptable = comptable[first_columns + other_columns] - return comptable + return comptable, external_regressor_config -def get_metadata(comptable): +def get_metadata(comptable: pd.DataFrame) -> Dict: """Fill in metric metadata for a given comptable. Parameters @@ -394,9 +437,9 @@ def get_metadata(comptable): Returns ------- - A dict containing the metadata for each column in the comptable for - which we have a metadata description, plus the "Component" metadata - description (always). + metric_metadata : dict + The metadata for each column in the comptable for which we have a metadata description, + plus the "Component" metadata description (always). """ metric_metadata = {} if "kappa" in comptable: diff --git a/tedana/metrics/external.py b/tedana/metrics/external.py new file mode 100644 index 000000000..0f89516a8 --- /dev/null +++ b/tedana/metrics/external.py @@ -0,0 +1,546 @@ +"""Metrics based on fits of component time series to external time series.""" + +import logging +import re +from typing import Dict, List, Tuple + +import numpy as np +import numpy.typing as npt +import pandas as pd +from scipy import stats + +from tedana import utils +from tedana.stats import fit_model + +LGR = logging.getLogger("GENERAL") +RepLGR = logging.getLogger("REPORT") + + +class RegressError(Exception): + """Passes errors that are raised when `validate_extern_regress` fails.""" + + pass + + +def load_validate_external_regressors( + external_regressors: str, external_regressor_config: Dict, n_vols: int +) -> Tuple[pd.DataFrame, Dict]: + """Load and validate external regressors and descriptors in dictionary. + + Parameters + ---------- + external_regressors: :obj:`str` + Path and name of tsv file that includes external regressor time series + external_regressor_config: :obj:`dict` + A dictionary with info for fitting external regressors to component time series + n_vols: :obj:`int` + Number of timepoints in the fMRI time series + + Returns + ------- + external_regressors: :obj:`pandas.DataFrame` + Each column is a labelled regressor and the number of rows should + match the number of timepoints in the fMRI time series + external_regressor_config: :obj:`dict` + A validated dictionary with info for fitting external regressors to component time series. + If regex patterns like '^mot_.*$' are used to define regressor names, + this is replaced with a list of the match column names used in external_regressors + """ + try: + external_regressors = pd.read_table(external_regressors) + except FileNotFoundError: + raise ValueError(f"Cannot load tsv file with external regressors: {external_regressors}") + + external_regressor_config = validate_extern_regress( + external_regressors, external_regressor_config, n_vols + ) + + return external_regressors, external_regressor_config + + +def validate_extern_regress( + external_regressors: pd.DataFrame, external_regressor_config: List[Dict], n_vols: int +) -> List[Dict]: + """Confirm external regressor dictionary matches data and expands regular expressions. + + Most keys in external_regressor_config are valided in component_selector.validate_tree + which is run when a component selector object is initialized. + This function validates external_regressor_config against the dataset-specific + external_regressors time series. + If the config names specific column labels with the f_stats_partial_models key, + then this confirms those column labels are used in external_regressors + Also checks if the number of time points in the external regressors matches + the number of time point in the fMRI data. + + Parameters + ---------- + external_regressors : :obj:`pandas.DataFrame` + Each column is a labelled regressor and the number of rows should + match the number of timepoints in the fMRI time series + external_regressor_config : :obj:`list[dict]` + Information describing the external regressors and + method to use for fitting and statistical tests. + Each element in the list is a dict defining the regressors + and statistical models for a test. + n_vols : :obj:`int` + The number of time points in the fMRI time series + + Returns + ------- + external_regressor_config: :obj:`list[dict]` + A validated list of dictionaries with info for fitting external regressors + to component time series. + If regex patterns like '^mot_.*$' are used to define regressor names, + these are replaced with a list of the matching column names used in external_regressors + + Raises + ------ + RegressorError if any validation test fails + """ + # err_msg is used to collect all errors in validation rather than require this to be run + # multiple times to see all validation errors. + # Will either collect errors and raise at the end of the function + # or raise errors that prevent the rest of the function from completing + err_msg = "" + + external_regressor_names = set(external_regressors.columns) + + def expand_regress_regex(regressor_templates, external_regressor_names, err_msg): + """Match or regex expand regressor names from config.""" + expanded_regressor_names = set() + for tmp_regressor_name in regressor_templates: + # If a regressor name is a regular expression, use re to match and expand + # by comparing to regressor names in external_regressor_names. + if tmp_regressor_name.startswith("^"): + expanded_names = [ + reg_name + for reg_name in external_regressor_names + if re.match(tmp_regressor_name, reg_name, re.IGNORECASE) + ] + if not expanded_names: + err_msg += ( + "No external regressor labels matching " + f"regular expression '{tmp_regressor_name}' found.\n" + ) + else: + expanded_regressor_names.update(set(expanded_names)) + else: + # If a regressor name is a string, check if it is in external_regressor_names + if tmp_regressor_name in external_regressor_names: + expanded_regressor_names.add(tmp_regressor_name) + else: + err_msg += ( + f"No external regressor matching '{tmp_regressor_name}' was found.\n" + ) + return expanded_regressor_names, err_msg + + # Expanding the regressors used for each model + all_regressor_names = set() + for config_idx in range(len(external_regressor_config)): + expanded_regressor_names, err_msg = expand_regress_regex( + external_regressor_config[config_idx]["regressors"], external_regressor_names, err_msg + ) + reused_regressors = all_regressor_names.intersection(expanded_regressor_names) + if reused_regressors: + LGR.warning( + f"{list(reused_regressors).sort()} used in " + "more than one external regressor model" + ) + all_regressor_names.update(expanded_regressor_names) + external_regressor_config[config_idx]["regressors"] = sorted(expanded_regressor_names) + + extra_names = set(external_regressor_names) - all_regressor_names + if extra_names: + LGR.warning( + "User-provided external_regressors include columns not used in any " + f"external regressor model: {sorted(extra_names)}" + ) + + # If a model includes specifications for partial regressors, expand them + for config_idx in range(len(external_regressor_config)): + if "partial_models" in external_regressor_config[config_idx].keys(): + if not isinstance(external_regressor_config[config_idx]["partial_models"], type(None)): + part_model_regress_names = set() + for part_model in external_regressor_config[config_idx]["partial_models"].keys(): + expanded_regressor_names, err_msg = expand_regress_regex( + external_regressor_config[config_idx]["partial_models"][part_model], + external_regressor_names, + err_msg, + ) + reused_regressors = part_model_regress_names.intersection( + expanded_regressor_names + ) + if reused_regressors: + LGR.warning( + f"{sorted(reused_regressors)} used in " + "more than one partial regressor model for " + f"{external_regressor_config[config_idx]['regress_ID']}" + ) + part_model_regress_names.update(expanded_regressor_names) + external_regressor_config[config_idx]["partial_models"][part_model] = sorted( + expanded_regressor_names + ) + extra_names = part_model_regress_names - set( + external_regressor_config[config_idx]["regressors"] + ) + if extra_names: + err_msg += ( + f"Partial models in {external_regressor_config[config_idx]['regress_ID']} " + "include regressors that are excluded from its full model: " + f"{sorted(extra_names)}\n" + ) + + if len(external_regressors.index) != n_vols: + err_msg += ( + f"External Regressors have {len(external_regressors.index)} timepoints " + f"while fMRI data have {n_vols} timepoints\n" + ) + + if err_msg: + raise RegressError(err_msg) + + return external_regressor_config + + +def fit_regressors( + comptable: pd.DataFrame, + external_regressors: pd.DataFrame, + external_regressor_config: List[Dict], + mixing: npt.NDArray, +) -> pd.DataFrame: + """Fit regressors to the mixing matrix. + + Uses correlation or F statistics in a linear model depending on the calc_stats + value in external_regressor_config + + Parameters + ---------- + comptable : (C x X) :obj:`pandas.DataFrame` + Component metric table. One row for each component, + with a column for each metric. The index is the component number. + external_regressors : (T x R) :obj:`pandas.DataFrame` + Each column is a labelled regressor and the number of rows should + match the number of timepoints in the fMRI time series + external_regressor_config : :obj:`list[dict]` + Information describing the external regressors and + method to use for fitting and statistical tests + mixing : (T x C) array_like + Mixing matrix for converting input data to component space, + where `C` is components and `T` is the same as in `data_cat` + + Returns + ------- + comptable : (C x X) :obj:`pandas.DataFrame` + Component metric table. + Same as inputted, with added columns for metrics related to the external regressor fits + """ + n_vols = mixing.shape[0] + + # For every model (i.e. nuisance and task) in external_regressor_config + # setup and run fit_mixing_to_regressors to add columns to comptable + for config_idx in range(len(external_regressor_config)): + regress_id = external_regressor_config[config_idx]["regress_ID"] + # If the order of detrending regressors is specified, then pass to + # create_legendre_polynomial_basis_set + # otherwise the function sets an order for the Legendre polynomials + if external_regressor_config[config_idx]["detrend"] is True: + legendre_arr = utils.create_legendre_polynomial_basis_set(n_vols, dtrank=None) + LGR.info( + f"External regressors fit for {regress_id} includes detrending with " + f"{legendre_arr.shape[1]} Legendre Polynomial regressors." + ) + + elif ( + isinstance(external_regressor_config[config_idx]["detrend"], int) + and external_regressor_config[config_idx]["detrend"] > 0 + ): + legendre_arr = utils.create_legendre_polynomial_basis_set( + n_vols, dtrank=external_regressor_config[config_idx]["detrend"] + ) + LGR.info( + f"External regressors fit for {regress_id} includes detrending with " + f"{legendre_arr.shape[1]} Legendre Polynomial regressors." + ) + else: + LGR.warning( + f"External regressor for {regress_id} fitted without detrending fMRI time series. " + "Only removing mean" + ) + legendre_arr = utils.create_legendre_polynomial_basis_set(n_vols, dtrank=1) + + detrend_labels = [] + for label_idx in range(legendre_arr.shape[1]): + detrend_labels.append(f"baseline {label_idx}") + detrend_regressors = pd.DataFrame(data=legendre_arr, columns=detrend_labels) + + if external_regressor_config[config_idx]["statistic"].lower() == "f": + comptable = fit_mixing_to_regressors( + comptable, + external_regressors, + external_regressor_config[config_idx], + mixing, + detrend_regressors, + ) + else: + # This should already be validated by this point, but keeping the catch clause here + # since this would otherwise just return comptable with no changes, which would + # make a hard-to-track error + raise ValueError( + f"statistic for {regress_id} external regressors in decision tree is " + f"{external_regressor_config[config_idx]['statistic'].lower()}, " + "which is not valid." + ) + + return comptable + + +def fit_mixing_to_regressors( + comptable: pd.DataFrame, + external_regressors: pd.DataFrame, + external_regressor_config: Dict, + mixing: npt.NDArray, + detrend_regressors: pd.DataFrame, +) -> pd.DataFrame: + """Compute Linear Model and calculate F statistics and P values for combinations of regressors. + + Equation: Y = XB + E + - Y = each ICA component (mixing) + - X = Design (Regressor) matrix (subsets of noise_regress_table) + - B = Weighting Factors (solving for B) + - E = errors (Y - Y_pred OR Y - XB) + + Parameters + ---------- + comptable : (C x X) :obj:`pandas.DataFrame` + Component metric table. + One row for each component, with a column for each metric. + The index is the component number. + external_regressors : :obj:`pandas.DataFrame` + Each column is a labelled regressor and the number of rows should + match the number of timepoints in the fMRI time series + external_regressor_config : :obj:`dict` + Information describing the external regressors and + method to use for fitting and statistical tests. + In other functions this is a list[dict] but here it is a + single dict which is one element in the list[dict] + mixing : (T x C) array_like + Mixing matrix for converting input data to component space, + where `C` is components and `T` is the same as in `data_cat` + detrend_regressors: (n_vols x polort) :obj:`pandas.DataFrame` + Dataframe containing the detrending regressor time series + + Returns + ------- + comptable : (C x X) :obj:`pandas.DataFrame` + Component metric table. + Same as inputted, with added columns for metrics related to external regressor fits. + New columns for F, R2, and p values for the full model and all partial models. + Names are "Fstat Full Model", "pval Full Model", "R2stat Full Model", + and "Full" is replaced by the partial model name for each partial model + """ + regress_id = external_regressor_config["regress_ID"] + LGR.info(f"Running fit_mixing_to_regressors for {regress_id}") + LGR.info(f"ICA matrix has {mixing.shape[0]} time points and {mixing.shape[1]} components") + + # regressor_models is a dictionary of all the models that will be fit to the mixing matrix + # It will always include 'base' which is just the polort detrending regressors and + # 'full' which is all relevant regressors including the detrending regressors + # For F statistics, the other models need for tests are those that include everything + # EXCEPT the category of interest. For example, there will also be a field for "no Motion" + # which contains all regressors in the full model except those that model motion + regressor_models = build_fstat_regressor_models( + external_regressors, external_regressor_config, detrend_regressors + ) + + # This is the test for the fit of the full model vs the polort detrending baseline + # The outputs will be what we use to decide which components to reject + betas_full, f_vals_tmp, p_vals_tmp, r2_vals_tmp = fit_model_with_stats( + y=mixing, regressor_models=regressor_models, base_label="base" + ) + + # TODO beta_full_model are the fits to all external regressors and might be useful to save + # TODO Also consider saving regressor_models or the detrending regressors + # betas_full_model = pd.DataFrame( + # data=betas_full.T, + # columns=np.concatenate( + # (np.array(detrend_regressors.columns), np.array(exte rnal_regressors.columns)) + # ), + # ) + f_vals = pd.DataFrame(data=f_vals_tmp, columns=[f"Fstat {regress_id} model"]) + p_vals = pd.DataFrame(data=p_vals_tmp, columns=[f"pval {regress_id} model"]) + r2_vals = pd.DataFrame(data=r2_vals_tmp, columns=[f"R2stat {regress_id} model"]) + + # Test the fits between the full model and the full model excluding one category of regressor + if "partial_models" in external_regressor_config.keys(): + for pmodel in external_regressor_config["partial_models"].keys(): + _, f_vals_tmp, p_vals_tmp, r2_vals_tmp = fit_model_with_stats( + mixing, regressor_models, f"no {pmodel}" + ) + f_vals[f"Fstat {regress_id} {pmodel} partial model"] = f_vals_tmp + p_vals[f"pval {regress_id} {pmodel} partial model"] = p_vals_tmp + r2_vals[f"R2stat {regress_id} {pmodel} partial model"] = r2_vals_tmp + + # Add all F p and R2 statistics to comptable + comptable = pd.concat((comptable, f_vals, p_vals, r2_vals), axis=1) + + return comptable + + +def build_fstat_regressor_models( + external_regressors: pd.DataFrame, + external_regressor_config: Dict, + detrend_regressors: pd.DataFrame, +) -> Dict: + """Combine detrending all or subsets of external regressors to make models to fit and test. + + Parameters + ---------- + external_regressors : :obj:`pandas.DataFrame` + Each column is a labelled regressor and the number of rows should + match the number of timepoints in the fMRI time series + external_regressor_config : :obj:`dict` + Information describing the external regressors and + method to use for fitting and statistical tests. + In other functions this is a list[dict] but here it is a + single dict which is one element in the list[dict] + detrend_regressors: (n_vols x polort) :obj:`pandas.DataFrame` + Dataframe containing the detrending regressor time series + + Returns + ------- + regressor_models: :obj:`dict` + Each element in the dictionary is a numpy array defining the regressors in a + regressor model. + The models that are always included are 'base' which is just the detrending regressors, + and 'full' which is all user-provided external regressors and the detrending regressors. + If partial models are named in external_regressor_config["f_stats_partial_models"], + then each of those will have a dictionary element named "no" then model name and the + regressors included will be everything except the specified regressors. + That is "no motion" will include all regressors except the motion regressors. + This is for the F test which compares the variance explained with the full model to the + variance explained if the regressors-of-interest for the partial model are removed. + """ + regress_id = external_regressor_config["regress_ID"] + # The category titles to group each regressor + if "partial_models" in external_regressor_config: + partial_models = external_regressor_config["partial_models"].keys() + else: + partial_models = [] + + # All regressor labels for the full model + regressor_labels = set(external_regressor_config["regressors"]) + + detrend_regressors_arr = detrend_regressors.to_numpy() + regressor_models = {"base": detrend_regressors_arr} + LGR.info(f"Size for base regressor model for {regress_id}: {regressor_models['base'].shape}") + + regressor_models["full"] = detrend_regressors_arr + for keep_label in regressor_labels: + regressor_models["full"] = np.concatenate( + ( + regressor_models["full"], + np.atleast_2d(stats.zscore(external_regressors[keep_label].to_numpy(), axis=0)).T, + ), + axis=1, + ) + regressor_labels.update(set(detrend_regressors.columns)) + # regressor_models["full"] = np.concatenate( + # (detrend_regressors_arr, stats.zscore(external_regressors.to_numpy(), axis=0)), axis=1 + # ) + LGR.info(f"Size for full regressor model for {regress_id}: {regressor_models['full'].shape}") + LGR.info(f"Regressors in full model for {regress_id}: {sorted(set(regressor_labels))}") + + for pmodel in partial_models: + # For F statistics, the other models to test are those that include everything EXCEPT + # the category of interest + # That is "no motion" should contain the full model excluding motion regressors + keep_labels = set(external_regressor_config["regressors"]) - set( + external_regressor_config["partial_models"][pmodel] + ) + no_pmodel = f"no {pmodel}" + regressor_models[no_pmodel] = detrend_regressors_arr + for keep_label in keep_labels: + regressor_models[no_pmodel] = np.concatenate( + ( + regressor_models[no_pmodel], + np.atleast_2d( + stats.zscore(external_regressors[keep_label].to_numpy(), axis=0) + ).T, + ), + axis=1, + ) + keep_labels.update(set(detrend_regressors.columns)) + LGR.info( + f"Size of external regressor partial model '{no_pmodel}': " + f"{regressor_models[no_pmodel].shape}" + ) + LGR.info( + "Regressors in partial model (everything but regressors of interest) " + f"'{no_pmodel}': {sorted(keep_labels)}" + ) + + return regressor_models + + +def fit_model_with_stats( + y: npt.NDArray, regressor_models: Dict, base_label: str, full_label: str = "full" +) -> Tuple[npt.NDArray, npt.NDArray, npt.NDArray, npt.NDArray]: + """Fit full and partial models and calculate F stats, R2, and p values. + + Math from page 11-14 of https://afni.nimh.nih.gov/pub/dist/doc/manual/3dDeconvolve.pdf + Calculates Y=betas*X + error for the base and the full model + F = ((SSE_base-SSE_full)/(DF_base-DF_full)) / (SSE_full/DF_full) + DF = degrees of freedom + SSE = sum of squares error + + Parameters + ---------- + y : (T X C) :obj:`numpy.ndarray` + Time by mixing matrix components for the time series for fitting + regressor_models: :obj:`dict` + Each element in the dictionary is a numpy array defining the regressors in a + regressor model. + Inclues 'full', 'base' and partial models. + base_label : :obj:`str` + The base model to compare the full model against. + For F stat for the full model, this should be 'base'. + For partial models, this should be the name of the partial model (i.e. "no motion"). + full_label : :obj:`str` + The full model to use. + Default="full". + "full" is expected if the goal is to compare all nuissance regressors to a base model. + "task_keep" for the special case of fitting pre-defined task-locked regressors. + + Returns + ------- + betas_full : (C x R) :obj:`numpy.ndarray` + The beta fits for the full model (components x regressors) + f_vals : (C x M) :obj:`numpy.ndarray` + The F statistics for the fits to the full and partial models + p_vals : (C x M) :obj:`numpy.ndarray` + The p values for the fits to the full and partial models + r2_vals : (C x M) :obj:`numpy.ndarray` + The R2 statistics for the fits to the full and partial models + """ + betas_base, sse_base, df_base = fit_model(regressor_models[base_label], y) + betas_full, sse_full, df_full = fit_model(regressor_models[full_label], y) + + # larger sample variance / smaller sample variance (F = (SSE1 – SSE2 / m) / SSE2 / n-k, + # where SSE = residual sum of squares, m = number of restrictions and k = number of + # independent variables) -> the 'm' restrictions in this case is the DOF range between + # the base - full model, the 'n-k' is the number of DOF (independent variables/timepoints) + # from the fully-fitted model + f_vals = np.divide((sse_base - sse_full) / (df_base - df_full), (sse_full / df_full)) + # cumulative distribution (FX(x) = P(X<=x), X = real-valued variable, x=probable variable + # within distribution) of the F-values + extra parameters to shape the range of the + # distribution values (range: start = DOF_base (unfitted) - + # DOF_Full (fitted with full model), end = DOF_Full) + p_vals = 1 - stats.f.cdf(f_vals, df_base - df_full, df_full) + # estimate proportion of variance (R2-squared fit) by SSE full [fully fitted] + # (sum of 1st errors) / SSE base [non-fitted] (sum of 2nd errors) ... and subtracting + # the error ratio from 1: R² = SSR (fitted model)/SST(total or model base) = + # Σ (Y_pred-Y_actual)**2 / Σ (Y_pred-Y_actual)**2 + r2_vals = 1 - np.divide(sse_full, sse_base) + print(y.shape) + + return betas_full, f_vals, p_vals, r2_vals diff --git a/tedana/resources/decision_trees/demo_external_regressors_motion_task_models.json b/tedana/resources/decision_trees/demo_external_regressors_motion_task_models.json new file mode 100644 index 000000000..87b548961 --- /dev/null +++ b/tedana/resources/decision_trees/demo_external_regressors_motion_task_models.json @@ -0,0 +1,293 @@ +{ + "tree_id": "demo_external_regressors_motion_task_models", + "info": "Demonstration based on the minimal decision tree that uses partial F stats on a model with multiple external regressors divided by category and task regressors to bias towards keeping.", + "report": "This is based on the minimal criteria of the original MEICA decision tree \\citep{kundu2013integrated} without the more aggressive noise removal steps \\citep{dupre2021te}.", + "necessary_metrics": [ + "kappa", + "rho", + "countsigFS0", + "countsigFT2", + "dice_FS0", + "dice_FT2", + "signal-noise_t", + "variance explained", + "pval nuisance model", + "pval task model", + "pval nuisance Motion partial model", + "pval nuisance CSF partial model", + "R2stat nuisance model", + "R2stat task model" + ], + "intermediate_classifications": ["provisionalaccept", "provisionalreject"], + "classification_tags": [ + "Likely BOLD", + "Unlikely BOLD", + "Low variance", + "External regressors", + "Fits motion external regressors", + "Fits CSF external regressors", + "Fits task" + ], + "external_regressor_config": [ + { + "regress_ID": "nuisance", + "info": "Fits all external nuisance regressors to a single model using an F statistic", + "report": "External nuisance regressors that fit to components using a linear model were rejected.", + "detrend": true, + "statistic": "F", + "regressors": ["^(?!signal).*$"], + "partial_models": { + "Motion": ["^mot_.*$"], + "CSF": ["^csf.*$"] + } + }, + { + "regress_ID": "task", + "info": "Fits all task regressors to a single model using an F statistic", + "report": "Task regressors that fit to components using a linear model and have some T2* weighting were accepted even if they would have been rejected base on other criteriea.", + "detrend": true, + "statistic": "F", + "regressors": ["^signal.*$"] + } + ], + "nodes": [ + { + "functionname": "manual_classify", + "parameters": {"new_classification": "unclassified", "decide_comps": "all"}, + "kwargs": {"clear_classification_tags": true, "dont_warn_reclassify": true} + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "provisionalreject", + "if_false": "nochange", + "decide_comps": "all", + "op": ">", + "left": "rho", + "right": "kappa" + }, + "kwargs": {"tag_if_true": "Unlikely BOLD"} + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "provisionalreject", + "if_false": "nochange", + "decide_comps": "all", + "op": ">", + "left": "countsigFS0", + "right": "countsigFT2" + }, + "kwargs": { + "left2": "countsigFT2", + "op2": ">", + "right2": 0, + "tag_if_true": "Unlikely BOLD" + } + }, + { + "functionname": "calc_median", + "parameters": { + "decide_comps": "all", + "metric_name": "variance explained", + "median_label": "varex" + } + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "provisionalreject", + "if_false": "nochange", + "decide_comps": "all", + "op": ">", + "left": "dice_FS0", + "right": "dice_FT2" + }, + "kwargs": { + "left2": "variance explained", + "op2": ">", + "right2": "median_varex", + "tag_if_true": "Unlikely BOLD" + } + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "provisionalreject", + "if_false": "nochange", + "decide_comps": "all", + "op": ">", + "left": 0, + "right": "signal-noise_t" + }, + "kwargs": { + "left2": "variance explained", + "op2": ">", + "right2": "median_varex", + "tag_if_true": "Unlikely BOLD" + } + }, + { + "functionname": "calc_kappa_elbow", + "parameters": {"decide_comps": "all"}, + "_comment": "" + }, + { + "functionname": "calc_rho_elbow", + "parameters": {"decide_comps": "all"}, + "kwargs": { + "subset_decide_comps": "unclassified", + "rho_elbow_type": "liberal", + "log_extra_info": "" + }, + "_comment": "" + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "provisionalaccept", + "if_false": "provisionalreject", + "decide_comps": "unclassified", + "op": ">=", + "left": "kappa", + "right": "kappa_elbow_kundu" + }, + "kwargs": { + "log_extra_info": "If kappa> kappa elbow and rho", + "left": "kappa", + "right": "rho" + }, + "kwargs": { + "log_extra_info": "If kappa>elbow and kappa>2*rho accept even if rho>elbow", + "right_scale": 2, + "op2": ">", + "left2": "kappa", + "right2": "kappa_elbow_kundu", + "tag_if_true": "Likely BOLD", + "tag_if_false": "Unlikely BOLD" + } + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "provisionalreject", + "if_false": "nochange", + "decide_comps": "all", + "op": "<", + "left": "pval nuisance model", + "right": 0.05 + }, + "kwargs": { + "op2": ">", + "left2": "R2stat nuisance model", + "right2": 0.5, + "log_extra_info": "If external regressors fit with p<0.05 and model R2>0.5 of the variance, then reject.", + "tag_if_true": "External regressors" + }, + "_comment": "Provisionally rejecting components that fit to the external regressor noise model" + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "nochange", + "if_false": "nochange", + "decide_comps": "provisionalreject", + "op": "<", + "left": "pval nuisance model", + "right": 0.05 + }, + "kwargs": { + "op2": ">", + "left2": "R2stat nuisance model", + "right2": 0.5, + "op3": "<", + "left3": "pval nuisance Motion partial model", + "right3": 0.05, + "tag_if_true": "Fits motion external regressors" + }, + "_comment": "Identical to the one above, & not changing classifications, but tagging if fits to motion regressors" + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "nochange", + "if_false": "nochange", + "decide_comps": "provisionalreject", + "op": "<", + "left": "pval nuisance model", + "right": 0.05 + }, + "kwargs": { + "op2": ">", + "left2": "R2stat nuisance model", + "right2": 0.5, + "op3": "<", + "left3": "pval nuisance CSF partial model", + "right3": 0.05, + "tag_if_true": "Fits CSF external regressors" + }, + "_comment": "Identical to the one above, & not changing classifications, but tagging if fits to CSF regressors" + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "accepted", + "if_false": "nochange", + "decide_comps": ["provisionalreject"], + "op": "<", + "left": "pval task model", + "right": 0.05 + }, + "kwargs": { + "op2": ">", + "left2": "R2stat task model", + "right2": 0.5, + "op3": ">=", + "left3": "kappa", + "right3": "kappa_elbow_kundu", + "tag_if_true": "Fits task" + }, + "_comment": "Keep if it fits task regressors and contains T2* signal, as defined by kappa>elbow" + }, + { + "functionname": "dec_variance_lessthan_thresholds", + "parameters": { + "if_true": "accepted", + "if_false": "nochange", + "decide_comps": "provisionalreject" + }, + "kwargs": { + "var_metric": "variance explained", + "single_comp_threshold": 0.1, + "all_comp_threshold": 1.0, + "tag_if_true": "Low variance" + } + }, + { + "functionname": "manual_classify", + "parameters": {"new_classification": "accepted", "decide_comps": "provisionalaccept"}, + "kwargs": {"tag": "Likely BOLD"} + }, + { + "functionname": "manual_classify", + "parameters": { + "new_classification": "rejected", + "decide_comps": ["provisionalreject", "unclassified"] + }, + "kwargs": {"tag": "Unlikely BOLD"} + } + ] +} diff --git a/tedana/resources/decision_trees/demo_external_regressors_single_model.json b/tedana/resources/decision_trees/demo_external_regressors_single_model.json new file mode 100644 index 000000000..1900ca2e5 --- /dev/null +++ b/tedana/resources/decision_trees/demo_external_regressors_single_model.json @@ -0,0 +1,205 @@ +{ + "tree_id": "demo_external_regressors_single_model", + "info": "Demonstration based on the minimal decision tree that uses an F stat on a model with multiple external regressors.", + "report": "This is based on the minimal criteria of the original MEICA decision tree \\citep{kundu2013integrated} without the more aggressive noise removal steps \\citep{dupre2021te}.", + "necessary_metrics": [ + "kappa", + "rho", + "countsigFS0", + "countsigFT2", + "dice_FS0", + "dice_FT2", + "signal-noise_t", + "variance explained", + "pval nuisance model", + "R2stat nuisance model" + ], + "intermediate_classifications": ["provisionalaccept", "provisionalreject"], + "classification_tags": ["Likely BOLD", "Unlikely BOLD", "Low variance", "External regressors"], + "external_regressor_config": [ + { + "regress_ID": "nuisance", + "info": "Fits all external nuisance regressors to a single model using an F statistic", + "report": "External nuisance regressors that fit to components using a linear model were rejected.", + "detrend": true, + "statistic": "F", + "regressors": ["^.*$"] + } + ], + "nodes": [ + { + "functionname": "manual_classify", + "parameters": {"new_classification": "unclassified", "decide_comps": "all"}, + "kwargs": {"clear_classification_tags": true, "dont_warn_reclassify": true} + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "rejected", + "if_false": "nochange", + "decide_comps": "all", + "op": ">", + "left": "rho", + "right": "kappa" + }, + "kwargs": {"tag_if_true": "Unlikely BOLD"} + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "rejected", + "if_false": "nochange", + "decide_comps": "all", + "op": ">", + "left": "countsigFS0", + "right": "countsigFT2" + }, + "kwargs": { + "left2": "countsigFT2", + "op2": ">", + "right2": 0, + "tag_if_true": "Unlikely BOLD" + } + }, + { + "functionname": "calc_median", + "parameters": { + "decide_comps": "all", + "metric_name": "variance explained", + "median_label": "varex" + } + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "rejected", + "if_false": "nochange", + "decide_comps": "all", + "op": ">", + "left": "dice_FS0", + "right": "dice_FT2" + }, + "kwargs": { + "left2": "variance explained", + "op2": ">", + "right2": "median_varex", + "tag_if_true": "Unlikely BOLD" + } + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "rejected", + "if_false": "nochange", + "decide_comps": "all", + "op": ">", + "left": 0, + "right": "signal-noise_t" + }, + "kwargs": { + "left2": "variance explained", + "op2": ">", + "right2": "median_varex", + "tag_if_true": "Unlikely BOLD" + } + }, + { + "functionname": "calc_kappa_elbow", + "parameters": {"decide_comps": "all"}, + "_comment": "" + }, + { + "functionname": "calc_rho_elbow", + "parameters": {"decide_comps": "all"}, + "kwargs": {"subset_decide_comps": "unclassified", "rho_elbow_type": "liberal"}, + "_comment": "" + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "provisionalaccept", + "if_false": "provisionalreject", + "decide_comps": "unclassified", + "op": ">=", + "left": "kappa", + "right": "kappa_elbow_kundu" + } + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "provisionalreject", + "if_false": "nochange", + "decide_comps": ["provisionalreject", "provisionalaccept"], + "op": ">", + "left": "rho", + "right": "rho_elbow_liberal" + } + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "provisionalaccept", + "if_false": "nochange", + "decide_comps": "provisionalreject", + "op": ">", + "left": "kappa", + "right": "rho" + }, + "kwargs": { + "log_extra_info": "If kappa>elbow and kappa>2*rho accept even if rho>elbow", + "right_scale": 2, + "op2": ">", + "left2": "kappa", + "right2": "kappa_elbow_kundu", + "tag_if_true": "Likely BOLD" + } + }, + { + "functionname": "dec_left_op_right", + "parameters": { + "if_true": "rejected", + "if_false": "nochange", + "decide_comps": "all", + "op": "<", + "left": "pval nuisance model", + "right": 0.05 + }, + "kwargs": { + "op2": ">", + "left2": "R2stat nuisance model", + "right2": 0.5, + "log_extra_info": "If external regressors fit with p<0.05 and model R2>0.5 of the variance, then reject.", + "tag_if_true": "External regressors" + } + }, + { + "functionname": "dec_variance_lessthan_thresholds", + "parameters": { + "if_true": "accepted", + "if_false": "nochange", + "decide_comps": "provisionalreject" + }, + "kwargs": { + "var_metric": "variance explained", + "single_comp_threshold": 0.1, + "all_comp_threshold": 1.0, + "tag_if_true": "Low variance" + } + }, + + { + "functionname": "manual_classify", + "parameters": {"new_classification": "accepted", "decide_comps": "provisionalaccept"}, + "kwargs": {"tag": "Likely BOLD"} + }, + { + "functionname": "manual_classify", + "parameters": { + "new_classification": "rejected", + "decide_comps": ["provisionalreject", "unclassified"] + }, + "kwargs": {"tag": "Unlikely BOLD"} + } + ] +} diff --git a/tedana/selection/component_selector.py b/tedana/selection/component_selector.py index f78477f92..e54845546 100644 --- a/tedana/selection/component_selector.py +++ b/tedana/selection/component_selector.py @@ -3,7 +3,9 @@ import inspect import logging import os.path as op +from typing import Dict, List, Union +import pandas as pd from numpy import asarray from tedana.io import load_json @@ -23,7 +25,16 @@ # A user can run the desision tree either using one of these # names or by giving the full path to a tree in a different # location -DEFAULT_TREES = ["minimal", "meica", "tedana_orig"] +# Including the demo trees here so people do not need to type the full +# path, but these are not listed as options in the CLI help menu until +# they are actually validated as useful on some data +DEFAULT_TREES = [ + "minimal", + "meica", + "tedana_orig", + "demo_external_regressors_single_model", + "demo_external_regressors_motion_task_models", +] class TreeError(Exception): @@ -32,7 +43,7 @@ class TreeError(Exception): pass -def load_config(tree): +def load_config(tree: str) -> Dict: """Load the json file with the decision tree and validate the fields in the decision tree. Parameters @@ -73,7 +84,7 @@ def load_config(tree): return validate_tree(dectree) -def validate_tree(tree): +def validate_tree(tree: Dict) -> Dict: """Confirm that provided `tree` is a valid decision tree. Parameters @@ -100,11 +111,17 @@ def validate_tree(tree): "intermediate_classifications", "classification_tags", "nodes", + "external_regressor_config", ] + tree_optional_keys = ["generated_metrics"] defaults = {"selector", "decision_node_idx"} default_classifications = {"nochange", "accepted", "rejected", "unclassified"} default_decide_comps = {"all", "accepted", "rejected", "unclassified"} + # If a tree doesn't include "external_regressor_config", instead of crashing, set to None + if "external_regressor_config" not in set(tree.keys()): + tree["external_regressor_config"] = None + # Confirm that the required fields exist missing_keys = set(tree_expected_keys) - set(tree.keys()) if missing_keys: @@ -115,16 +132,15 @@ def validate_tree(tree): # Warn if unused fields exist unused_keys = set(tree.keys()) - set(tree_expected_keys) - {"used_metrics"} - # Make sure some fields don't trigger a warning; hacky, sorry - ok_to_not_use = ( - "reconstruct_from", - "generated_metrics", - ) - for k in ok_to_not_use: + + # Separately check for optional, but used keys + for k in tree_optional_keys: if k in unused_keys: unused_keys.remove(k) if unused_keys: - LGR.warning(f"Decision tree includes fields that are not used or logged {unused_keys}") + LGR.warning( + f"Decision tree includes fields that are not used or logged {sorted(unused_keys)}" + ) # Combine the default classifications with the user inputted classifications all_classifications = set(tree.get("intermediate_classifications")) | set( @@ -194,7 +210,9 @@ def validate_tree(tree): compclass = compclass | set(tmp_comp) nonstandard_labels = compclass.difference(all_classifications) if nonstandard_labels: - LGR.warning(f"{compclass} in node {i} of the decision tree includes a classification") + LGR.warning( + f"{sorted(compclass)} in node {i} of the decision tree includes a classification" + ) if "decide_comps" in node.get("parameters").keys(): tmp_comp = node["parameters"]["decide_comps"] if isinstance(tmp_comp, str): @@ -203,7 +221,7 @@ def validate_tree(tree): nonstandard_labels = compclass.difference(all_decide_comps) if nonstandard_labels: LGR.warning( - f"{compclass} in node {i} of the decision tree includes a classification " + f"{sorted(compclass)} in node {i} of the decision tree includes a classification " "label that was not predefined" ) @@ -218,10 +236,68 @@ def validate_tree(tree): undefined_classification_tags = tagset.difference(set(tree.get("classification_tags"))) if undefined_classification_tags: LGR.warning( - f"{tagset} in node {i} of the decision tree includes a classification " + f"{sorted(tagset)} in node {i} of the decision tree includes a classification " "tag that was not predefined" ) + # If there is an external_regressor_config field, validate it + if tree["external_regressor_config"] is not None: + external_regressor_config = tree["external_regressor_config"] + if isinstance(external_regressor_config, list): + # Define the fields that should always be present + dict_expected_keys = set( + ["regress_ID", "info", "report", "detrend", "statistic", "regressors"] + ) + dict_optional_keys = set(["partial_models"]) + # Right now, "f" is the only option, but this leaves open the possibility + # to have additional options + statistic_key_options = set("f") + + for config_idx in range(len(external_regressor_config)): + # Confirm that the required fields exist + missing_keys = dict_expected_keys - set( + external_regressor_config[config_idx].keys() + ) + if missing_keys: + err_msg += ( + f"External regressor dictionary {config_idx} " + f"is missing required fields: {missing_keys}\n" + ) + + if "statistic" in set(external_regressor_config[config_idx].keys()): + if ( + external_regressor_config[config_idx]["statistic"].lower() + not in statistic_key_options + ): + err_msg += ( + "statistic in external_regressor_config 1 is " + f"{external_regressor_config[config_idx]['statistic']}. " + "It must be one of the following: " + f"{statistic_key_options}\n" + ) + + if (external_regressor_config[config_idx]["statistic"].lower() != "f") and ( + "partial_models" in set(external_regressor_config[config_idx].keys()) + ): + err_msg += ( + "External regressor dictionary cannot include " + "partial_models if statistic is not F\n" + ) + + # Warn if unused fields exist + unused_keys = ( + set(external_regressor_config[config_idx].keys()) + - dict_expected_keys + - dict_optional_keys + ) + if unused_keys: + LGR.warning( + f"External regressor dictionary {config_idx} includes fields that " + f"are not used or logged {sorted(unused_keys)}" + ) + else: + err_msg += "External regressor dictional exists, but is not a list\n" + if err_msg: raise TreeError("\n" + err_msg) @@ -231,7 +307,7 @@ def validate_tree(tree): class ComponentSelector: """Load and classify components based on a specified ``tree``.""" - def __init__(self, tree): + def __init__(self, tree: str): """Initialize the class using the info specified in the json file ``tree``. Parameters @@ -255,7 +331,12 @@ def __init__(self, tree): self.classification_tags = set(self.tree["classification_tags"]) self.tree["used_metrics"] = set(self.tree.get("used_metrics", [])) - def select(self, component_table, cross_component_metrics={}, status_table=None): + def select( + self, + component_table: pd.DataFrame, + cross_component_metrics: Dict = {}, + status_table: Union[pd.DataFrame, None] = None, + ): """Apply the decision tree to data. Using the validated tree in ``ComponentSelector`` to run the decision @@ -274,6 +355,9 @@ def select(self, component_table, cross_component_metrics={}, status_table=None) A table tracking the status of each component at each step. Pass a status table if running additional steps on a decision tree that was already executed. Default=None. + external_regressor_config : :obj:`dict` + Information describing the external regressors and + method to use for fitting and statistical tests Notes ----- @@ -334,11 +418,40 @@ def select(self, component_table, cross_component_metrics={}, status_table=None) # this will crash the program with an error message if not all # necessary_metrics are in the comptable confirm_metrics_exist( - self.component_table_, - self.necessary_metrics, + component_table=self.component_table_, + necessary_metrics=self.necessary_metrics, function_name=self.tree_name, ) + # To run a decision tree, each component needs to have an initial classification + # If the classification column doesn't exist, create it and label all components + # as unclassified + if "classification" not in self.component_table_: + self.component_table_["classification"] = "unclassified" + + if status_table is None: + self.component_status_table_ = self.component_table_[ + ["Component", "classification"] + ].copy() + self.component_status_table_ = self.component_status_table_.rename( + columns={"classification": "initialized classification"} + ) + self.start_idx_ = 0 + else: + # Since a status table exists, we need to skip nodes up to the + # point where the last tree finished. Notes that were executed + # have an output field. Identify the last node with an output field + tmp_idx = len(self.tree["nodes"]) - 1 + while ("outputs" not in self.tree["nodes"][tmp_idx]) and (tmp_idx > 0): + tmp_idx -= 1 + # start at the first node that does not have an output field + self.start_idx_ = tmp_idx + 1 + LGR.info(f"Start is {self.start_idx_}") + self.component_status_table_ = status_table + + if "classification_tags" not in self.component_table_.columns: + self.component_table_["classification_tags"] = "" + # To run a decision tree, each component needs to have an initial classification # If the classification column doesn't exist, create it and label all components # as unclassified @@ -384,6 +497,7 @@ def select(self, component_table, cross_component_metrics={}, status_table=None) kwargs = self.check_null(kwargs, node["functionname"]) all_params = {**params, **kwargs} else: + kwargs = {} kwargs = {} all_params = {**params} @@ -413,7 +527,7 @@ def select(self, component_table, cross_component_metrics={}, status_table=None) self.are_all_components_accepted_or_rejected() - def add_manual(self, indices, classification): + def add_manual(self, indices: List[int], classification: str): """Add nodes that will manually classify components. Parameters @@ -437,16 +551,25 @@ def add_manual(self, indices, classification): } ) - def check_null(self, params, fcn): + def check_null(self, params: Dict, fcn: str) -> Dict: """ Check that required parameters for selection node functions are attributes in the class. Error if any are undefined. + Parameters + ---------- + params : :obj:`dict` + The keys and values for the inputted parameters + fcn : :obj:`str` + The name of a component selection function in selection_nodes.py + Returns ------- params : :obj:`dict` - The keys and values for the inputted parameters + The keys and values for the inputted parameters. + If a parameter's value was defined in self.cross_component_metrics + then that value is also in params when returned """ for key, val in params.items(): if val is None: @@ -508,12 +631,12 @@ def are_all_components_accepted_or_rejected(self): ) @property - def n_comps_(self): + def n_comps_(self) -> int: """The number of components in the component table.""" return len(self.component_table_) @property - def likely_bold_comps_(self): + def likely_bold_comps_(self) -> int: """A boolean :obj:`pandas.Series` of components that are tagged "Likely BOLD".""" likely_bold_comps = self.component_table_["classification_tags"].copy() for idx in range(len(likely_bold_comps)): @@ -524,22 +647,22 @@ def likely_bold_comps_(self): return likely_bold_comps @property - def n_likely_bold_comps_(self): + def n_likely_bold_comps_(self) -> int: """The number of components that are tagged "Likely BOLD".""" return self.likely_bold_comps_.sum() @property - def accepted_comps_(self): + def accepted_comps_(self) -> pd.Series: """A boolean :obj:`pandas.Series` of components that are accepted.""" return self.component_table_["classification"] == "accepted" @property - def n_accepted_comps_(self): + def n_accepted_comps_(self) -> int: """The number of components that are accepted.""" return self.accepted_comps_.sum() @property - def rejected_comps_(self): + def rejected_comps_(self) -> pd.Series: """A boolean :obj:`pandas.Series` of components that are rejected.""" return self.component_table_["classification"] == "rejected" diff --git a/tedana/selection/selection_utils.py b/tedana/selection/selection_utils.py index aea51ad05..350d53009 100644 --- a/tedana/selection/selection_utils.py +++ b/tedana/selection/selection_utils.py @@ -1,8 +1,11 @@ """Utility functions for tedana.selection.""" import logging +import re +from typing import Dict, List, Union import numpy as np +import pandas as pd from tedana.stats import getfbounds @@ -14,7 +17,9 @@ ############################################################## -def selectcomps2use(component_table, decide_comps): +def selectcomps2use( + component_table: pd.DataFrame, decide_comps: Union[str, List[str], List[int]] +) -> List[int]: """Get a list of component numbers that fit the classification types in ``decide_comps``. Parameters @@ -83,14 +88,14 @@ def selectcomps2use(component_table, decide_comps): def change_comptable_classifications( - selector, - if_true, - if_false, - decision_boolean, - tag_if_true=None, - tag_if_false=None, - dont_warn_reclassify=False, -): + selector, # ComponentSelector + if_true: str, + if_false: str, + decision_boolean: pd.Series, + tag_if_true: str = None, + tag_if_false: str = None, + dont_warn_reclassify: bool = False, +): # -> Tuple[ComponentSelector, int, int] """ Change or don't change the component classification. @@ -166,13 +171,13 @@ def change_comptable_classifications( def comptable_classification_changer( - selector, - boolstate, - classify_if, - decision_boolean, - tag_if=None, - dont_warn_reclassify=False, -): + selector, # : ComponentSelector, + boolstate: bool, + classify_if: str, + decision_boolean: pd.Series, + tag_if: Union[str, None] = None, + dont_warn_reclassify: bool = False, +): # -> ComponentSelector """Implement the component classification changes from ``change_comptable_classifications``. Parameters @@ -281,7 +286,7 @@ def comptable_classification_changer( return selector -def clean_dataframe(component_table): +def clean_dataframe(component_table: pd.DataFrame) -> pd.DataFrame: """ Reorder columns in component table. @@ -313,7 +318,11 @@ def clean_dataframe(component_table): ################################################# -def confirm_metrics_exist(component_table, necessary_metrics, function_name=None): +def confirm_metrics_exist( + component_table: pd.DataFrame, + necessary_metrics: List[str], + function_name: Union[str, None] = None, +) -> Union[None, bool]: """Confirm that all metrics declared in necessary_metrics are already included in comptable. Parameters @@ -321,11 +330,16 @@ def confirm_metrics_exist(component_table, necessary_metrics, function_name=None component_table : (C x M) :obj:`pandas.DataFrame` Component metric table. One row for each component, with a column for each metric. The index should be the component number. - necessary_metrics : :obj:`list` + necessary_metrics : :obj:`list[str]` A list of strings of metric names. function_name : :obj:`str` Text identifying the function name that called this function. + Returns + ------- + metrics_are_missing : :obj:`bool` + If there are no metrics missing, this returns True. + Raises ------ ValueError @@ -337,26 +351,42 @@ def confirm_metrics_exist(component_table, necessary_metrics, function_name=None Also, the string in ``necessary_metrics`` and the column labels in ``component_table`` will only be matched if they're identical. """ - missing_metrics = set(necessary_metrics) - set(component_table.columns) - if missing_metrics: - function_name = function_name or "unknown function" - raise ValueError( - f"Necessary metrics for {function_name}: {necessary_metrics}. " + hardcoded_metrics = [metric for metric in necessary_metrics if not metric.startswith("^")] + regex_metrics = [metric for metric in necessary_metrics if metric.startswith("^")] + # Check that all hardcoded (literal string) metrics are accounted for. + missing_metrics = sorted(list(set(hardcoded_metrics) - set(component_table.columns))) + # Check that the regular expression-based metrics are accounted for. + found_metrics = component_table.columns.tolist() + for regex_metric in regex_metrics: + if not any(re.match(regex_metric, metric) for metric in found_metrics): + missing_metrics.append(regex_metric) + + metrics_are_missing = len(missing_metrics) > 0 + if metrics_are_missing: + if function_name is None: + function_name = "unknown function" + + error_msg = ( + f"Necessary metrics for {function_name}: " + f"{necessary_metrics}. " f"Comptable metrics: {set(component_table.columns)}. " f"MISSING METRICS: {missing_metrics}." ) + raise ValueError(error_msg) + + return metrics_are_missing def log_decision_tree_step( - function_name_idx, - comps2use, - decide_comps=None, - n_true=None, - n_false=None, - if_true=None, - if_false=None, - calc_outputs=None, -): + function_name_idx: str, + comps2use: Union[List[int], int], + decide_comps: Union[str, List[str], List[int], None] = None, + n_true: Union[int, None] = None, + n_false: Union[int, None] = None, + if_true: Union[str, None] = None, + if_false: Union[str, None] = None, + calc_outputs: Union[Dict, None] = None, +) -> None: """Log text to add after every decision tree calculation. Parameters @@ -423,7 +453,7 @@ def log_decision_tree_step( ) -def log_classification_counts(decision_node_idx, component_table): +def log_classification_counts(decision_node_idx: int, component_table: pd.DataFrame) -> None: """Log the total counts for each component classification in component_table. Parameters diff --git a/tedana/selection/tedica.py b/tedana/selection/tedica.py index c12d0870f..5ea6b79f2 100644 --- a/tedana/selection/tedica.py +++ b/tedana/selection/tedica.py @@ -52,13 +52,15 @@ def automatic_selection(component_table, selector, **kwargs): .. _FAQ: faq.html """ - LGR.info("Performing ICA component selection") + # TODO external_regressor_config was inputted in this function + LGR.info(f"Performing ICA component selection with tree: {selector.tree_name}") RepLGR.info( "\n\nNext, component selection was performed to identify BOLD (TE-dependent) and " "non-BOLD (TE-independent) components using a decision tree." ) + # TODO external_regressor_config=external_regressor_config, was here component_table["classification_tags"] = "" selector.select(component_table, cross_component_metrics=kwargs) selector.metadata_ = collect.get_metadata(selector.component_table_) diff --git a/tedana/stats.py b/tedana/stats.py index cb6991bbc..ad9e63c0f 100644 --- a/tedana/stats.py +++ b/tedana/stats.py @@ -3,7 +3,7 @@ import logging import numpy as np -from scipy import stats +from scipy import linalg, stats from tedana import utils @@ -223,3 +223,46 @@ def t_to_z(t_values, dof): if ret_float: out = out[0] return out + + +def fit_model(x, y, output_residual=False): + """ + Linear regression for a model y = betas * x + error. + + Parameters + ---------- + x : (T X R) :obj:`numpy.ndarray` + 2D array with the regressors for the specified model an time + y : (T X C) :obj:`numpy.ndarray` + Time by mixing matrix components for the time series for fitting + output_residual : :obj:`bool` + If true, then this just outputs the residual of the fit. + If false, then outputs beta fits, sse, and df + + Returns + ------- + residual : (T X C) :obj:`numpy.ndarray` + The residual time series for the fit (only if output_residual is True) + betas : (R X C) :obj:`numpy.ndarray` + The magnitude fits for the model (only if output_residual is False) + sse : (C) :obj:`numpy.ndarray` + The sum of square error for the model (only if output_residual is False) + df : :obj:`int` + The degrees of freeom for the model (only if output_residual is False) + (timepoints - number of regressors) + """ + betas, _, _, _ = linalg.lstsq(x, y) + # matrix-multiplication on the regressors with the betas -> to create a new 'estimated' + # component matrix = fitted regressors (least squares beta solution * regressors) + fitted_regressors = np.matmul(x, betas) + residual = y - fitted_regressors + if output_residual: + return residual + else: + # sum the differences between the actual ICA components and the 'estimated' + # component matrix (beta-fitted regressors) + sse = np.sum(np.square(residual), axis=0) + # calculate how many individual values [timepoints] are free to vary after + # the least-squares solution [beta] betw X & Y is calculated + df = y.shape[0] - betas.shape[0] + return betas, sse, df diff --git a/tedana/tests/data/cornell_three_echo_preset_mixing_outputs.txt b/tedana/tests/data/cornell_three_echo_preset_mixing_outputs.txt new file mode 100644 index 000000000..1af72cda8 --- /dev/null +++ b/tedana/tests/data/cornell_three_echo_preset_mixing_outputs.txt @@ -0,0 +1,105 @@ +S0map.nii.gz +T2starmap.nii.gz +dataset_description.json +desc-ICAAccepted_components.nii.gz +desc-ICAAccepted_stat-z_components.nii.gz +desc-ICA_components.nii.gz +desc-ICA_decomposition.json +desc-tedana_metrics.json +desc-tedana_metrics.tsv +desc-tedana_registry.json +desc-ICACrossComponent_metrics.json +desc-ICA_status_table.tsv +desc-ICA_decision_tree.json +desc-ICA_mixing.tsv +desc_ICA_mixing_static.tsv +desc-ICA_stat-z_components.nii.gz +desc-adaptiveGoodSignal_mask.nii.gz +desc-denoised_bold.nii.gz +desc-optcom_bold.nii.gz +desc-confounds_timeseries.tsv +desc-rmse_statmap.nii.gz +figures +figures/adaptive_mask.svg +figures/carpet_optcom.svg +figures/carpet_denoised.svg +figures/carpet_accepted.svg +figures/carpet_rejected.svg +figures/comp_000.png +figures/comp_001.png +figures/comp_002.png +figures/comp_003.png +figures/comp_004.png +figures/comp_005.png +figures/comp_006.png +figures/comp_007.png +figures/comp_008.png +figures/comp_009.png +figures/comp_010.png +figures/comp_011.png +figures/comp_012.png +figures/comp_013.png +figures/comp_014.png +figures/comp_015.png +figures/comp_016.png +figures/comp_017.png +figures/comp_018.png +figures/comp_019.png +figures/comp_020.png +figures/comp_021.png +figures/comp_022.png +figures/comp_023.png +figures/comp_024.png +figures/comp_025.png +figures/comp_026.png +figures/comp_027.png +figures/comp_028.png +figures/comp_029.png +figures/comp_030.png +figures/comp_031.png +figures/comp_032.png +figures/comp_033.png +figures/comp_034.png +figures/comp_035.png +figures/comp_036.png +figures/comp_037.png +figures/comp_038.png +figures/comp_039.png +figures/comp_040.png +figures/comp_041.png +figures/comp_042.png +figures/comp_043.png +figures/comp_044.png +figures/comp_045.png +figures/comp_046.png +figures/comp_047.png +figures/comp_048.png +figures/comp_049.png +figures/comp_050.png +figures/comp_051.png +figures/comp_052.png +figures/comp_053.png +figures/comp_054.png +figures/comp_055.png +figures/comp_056.png +figures/comp_057.png +figures/comp_058.png +figures/comp_059.png +figures/comp_060.png +figures/comp_061.png +figures/comp_062.png +figures/comp_063.png +figures/comp_064.png +figures/comp_065.png +figures/comp_066.png +figures/comp_067.png +figures/comp_068.png +figures/rmse_brain.svg +figures/rmse_timeseries.svg +figures/s0_brain.svg +figures/s0_histogram.svg +figures/t2star_brain.svg +figures/t2star_histogram.svg +references.bib +report.txt +tedana_report.html diff --git a/tedana/tests/data/external_regress_Ftest_3echo.tsv b/tedana/tests/data/external_regress_Ftest_3echo.tsv new file mode 100644 index 000000000..ee5ab71cc --- /dev/null +++ b/tedana/tests/data/external_regress_Ftest_3echo.tsv @@ -0,0 +1,76 @@ +Mot_X Mot_Y Mot_Z Mot_Pitch Mot_Roll Mot_Yaw Mot_d1_X Mot_d1_Y Mot_d1_Z Mot_d1_Pitch Mot_d1_Roll Mot_d1_Yaw CSF Signal +-1.918448296 0.613840782 0.931540667 -0.6673914 -0.920974104 0.414696683 0 0 0 0 0 0 -0.682069874 -0.267104969 +-1.499312896 -1.436337287 -0.002877744 -1.4486174 1.920982078 -0.662135106 0.419135399 -1.049966602 1.371602444 -0.381774669 2.841956181 -1.076831788 1.299506949 -1.441873441 +-0.464347503 0.77742439 -2.018920101 -0.5813718 -0.08376844 -2.049602342 1.034965394 -0.639791483 -0.34571542 0.481431869 -2.004750518 -1.387467237 1.03956566 -0.012546791 +-0.882468232 0.613586639 -0.900634617 -1.8710858 0.832274633 -0.715980861 -0.418120729 -0.701576096 -1.5077406 -1.966556791 0.916043073 1.333621481 -2.117416134 -1.514608378 +-2.876432946 0.162624246 -2.693742538 0.40271992 -0.101695422 -2.322836044 -1.993964714 -0.986958583 1.393128708 1.22698911 -0.933970055 -1.606855183 0.165056144 -0.730488362 +-2.770599691 0.773069239 -0.594607386 0.87415002 -1.095591014 0.409168659 0.105833255 0.722646172 0.85406937 -0.028751384 -0.993895591 2.732004703 2.249547675 -0.118317264 +-3.205528009 -1.240282529 -0.231044327 1.2539739 0.868004428 0.741313722 -0.434928317 -0.361780621 -2.896983594 0.845013265 1.963595441 0.332145064 1.06534899 1.088560942 +-1.731140824 -0.585488802 -1.289487317 -1.4567711 -2.241535224 -1.303961556 1.474387185 -0.739533675 0.635886989 1.444834965 -3.109539652 -2.045275278 1.897657856 3.373106633 +-1.649385201 -1.010216314 0.409064517 1.82693469 0.072142733 1.011135469 0.081755623 1.713588814 0.760907196 1.438742606 2.313677958 2.315097025 0.475455108 2.886092128 +-0.356722413 -2.18462177 -1.960832678 1.34803872 0.038795691 0.586439931 1.292662788 -1.754630085 0.947755796 -2.51869842 -0.033347042 -0.424695538 -1.782949546 0.990997189 +-2.068139427 -2.368583778 -1.31756829 0.14838923 0.48600498 0.188669434 -1.711417014 -1.922701681 -2.548820211 -1.004376035 0.447209289 -0.397770497 0.319060478 -0.009543645 +0.715406929 -0.794055247 -0.659208429 -0.9088572 0.832244029 -0.298597633 2.783546356 3.433036569 3.681919631 0.171322901 0.346239049 -0.487267067 -2.360487189 -0.86352509 +-1.040454954 -1.460509829 0.730612891 0.48960456 1.662017221 1.097618703 -1.755861883 0.771015618 -0.57528371 -1.618451595 0.829773192 1.396216336 -0.075080291 -1.79176057 +0.641959638 -0.420512505 1.474108981 -0.4546902 0.644800249 0.785479669 1.682414591 -1.407900471 -0.134985528 -0.118394819 -1.017216972 -0.312139034 0.492524814 -1.707843326 +1.17282228 -1.440402159 -1.827017963 0.13785463 0.423777039 1.865866632 0.530862642 1.413588482 -3.183847724 1.162659639 -0.22102321 1.080386963 1.96694161 0.56742326 +0.019226195 -1.206922803 -1.019112095 1.89767797 0.03740602 -0.365314788 -1.153596085 -1.368131607 2.550710877 2.038738178 -0.386371019 -2.231181419 -0.506011411 2.15937525 +0.230716637 1.025871474 -0.210799297 2.72895514 -0.090854654 1.203749971 0.211490441 1.382776835 -1.693338606 -1.1407467 -0.128260675 1.569064758 -3.346663807 2.050447006 +0.407382846 1.273926231 1.842308698 0.2197399 -1.348759461 1.743040689 0.176666209 -0.27185966 1.602569217 -0.76463226 -1.257904807 0.539290719 -1.247235976 -0.433977049 +0.1612234 0.48067487 0.69887392 -1.0384601 -0.545209917 -0.538861847 -0.246159445 -0.795743419 -0.833587695 -0.113367872 0.803549544 -2.281902536 -2.528311547 -1.005952731 +-0.774435034 0.250436935 -1.302969683 -1.4253112 -0.31725089 0.997865351 -0.935658434 0.217343614 0.663050904 -2.285610786 0.227959027 1.536727197 -0.125139007 -1.532868153 +-2.21405771 -1.19855816 0.157180224 -1.2830147 0.915847837 -1.855905265 -1.439622676 -2.135317516 1.499162395 0.711683009 1.233098727 -2.853770615 -1.372953894 -0.723766052 +0.644602937 -0.593727894 1.116393898 -2.2446977 2.308808799 -0.738470514 2.858660646 -0.02026364 -3.100275341 0.653486815 1.392960962 1.11743475 0.219777736 -0.346882257 +-1.70817007 -0.569744613 0.856896127 2.2565618 -1.54181769 0.718533392 -2.352773007 0.345506953 1.767981229 1.488999395 -3.850626489 1.457003906 1.811748869 2.011863542 +-1.361399301 0.794794152 0.587718056 1.75394479 -0.337715782 -0.80254175 0.34677077 2.190844145 2.688794228 0.082987972 1.204101908 -1.521075142 -1.355637416 0.760877094 +-1.501020749 0.083640965 1.076805817 0.08823669 1.332131724 -1.603750955 -0.139621449 -1.359729639 -1.341695095 1.338693701 1.669847505 -0.801209205 -0.849180593 0.488569214 +0.48778649 0.710004314 0.835636131 -0.0622049 1.299686527 1.15627752 1.98880724 2.683677358 0.04576576 -0.440197601 -0.032445197 2.760028475 -0.611650727 0.011338428 +-0.442521714 -2.429135912 -0.344746589 -0.1539904 -0.361764912 2.15020488 -0.930308204 0.480707608 -1.904999213 -2.219798226 -1.661451439 0.99392736 -0.063962351 -0.36410246 +-1.293284679 -0.119698923 -2.140007228 -2.3016194 -0.092783134 0.846265045 -0.850762965 -2.56285129 -0.135733172 -0.07728598 0.268981778 -1.303939835 2.046688925 -0.08031042 +0.293540519 2.273792315 -0.922759343 -0.472177 0.371524998 -0.187843855 1.586825197 -1.795807531 0.65538016 -0.121451694 0.464308132 -1.0341089 0.913184123 -2.255100827 +-0.693487854 0.351463719 -0.162894519 0.08909674 0.702698899 -2.032021551 -0.987028372 2.557562924 -1.210489276 0.904088386 0.331173901 -1.844177697 1.386981796 -0.650041216 +0.173105971 0.46616017 0.566506326 0.55123039 -0.821761475 0.780445909 0.866593825 0.123577956 2.327322112 -0.048680501 -1.524460374 2.812467461 0.361548921 -0.515174586 +0.284124435 0.109629879 -0.256780429 0.92513744 -0.478904608 0.317718609 0.111018464 -1.192917786 -1.933131477 1.459177382 0.342856867 -0.4627273 0.152004422 -1.362814301 +-0.881234565 -2.388114332 1.356737359 0.98653926 -0.691392499 0.386870289 -1.165359 0.254732835 2.37031576 -0.124326207 -0.212487891 0.069151679 -2.490702928 -1.635385346 +-0.255444705 -2.054123065 -0.305918819 -0.0787619 0.063676012 0.039098394 0.62578986 -0.587539233 -1.981628457 -1.33038225 0.755068511 -0.347771894 1.310144787 -0.548120778 +0.780073222 -1.176335262 -1.008397871 -2.5274442 -0.133381314 -0.262708036 1.035517928 -0.953462691 2.52450507 -0.992425857 -0.197057326 -0.30180643 -0.699933248 -1.09400854 +1.096676395 2.630985961 1.101324011 0.72981317 -0.18497425 -1.860502604 0.316603173 1.339281552 -1.131021166 -0.395124649 -0.051592935 -1.597794569 1.066356413 -0.098655736 +-1.111129192 0.648260708 1.302645215 -2.8573443 -0.191034251 0.513984918 -2.207805588 1.048362324 -1.370405474 0.426137996 -0.006060001 2.374487522 0.404978742 -1.291616158 +0.924918217 3.136233551 -1.957241535 -0.0284925 1.565544896 -0.360526519 2.036047409 0.250771109 0.145908513 1.408397727 1.756579147 -0.874511437 -0.126338151 -0.038948249 +-0.469509665 1.138025998 2.606253963 -0.5611022 1.453765216 -0.840789872 -1.394427882 -0.656810745 0.193848406 -0.165074206 -0.11177968 -0.480263353 -0.211848963 0.738228755 +-0.456972678 3.643029733 2.451272614 1.63729865 1.483563223 -1.429801084 0.012536987 -0.116567016 -0.490289096 1.226139395 0.029798008 -0.589011213 1.481842619 1.457544972 +1.117822835 0.424896113 0.317851072 1.13970873 -0.515525219 0.831537695 1.574795513 -0.93531283 -0.051682394 1.459313684 -1.999088443 2.261338779 -1.387046241 1.360885272 +0.11902894 -0.263988644 0.992680446 -0.7257977 -1.60180806 -1.4286882 -0.998793895 1.380749638 2.61009726 -1.105881546 -1.086282841 -2.260225895 -4.090650432 1.074110525 +0.782962389 0.190310101 0.886954126 -0.327059 -0.440194623 -1.257077276 0.663933449 0.825127605 -0.900437185 -1.359456187 1.161613437 0.171610924 -2.605810353 -0.116600574 +-1.225328525 -0.01698158 0.25889854 -0.6122607 0.16691481 -1.412548542 -2.008290914 -2.091189114 0.37629829 -1.893617713 0.607109433 -0.155471266 0.801223562 -1.108415283 +2.004227441 1.490583611 1.351189959 -1.9813324 1.379480559 0.499874622 3.229555967 0.776274988 -0.551334681 1.771100233 1.212565749 1.912423164 3.117322611 -0.3030232 +-0.021224349 -0.546717557 0.42114983 -1.1798323 0.710741431 -0.478383763 -2.02545179 -0.711745628 1.618662502 -0.017105838 -0.668739128 -0.978258385 0.199583792 0.648559962 +-1.585894619 0.471012009 2.34339681 0.2217914 -1.300710729 -0.115940822 -1.56467027 0.13315835 -0.978953361 0.825405711 -2.01145216 0.36244294 1.919451105 0.709433343 +-0.322444366 -1.486051289 -0.695615769 1.60677983 1.006851176 0.229014053 1.263450253 0.762023344 -2.438725646 -0.300422879 2.307561905 0.344954875 1.9562584 0.784237939 +-1.23588898 0.223047682 -1.661200563 1.70669699 -0.430495205 0.688400238 -0.913444614 -0.144699562 0.711704777 1.625311402 -1.437346381 0.459386185 1.136081926 -0.862600046 +-0.113014634 -1.327742478 -2.356161038 -1.935502 0.524175949 -1.914071379 1.122874346 -1.028438305 -2.108397279 -1.732740987 0.954671153 -2.602471617 0.727916107 0.029407721 +1.896517135 -0.074610997 -1.792468809 -0.4276473 -1.126535717 -0.121119705 2.009531769 -0.376769575 1.833398649 -0.307528232 -1.650711665 1.792951674 -0.179938446 -2.194911615 +0.54927595 -1.469166384 -1.061423099 -1.7537277 0.081635502 0.188482433 -1.347241185 1.00375727 -1.605109539 -0.006448868 1.208171219 0.309602138 0.587593716 -1.401211947 +0.609241805 -1.600051424 -0.334116527 -1.8937495 -1.409227385 -3.090343971 0.059965855 2.191032891 -0.722138977 -1.440807059 -1.490862886 -3.278826404 0.204311075 0.023719982 +0.553297243 -1.082413327 -1.432198474 -0.8830019 -0.532461972 1.704602033 -0.055944562 -3.670809628 4.059332093 0.602669002 0.876765413 4.794946004 1.058054927 -0.499666887 +1.070313239 -1.987180682 0.597461687 -0.7079295 -0.644730406 1.487137198 0.517015996 2.814079678 -1.518649218 1.856274528 -0.112268434 -0.217464835 -0.04792865 -1.047042678 +1.178754327 -0.119901901 -0.454913138 1.28551571 0.247207569 -0.389487979 0.108441088 -1.137957841 1.495094438 0.662909128 0.891937975 -1.876625177 -0.368409886 0.146653233 +2.154376652 -3.508128379 0.842994286 1.65380175 1.660919467 -0.488961614 0.975622325 -0.263936276 -0.562174092 -0.279882361 1.413711898 -0.099473635 -1.463377013 2.687429287 +0.09128902 -0.783193205 0.617096023 0.68672227 -0.11425155 -0.599971624 -2.063087632 0.043172816 -1.287801089 0.335411313 -1.775171017 -0.11101001 -1.959369594 2.540784371 +2.311450852 -1.018120612 0.260947086 -0.0561641 -1.636916947 -0.816931299 2.220161831 0.300834316 0.767505849 -2.686875234 -1.522665397 -0.216959675 -2.275737467 0.040025881 +1.158741617 0.918316824 1.237957484 -0.3404328 -0.563502682 -1.573657474 -1.152709235 -2.340120734 -0.096825229 2.837647319 1.073414265 -0.756726175 1.591038481 -1.650004329 +1.479471242 0.973441967 -0.875383651 0.37398953 0.099716682 -0.330379693 0.320729625 2.528326789 0.60868415 -1.606010454 0.663219364 1.243277781 1.935694967 -0.667546755 +1.777648491 -1.097974442 1.53805825 -2.2046334 0.720276278 0.96057892 0.298177249 -0.10858881 0.430761715 0.126426695 0.620559596 1.290958612 1.714946716 0.490716727 +2.300528229 -0.019891598 -0.843192928 0.38002863 0.345593137 -1.927906889 0.522879738 -1.255608226 -3.36776509 -0.740799207 -0.374683141 -2.888485808 -0.104321121 0.189054556 +1.718219352 1.968615215 0.99258286 1.00367245 0.472089401 -1.151827276 -0.582308877 1.876953321 4.151726133 0.814804546 0.126496264 0.776079613 -1.886967124 0.88261297 +1.818796046 1.251516902 2.48112936 0.120162 -2.174404986 -0.288678607 0.100576694 -2.104495113 0.990463227 -0.11508072 -2.646494386 0.863148669 -1.843686685 -1.162003063 +2.650511053 0.666817523 2.116072146 -0.7304728 -0.289560275 0.757223995 0.831715007 1.332630432 -1.635287484 -0.835249062 1.88484471 1.045902602 -1.502048982 -0.81862897 +1.003358656 -0.553431082 0.253195231 0.29818877 -0.644888975 0.415375762 -1.647152397 -0.066198816 -1.68303734 -1.137958872 -0.3553287 -0.341848233 -0.209471432 -0.71773952 +-0.308505642 -1.009669814 -1.293291819 -0.5731582 0.223328279 -1.858455246 -1.311864297 0.095181926 -0.571466561 0.854871892 0.868217254 -2.273831008 1.524835998 -0.693049382 +2.580580394 1.431738472 -0.712151248 -1.8779663 -0.689204561 0.28543817 2.889086036 -0.20747373 0.812222922 -0.290684446 -0.91253284 2.143893416 1.199375477 -1.885283082 +1.278114657 0.200109843 2.617031393 -0.5918718 1.189929684 -1.246224619 -1.302465737 -0.902096937 1.316570806 0.226138509 1.879134246 -1.531662789 0.847608147 -0.843658546 +1.77544775 0.872762242 0.204838601 -1.1457108 0.511356667 -0.056703264 0.497333092 1.081594434 -3.06925756 0.580810673 -0.678573017 1.189521355 1.266513646 1.286820743 +2.216557752 0.736849721 0.321308305 0.48927743 -0.109493948 -0.438554941 0.441110002 0.598981665 1.452579029 1.709194708 -0.620850615 -0.381851677 2.911628578 2.000828511 +1.254647386 1.826695112 3.299852074 2.2897055 -0.324070512 -1.119656585 -0.961910366 -1.077788301 -1.502492458 -0.590872341 -0.214576565 -0.681101644 1.403046925 2.495912365 +-0.209384481 3.059836405 1.525797667 0.99017857 -0.829861324 1.067304931 -1.464031867 0.008085359 0.914983495 0.38404106 -0.505790812 2.186961516 1.291264032 -0.189273644 +1.671697742 2.588938477 -0.472348787 -1.4281572 0.57578738 -0.557224395 1.881082223 3.100744209 2.376132007 -3.233419839 1.405648703 -1.624529326 -1.465703731 -0.51309839 diff --git a/tedana/tests/test_component_selector.py b/tedana/tests/test_component_selector.py index a8ed26231..f536339e1 100644 --- a/tedana/tests/test_component_selector.py +++ b/tedana/tests/test_component_selector.py @@ -2,6 +2,7 @@ import glob import json +import logging import os import os.path as op @@ -12,6 +13,8 @@ from tedana.utils import get_resource_path THIS_DIR = os.path.dirname(os.path.abspath(__file__)) +LGR = logging.getLogger("GENERAL") + # ---------------------------------------------------------------------- # Functions Used For Tests @@ -40,6 +43,8 @@ def dicts_to_test(treechoice): "missing_function": An undefined decision node function "missing_key": A dict missing one of the required keys (report) "null_value": A parameter in one node improperly has a null value + "external_missing_key": external_regressors_config missing a required key + "external_invalid_statistic": external_regressors_config statistic is not "F" Returns ------- @@ -56,6 +61,36 @@ def dicts_to_test(treechoice): "necessary_metrics": ["kappa", "rho"], "intermediate_classifications": ["random1"], "classification_tags": ["Random1"], + "external_regressor_config": [ + { + "regress_ID": "nuisance", + "info": ( + "Fits all external nuisance regressors to " + "a single model using an F statistic" + ), + "report": ( + "External nuisance regressors that fit to " + "components using a linear model were rejected." + ), + "detrend": True, + "statistic": "F", + "regressors": ["^(?!signal).*$"], + "partial_models": {"Motion": ["^mot_.*$"], "CSF": ["^csf.*$"]}, + }, + { + "regress_ID": "task", + "info": "Fits all task regressors to a single model using an F statistic", + "report": ( + "Task regressors that fit to components using a linear model and " + "have some T2* weighting were accepted even if they would have " + "been rejected base on other criteriea." + ), + "detrend": True, + "statistic": "F", + "regressors": ["^signal.*$"], + "extra field": 42, + }, + ], "nodes": [ { "functionname": "dec_left_op_right", @@ -134,6 +169,11 @@ def dicts_to_test(treechoice): tree.pop("report") elif treechoice == "null_value": tree["nodes"][0]["parameters"]["left"] = None + elif treechoice == "external_missing_key": + tree["external_regressor_config"][1].pop("statistic") + elif treechoice == "external_invalid_statistic": + # Will also trigger statistic isn't F and partial models exist + tree["external_regressor_config"][0]["statistic"] = "corr" else: raise Exception(f"{treechoice} is an invalid option for treechoice") @@ -204,11 +244,11 @@ def test_validate_tree_succeeds(): Tested on all default trees in ./tedana/resources/decision_trees Note: If there is a tree in the default trees directory that is being developed and not yet valid, it's file name should - include 'invalid' as a prefix. + begin with 'X'. """ default_tree_names = glob.glob( - os.path.join(THIS_DIR, "../resources/decision_trees/[!invalid]*.json") + os.path.join(f"{THIS_DIR}/../resources/decision_trees/", "[!X]*.json") ) for tree_name in default_tree_names: @@ -218,19 +258,46 @@ def test_validate_tree_succeeds(): # Test a few extra possabilities just using the minimal.json tree if "/minimal.json" in tree_name: - # Should remove/ignore the "reconstruct_from" key during validation - tree["reconstruct_from"] = "testinput" # Need to test handling of the tag_if_false kwarg somewhere tree["nodes"][1]["kwargs"]["tag_if_false"] = "testing tag" assert component_selector.validate_tree(tree) -def test_validate_tree_warnings(): +def test_validate_tree_warnings(caplog): """Test to make sure validate_tree triggers all warning conditions.""" # A tree that raises all possible warnings in the validator should still be valid assert component_selector.validate_tree(dicts_to_test("valid")) + assert ( + r"Decision tree includes fields that are not used or logged ['unused_key']" in caplog.text + ) + assert ( + r"['random1'] in node 0 of the decision tree includes " + "a classification tag that was not predefined" + ) in caplog.text + assert ( + r"['nochange', 'random2'] in node 1 of the decision tree includes a classification" + in caplog.text + ) + assert ( + r"['random2notpredefined'] in node 1 of the decision tree " + "includes a classification tag that was not predefined" + ) in caplog.text + assert ( + r"['random2notpredefined'] in node 2 of the decision tree includes " + "a classification label that was not predefined" + ) in caplog.text + assert ( + r"['Random2_NotPredefined'] in node 2 of the decision tree " + "includes a classification tag that was not predefined" + ) in caplog.text + assert (r"Node 3 includes the 'log_extra_report' parameter.") in caplog.text + assert ( + "External regressor dictionary 1 includes fields " + r"that are not used or logged ['extra field']" + ) in caplog.text + def test_validate_tree_fails(): """Test to make sure validate_tree fails for invalid trees. @@ -240,27 +307,57 @@ def test_validate_tree_fails(): """ # An empty dict should not be valid - with pytest.raises(component_selector.TreeError): + with pytest.raises( + component_selector.TreeError, match="Decision tree missing required fields" + ): component_selector.validate_tree({}) # A tree that is missing a required key should not be valid - with pytest.raises(component_selector.TreeError): + with pytest.raises( + component_selector.TreeError, match=r"Decision tree missing required fields: {'report'}" + ): component_selector.validate_tree(dicts_to_test("missing_key")) # Calling a selection node function that does not exist should not be valid - with pytest.raises(component_selector.TreeError): + with pytest.raises( + component_selector.TreeError, + match=r"Node 0 has invalid functionname parameter: not_a_function", + ): component_selector.validate_tree(dicts_to_test("missing_function")) # Calling a function with an non-existent required parameter should not be valid - with pytest.raises(component_selector.TreeError): + with pytest.raises( + component_selector.TreeError, + match=r"Node 0 has additional, undefined required parameters: {'nonexistent_req_param'}", + ): component_selector.validate_tree(dicts_to_test("extra_req_param")) # Calling a function with an non-existent optional parameter should not be valid - with pytest.raises(component_selector.TreeError): + with pytest.raises( + component_selector.TreeError, match=r"Node 0 has additional, undefined optional parameters" + ): component_selector.validate_tree(dicts_to_test("extra_opt_param")) # Calling a function missing a required parameter should not be valid - with pytest.raises(component_selector.TreeError): + with pytest.raises( + component_selector.TreeError, match=r"Node 0 is missing required parameter" + ): component_selector.validate_tree(dicts_to_test("missing_req_param")) + with pytest.raises( + component_selector.TreeError, + match=r"External regressor dictionary 1 is missing required fields: {'statistic'}", + ): + component_selector.validate_tree(dicts_to_test("external_missing_key")) + + with pytest.raises( + component_selector.TreeError, + match=( + "statistic in external_regressor_config 1 is corr. It must be one of the following: " + "{'f'}\nExternal regressor dictionary cannot include partial_models " + "if statistic is not F" + ), + ): + component_selector.validate_tree(dicts_to_test("external_invalid_statistic")) + def test_check_null_fails(): """Tests to trigger check_null missing parameter error.""" diff --git a/tedana/tests/test_external_metrics.py b/tedana/tests/test_external_metrics.py new file mode 100644 index 000000000..848b281ae --- /dev/null +++ b/tedana/tests/test_external_metrics.py @@ -0,0 +1,572 @@ +"""Tests for tedana.metrics.external.""" + +import logging +import os.path as op +import re + +import pandas as pd +import pytest + +from tedana import utils +from tedana.io import load_json +from tedana.metrics import external +from tedana.tests.utils import data_for_testing_info, download_test_data + +THIS_DIR = op.dirname(op.abspath(__file__)) +LGR = logging.getLogger("GENERAL") + +# ---------------------------------------------------------------------- +# Functions Used For Tests +# ---------------------------------------------------------------------- + + +def sample_external_regressors(regress_choice="valid"): + """ + Retrieves a sample external regressor dataframe. + + Parameters + ---------- + regress_choice : :obj:`str` How to keep or alter the external regressor data + Options are: + "valid": Column labels expected in demo_external_regressors_motion_task_models + The labels in the config file are lowercase and this file is capitalized, but it should + still be valid. + "no_mot_y_column": The column labeled "Mot_Pitch" is removed. + + Returns + ------- + external_regressors : :obj:`pandas.DataFrame` External regressor table + n_vols : :obj:`int` Number of time points (rows) in external_regressors + + Notes + ----- + The loaded external regressors are in ./tests/data/external_regress_Ftest_3echo.tsv + These are based on tedana being run with default parameters on the 3 echo data using + the mixing matrix downloaded with the three echo data + .testing_data_cache/three-echo/TED.three-echo/desc_ICA_mixing_static.tsv + For the external regressors: + Column 0 (Mot_X) is the time series for ICA component 8 + Gaussian noise + Column 1 (Mot_Y) is 0.6 * comp 18 + 0.4 * comp 29 + Gaussian Noise + Column 2 (Mot_Z) is 0.8 * comp 18 + 1.2 * Gaussian Noise + Column 3 (Mot_Pitch) is 0.9 * comp 30 + 0.1 * comp 61 + Gaussian Noise + Columns 4-5 are Gaussian noise + Columns 6-11 are the first derivatives of columns 0-5 + Column 12 (CSF) is comp 11 + Gaussian Noise + Column 13 (Signal) is comp 30 + Gaussian Noise + + The base gaussian noise is mean=0, stdev=1. + The scaling weights for components and noise are set so that, with an R^2>0.5 threshold: + ICA Comp 8 rejected solely based on the fit to Mot_X + ICA Comp 31 looks strongly inversely correlated to Comp 8 and is also rejected + ICA Comp 18 rejected based on the combined fit to Mot_X and Mot_Y (signif Motion partial model) + ICA Comp 29 NOT rejected based only on the fit to Mot_Y + ICA Comp 11 rejected based on a fit to CSF (signif CSF partial model) + ICA Comp 30 accepted based on fit to task model even through also fits to Mot_Pitch + """ + sample_fname = op.join(THIS_DIR, "data", "external_regress_Ftest_3echo.tsv") + + external_regressors = pd.read_csv(sample_fname, delimiter="\t") + + if regress_choice == "no_mot_y_column": + external_regressors = external_regressors.drop(columns="Mot_Y") + elif regress_choice != "valid": + raise ValueError(f"regress_choice is {regress_choice}, which is not a listed option") + + n_vols = len(external_regressors) + + return external_regressors, n_vols + + +def sample_external_regressor_config(config_choice="valid"): + """ + Retrieves a sample external regressor configuration dictionary. + + Parameters + ---------- + config_choice : :obj:`str` How to keep or alter the config file + Options are: + "valid": Config dictionary stored in demo_external_regressors_motion_task_models + "no_task_partial": Removes "task_keep" and everything with partial F stats + "csf_in_mot": Adds "CSF" to the list of motion regressor partial models + "signal_in_mot": Adds "Signal" to the list of motion regressor partial models + + Returns + ------- + external_regressor_config : :obj:`dict` External Regressor Dictionary + """ + + sample_fname = op.join( + THIS_DIR, + "../resources/decision_trees", + "demo_external_regressors_motion_task_models.json", + ) + tree = load_json(sample_fname) + external_regressor_config = tree["external_regressor_config"] + + if config_choice == "no_task_partial": + external_regressor_config = [external_regressor_config[0]] + external_regressor_config[0].pop("partial_models") + elif config_choice == "csf_in_mot": + external_regressor_config[0]["partial_models"]["Motion"].append("^csf.*$") + elif config_choice == "signal_in_mot": + external_regressor_config[0]["partial_models"]["Motion"].append("Signal") + elif config_choice == "unmatched_regex": + external_regressor_config[0]["partial_models"]["Motion"] = ["^translation_.*$"] + elif config_choice != "valid": + raise ValueError(f"config_choice is {config_choice}, which is not a listed option") + + return external_regressor_config + + +def sample_mixing_matrix(): + """Load and return the three-echo mixing matrix.""" + + test_data_path, osf_id = data_for_testing_info("three-echo") + download_test_data(osf_id, test_data_path) + + return pd.read_csv( + op.join( + THIS_DIR, + "../../.testing_data_cache/three-echo/TED.three-echo/desc_ICA_mixing_static.tsv", + ), + delimiter="\t", + ).to_numpy() + + +def sample_comptable(n_components): + """Create an empty component table. + + Parameters + ---------- + n_components : :obj:`int` + The number of components (rows) in the compponent table DataFrame + + Returns + ------- + component_table : :obj:`pd.DataFrame` + A component table with a single "Component" column with + "ICA_" number for each row + """ + + row_vals = [] + for ridx in range(n_components): + row_vals.append(f"ICA_{str(ridx).zfill(2)}") + + return pd.DataFrame(data={"Component": row_vals}) + + +def sample_detrend_regressors(n_vols, dtrank=None): + """ + Creates Legendre polynomial detrending regressors. + + Parameters + ---------- + n_vols: :obj:`int` + The number of volumes or time points for the regressors + dtrank : :obj:`int` or None + The rank (number) of detrending regressors to create + Automatically calculate if None (default) + + Returns + ------- + detrend_regressors : :obj:`pd.DataFrame` The specified detrending regressors + """ + + legendre_arr = utils.create_legendre_polynomial_basis_set(n_vols, dtrank) + detrend_labels = [] + for label_idx in range(legendre_arr.shape[1]): + detrend_labels.append(f"baseline {label_idx}") + return pd.DataFrame(data=legendre_arr, columns=detrend_labels) + + +# validate_extern_regress +# ----------------------- +def test_validate_extern_regress_succeeds(caplog): + """Test validate_extern_regress works as expected.""" + + external_regressors, n_vols = sample_external_regressors() + external_regressor_config = sample_external_regressor_config() + external_regressor_config_expanded = external.validate_extern_regress( + external_regressors, external_regressor_config, n_vols + ) + + # The regex patterns should have been replaced with the full names of the regressors + assert set(external_regressor_config_expanded[0]["partial_models"]["Motion"]) == set( + [ + "Mot_X", + "Mot_d1_Yaw", + "Mot_d1_Y", + "Mot_d1_Pitch", + "Mot_Z", + "Mot_d1_Z", + "Mot_d1_Roll", + "Mot_d1_X", + "Mot_Pitch", + "Mot_Y", + "Mot_Yaw", + "Mot_Roll", + ] + ) + assert external_regressor_config_expanded[0]["partial_models"]["CSF"] == ["CSF"] + assert external_regressor_config_expanded[1]["regressors"] == ["Signal"] + assert "WARNING" not in caplog.text + + # Rerunning with explicit names for the above three categories instead of regex patterns + # Shouldn't change anything, but making sure it runs + caplog.clear() + external_regressor_config_expanded = external.validate_extern_regress( + external_regressors, external_regressor_config_expanded, n_vols + ) + assert "WARNING" not in caplog.text + + # Removing all partial model and task_keep stuff to confirm it still runs, but with a warning + caplog.clear() + external_regressor_config = sample_external_regressor_config("no_task_partial") + external.validate_extern_regress(external_regressors, external_regressor_config, n_vols) + assert ( + "User-provided external_regressors include columns not used " + "in any external regressor model: ['Signal']" + ) in caplog.text + + # Add "CSF" to "Motion" partial model (also in "CSF" partial model) to test if warning appears + caplog.clear() + external_regressor_config = sample_external_regressor_config("csf_in_mot") + external.validate_extern_regress(external_regressors, external_regressor_config, n_vols) + assert "['CSF'] used in more than one partial regressor model for nuisance" in caplog.text + + +def test_validate_extern_regress_fails(): + """Test validate_extern_regress fails when expected.""" + + external_regressors, n_vols = sample_external_regressors() + external_regressor_config = sample_external_regressor_config() + + # If there are a different number of time points in the fMRI data and external regressors + with pytest.raises( + external.RegressError, match=f"while fMRI data have {n_vols - 1} timepoints" + ): + external.validate_extern_regress( + external_regressors, external_regressor_config, n_vols - 1 + ) + + # If no external regressor labels match the regex label in config + external_regressor_config = sample_external_regressor_config("unmatched_regex") + with pytest.raises( + external.RegressError, + match=( + re.escape( + "No external regressor labels matching regular expression '^translation_.*$' found" + ) + ), + ): + external.validate_extern_regress(external_regressors, external_regressor_config, n_vols) + + # If Signal is in a partial model, but not "regressors" for the full model + external_regressor_config = sample_external_regressor_config("signal_in_mot") + with pytest.raises( + external.RegressError, + match=( + re.escape( + "Partial models in nuisance include regressors that " + "are excluded from its full model: ['Signal']" + ) + ), + ): + external.validate_extern_regress(external_regressors, external_regressor_config, n_vols) + + # If a regressor expected in the config is not in external_regressors + # Run successfully to expand Motion labels in config and then create error + # when "Mot_Y" is in the config, but removed from external_regressors + external_regressor_config = sample_external_regressor_config() + external_regressors, n_vols = sample_external_regressors() + external_regressor_config_expanded = external.validate_extern_regress( + external_regressors, external_regressor_config, n_vols + ) + external_regressors, n_vols = sample_external_regressors("no_mot_y_column") + # The same error message will appear twice. + # One for "regressor" and once for motion partial model + with pytest.raises( + external.RegressError, + match=re.escape( + "No external regressor matching 'Mot_Y' was found.\n" + "No external regressor matching 'Mot_Y' was found." + ), + ): + external.validate_extern_regress( + external_regressors, external_regressor_config_expanded, n_vols + ) + + +# load_validate_external_regressors +# --------------------------------- + + +def test_load_validate_external_regressors_fails(): + """Test load_validate_external_regressors fails when not given a tsv file.""" + + external_regressors = "NotATSVFile.tsv" + external_regressor_config = sample_external_regressor_config("valid") + with pytest.raises( + ValueError, match=f"Cannot load tsv file with external regressors: {external_regressors}" + ): + external.load_validate_external_regressors( + external_regressors, external_regressor_config, 200 + ) + + +def test_load_validate_external_regressors_smoke(): + """Test load_validate_external_regressors succeeds.""" + + external_regressors = op.join(THIS_DIR, "data", "external_regress_Ftest_3echo.tsv") + n_vols = 75 + external_regressor_config = sample_external_regressor_config() + + # Not testing outputs because this is just calling validate_extern_regress and + # outputs are checked in those tests + external.load_validate_external_regressors( + external_regressors, external_regressor_config, n_vols + ) + + +# fit_regressors +# -------------- + + +def test_fit_regressors(caplog): + """Test conditions fit_regressors succeeds and fails.""" + + caplog.set_level(logging.INFO) + external_regressors, n_vols = sample_external_regressors() + external_regressor_config = sample_external_regressor_config() + external_regressor_config_expanded = external.validate_extern_regress( + external_regressors, external_regressor_config, n_vols + ) + mixing = sample_mixing_matrix() + + # Running with external_regressor_config["detrend"] is True, + # which results in 1 detrending regressor + comptable = sample_comptable(mixing.shape[1]) + comptable = external.fit_regressors( + comptable, external_regressors, external_regressor_config_expanded, mixing + ) + + # Contents will be valided in fit_mixing_to_regressors so just checking column labels here + assert set(comptable.keys()) == { + "Component", + "Fstat nuisance model", + "Fstat task model", + "Fstat nuisance Motion partial model", + "Fstat nuisance CSF partial model", + "pval nuisance model", + "pval task model", + "pval nuisance Motion partial model", + "pval nuisance CSF partial model", + "R2stat nuisance model", + "R2stat task model", + "R2stat nuisance Motion partial model", + "R2stat nuisance CSF partial model", + } + + assert ( + "External regressors fit for nuisance includes detrending " + "with 1 Legendre Polynomial regressors" in caplog.text + ) + + caplog.clear() + # Running with external_regressor_config["detrend"]=3, which results in 3 detrending regressors + external_regressor_config[1]["detrend"] = 3 + comptable = sample_comptable(mixing.shape[1]) + comptable = external.fit_regressors( + comptable, external_regressors, external_regressor_config_expanded, mixing + ) + assert ( + "External regressors fit for task includes detrending " + "with 3 Legendre Polynomial regressors" in caplog.text + ) + + caplog.clear() + # Running with external_regressor_config["detrend"]=0, + # which results in 1 detrend regressors (demeaning) + external_regressor_config[0]["detrend"] = 0 + comptable = sample_comptable(mixing.shape[1]) + comptable = external.fit_regressors( + comptable, external_regressors, external_regressor_config_expanded, mixing + ) + assert ( + "External regressor for nuisance fitted without detrending fMRI time series. " + "Only removing mean" in caplog.text + ) + + caplog.clear() + external_regressor_config[1]["statistic"] = "Corr" + comptable = sample_comptable(mixing.shape[1]) + with pytest.raises( + ValueError, + match=( + "statistic for task external regressors in decision tree is corr, " + "which is not valid." + ), + ): + comptable = external.fit_regressors( + comptable, external_regressors, external_regressor_config_expanded, mixing + ) + + +# fit_mixing_to_regressors +# -------------- + + +def test_fit_mixing_to_regressors(caplog): + """Test conditions fit_mixing_to_regressors succeeds and fails.""" + + # Note: Outputs from fit_model_with_stats are also tested within this function + + caplog.set_level(logging.INFO) + external_regressors, n_vols = sample_external_regressors() + external_regressor_config = sample_external_regressor_config() + external_regressor_config_expanded = external.validate_extern_regress( + external_regressors, external_regressor_config, n_vols + ) + mixing = sample_mixing_matrix() + + detrend_regressors = sample_detrend_regressors(n_vols, dtrank=None) + + # Running with external_regressor_config["detrend"] is True, + # which results in 1 detrending regressor + comptable = sample_comptable(mixing.shape[1]) + + for config_idx in range(2): + comptable = external.fit_mixing_to_regressors( + comptable, + external_regressors, + external_regressor_config_expanded[config_idx], + mixing, + detrend_regressors, + ) + + # Since a fixed mixing matrix is used, the values should always be consistent + # Comparing just 3 rows and rounding to 6 decimal places to avoid testing failures + # due to differences in floating point precision between systems + output_rows_to_validate = comptable.iloc[[0, 11, 30]].round(decimals=6) + expected_results = pd.DataFrame( + columns=[ + "Component", + "Fstat nuisance model", + "Fstat task model", + "Fstat nuisance Motion partial model", + "Fstat nuisance CSF partial model", + "pval nuisance model", + "pval task model", + "pval nuisance Motion partial model", + "pval nuisance CSF partial model", + "R2stat nuisance model", + "R2stat task model", + "R2stat nuisance Motion partial model", + "R2stat nuisance CSF partial model", + ] + ) + expected_results.loc[0] = [ + "ICA_00", + 0.5898043794795538, + 0.040242260292383224, + 0.6359651437299336, + 0.5882298391006501, + 0.8529159565446598, + 0.8415655022508225, + 0.8033066601119929, + 0.4460627598486151, + 0.1116607090996441, + 0.0005509601152330346, + 0.11119635498661495, + 0.009551010649882286, + ] + expected_results.loc[11] = [ + "ICA_11", + 5.050391950932562, + 0.3101483992796387, + 0.6191428219572478, + 37.021610927761515, + 5.897391126885587e-06, + 0.5792925973677331, + 0.8177727274388485, + 8.422777264538439e-08, + 0.518377055217612, + 0.004230633903377634, + 0.10857438156630017, + 0.377688252390028, + ] + expected_results.loc[30] = [ + "ICA_30", + 5.869398664558788, + 109.32951177196031, + 6.215675922255525, + 1.524970189426933, + 7.193855290354989e-07, + 3.3306690738754696e-16, + 5.303071232143353e-07, + 0.22160450819684074, + 0.5557244697248107, + 0.5996259777665551, + 0.5501080476751579, + 0.024389778752502478, + ] + + assert ( + (output_rows_to_validate.sort_index(axis=1)) + .compare(expected_results.sort_index(axis=1).round(decimals=6)) + .empty + ) + + +# build_fstat_regressor_models +# -------------- + + +def test_build_fstat_regressor_models(caplog): + """Test conditions build_fstat_regressor_models succeeds and fails.""" + + caplog.set_level(logging.INFO) + external_regressors, n_vols = sample_external_regressors() + external_regressor_config = sample_external_regressor_config() + external_regressor_config_expanded = external.validate_extern_regress( + external_regressors, external_regressor_config, n_vols + ) + + detrend_regressors = sample_detrend_regressors(n_vols, dtrank=3) + + # Running nuisance with partial_models + regressor_models = external.build_fstat_regressor_models( + external_regressors, external_regressor_config_expanded[0], detrend_regressors + ) + + assert regressor_models["full"].shape == (n_vols, 16) + assert ( + "Regressors in full model for nuisance: " + "['CSF', 'Mot_Pitch', 'Mot_Roll', 'Mot_X', 'Mot_Y', 'Mot_Yaw', " + "'Mot_Z', 'Mot_d1_Pitch', 'Mot_d1_Roll', 'Mot_d1_X', 'Mot_d1_Y', 'Mot_d1_Yaw', " + "'Mot_d1_Z', 'baseline 0', 'baseline 1', 'baseline 2']" + ) in caplog.text + assert regressor_models["no CSF"].shape == (n_vols, 15) + assert ( + "Regressors in partial model (everything but regressors of interest) 'no CSF': " + "['Mot_Pitch', 'Mot_Roll', 'Mot_X', 'Mot_Y', 'Mot_Yaw', 'Mot_Z', 'Mot_d1_Pitch', " + "'Mot_d1_Roll', 'Mot_d1_X', 'Mot_d1_Y', 'Mot_d1_Yaw', 'Mot_d1_Z', " + "'baseline 0', 'baseline 1', 'baseline 2']" + ) in caplog.text + assert regressor_models["no Motion"].shape == (n_vols, 4) + assert ( + "Regressors in partial model (everything but regressors of interest) 'no Motion': " + "['CSF', 'baseline 0', 'baseline 1', 'baseline 2']" in caplog.text + ) + + # Running task regressor + caplog.clear() + regressor_models = external.build_fstat_regressor_models( + external_regressors, external_regressor_config_expanded[1], detrend_regressors + ) + + assert regressor_models["full"].shape == (n_vols, 4) + assert ( + "Regressors in full model for task: ['Signal', 'baseline 0', 'baseline 1', 'baseline 2']" + in caplog.text + ) diff --git a/tedana/tests/test_integration.py b/tedana/tests/test_integration.py index 23a320509..142efc175 100644 --- a/tedana/tests/test_integration.py +++ b/tedana/tests/test_integration.py @@ -1,24 +1,19 @@ """Integration tests for "real" data.""" import glob -import json import logging import os import os.path as op import re import shutil import subprocess -import tarfile -from datetime import datetime -from gzip import GzipFile -from io import BytesIO import pandas as pd import pytest -import requests from pkg_resources import resource_filename from tedana.io import InputHarvester +from tedana.tests.utils import data_for_testing_info, download_test_data from tedana.workflows import t2smap as t2smap_cli from tedana.workflows import tedana as tedana_cli from tedana.workflows.ica_reclassify import ica_reclassify_workflow @@ -76,129 +71,6 @@ def check_integration_outputs(fname, outpath, n_logs=1): raise ValueError(msg) -def data_for_testing_info(test_dataset=str): - """ - Get the path and download link for each dataset used for testing. - - Also creates the base directories into which the data and output - directories are written - - Parameters - ---------- - test_dataset : str - References one of the datasets to download. It can be: - three-echo - three-echo-reclassify - four-echo - five-echo - - Returns - ------- - test_data_path : str - The path to the local directory where the data will be downloaded - osf_id : str - The ID for the OSF file. - Data download link would be https://osf.io/osf_id/download - Metadata download link would be https://osf.io/osf_id/metadata/?format=datacite-json - """ - - tedana_path = os.path.dirname(tedana_cli.__file__) - base_data_path = os.path.abspath(os.path.join(tedana_path, "../../.testing_data_cache")) - os.makedirs(base_data_path, exist_ok=True) - os.makedirs(os.path.join(base_data_path, "outputs"), exist_ok=True) - if test_dataset == "three-echo": - test_data_path = os.path.join(base_data_path, "three-echo/TED.three-echo") - osf_id = "rqhfc" - os.makedirs(os.path.join(base_data_path, "three-echo"), exist_ok=True) - os.makedirs(os.path.join(base_data_path, "outputs/three-echo"), exist_ok=True) - elif test_dataset == "three-echo-reclassify": - test_data_path = os.path.join(base_data_path, "reclassify") - osf_id = "f6g45" - os.makedirs(os.path.join(base_data_path, "outputs/reclassify"), exist_ok=True) - elif test_dataset == "four-echo": - test_data_path = os.path.join(base_data_path, "four-echo/TED.four-echo") - osf_id = "gnj73" - os.makedirs(os.path.join(base_data_path, "four-echo"), exist_ok=True) - os.makedirs(os.path.join(base_data_path, "outputs/four-echo"), exist_ok=True) - elif test_dataset == "five-echo": - test_data_path = os.path.join(base_data_path, "five-echo/TED.five-echo") - osf_id = "9c42e" - os.makedirs(os.path.join(base_data_path, "five-echo"), exist_ok=True) - os.makedirs(os.path.join(base_data_path, "outputs/five-echo"), exist_ok=True) - else: - raise ValueError(f"{test_dataset} is not a valid dataset string for data_for_testing_info") - - return test_data_path, osf_id - - -def download_test_data(osf_id, test_data_path): - """If current data is not already available, downloads tar.gz data. - - Data are stored at `https://osf.io/osf_id/download`. - It unpacks into `out_path`. - - Parameters - ---------- - osf_id : str - The ID for the OSF file. - out_path : str - Path to directory where OSF data should be extracted - """ - - try: - datainfo = requests.get(f"https://osf.io/{osf_id}/metadata/?format=datacite-json") - except Exception: - if len(os.listdir(test_data_path)) == 0: - raise ConnectionError( - f"Cannot access https://osf.io/{osf_id} and testing data " "are not yet downloaded" - ) - else: - TestLGR.warning( - f"Cannot access https://osf.io/{osf_id}. " - f"Using local copy of testing data in {test_data_path} " - "but cannot validate that local copy is up-to-date" - ) - return - datainfo.raise_for_status() - metadata = json.loads(datainfo.content) - # 'dates' is a list with all udpates to the file, the last item in the list - # is the most recent and the 'date' field in the list is the date of the last - # update. - osf_filedate = metadata["dates"][-1]["date"] - - # File the file with the most recent date for comparision with - # the lsst updated date for the osf file - if os.path.exists(test_data_path): - filelist = glob.glob(f"{test_data_path}/*") - most_recent_file = max(filelist, key=os.path.getctime) - if os.path.exists(most_recent_file): - local_filedate = os.path.getmtime(most_recent_file) - local_filedate_str = str(datetime.fromtimestamp(local_filedate).date()) - local_data_exists = True - else: - local_data_exists = False - else: - local_data_exists = False - if local_data_exists: - if local_filedate_str == osf_filedate: - TestLGR.info( - f"Downloaded and up-to-date data already in {test_data_path}. Not redownloading" - ) - return - else: - TestLGR.info( - f"Downloaded data in {test_data_path} was last modified on " - f"{local_filedate_str}. Data on https://osf.io/{osf_id} " - f" was last updated on {osf_filedate}. Deleting and redownloading" - ) - shutil.rmtree(test_data_path) - req = requests.get(f"https://osf.io/{osf_id}/download") - req.raise_for_status() - t = tarfile.open(fileobj=GzipFile(fileobj=BytesIO(req.content))) - os.makedirs(test_data_path, exist_ok=True) - t.extractall(test_data_path) - - def reclassify_raw() -> str: test_data_path, _ = data_for_testing_info("three-echo-reclassify") return os.path.join(test_data_path, "TED.three-echo") @@ -352,6 +224,8 @@ def test_integration_three_echo(skip_integration): ) # Test re-running, but use the CLI + # TODO Move this to a separate integration test, use the fixed desc_ICA_mixing_static.tsv that + # is distributed with the testing data, and test specific outputs for consistent values args = [ "-d", f"{test_data_path}/three_echo_Cornell_zcat.nii.gz", @@ -374,6 +248,90 @@ def test_integration_three_echo(skip_integration): check_integration_outputs(fn, out_dir) +def test_integration_three_echo_external_regressors_single_model(skip_integration): + """Integration test of tedana workflow with extern regress and F stat.""" + + if skip_integration: + pytest.skip("Skipping three-echo with external regressors integration test") + + test_data_path, osf_id = data_for_testing_info("three-echo") + out_dir = os.path.abspath( + os.path.join(test_data_path, "../../outputs/three-echo-externalreg-Ftest") + ) + + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + # download data and run the test + # external_regress_Ftest_3echo.tsv has 13 rows. Based on a local run on the 3 echo data: + # Col 1 (trans_x_correlation) is the TS for ICA comp 59 + similar stdev Gaussian Noise + # Col 2 (trans_y_correlation) is 0.4*comp29+0.5+comp20+Gaussian Noise + # Col 3 (trans_z_correlation) is comp20+Gaussian Noise + # Col 4-6 are Gaussian noise representing pitch/roll/yaw + # Col 7-12 are the first derivative of col 1-6 + # With the currently set up decision tree, + # Component 59 should be rejected because it is correlated to trans_x and, + # comp 20 should be rejected because of a signif fit to a combination of trans_y and trans_z. + # Component 29 is not rejected because the fit does not cross a r>0.8 threshold + # Note that the above is in comparision to the minimal decision tree + # but the integration test for 3 echoes uses the kundu tree + download_test_data(osf_id, test_data_path) + tree_name = "resources/decision_trees/demo_external_regressors_single_model.json" + tedana_cli.tedana_workflow( + data=f"{test_data_path}/three_echo_Cornell_zcat.nii.gz", + tes=[14.5, 38.5, 62.5], + out_dir=out_dir, + tree=resource_filename("tedana", tree_name), + external_regressors=resource_filename( + "tedana", "tests/data/external_regress_Ftest_3echo.tsv" + ), + low_mem=True, + tedpca="aic", + ) + + # compare the generated output files + fn = resource_filename("tedana", "tests/data/cornell_three_echo_outputs.txt") + check_integration_outputs(fn, out_dir) + + +def test_integration_three_echo_external_regressors_motion_task_models(skip_integration): + """Integration test of tedana workflow with extern regress and F stat.""" + + if skip_integration: + pytest.skip("Skipping three-echo with external regressors integration test") + + test_data_path, osf_id = data_for_testing_info("three-echo") + out_dir = os.path.abspath( + os.path.join(test_data_path, "../../outputs/three-echo-externalreg-Ftest-multimodels") + ) + + if os.path.exists(out_dir): + shutil.rmtree(out_dir) + + # download data and run the test + # external_regress_Ftest_3echo.tsv has 12 columns for motion, 1 for CSF, and 1 for task signal + # The regressor values and expected fits with the data are detailed in: + # tests.test_external_metrics.sample_external_regressors + download_test_data(osf_id, test_data_path) + tree_name = "resources/decision_trees/demo_external_regressors_motion_task_models.json" + tedana_cli.tedana_workflow( + data=f"{test_data_path}/three_echo_Cornell_zcat.nii.gz", + tes=[14.5, 38.5, 62.5], + out_dir=out_dir, + tree=resource_filename("tedana", tree_name), + external_regressors=resource_filename( + "tedana", "tests/data/external_regress_Ftest_3echo.tsv" + ), + mixm=f"{test_data_path}/desc_ICA_mixing_static.tsv", + low_mem=True, + tedpca="aic", + ) + + # compare the generated output files + fn = resource_filename("tedana", "tests/data/cornell_three_echo_preset_mixing_outputs.txt") + check_integration_outputs(fn, out_dir) + + def test_integration_reclassify_insufficient_args(skip_integration): if skip_integration: pytest.skip("Skipping reclassify insufficient args") diff --git a/tedana/tests/test_metrics.py b/tedana/tests/test_metrics.py index dfcd5ee2f..377deea02 100644 --- a/tedana/tests/test_metrics.py +++ b/tedana/tests/test_metrics.py @@ -7,7 +7,11 @@ import pytest from tedana import io, utils -from tedana.metrics import collect, dependence +from tedana.metrics import collect, dependence, external +from tedana.tests.test_external_metrics import ( + sample_external_regressor_config, + sample_external_regressors, +) from tedana.tests.utils import get_test_data_path @@ -24,15 +28,20 @@ def testdata1(): methods=["dropout", "decay"], ) data_optcom = np.mean(data_cat, axis=1) - mixing = np.random.random((data_optcom.shape[1], 50)) + mixing = np.random.random((data_optcom.shape[1], 3)) io_generator = io.OutputGenerator(ref_img) + + # includes adaptive_mask_cut and mixing_cut which are used for ValueError tests + # for when dimensions do not align data_dict = { "data_cat": data_cat, "tes": tes, "data_optcom": data_optcom, "adaptive_mask": adaptive_mask, + "adaptive_mask_cut": np.delete(adaptive_mask, (0), axis=0), "generator": io_generator, "mixing": mixing, + "mixing_cut": np.delete(mixing, (0), axis=0), } return data_dict @@ -52,19 +61,109 @@ def test_smoke_generate_metrics(testdata1): "normalized variance explained", "d_table_score", ] - comptable = collect.generate_metrics( - testdata1["data_cat"], - testdata1["data_optcom"], - testdata1["mixing"], - testdata1["adaptive_mask"], - testdata1["tes"], - testdata1["generator"], - "ICA", + + external_regressors, _ = sample_external_regressors() + # these data have 50 volumes so cut external_regressors to 50 vols for these tests + # This is just testing execution. Accuracy of values for external regressors are + # tested in test_external_metrics + n_vols = 5 + external_regressors = external_regressors.drop(labels=range(5, 75), axis=0) + + external_regressor_config = sample_external_regressor_config() + external_regressor_config_expanded = external.validate_extern_regress( + external_regressors, external_regressor_config, n_vols + ) + + comptable, _ = collect.generate_metrics( + data_cat=testdata1["data_cat"], + data_optcom=testdata1["data_optcom"], + mixing=testdata1["mixing"], + adaptive_mask=testdata1["adaptive_mask"], + tes=testdata1["tes"], + io_generator=testdata1["generator"], + label="ICA", + external_regressors=external_regressors, + external_regressor_config=external_regressor_config_expanded, metrics=metrics, ) assert isinstance(comptable, pd.DataFrame) +def test_generate_metrics_fails(testdata1): + """Testing error conditions for tedana.metrics.collect.generate_metrics.""" + + metrics = [ + "kappa", + "rho", + ] + + # missing external regressors + external_regress = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) + with pytest.raises( + ValueError, + match=( + "If external_regressors is defined, then " + "external_regressor_config also needs to be defined." + ), + ): + comptable, _ = collect.generate_metrics( + data_cat=testdata1["data_cat"], + data_optcom=testdata1["data_optcom"], + mixing=testdata1["mixing"], + adaptive_mask=testdata1["adaptive_mask"], + tes=testdata1["tes"], + io_generator=testdata1["generator"], + label="ICA", + external_regressors=external_regress, + metrics=metrics, + ) + + with pytest.raises( + ValueError, + match=(r"First dimensions \(number of samples\) of data_cat"), + ): + comptable, _ = collect.generate_metrics( + data_cat=testdata1["data_cat"], + data_optcom=testdata1["data_optcom"], + mixing=testdata1["mixing"], + adaptive_mask=testdata1["adaptive_mask_cut"], + tes=testdata1["tes"], + io_generator=testdata1["generator"], + label="ICA", + metrics=metrics, + ) + + with pytest.raises( + ValueError, + match=("does not match number of echoes provided"), + ): + comptable, _ = collect.generate_metrics( + data_cat=testdata1["data_cat"], + data_optcom=testdata1["data_optcom"], + mixing=testdata1["mixing"], + adaptive_mask=testdata1["adaptive_mask"], + tes=testdata1["tes"][0:2], + io_generator=testdata1["generator"], + label="ICA", + metrics=metrics, + ) + + with pytest.raises( + ValueError, + match=("Number of volumes in data_cat"), + ): + comptable, _ = collect.generate_metrics( + data_cat=testdata1["data_cat"], + data_optcom=testdata1["data_optcom"], + mixing=testdata1["mixing_cut"], + adaptive_mask=testdata1["adaptive_mask"], + tes=testdata1["tes"], + io_generator=testdata1["generator"], + label="ICA", + metrics=metrics, + ) + + def test_smoke_calculate_weights(): """Smoke test for tedana.metrics.dependence.calculate_weights.""" n_voxels, n_volumes, n_components = 1000, 100, 50 diff --git a/tedana/tests/test_selection_utils.py b/tedana/tests/test_selection_utils.py index 465fb8bcc..73ae84c30 100644 --- a/tedana/tests/test_selection_utils.py +++ b/tedana/tests/test_selection_utils.py @@ -241,8 +241,12 @@ def test_confirm_metrics_exist_succeeds(): # Testing for metrics that exist with 1 or 2 necessary metrics in a set # Returns True if an undefined metric exists so using "assert not" + # Testing if it can find a single metric assert not selection_utils.confirm_metrics_exist(component_table, {"kappa"}) + # Testing if it can find multiple metrics assert not selection_utils.confirm_metrics_exist(component_table, {"kappa", "rho"}) + # Testing if it can find metrics that use regular expressions + assert not selection_utils.confirm_metrics_exist(component_table, {"kappa", "^count.*$"}) def test_confirm_metrics_exist_fails(): @@ -251,13 +255,22 @@ def test_confirm_metrics_exist_fails(): component_table = sample_component_table(options="comptable") # Should fail with and error would have default or pre-defined file name - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="Necessary metrics for unknown function"): selection_utils.confirm_metrics_exist(component_table, {"kappa", "quack"}) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=r"MISSING METRICS: \['quack'\]"): + selection_utils.confirm_metrics_exist( + component_table, + necessary_metrics={"kappa", "quack"}, + function_name="dec_left_op_right", + ) + with pytest.raises(ValueError, match="Necessary metrics for farm"): selection_utils.confirm_metrics_exist( component_table, {"kappa", "mooo"}, function_name="farm" ) + with pytest.raises(ValueError, match=r"MISSING METRICS: \['\^mount.\*\$'\]."): + selection_utils.confirm_metrics_exist(component_table, {"kappa", "^mount.*$"}) + def test_log_decision_tree_step_smoke(): """A smoke test for log_decision_tree_step.""" diff --git a/tedana/tests/test_stats.py b/tedana/tests/test_stats.py index 9d64f67ac..5214daa72 100644 --- a/tedana/tests/test_stats.py +++ b/tedana/tests/test_stats.py @@ -4,8 +4,9 @@ import numpy as np import pytest +from numpy.matlib import repmat -from tedana.stats import computefeats2, get_coeffs, getfbounds +from tedana.stats import computefeats2, fit_model, get_coeffs, getfbounds def test_break_computefeats2(): @@ -140,3 +141,32 @@ def test_smoke_getfbounds(): assert f05 is not None assert f025 is not None assert f01 is not None + + +def test_fit_model(): + """Tests for fit_model.""" + + # set up data where y = weights*x + residuals + r = 15 # number of regressors + t = 300 # number of time points + c = 50 # number of components + rng = np.random.default_rng(42) # using a fixed seed + x = rng.random(size=(t, r)) + weights = rng.random(size=(r, c)) + # Making the residuals sufficiently small for the fit to be precise to 4 decimals + residuals = rng.random(size=(t, c)) / 1000000 + y = np.empty((t, c)) + for cidx in range(c): + y[:, cidx] = (x * repmat(weights[:, cidx], t, 1)).sum(axis=1) + y = y + residuals + + # Fitting model and confirming outputs are the correct shape + # and beta fits match inputted weights to four decimal places + betas, sse, df = fit_model(x, y) + assert df == (t - r) + assert sse.shape == (c,) + assert (np.round(betas, decimals=4) == np.round(weights, decimals=4)).all() + + # Outputting the residual and checking it matches the inputted residual + fit_residuals = fit_model(x, y, output_residual=True) + assert (np.round(fit_residuals, decimals=4) == np.round(residuals, decimals=4)).all() diff --git a/tedana/tests/test_utils.py b/tedana/tests/test_utils.py index b81bf62b3..20c3edd5e 100644 --- a/tedana/tests/test_utils.py +++ b/tedana/tests/test_utils.py @@ -323,6 +323,34 @@ def test_smoke_threshold_map(): assert utils.threshold_map(img, min_cluster_size, sided="bi") is not None +def test_create_legendre_polynomial_basis_set(): + """Checking that accurate Legendre polynomials are created.""" + + n_vols = 100 + legendre_arr = utils.create_legendre_polynomial_basis_set(n_vols, dtrank=6) + + # Testing first 6 orders to 6 decimal accuracy using + # explicit equations rathern than scipy's lpmv + legendre_rounded = np.round(legendre_arr, decimals=6) + bounds = np.linspace(-1, 1, n_vols) + # order 0 (all 1's) + assert (legendre_arr[:, 0] == 1).sum() == n_vols + # order 1 (y=x) + assert np.abs((legendre_rounded[:, 1] - np.round(bounds, decimals=6))).sum() == 0 + # order 2 (y = 0.5*(3*x^2 - 1)) + tmp_o2 = 0.5 * (3 * bounds**2 - 1) + assert np.abs((legendre_rounded[:, 2] - np.round(tmp_o2, decimals=6))).sum() == 0 + # order 3 (y = 0.5*(5*x^3 - 3*x)) + tmp_o3 = 0.5 * (5 * bounds**3 - 3 * bounds) + assert np.abs((legendre_rounded[:, 3] - np.round(tmp_o3, decimals=6))).sum() == 0 + # order 4 (y = 0.125*(35*x^4 - 30*x^2 + 3)) + tmp_o4 = 0.125 * (35 * bounds**4 - 30 * bounds**2 + 3) + assert np.abs((legendre_rounded[:, 4] - np.round(tmp_o4, decimals=6))).sum() == 0 + # order 5 (y = 0.125*(63*x^5 - 70*x^3 + 15x)) + tmp_o5 = 0.125 * (63 * bounds**5 - 70 * bounds**3 + 15 * bounds) + assert np.abs((legendre_rounded[:, 5] - np.round(tmp_o5, decimals=6))).sum() == 0 + + def test_sec2millisec(): """Ensure that sec2millisec returns 1000x the input values.""" assert utils.sec2millisec(5) == 5000 diff --git a/tedana/tests/utils.py b/tedana/tests/utils.py index 5b4a473e3..6147165be 100644 --- a/tedana/tests/utils.py +++ b/tedana/tests/utils.py @@ -1,6 +1,22 @@ """Utility functions for testing tedana.""" -from os.path import abspath, dirname, join, sep +import glob +import json +import logging +import shutil +import tarfile +from datetime import datetime +from gzip import GzipFile +from io import BytesIO +from os import listdir, makedirs +from os.path import abspath, dirname, exists, getctime, getmtime, join, sep + +import requests + +from tedana.workflows import tedana as tedana_cli + +# Added a testing logger to output whether or not testing data were downlaoded +TestLGR = logging.getLogger("TESTING") def get_test_data_path(): @@ -12,3 +28,126 @@ def get_test_data_path(): Based on function by Yaroslav Halchenko used in Neurosynth Python package. """ return abspath(join(dirname(__file__), "data") + sep) + + +def data_for_testing_info(test_dataset=str): + """ + Get the path and download link for each dataset used for testing. + + Also creates the base directories into which the data and output + directories are written + + Parameters + ---------- + test_dataset : str + References one of the datasets to download. It can be: + three-echo + three-echo-reclassify + four-echo + five-echo + + Returns + ------- + test_data_path : str + The path to the local directory where the data will be downloaded + osf_id : str + The ID for the OSF file. + Data download link would be https://osf.io/osf_id/download + Metadata download link would be https://osf.io/osf_id/metadata/?format=datacite-json + """ + + tedana_path = dirname(tedana_cli.__file__) + base_data_path = abspath(join(tedana_path, "../../.testing_data_cache")) + makedirs(base_data_path, exist_ok=True) + makedirs(join(base_data_path, "outputs"), exist_ok=True) + if test_dataset == "three-echo": + test_data_path = join(base_data_path, "three-echo/TED.three-echo") + osf_id = "rqhfc" + makedirs(join(base_data_path, "three-echo"), exist_ok=True) + makedirs(join(base_data_path, "outputs/three-echo"), exist_ok=True) + elif test_dataset == "three-echo-reclassify": + test_data_path = join(base_data_path, "reclassify") + osf_id = "f6g45" + makedirs(join(base_data_path, "outputs/reclassify"), exist_ok=True) + elif test_dataset == "four-echo": + test_data_path = join(base_data_path, "four-echo/TED.four-echo") + osf_id = "gnj73" + makedirs(join(base_data_path, "four-echo"), exist_ok=True) + makedirs(join(base_data_path, "outputs/four-echo"), exist_ok=True) + elif test_dataset == "five-echo": + test_data_path = join(base_data_path, "five-echo/TED.five-echo") + osf_id = "9c42e" + makedirs(join(base_data_path, "five-echo"), exist_ok=True) + makedirs(join(base_data_path, "outputs/five-echo"), exist_ok=True) + else: + raise ValueError(f"{test_dataset} is not a valid dataset string for data_for_testing_info") + + return test_data_path, osf_id + + +def download_test_data(osf_id, test_data_path): + """If current data is not already available, downloads tar.gz data. + + Data are stored at `https://osf.io/osf_id/download`. + It unpacks into `out_path`. + + Parameters + ---------- + osf_id : str + The ID for the OSF file. + out_path : str + Path to directory where OSF data should be extracted + """ + + try: + datainfo = requests.get(f"https://osf.io/{osf_id}/metadata/?format=datacite-json") + except Exception: + if len(listdir(test_data_path)) == 0: + raise ConnectionError( + f"Cannot access https://osf.io/{osf_id} and testing data " "are not yet downloaded" + ) + else: + TestLGR.warning( + f"Cannot access https://osf.io/{osf_id}. " + f"Using local copy of testing data in {test_data_path} " + "but cannot validate that local copy is up-to-date" + ) + return + datainfo.raise_for_status() + metadata = json.loads(datainfo.content) + # 'dates' is a list with all udpates to the file, the last item in the list + # is the most recent and the 'date' field in the list is the date of the last + # update. + osf_filedate = metadata["dates"][-1]["date"] + + # File the file with the most recent date for comparision with + # the lsst updated date for the osf file + if exists(test_data_path): + filelist = glob.glob(f"{test_data_path}/*") + most_recent_file = max(filelist, key=getctime) + if exists(most_recent_file): + local_filedate = getmtime(most_recent_file) + local_filedate_str = str(datetime.fromtimestamp(local_filedate).date()) + local_data_exists = True + else: + local_data_exists = False + else: + local_data_exists = False + if local_data_exists: + if local_filedate_str == osf_filedate: + TestLGR.info( + f"Downloaded and up-to-date data already in {test_data_path}. Not redownloading" + ) + return + else: + TestLGR.info( + f"Downloaded data in {test_data_path} was last modified on " + f"{local_filedate_str}. Data on https://osf.io/{osf_id} " + f" was last updated on {osf_filedate}. Deleting and redownloading" + ) + shutil.rmtree(test_data_path) + req = requests.get(f"https://osf.io/{osf_id}/download") + req.raise_for_status() + t = tarfile.open(fileobj=GzipFile(fileobj=BytesIO(req.content))) + makedirs(test_data_path, exist_ok=True) + t.extractall(test_data_path) diff --git a/tedana/utils.py b/tedana/utils.py index 5235a63f7..9c2c0c67b 100644 --- a/tedana/utils.py +++ b/tedana/utils.py @@ -5,9 +5,11 @@ import platform import sys import warnings +from typing import Union import nibabel as nib import numpy as np +import numpy.typing as npt from bokeh import __version__ as bokeh_version from mapca import __version__ as mapca_version from matplotlib import __version__ as matplotlib_version @@ -18,6 +20,7 @@ from pandas import __version__ as pandas_version from scipy import __version__ as scipy_version from scipy import ndimage +from scipy.special import lpmv from sklearn import __version__ as sklearn_version from sklearn.utils import check_array from threadpoolctl import __version__ as threadpoolctl_version @@ -455,6 +458,42 @@ def threshold_map(img, min_cluster_size, threshold=None, mask=None, binarize=Tru return clust_thresholded +def create_legendre_polynomial_basis_set( + n_vols: int, dtrank: Union[int, None] = None +) -> npt.NDArray: + """ + Create Legendre polynomial basis set for detrending time series. + + Parameters + ---------- + n_vols : :obj:`int` + The number of time points in the fMRI time series + dtrank : :obj:`int`, optional + Specifies degree of Legendre polynomial basis function for estimating + spatial global signal. Default: None + If None, then this is set to 1+floor(n_vols/150) + + Returns + ------- + legendre_arr : (T X R) :obj:`np.ndarray` + A time by rank matrix of the first dtrank order Legendre polynomials. + These include: + Order 0: y = 1 + Order 1: y = x + Order 2: y = 0.5*(3*x^2 - 1) + Order 3: y = 0.5*(5*x^3 - 3*x) + Order 4: y = 0.125*(35*x^4 - 30*x^2 + 3) + Order 5: y = 0.125*(63*x^5 - 70*x^3 + 15x) + """ + if dtrank is None: + dtrank = int(1 + np.floor(n_vols / 150)) + + bounds = np.linspace(-1, 1, n_vols) + legendre_arr = np.column_stack([lpmv(0, vv, bounds) for vv in range(dtrank)]) + + return legendre_arr + + def sec2millisec(arr): """ Convert seconds to milliseconds. diff --git a/tedana/workflows/tedana.py b/tedana/workflows/tedana.py index 5bdff23fc..b3e39022f 100644 --- a/tedana/workflows/tedana.py +++ b/tedana/workflows/tedana.py @@ -174,6 +174,18 @@ def _get_parser(): ), default="tedana_orig", ) + optional.add_argument( + "--external", + dest="external_regressors", + type=lambda x: is_valid_file(parser, x), + help=( + "File containing external regressors to compare to ICA component be used in the " + "decision tree. For example, to identify components fit head motion time series." + "The file must be a TSV file with the same number of rows as the number of volumes in " + "the input data. Column labels and statistical tests are defined with external_labels." + ), + default=None, + ) optional.add_argument( "--seed", dest="fixed_seed", @@ -334,6 +346,7 @@ def tedana_workflow( fittype="loglin", combmode="t2s", tree="tedana_orig", + external_regressors=None, tedpca="aic", fixed_seed=42, maxit=500, @@ -395,10 +408,15 @@ def tedana_workflow( process in MEICA. A difference between that tree and the older MEICA was identified so the original meica tree is also included. meica will always accept the same or more components, but those accepted components are sometimes - high variance so the differences can be non-trivial. Minimal is intented + high variance so the differences can be non-trivial. Minimal is intended to be a simpler process, but it accepts and rejects some distinct components compared to the others. Testing to better understand the effects of the differences is ongoing. Default is 'tedana_orig'. + external_regressors : :obj:`str` or None, optional + File containing external regressors to be used in the decision tree. + The file must be a TSV file with the same number of rows as the number of volumes in + the input data. Each column in the file will be treated as a separate regressor. + Default is None. tedpca : {'mdl', 'aic', 'kic', 'kundu', 'kundu-stabilize', float, int}, optional Method with which to select components in TEDPCA. If a float is provided, then it is assumed to represent percentage of variance @@ -522,6 +540,19 @@ def tedana_workflow( LGR.info(f"Loading input data: {[f for f in data]}") catd, ref_img = io.load_data(data, n_echos=n_echos) + # Load external regressors if provided + # Decided to do the validation here so that, if there are issues, an error + # will be raised before PCA/ICA + if ( + "external_regressor_config" in set(selector.tree.keys()) + and selector.tree["external_regressor_config"] is not None + ): + external_regressors, selector.tree["external_regressor_config"] = ( + metrics.external.load_validate_external_regressors( + external_regressors, selector.tree["external_regressor_config"], catd.shape[2] + ) + ) + io_generator = io.OutputGenerator( ref_img, convention=convention, @@ -708,16 +739,26 @@ def tedana_workflow( extra_metrics = ["variance explained", "normalized variance explained", "kappa", "rho"] necessary_metrics = sorted(list(set(necessary_metrics + extra_metrics))) - comptable = metrics.collect.generate_metrics( - catd, - data_oc, - mmix, - masksum_clf, - tes, - io_generator, - "ICA", + comptable, _ = metrics.collect.generate_metrics( + data_cat=catd, + data_optcom=data_oc, + mixing=mmix, + adaptive_mask=masksum_clf, + tes=tes, + io_generator=io_generator, + label="ICA", metrics=necessary_metrics, + external_regressors=external_regressors, + external_regressor_config=selector.tree["external_regressor_config"], + ) + LGR.info("Selecting components from ICA results") + selector = selection.automatic_selection( + comptable, + selector, + n_echos=n_echos, + n_vols=n_vols, ) + n_likely_bold_comps = selector.n_likely_bold_comps_ LGR.info("Selecting components from ICA results") selector = selection.automatic_selection( comptable, @@ -738,7 +779,12 @@ def tedana_workflow( if keep_restarting: io_generator.overwrite = True # Create a re-initialized selector object if rerunning + # Since external_regressor_config might have been expanded to remove + # regular expressions immediately after initialization, + # store and copy this key + tmp_external_regressor_config = selector.tree["external_regressor_config"] selector = ComponentSelector(tree) + selector.tree["external_regressor_config"] = tmp_external_regressor_config RepLGR.disabled = True # Disable the report to avoid duplicate text RepLGR.disabled = False # Re-enable the report after the while loop is escaped @@ -748,27 +794,26 @@ def tedana_workflow( mixing_file = io_generator.get_name("ICA mixing tsv") mmix = pd.read_table(mixing_file).values - selector = ComponentSelector(tree) + # selector = ComponentSelector(tree) necessary_metrics = selector.necessary_metrics # The figures require some metrics that might not be used by the decision tree. extra_metrics = ["variance explained", "normalized variance explained", "kappa", "rho"] necessary_metrics = sorted(list(set(necessary_metrics + extra_metrics))) - comptable = metrics.collect.generate_metrics( - catd, - data_oc, - mmix, - masksum_clf, - tes, - io_generator, - "ICA", + comptable, _ = metrics.collect.generate_metrics( + data_cat=catd, + data_optcom=data_oc, + mixing=mmix, + adaptive_mask=masksum_clf, + tes=tes, + io_generator=io_generator, + label="ICA", metrics=necessary_metrics, + external_regressors=external_regressors, + external_regressor_config=selector.tree["external_regressor_config"], ) selector = selection.automatic_selection( - comptable, - selector, - n_echos=n_echos, - n_vols=n_vols, + comptable, selector, n_echos=n_echos, n_vols=n_vols ) # TODO The ICA mixing matrix should be written out after it is created