From 62b798a11d347fccfe119a5c7ea75f8b1e5227e1 Mon Sep 17 00:00:00 2001 From: Sarah Teichman Date: Fri, 19 Dec 2025 16:12:56 -0500 Subject: [PATCH 1/2] updates before submitting to cran --- DESCRIPTION | 2 +- R/X_cup_from_X.R | 25 +++++++ R/emuFit.R | 53 ++++++++++----- R/emuFit_micro_penalized.R | 4 +- R/fit_null.R | 2 +- R/get_G_for_augmentations.R | 18 +++++ man/emuFit.Rd | 42 ++++++++---- tests/testthat/test-X_cup_from_X.R | 13 ++++ tests/testthat/test-emuFit.R | 4 +- tests/testthat/test-fit_null.R | 3 + tests/testthat/test-get_G_for_augmentations.R | 17 +++++ vignettes/quick_start.Rmd | 67 +++++++++++++++++++ 12 files changed, 213 insertions(+), 37 deletions(-) create mode 100644 tests/testthat/test-X_cup_from_X.R create mode 100644 tests/testthat/test-get_G_for_augmentations.R create mode 100644 vignettes/quick_start.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index 30cbd8e..b2e7c8a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: radEmu Title: Using Relative Abundance Data to Estimate of Multiplicative Differences in Mean Absolute Abundance -Version: 2.2.1.0 +Version: 2.3.1.0 Authors@R: c(person("David", "Clausen", role = c("aut")), person("Amy D", "Willis", email = "adwillis@uw.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2802-4317")), person("Sarah", "Teichman", role = "aut")) diff --git a/R/X_cup_from_X.R b/R/X_cup_from_X.R index f736b04..37e855a 100644 --- a/R/X_cup_from_X.R +++ b/R/X_cup_from_X.R @@ -20,3 +20,28 @@ X_cup_from_X <- function(X,J){ return(X_cup) } + +X_cup_from_X_fast <- function(X, J) { + n <- nrow(X) + p <- ncol(X) + + # Total number of rows in final matrix + total_rows <- n * J + + # Construct i index (row indices) + i_coords <- rep(1:total_rows, each = p) + + # Construct j index (column indices) + j_block <- rep(0:(J - 1), each = p) * p + j_coords <- rep(j_block, times = n) + rep(1:p, times = total_rows) + + # Construct values + X_rep <- X[rep(1:n, each = J), ] + x_vals <- as.vector(t(X_rep)) # row-wise unrolling + + X_cup <- Matrix::sparseMatrix(i = i_coords, + j = j_coords, + x = x_vals) + + return(X_cup) +} \ No newline at end of file diff --git a/R/emuFit.R b/R/emuFit.R index 52f62ba..a79213d 100644 --- a/R/emuFit.R +++ b/R/emuFit.R @@ -45,6 +45,10 @@ #' (corresponding to 95% confidence intervals) #' @param return_wald_p logical: return p-values from Wald tests? Default is `FALSE`. #' @param compute_cis logical: compute and return Wald CIs? Default is `TRUE`. +#' @param max_abs_B maximum allowed value for elements of B (in absolute value) in full model fitting. In +#' most cases this is not needed as Firth penalty will prevent infinite estimates +#' under separation. However, such a threshold may be helpful in very poorly conditioned problems (e.g., with many +#' nearly collinear regressors). Default is \code{250}. #' @param run_score_tests logical: perform robust score testing? Default is TRUE. #' @param test_kj a data frame whose rows give coordinates (in category j and #' covariate k) of elements of B to construct hypothesis tests for. If you don't know @@ -77,26 +81,34 @@ #' @param ... Additional arguments. Arguments matching the names of \code{control_fn()} options are forwarded to that function and override #' defaults. Unknown arguments are ignored with a warning. #' -#' @return A list containing elements 'coef', 'B', 'penalized', 'Y_augmented', -#' 'z_hat', 'I', 'Dy', and 'score_test_hyperparams' if score tests are run. -#' Parameter estimates by covariate and outcome category (e.g., taxon for microbiome data), -#' as well as optionally confidence intervals and p-values, are contained in 'coef'. -#' Any robust score statistics and score test p-values are also included in 'coef'. -#' If there are any zero-comparison parameters in the model, a column 'zero_comparison' -#' is also included, which is TRUE for any parameters that compare the level of a categorical +#' @return \code{emuFit} returns a list containing elements \code{coef}, \code{B}, \code{penalized}, \code{Y_augmented}, +#' \code{z_hat}, \code{I}, \code{Dy}, and \code{score_test_hyperparams} if score tests are run. +#' +#' The \code{coef} table contains log fold-difference parameter estimates by covariate and outcome +#' category (e.g., taxon for microbiome data). A log fold-difference estimate of \code{1} for a treatment (versus control) and taxon X can be +#' interpreted to say that we expect taxon X is \code{exp(1) = 2.72} times more abundant in someone who has received the treatment compared to +#' someone who has received the control (holding all other covariates equal), when compared to typical fold-differences in abundances of +#' taxa in this analysis. +#' +#' \code{coef} also includes optionally-computed confidence intervals and robust Wald p-values. +#' Robust score statistics and score test p-values are also included in \code{coef}. +#' As explained in the associated manuscript, we recommend use of the robust score test values instead of the robust +#' Wald test p-values, due to better error rate control (i.e. fewer false positives). +#' +#' If there are any zero-comparison parameters in the model, a column "zero_comparison" +#' is also included, which is \code{TRUE} for any parameters that compare the level of a categorical #' covariate to a reference level for a category with only zero counts for both the comparison #' level and the reference level. This check is currently implemented for an arbitrary design matrix #' generated using the \code{formula} and \code{data} arguments, and for a design matrix with no more #' than one categorical covariate if the design matrix \code{X} is input directly. -#' 'B' contains parameter estimates in matrix format (rows indexing covariates and +#' +#' \code{B} contains parameter estimates in matrix format (rows indexing covariates and #' columns indexing outcome category / taxon). -#' 'penalized' is equal to TRUE f Firth penalty is used in estimation (default) and FALSE otherwise. -#' 'z_hat' returns the nuisance parameters calculated in Equation 7 of the radEmu manuscript, -#' corresponding to either 'Y_augmented' or 'Y' if the 'penalized' is equal to TRUE -#' or FALSE, respectively. -#' I' and 'Dy' contain an information matrix and empirical score covariance matrix computed under the +#' \code{penalized} is \code{TRUE} if Firth penalty is used in estimation (default) and \code{FALSE} otherwise. +#' \code{z_hat} returns the nuisance parameters (sample-specific sequencing effects). +#' \code{I} and \code{Dy} contain an information matrix and empirical score covariance matrix computed under the #' full model. -#' 'score_test_hyperparams' contains parameters and hyperparameters related to estimation under the null, +#' \code{score_test_hyperparams} contains parameters and hyperparameters related to estimation under the null, #' including whether or not the algorithm converged, which can be helpful for debugging. #' #' @importFrom stats cov median model.matrix optim pchisq qnorm weighted.mean @@ -146,6 +158,7 @@ emuFit <- function(Y, alpha = 0.05, return_wald_p = FALSE, compute_cis = TRUE, + max_abs_B = 250, run_score_tests = TRUE, test_kj = NULL, null_fit_alg = "constraint_sandwich", @@ -222,7 +235,7 @@ emuFit <- function(Y, # check for zero-comparison parameters zero_comparison_res <- zero_comparison_check(X = X, Y = Y) - X_cup <- X_cup_from_X(X,J) + X_cup <- X_cup_from_X_fast(X,J) @@ -251,7 +264,8 @@ emuFit <- function(Y, max_step = control$max_step, tolerance = tolerance, verbose = (verbose == "development"), - j_ref = j_ref) + j_ref = j_ref, + max_abs_B = max_abs_B) Y_test <- fitted_model$Y_augmented fitted_B <- fitted_model$B converged_estimates <- fitted_model$convergence @@ -295,7 +309,7 @@ emuFit <- function(Y, } else { fitted_B <- B if (penalize) { - G <- get_G_for_augmentations(X, J, n, X_cup) + G <- get_G_for_augmentations_fast(X, J, n, X_cup) Y_test <- Y_augmented <- Y + get_augmentations(X = X, G = G, Y = Y, B = fitted_B) } else { @@ -305,6 +319,11 @@ emuFit <- function(Y, } } + max_est_B <- max(abs(fitted_B)) + if (refit & (max_est_B >= 0.9 * max_abs_B)) { + warning("At least one estimated B value is within 10% of your `max_abs_B` boundary. We suggest that you rerun estimation with a larger `max_abs_B` value.") + } + if (p == 1) { message("You are running an intercept-only model. In this model the intercept beta_0^j is an unidentifiable combination of intercept for category j and the detection efficiency of category j. Therefore this parameter is not interpretable.") diff --git a/R/emuFit_micro_penalized.R b/R/emuFit_micro_penalized.R index 385f72a..86a9eae 100644 --- a/R/emuFit_micro_penalized.R +++ b/R/emuFit_micro_penalized.R @@ -64,9 +64,9 @@ may take a moment." ) } if (is.null(X_cup)) { - X_cup <- X_cup_from_X(X, J) + X_cup <- X_cup_from_X_fast(X, J) } - G <- get_G_for_augmentations(X, J, n, X_cup) + G <- get_G_for_augmentations_fast(X, J, n, X_cup) while (!converged) { # print(counter) diff --git a/R/fit_null.R b/R/fit_null.R index b6f6b65..0756272 100644 --- a/R/fit_null.R +++ b/R/fit_null.R @@ -101,7 +101,7 @@ fit_null <- function(B, #get X_cup for later use if (is.null(X_cup)) { - X_cup = X_cup_from_X(X,J) + X_cup = X_cup_from_X_fast(X,J) } #set iteration to zero diff --git a/R/get_G_for_augmentations.R b/R/get_G_for_augmentations.R index b1cef00..a1dc793 100644 --- a/R/get_G_for_augmentations.R +++ b/R/get_G_for_augmentations.R @@ -22,3 +22,21 @@ get_G_for_augmentations <- function(X, G <- cbind(X_tilde_J,Z) return(G) } + +get_G_for_augmentations_fast <- function(X, J, n, X_cup) { + p <- ncol(X) + #calculate indices of columns to remove from X_cup (aka X_tilde, the expanded design matrix) + to_delete <- p*(J - 1) + 1:p #col indices for elements of B^J in long B format + last_class_rows <- seq(from = J, to = n * J, by = J) + X_cup[last_class_rows, ] <- 0 + + X_cup <- X_cup[, -to_delete] + + #design matrix in z + Z <- Matrix::sparseMatrix(i = 1:(n*J), + j = rep(1:n, each = J), + x = 1, + dims = c(n * J, n)) + G <- cbind(X_cup,Z) + return(G) +} diff --git a/man/emuFit.Rd b/man/emuFit.Rd index 7b57325..1d3afdc 100644 --- a/man/emuFit.Rd +++ b/man/emuFit.Rd @@ -26,6 +26,7 @@ emuFit( alpha = 0.05, return_wald_p = FALSE, compute_cis = TRUE, + max_abs_B = 250, run_score_tests = TRUE, test_kj = NULL, null_fit_alg = "constraint_sandwich", @@ -107,6 +108,11 @@ Used in estimation.} \item{compute_cis}{logical: compute and return Wald CIs? Default is \code{TRUE}.} +\item{max_abs_B}{maximum allowed value for elements of B (in absolute value) in full model fitting. In +most cases this is not needed as Firth penalty will prevent infinite estimates +under separation. However, such a threshold may be helpful in very poorly conditioned problems (e.g., with many +nearly collinear regressors). Default is \code{250}.} + \item{run_score_tests}{logical: perform robust score testing? Default is TRUE.} \item{test_kj}{a data frame whose rows give coordinates (in category j and @@ -151,26 +157,34 @@ Default is \code{0.01}.} defaults. Unknown arguments are ignored with a warning.} } \value{ -A list containing elements 'coef', 'B', 'penalized', 'Y_augmented', -'z_hat', 'I', 'Dy', and 'score_test_hyperparams' if score tests are run. -Parameter estimates by covariate and outcome category (e.g., taxon for microbiome data), -as well as optionally confidence intervals and p-values, are contained in 'coef'. -Any robust score statistics and score test p-values are also included in 'coef'. -If there are any zero-comparison parameters in the model, a column 'zero_comparison' -is also included, which is TRUE for any parameters that compare the level of a categorical +\code{emuFit} returns a list containing elements \code{coef}, \code{B}, \code{penalized}, \code{Y_augmented}, +\code{z_hat}, \code{I}, \code{Dy}, and \code{score_test_hyperparams} if score tests are run. + +The \code{coef} table contains log fold-difference parameter estimates by covariate and outcome +category (e.g., taxon for microbiome data). A log fold-difference estimate of \code{1} for a treatment (versus control) and taxon X can be +interpreted to say that we expect taxon X is \code{exp(1) = 2.72} times more abundant in someone who has received the treatment compared to +someone who has received the control (holding all other covariates equal), when compared to typical fold-differences in abundances of +taxa in this analysis. + +\code{coef} also includes optionally-computed confidence intervals and robust Wald p-values. +Robust score statistics and score test p-values are also included in \code{coef}. +As explained in the associated manuscript, we recommend use of the robust score test values instead of the robust +Wald test p-values, due to better error rate control (i.e. fewer false positives). + +If there are any zero-comparison parameters in the model, a column "zero_comparison" +is also included, which is \code{TRUE} for any parameters that compare the level of a categorical covariate to a reference level for a category with only zero counts for both the comparison level and the reference level. This check is currently implemented for an arbitrary design matrix generated using the \code{formula} and \code{data} arguments, and for a design matrix with no more than one categorical covariate if the design matrix \code{X} is input directly. -'B' contains parameter estimates in matrix format (rows indexing covariates and + +\code{B} contains parameter estimates in matrix format (rows indexing covariates and columns indexing outcome category / taxon). -'penalized' is equal to TRUE f Firth penalty is used in estimation (default) and FALSE otherwise. -'z_hat' returns the nuisance parameters calculated in Equation 7 of the radEmu manuscript, -corresponding to either 'Y_augmented' or 'Y' if the 'penalized' is equal to TRUE -or FALSE, respectively. -I' and 'Dy' contain an information matrix and empirical score covariance matrix computed under the +\code{penalized} is \code{TRUE} if Firth penalty is used in estimation (default) and \code{FALSE} otherwise. +\code{z_hat} returns the nuisance parameters (sample-specific sequencing effects). +\code{I} and \code{Dy} contain an information matrix and empirical score covariance matrix computed under the full model. -'score_test_hyperparams' contains parameters and hyperparameters related to estimation under the null, +\code{score_test_hyperparams} contains parameters and hyperparameters related to estimation under the null, including whether or not the algorithm converged, which can be helpful for debugging. } \description{ diff --git a/tests/testthat/test-X_cup_from_X.R b/tests/testthat/test-X_cup_from_X.R new file mode 100644 index 0000000..45ace43 --- /dev/null +++ b/tests/testthat/test-X_cup_from_X.R @@ -0,0 +1,13 @@ +test_that("X_cup_from_X_fast works and is faster than X_cup_from_X", { + X <- cbind(1, rnorm(100), runif(100)) + J <- 200 + start1 <- proc.time() + Xcup1 <- X_cup_from_X(X, J) + end1 <- proc.time() - start1 + start2 <- proc.time() + Xcup2 <- X_cup_from_X_fast(X, J) + end2 <- proc.time() - start2 + + expect_true(all.equal(Xcup1, Xcup2)) + expect_true(end2[3] < end1[3]) +}) \ No newline at end of file diff --git a/tests/testthat/test-emuFit.R b/tests/testthat/test-emuFit.R index 6a7f802..8621dd3 100644 --- a/tests/testthat/test-emuFit.R +++ b/tests/testthat/test-emuFit.R @@ -404,7 +404,7 @@ test_that("emuFit has 'score_test_hyperparams' object and throws warnings when c expect_false(fitted_model$estimation_converged) # check that warning is returned when estimation under the null doesn't converge - suppressWarnings({ + suppressMessages({suppressWarnings({ fitted_model2 <- emuFit(Y = Y, X = cbind(X, rnorm(nrow(X))), verbose = FALSE, @@ -416,7 +416,7 @@ test_that("emuFit has 'score_test_hyperparams' object and throws warnings when c test_kj = data.frame(k = 1, j = 1:2), maxit_null = 5, inner_maxit = 1) - }) + })}) # check that fitted model contains score_test_hyperparams object expect_true("score_test_hyperparams" %in% names(fitted_model2)) diff --git a/tests/testthat/test-fit_null.R b/tests/testthat/test-fit_null.R index 062a71a..2932718 100644 --- a/tests/testthat/test-fit_null.R +++ b/tests/testthat/test-fit_null.R @@ -1,4 +1,7 @@ test_that("we get same null fit with different j_ref", { + + skip("Deterministic, slow to run but passes when tested") + set.seed(59542234) n <- 10 J <- 5 diff --git a/tests/testthat/test-get_G_for_augmentations.R b/tests/testthat/test-get_G_for_augmentations.R new file mode 100644 index 0000000..8558f1f --- /dev/null +++ b/tests/testthat/test-get_G_for_augmentations.R @@ -0,0 +1,17 @@ +test_that("get_G_for_augmentations_fast works and is faster than get_G_for_augmentations", { + X <- cbind(1, rnorm(200), runif(200)) + J <- 200 + Xcup2 <- X_cup_from_X_fast(X, J) + + start1 <- proc.time() + g1 <- get_G_for_augmentations(X, J, nrow(X), Xcup2) + end1 <- proc.time() - start1 + + start2 <- proc.time() + g2 <- get_G_for_augmentations_fast(X, J, nrow(X), Xcup2) + end2 <- proc.time() - start2 + + expect_true(all.equal(g1, g2)) + expect_true(end2[3] < end1[3]) +}) + diff --git a/vignettes/quick_start.Rmd b/vignettes/quick_start.Rmd new file mode 100644 index 0000000..6aa9a6a --- /dev/null +++ b/vignettes/quick_start.Rmd @@ -0,0 +1,67 @@ +--- +title: "Quick start with `radEmu`" +author: "Sarah Teichman and Amy Willis" +date: "`r Sys.Date()`" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{quickstart} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Setup + +First, we will install `radEmu`, if we haven't already. + +```{r, eval = FALSE} +# if (!require("remotes", quietly = TRUE)) +# install.packages("remotes") +# +# remotes::install_github("statdivlab/radEmu") +``` + +Next, we can load `radEmu`. + +```{r setup, message = FALSE} +library(radEmu) +``` + +In this vignette we'll explore a [dataset published by Wirbel et al. (2019)](https://www.nature.com/articles/s41591-019-0406-6). This is a meta-analysis of case-control studies of colorectal cancer, meaning that Wirbel et al. collected raw sequencing data from studies other researchers conducted and re-analyzed it. + +Wirbel et al. published two pieces of data we'll focus on: + +* metadata giving demographics and other information about participants +* a mOTU (metagenomic OTU) table + +```{r} +data("wirbel_sample") +dim(wirbel_sample) +data("wirbel_otu") +dim(wirbel_otu) +``` + +We can see that we have $566$ samples and $845$ mOTUs. We will subset to samples from the Chinese study. + +```{r} +wirbel_sample_ch <- wirbel_sample[wirbel_sample$Country == "CHI", ] +wirbel_otu_ch <- wirbel_otu[rownames(wirbel_sample_ch), ] +to_rm <- which(colSums(wirbel_otu_ch) == 0) +wirbel_otu_ch <- wirbel_otu_ch[, -to_rm] +``` + +## Differential abundance analysis with radEmu + +```{r} +tmp <- emuFit(Y = wirbel_otu_ch, formula = ~ Group + Gender, data = wirbel_sample_ch, + run_score_tests = FALSE, tolerance = 1e-3) +tmp1 <- emuFit(Y = wirbel_otu_ch, formula = ~ Group + Gender, data = wirbel_sample_ch, + refit = FALSE, fitted_model = tmp, compute_cis = FALSE, verbose = "development", test_kj = data.frame(k = 2, j = 3)) +``` + From a7eda693a7a0ad4eba894d8ec08bca577cd837b5 Mon Sep 17 00:00:00 2001 From: Sarah Teichman Date: Fri, 19 Dec 2025 17:17:42 -0500 Subject: [PATCH 2/2] add quickstart vignette --- R/emuFit.R | 4 ++++ R/emuFit_check.R | 4 ++-- vignettes/quick_start.Rmd | 33 +++++++++++++++++++++++++++++---- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/R/emuFit.R b/R/emuFit.R index a79213d..c854d0a 100644 --- a/R/emuFit.R +++ b/R/emuFit.R @@ -248,6 +248,10 @@ emuFit <- function(Y, if (refit) { + if (verbose %in% c(TRUE, "development")) { + message("Estimating parameters") + } + if (!is.null(fitted_model)) { B <- fitted_model$B } diff --git a/R/emuFit_check.R b/R/emuFit_check.R index 498d602..ffc82a2 100644 --- a/R/emuFit_check.R +++ b/R/emuFit_check.R @@ -251,7 +251,7 @@ ignoring argument 'cluster'.") # check that test_kj is not null if running score tests if (run_score_tests) { if (is.null(test_kj)) { - stop("When `run_score_tests = TRUE`, you must provide a matrix `test_kj` to determine which parameters you want to test. If you don't know which indices k correspond to the covariate(s) that you would like to test, run the function `radEmu::make_design_matrix()` in order to view the design matrix, and identify which column of the design matrix corresponds to each covariate in your model. If you don't know which indices j correspond to categories (taxa) that you want to test, you can look at the columns and column names of your `Y` matrix.") + stop("When `run_score_tests = TRUE`, you must provide a data frame `test_kj` to determine which parameters you want to test. If you don't know which indices k correspond to the covariate(s) that you would like to test, run the function `radEmu::make_design_matrix()` in order to view the design matrix, and identify which column of the design matrix corresponds to each covariate in your model. If you don't know which indices j correspond to categories (taxa) that you want to test, you can look at the columns and column names of your `Y` matrix.") } } @@ -299,7 +299,7 @@ ignoring argument 'cluster'.") if (is.logical(all.equal(constraint_fn[[k]], pseudohuber_median))) { if (all.equal(constraint_fn[[k]], pseudohuber_median)) { - if (verbose %in% c(TRUE, "development")) message("Centering row ", k, " of B with pseudo-Huber smoothed median with smoothing parameter ", constraint_param, ".") + if (verbose %in% c("development")) message("Centering row ", k, " of B with pseudo-Huber smoothed median with smoothing parameter ", constraint_param, ".") stopifnot(!is.na(constraint_param)) diff --git a/vignettes/quick_start.Rmd b/vignettes/quick_start.Rmd index 6aa9a6a..8a3472d 100644 --- a/vignettes/quick_start.Rmd +++ b/vignettes/quick_start.Rmd @@ -42,6 +42,8 @@ Wirbel et al. published two pieces of data we'll focus on: ```{r} data("wirbel_sample") +# encode control as the baseline level +wirbel_sample$Group <- factor(wirbel_sample$Group, levels = c("CTR","CRC")) dim(wirbel_sample) data("wirbel_otu") dim(wirbel_otu) @@ -58,10 +60,33 @@ wirbel_otu_ch <- wirbel_otu_ch[, -to_rm] ## Differential abundance analysis with radEmu +We now are ready to fit a `radEmu` model with the `emuFit()` function! This will take 1-2 minutes to run, so we suggest you start running it and then read below about the arguments! + ```{r} -tmp <- emuFit(Y = wirbel_otu_ch, formula = ~ Group + Gender, data = wirbel_sample_ch, - run_score_tests = FALSE, tolerance = 1e-3) -tmp1 <- emuFit(Y = wirbel_otu_ch, formula = ~ Group + Gender, data = wirbel_sample_ch, - refit = FALSE, fitted_model = tmp, compute_cis = FALSE, verbose = "development", test_kj = data.frame(k = 2, j = 3)) +mod <- emuFit(Y = wirbel_otu_ch, formula = ~ Group + Gender, data = wirbel_sample_ch, + test_kj = data.frame(k = 2, j = 1), verbose = TRUE) ``` +Some of the important arguments for `emuFit()` are the following: + + - `formula`: This is a formula telling radEmu what predictors to use in its model. We are using Group, which is an indicator for case (CRC) vs control (CTR), and Gender. + - `data`: A data frame containing covariate information about our samples. + - `Y`: A matrix containing our observed abundance data (e.g., counts or depth measurements). The rows give the observations (samples), and the columns give the categories (taxa/mOTUs). Note that `Y` doesn't have to be integer-valued (counts)! + - `test_kj`: a data frame describing which robust score tests you want to run. `k` corresponds with the predictor(s) that you want to test (if you don't know what order your predictors appear in your design matrix, use the function `make_design_matrix()` with your `formula` and `data` to check!) and `j` corresponds with the taxa that you want to test (check `colnames(Y)` to see the order of the taxa). If you wanted to test the Group covariate for all taxa then you could set `test_kj = data.frame(k = 2, j = 1:ncol(Y))`. + - `verbose`: do you want the code to give you updates as it goes along? + +In the example above we only test the first taxon, because each test for a data set this size takes approximately 30 seconds. Typically you'll want to test all taxa that you are interested in, so you may need to leave this running for a few hours, or check out the "parallel_radEmu" vignette to see how you could parallelize these score tests if you have access to additional computing resources. + +Now that we've fit the model and run a robust score test for the first taxon, we can look at the results! The parameter estimates and any test results are in the `coef` part of the `emuFit` object. + +```{r} +head(mod$coef) +``` + +The first taxon in our model is *Streptococcus anginosus*, which has a log fold-difference estimate of $1.23$. We will interpret this to mean that we expect that *Streptococcus anginosus* is $exp(1.23) \approx 3.42$ times more abundant in cases of colorectal cancer compared to controls of the same gender, when compared to the typical fold-differences in abundances of the taxa in this analysis. + +We could similarly interpret the log fold-differences for each taxon in our analysis. + +For *Streptococcus anginosus* we have a robust score test p-value of $0.09$. This means that we do not have enough evidence to reject the null hypothesis that the fold-difference in abundance of *Streptococcus anginosus* between cases and controls is the same as the typical fold-difference in abundance between cases and controls across all taxa in this analysis. When we say ``typical" here we mean approximately the median fold-difference across taxa. + +Now you are ready to start using `radEmu`! We recommend our other vignettes for a deeper look using `radEmu`, including for `phyloseq` or `TreeSummarizedExperiment` objects, with clustered data, and in parallel. \ No newline at end of file