statdivlab · svteichman · Dec 19, 2025 · Dec 19, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: radEmu
 Title: Using Relative Abundance Data to Estimate of Multiplicative Differences in Mean Absolute Abundance 
-Version: 2.2.1.0
+Version: 2.3.1.0
 Authors@R: c(person("David", "Clausen", role = c("aut")),
              person("Amy D", "Willis", email = "adwillis@uw.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2802-4317")),
              person("Sarah", "Teichman", role = "aut"))

diff --git a/R/X_cup_from_X.R b/R/X_cup_from_X.R
@@ -20,3 +20,28 @@ X_cup_from_X <- function(X,J){
 
   return(X_cup)
 }
+
+X_cup_from_X_fast <- function(X, J) {
+  n <- nrow(X)
+  p <- ncol(X)
+
+  # Total number of rows in final matrix
+  total_rows <- n * J
+
+  # Construct i index (row indices)
+  i_coords <- rep(1:total_rows, each = p)
+
+  # Construct j index (column indices)
+  j_block <- rep(0:(J - 1), each = p) * p
+  j_coords <- rep(j_block, times = n) + rep(1:p, times = total_rows)
+
+  # Construct values
+  X_rep <- X[rep(1:n, each = J), ]
+  x_vals <- as.vector(t(X_rep))  # row-wise unrolling
+
+  X_cup <- Matrix::sparseMatrix(i = i_coords,
+                                j = j_coords,
+                                x = x_vals)
+
+  return(X_cup)
+}
diff --git a/R/emuFit.R b/R/emuFit.R
@@ -45,6 +45,10 @@
 #' (corresponding to 95% confidence intervals)
 #' @param return_wald_p logical: return p-values from Wald tests? Default is `FALSE`.
 #' @param compute_cis logical: compute and return Wald CIs? Default is `TRUE`.
+#' @param max_abs_B maximum allowed value for elements of B (in absolute value) in full model fitting. In
+#' most cases this is not needed as Firth penalty will prevent infinite estimates
+#' under separation. However, such a threshold may be helpful in very poorly conditioned problems (e.g., with many
+#' nearly collinear regressors). Default is \code{250}.
 #' @param run_score_tests logical: perform robust score testing? Default is TRUE.
 #' @param test_kj a data frame whose rows give coordinates (in category j and
 #' covariate k) of elements of B to construct hypothesis tests for. If you don't know
@@ -77,26 +81,34 @@
 #' @param ... Additional arguments. Arguments matching the names of \code{control_fn()} options are forwarded to that function and override
 #' defaults. Unknown arguments are ignored with a warning.
 #' 
-#' @return A list containing elements 'coef', 'B', 'penalized', 'Y_augmented',
-#' 'z_hat', 'I', 'Dy', and 'score_test_hyperparams' if score tests are run.  
-#' Parameter estimates by covariate and outcome category (e.g., taxon for microbiome data),
-#' as well as optionally confidence intervals and p-values, are contained in 'coef'.
-#' Any robust score statistics and score test p-values are also included in 'coef'. 
-#' If there are any zero-comparison parameters in the model, a column 'zero_comparison'
-#' is also included, which is TRUE for any parameters that compare the level of a categorical
+#' @return \code{emuFit} returns a list containing elements \code{coef}, \code{B}, \code{penalized}, \code{Y_augmented},
+#' \code{z_hat}, \code{I}, \code{Dy}, and \code{score_test_hyperparams} if score tests are run.  
+#' 
+#' The \code{coef} table contains log fold-difference parameter estimates by covariate and outcome 
+#' category (e.g., taxon for microbiome data). A log fold-difference estimate of \code{1} for a treatment (versus control) and taxon X can be 
+#' interpreted to say that we expect taxon X is \code{exp(1) = 2.72} times more abundant in someone who has received the treatment compared to 
+#' someone who has received the control (holding all other covariates equal), when compared to typical fold-differences in abundances of 
+#' taxa in this analysis.
+#'
+#' \code{coef} also includes optionally-computed confidence intervals and robust Wald p-values.
+#' Robust score statistics and score test p-values are also included in \code{coef}. 
+#' As explained in the associated manuscript, we recommend use of the robust score test values instead of the robust 
+#' Wald test p-values, due to better error rate control (i.e. fewer false positives). 
+#' 
+#' If there are any zero-comparison parameters in the model, a column "zero_comparison"
+#' is also included, which is \code{TRUE} for any parameters that compare the level of a categorical
 #' covariate to a reference level for a category with only zero counts for both the comparison
 #' level and the reference level. This check is currently implemented for an arbitrary design matrix
 #' generated using the \code{formula} and \code{data} arguments, and for a design matrix with no more
 #' than one categorical covariate if the design matrix \code{X} is input directly.
-#' 'B' contains parameter estimates in matrix format (rows indexing covariates and
+#' 
+#' \code{B} contains parameter estimates in matrix format (rows indexing covariates and
 #' columns indexing outcome category / taxon). 
-#' 'penalized' is equal to TRUE f Firth penalty is used in estimation (default) and FALSE otherwise. 
-#' 'z_hat' returns the nuisance parameters calculated in Equation 7 of the radEmu manuscript,
-#' corresponding to either 'Y_augmented' or 'Y' if the 'penalized' is equal to TRUE
-#' or FALSE, respectively. 
-#' I' and 'Dy' contain an information matrix and empirical score covariance matrix computed under the 
+#' \code{penalized} is \code{TRUE} if Firth penalty is used in estimation (default) and \code{FALSE} otherwise. 
+#' \code{z_hat} returns the nuisance parameters (sample-specific sequencing effects).
+#' \code{I} and \code{Dy} contain an information matrix and empirical score covariance matrix computed under the 
 #' full model. 
-#' 'score_test_hyperparams' contains parameters and hyperparameters related to estimation under the null,
+#' \code{score_test_hyperparams} contains parameters and hyperparameters related to estimation under the null,
 #' including whether or not the algorithm converged, which can be helpful for debugging. 
 #'
 #' @importFrom stats cov median model.matrix optim pchisq qnorm weighted.mean
@@ -146,6 +158,7 @@ emuFit <- function(Y,
                    alpha = 0.05,
                    return_wald_p = FALSE,
                    compute_cis = TRUE,
+                   max_abs_B = 250, 
                    run_score_tests = TRUE,
                    test_kj = NULL,
                    null_fit_alg = "constraint_sandwich",
@@ -222,7 +235,7 @@ emuFit <- function(Y,
   # check for zero-comparison parameters
   zero_comparison_res <- zero_comparison_check(X = X, Y = Y)
 
-  X_cup <- X_cup_from_X(X,J)
+  X_cup <- X_cup_from_X_fast(X,J)
 
 
 
@@ -235,6 +248,10 @@ emuFit <- function(Y,
 
   if (refit) {
 
+    if (verbose %in% c(TRUE, "development")) {
+      message("Estimating parameters")
+    }
+
     if (!is.null(fitted_model)) {
       B <- fitted_model$B
     }
@@ -251,7 +268,8 @@ emuFit <- function(Y,
                                max_step = control$max_step,
                                tolerance = tolerance,
                                verbose = (verbose == "development"),
-                               j_ref = j_ref)
+                               j_ref = j_ref,
+                               max_abs_B = max_abs_B)
       Y_test <- fitted_model$Y_augmented
       fitted_B <- fitted_model$B
       converged_estimates <- fitted_model$convergence
@@ -295,7 +313,7 @@ emuFit <- function(Y,
     } else {
       fitted_B <- B
       if (penalize) {
-        G <- get_G_for_augmentations(X, J, n, X_cup)
+        G <- get_G_for_augmentations_fast(X, J, n, X_cup)
         Y_test <- Y_augmented <- Y + 
           get_augmentations(X = X, G = G, Y = Y, B = fitted_B)
       } else {
@@ -305,6 +323,11 @@ emuFit <- function(Y,
     }
   }
 
+  max_est_B <- max(abs(fitted_B))
+  if (refit & (max_est_B >= 0.9 * max_abs_B)) {
+    warning("At least one estimated B value is within 10% of your `max_abs_B` boundary. We suggest that you rerun estimation with a larger `max_abs_B` value.")
+  }
+
   if (p == 1) {
     message("You are running an intercept-only model. In this model the intercept beta_0^j is an unidentifiable combination of intercept for category j and the detection efficiency of category j. Therefore this parameter is not interpretable.")
 

diff --git a/R/emuFit_check.R b/R/emuFit_check.R
@@ -251,7 +251,7 @@ ignoring argument 'cluster'.")
   # check that test_kj is not null if running score tests
   if (run_score_tests) {
     if (is.null(test_kj)) {
-      stop("When `run_score_tests = TRUE`, you must provide a matrix `test_kj` to determine which parameters you want to test. If you don't know which indices k correspond to the covariate(s) that you would like to test, run the function `radEmu::make_design_matrix()` in order to view the design matrix, and identify which column of the design matrix corresponds to each covariate in your model. If you don't know which indices j correspond to categories (taxa) that you want to test, you can look at the columns and column names of your `Y` matrix.")
+      stop("When `run_score_tests = TRUE`, you must provide a data frame `test_kj` to determine which parameters you want to test. If you don't know which indices k correspond to the covariate(s) that you would like to test, run the function `radEmu::make_design_matrix()` in order to view the design matrix, and identify which column of the design matrix corresponds to each covariate in your model. If you don't know which indices j correspond to categories (taxa) that you want to test, you can look at the columns and column names of your `Y` matrix.")
     }
   }
 
@@ -299,7 +299,7 @@ ignoring argument 'cluster'.")
 
     if (is.logical(all.equal(constraint_fn[[k]], pseudohuber_median))) {
       if (all.equal(constraint_fn[[k]], pseudohuber_median)) {
-        if (verbose %in% c(TRUE, "development")) message("Centering row ", k, " of B with pseudo-Huber smoothed median with smoothing parameter ", constraint_param, ".")
+        if (verbose %in% c("development")) message("Centering row ", k, " of B with pseudo-Huber smoothed median with smoothing parameter ", constraint_param, ".")
 
         stopifnot(!is.na(constraint_param))
 

diff --git a/R/emuFit_micro_penalized.R b/R/emuFit_micro_penalized.R
@@ -64,9 +64,9 @@ may take a moment."
       )
     }
     if (is.null(X_cup)) {
-      X_cup <- X_cup_from_X(X, J)
+      X_cup <- X_cup_from_X_fast(X, J)
     }
-    G <- get_G_for_augmentations(X, J, n, X_cup)
+    G <- get_G_for_augmentations_fast(X, J, n, X_cup)
 
     while (!converged) {
       # print(counter)

diff --git a/R/fit_null.R b/R/fit_null.R
@@ -101,7 +101,7 @@ fit_null <- function(B,
 
   #get X_cup for later use
   if (is.null(X_cup)) {
-    X_cup = X_cup_from_X(X,J)
+    X_cup = X_cup_from_X_fast(X,J)
   }
 
   #set iteration to zero

diff --git a/R/get_G_for_augmentations.R b/R/get_G_for_augmentations.R
@@ -22,3 +22,21 @@ get_G_for_augmentations <- function(X,
   G <- cbind(X_tilde_J,Z)
   return(G)
 }
+
+get_G_for_augmentations_fast <- function(X, J, n, X_cup) {
+  p <- ncol(X)
+  #calculate indices of columns to remove from X_cup (aka X_tilde, the expanded design matrix)
+  to_delete <- p*(J - 1) + 1:p #col indices for elements of B^J in long B format
+  last_class_rows <- seq(from = J, to = n * J, by = J)
+  X_cup[last_class_rows, ] <- 0
+
+  X_cup <- X_cup[, -to_delete]
+
+  #design matrix in z
+  Z <- Matrix::sparseMatrix(i = 1:(n*J),
+                            j = rep(1:n, each = J),
+                            x = 1,
+                            dims = c(n * J, n))
+  G <- cbind(X_cup,Z)
+  return(G)
+}
diff --git a/man/emuFit.Rd b/man/emuFit.Rd
diff --git a/tests/testthat/test-X_cup_from_X.R b/tests/testthat/test-X_cup_from_X.R
@@ -0,0 +1,13 @@
+test_that("X_cup_from_X_fast works and is faster than X_cup_from_X", {
+  X <- cbind(1, rnorm(100), runif(100))
+  J <- 200
+  start1 <- proc.time()
+  Xcup1 <- X_cup_from_X(X, J)
+  end1 <- proc.time() - start1
+  start2 <- proc.time()
+  Xcup2 <- X_cup_from_X_fast(X, J)
+  end2 <- proc.time() - start2
+
+  expect_true(all.equal(Xcup1, Xcup2))
+  expect_true(end2[3] < end1[3])
+})
diff --git a/tests/testthat/test-emuFit.R b/tests/testthat/test-emuFit.R
@@ -404,7 +404,7 @@ test_that("emuFit has 'score_test_hyperparams' object and throws warnings when c
   expect_false(fitted_model$estimation_converged)
 
   # check that warning is returned when estimation under the null doesn't converge
-  suppressWarnings({
+  suppressMessages({suppressWarnings({
     fitted_model2 <- emuFit(Y = Y,
                            X = cbind(X, rnorm(nrow(X))),
                            verbose = FALSE,
@@ -416,7 +416,7 @@ test_that("emuFit has 'score_test_hyperparams' object and throws warnings when c
                            test_kj = data.frame(k = 1, j = 1:2),
                            maxit_null = 5,
                            inner_maxit = 1)
-  })
+  })})
 
   # check that fitted model contains score_test_hyperparams object
   expect_true("score_test_hyperparams" %in% names(fitted_model2))

diff --git a/tests/testthat/test-fit_null.R b/tests/testthat/test-fit_null.R
@@ -1,4 +1,7 @@
 test_that("we get same null fit with different j_ref", {
+
+  skip("Deterministic, slow to run but passes when tested")
+
   set.seed(59542234)
   n <- 10
   J <- 5

diff --git a/tests/testthat/test-get_G_for_augmentations.R b/tests/testthat/test-get_G_for_augmentations.R
@@ -0,0 +1,17 @@
+test_that("get_G_for_augmentations_fast works and is faster than get_G_for_augmentations", {
+  X <- cbind(1, rnorm(200), runif(200))
+  J <- 200
+  Xcup2 <- X_cup_from_X_fast(X, J)
+
+  start1 <- proc.time()
+  g1 <- get_G_for_augmentations(X, J, nrow(X), Xcup2)
+  end1 <- proc.time() - start1
+
+  start2 <- proc.time()
+  g2 <- get_G_for_augmentations_fast(X, J, nrow(X), Xcup2)
+  end2 <- proc.time() - start2
+
+  expect_true(all.equal(g1, g2))
+  expect_true(end2[3] < end1[3])
+})
+