Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: radEmu
Title: Using Relative Abundance Data to Estimate of Multiplicative Differences in Mean Absolute Abundance
Version: 2.2.1.0
Version: 2.3.1.0
Authors@R: c(person("David", "Clausen", role = c("aut")),
person("Amy D", "Willis", email = "adwillis@uw.edu", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-2802-4317")),
person("Sarah", "Teichman", role = "aut"))
Expand Down
25 changes: 25 additions & 0 deletions R/X_cup_from_X.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,28 @@ X_cup_from_X <- function(X,J){

return(X_cup)
}

X_cup_from_X_fast <- function(X, J) {
n <- nrow(X)
p <- ncol(X)

# Total number of rows in final matrix
total_rows <- n * J

# Construct i index (row indices)
i_coords <- rep(1:total_rows, each = p)

# Construct j index (column indices)
j_block <- rep(0:(J - 1), each = p) * p
j_coords <- rep(j_block, times = n) + rep(1:p, times = total_rows)

# Construct values
X_rep <- X[rep(1:n, each = J), ]
x_vals <- as.vector(t(X_rep)) # row-wise unrolling

X_cup <- Matrix::sparseMatrix(i = i_coords,
j = j_coords,
x = x_vals)

return(X_cup)
}
57 changes: 40 additions & 17 deletions R/emuFit.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@
#' (corresponding to 95% confidence intervals)
#' @param return_wald_p logical: return p-values from Wald tests? Default is `FALSE`.
#' @param compute_cis logical: compute and return Wald CIs? Default is `TRUE`.
#' @param max_abs_B maximum allowed value for elements of B (in absolute value) in full model fitting. In
#' most cases this is not needed as Firth penalty will prevent infinite estimates
#' under separation. However, such a threshold may be helpful in very poorly conditioned problems (e.g., with many
#' nearly collinear regressors). Default is \code{250}.
#' @param run_score_tests logical: perform robust score testing? Default is TRUE.
#' @param test_kj a data frame whose rows give coordinates (in category j and
#' covariate k) of elements of B to construct hypothesis tests for. If you don't know
Expand Down Expand Up @@ -77,26 +81,34 @@
#' @param ... Additional arguments. Arguments matching the names of \code{control_fn()} options are forwarded to that function and override
#' defaults. Unknown arguments are ignored with a warning.
#'
#' @return A list containing elements 'coef', 'B', 'penalized', 'Y_augmented',
#' 'z_hat', 'I', 'Dy', and 'score_test_hyperparams' if score tests are run.
#' Parameter estimates by covariate and outcome category (e.g., taxon for microbiome data),
#' as well as optionally confidence intervals and p-values, are contained in 'coef'.
#' Any robust score statistics and score test p-values are also included in 'coef'.
#' If there are any zero-comparison parameters in the model, a column 'zero_comparison'
#' is also included, which is TRUE for any parameters that compare the level of a categorical
#' @return \code{emuFit} returns a list containing elements \code{coef}, \code{B}, \code{penalized}, \code{Y_augmented},
#' \code{z_hat}, \code{I}, \code{Dy}, and \code{score_test_hyperparams} if score tests are run.
#'
#' The \code{coef} table contains log fold-difference parameter estimates by covariate and outcome
#' category (e.g., taxon for microbiome data). A log fold-difference estimate of \code{1} for a treatment (versus control) and taxon X can be
#' interpreted to say that we expect taxon X is \code{exp(1) = 2.72} times more abundant in someone who has received the treatment compared to
#' someone who has received the control (holding all other covariates equal), when compared to typical fold-differences in abundances of
#' taxa in this analysis.
#'
#' \code{coef} also includes optionally-computed confidence intervals and robust Wald p-values.
#' Robust score statistics and score test p-values are also included in \code{coef}.
#' As explained in the associated manuscript, we recommend use of the robust score test values instead of the robust
#' Wald test p-values, due to better error rate control (i.e. fewer false positives).
#'
#' If there are any zero-comparison parameters in the model, a column "zero_comparison"
#' is also included, which is \code{TRUE} for any parameters that compare the level of a categorical
#' covariate to a reference level for a category with only zero counts for both the comparison
#' level and the reference level. This check is currently implemented for an arbitrary design matrix
#' generated using the \code{formula} and \code{data} arguments, and for a design matrix with no more
#' than one categorical covariate if the design matrix \code{X} is input directly.
#' 'B' contains parameter estimates in matrix format (rows indexing covariates and
#'
#' \code{B} contains parameter estimates in matrix format (rows indexing covariates and
#' columns indexing outcome category / taxon).
#' 'penalized' is equal to TRUE f Firth penalty is used in estimation (default) and FALSE otherwise.
#' 'z_hat' returns the nuisance parameters calculated in Equation 7 of the radEmu manuscript,
#' corresponding to either 'Y_augmented' or 'Y' if the 'penalized' is equal to TRUE
#' or FALSE, respectively.
#' I' and 'Dy' contain an information matrix and empirical score covariance matrix computed under the
#' \code{penalized} is \code{TRUE} if Firth penalty is used in estimation (default) and \code{FALSE} otherwise.
#' \code{z_hat} returns the nuisance parameters (sample-specific sequencing effects).
#' \code{I} and \code{Dy} contain an information matrix and empirical score covariance matrix computed under the
#' full model.
#' 'score_test_hyperparams' contains parameters and hyperparameters related to estimation under the null,
#' \code{score_test_hyperparams} contains parameters and hyperparameters related to estimation under the null,
#' including whether or not the algorithm converged, which can be helpful for debugging.
#'
#' @importFrom stats cov median model.matrix optim pchisq qnorm weighted.mean
Expand Down Expand Up @@ -146,6 +158,7 @@ emuFit <- function(Y,
alpha = 0.05,
return_wald_p = FALSE,
compute_cis = TRUE,
max_abs_B = 250,
run_score_tests = TRUE,
test_kj = NULL,
null_fit_alg = "constraint_sandwich",
Expand Down Expand Up @@ -222,7 +235,7 @@ emuFit <- function(Y,
# check for zero-comparison parameters
zero_comparison_res <- zero_comparison_check(X = X, Y = Y)

X_cup <- X_cup_from_X(X,J)
X_cup <- X_cup_from_X_fast(X,J)



Expand All @@ -235,6 +248,10 @@ emuFit <- function(Y,

if (refit) {

if (verbose %in% c(TRUE, "development")) {
message("Estimating parameters")
}

if (!is.null(fitted_model)) {
B <- fitted_model$B
}
Expand All @@ -251,7 +268,8 @@ emuFit <- function(Y,
max_step = control$max_step,
tolerance = tolerance,
verbose = (verbose == "development"),
j_ref = j_ref)
j_ref = j_ref,
max_abs_B = max_abs_B)
Y_test <- fitted_model$Y_augmented
fitted_B <- fitted_model$B
converged_estimates <- fitted_model$convergence
Expand Down Expand Up @@ -295,7 +313,7 @@ emuFit <- function(Y,
} else {
fitted_B <- B
if (penalize) {
G <- get_G_for_augmentations(X, J, n, X_cup)
G <- get_G_for_augmentations_fast(X, J, n, X_cup)
Y_test <- Y_augmented <- Y +
get_augmentations(X = X, G = G, Y = Y, B = fitted_B)
} else {
Expand All @@ -305,6 +323,11 @@ emuFit <- function(Y,
}
}

max_est_B <- max(abs(fitted_B))
if (refit & (max_est_B >= 0.9 * max_abs_B)) {
warning("At least one estimated B value is within 10% of your `max_abs_B` boundary. We suggest that you rerun estimation with a larger `max_abs_B` value.")
}

if (p == 1) {
message("You are running an intercept-only model. In this model the intercept beta_0^j is an unidentifiable combination of intercept for category j and the detection efficiency of category j. Therefore this parameter is not interpretable.")

Expand Down
4 changes: 2 additions & 2 deletions R/emuFit_check.R
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ ignoring argument 'cluster'.")
# check that test_kj is not null if running score tests
if (run_score_tests) {
if (is.null(test_kj)) {
stop("When `run_score_tests = TRUE`, you must provide a matrix `test_kj` to determine which parameters you want to test. If you don't know which indices k correspond to the covariate(s) that you would like to test, run the function `radEmu::make_design_matrix()` in order to view the design matrix, and identify which column of the design matrix corresponds to each covariate in your model. If you don't know which indices j correspond to categories (taxa) that you want to test, you can look at the columns and column names of your `Y` matrix.")
stop("When `run_score_tests = TRUE`, you must provide a data frame `test_kj` to determine which parameters you want to test. If you don't know which indices k correspond to the covariate(s) that you would like to test, run the function `radEmu::make_design_matrix()` in order to view the design matrix, and identify which column of the design matrix corresponds to each covariate in your model. If you don't know which indices j correspond to categories (taxa) that you want to test, you can look at the columns and column names of your `Y` matrix.")
}
}

Expand Down Expand Up @@ -299,7 +299,7 @@ ignoring argument 'cluster'.")

if (is.logical(all.equal(constraint_fn[[k]], pseudohuber_median))) {
if (all.equal(constraint_fn[[k]], pseudohuber_median)) {
if (verbose %in% c(TRUE, "development")) message("Centering row ", k, " of B with pseudo-Huber smoothed median with smoothing parameter ", constraint_param, ".")
if (verbose %in% c("development")) message("Centering row ", k, " of B with pseudo-Huber smoothed median with smoothing parameter ", constraint_param, ".")

stopifnot(!is.na(constraint_param))

Expand Down
4 changes: 2 additions & 2 deletions R/emuFit_micro_penalized.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ may take a moment."
)
}
if (is.null(X_cup)) {
X_cup <- X_cup_from_X(X, J)
X_cup <- X_cup_from_X_fast(X, J)
}
G <- get_G_for_augmentations(X, J, n, X_cup)
G <- get_G_for_augmentations_fast(X, J, n, X_cup)

while (!converged) {
# print(counter)
Expand Down
2 changes: 1 addition & 1 deletion R/fit_null.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ fit_null <- function(B,

#get X_cup for later use
if (is.null(X_cup)) {
X_cup = X_cup_from_X(X,J)
X_cup = X_cup_from_X_fast(X,J)
}

#set iteration to zero
Expand Down
18 changes: 18 additions & 0 deletions R/get_G_for_augmentations.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,21 @@ get_G_for_augmentations <- function(X,
G <- cbind(X_tilde_J,Z)
return(G)
}

get_G_for_augmentations_fast <- function(X, J, n, X_cup) {
p <- ncol(X)
#calculate indices of columns to remove from X_cup (aka X_tilde, the expanded design matrix)
to_delete <- p*(J - 1) + 1:p #col indices for elements of B^J in long B format
last_class_rows <- seq(from = J, to = n * J, by = J)
X_cup[last_class_rows, ] <- 0

X_cup <- X_cup[, -to_delete]

#design matrix in z
Z <- Matrix::sparseMatrix(i = 1:(n*J),
j = rep(1:n, each = J),
x = 1,
dims = c(n * J, n))
G <- cbind(X_cup,Z)
return(G)
}
42 changes: 28 additions & 14 deletions man/emuFit.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions tests/testthat/test-X_cup_from_X.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
test_that("X_cup_from_X_fast works and is faster than X_cup_from_X", {
X <- cbind(1, rnorm(100), runif(100))
J <- 200
start1 <- proc.time()
Xcup1 <- X_cup_from_X(X, J)
end1 <- proc.time() - start1
start2 <- proc.time()
Xcup2 <- X_cup_from_X_fast(X, J)
end2 <- proc.time() - start2

expect_true(all.equal(Xcup1, Xcup2))
expect_true(end2[3] < end1[3])
})
4 changes: 2 additions & 2 deletions tests/testthat/test-emuFit.R
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ test_that("emuFit has 'score_test_hyperparams' object and throws warnings when c
expect_false(fitted_model$estimation_converged)

# check that warning is returned when estimation under the null doesn't converge
suppressWarnings({
suppressMessages({suppressWarnings({
fitted_model2 <- emuFit(Y = Y,
X = cbind(X, rnorm(nrow(X))),
verbose = FALSE,
Expand All @@ -416,7 +416,7 @@ test_that("emuFit has 'score_test_hyperparams' object and throws warnings when c
test_kj = data.frame(k = 1, j = 1:2),
maxit_null = 5,
inner_maxit = 1)
})
})})

# check that fitted model contains score_test_hyperparams object
expect_true("score_test_hyperparams" %in% names(fitted_model2))
Expand Down
3 changes: 3 additions & 0 deletions tests/testthat/test-fit_null.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
test_that("we get same null fit with different j_ref", {

skip("Deterministic, slow to run but passes when tested")

set.seed(59542234)
n <- 10
J <- 5
Expand Down
17 changes: 17 additions & 0 deletions tests/testthat/test-get_G_for_augmentations.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
test_that("get_G_for_augmentations_fast works and is faster than get_G_for_augmentations", {
X <- cbind(1, rnorm(200), runif(200))
J <- 200
Xcup2 <- X_cup_from_X_fast(X, J)

start1 <- proc.time()
g1 <- get_G_for_augmentations(X, J, nrow(X), Xcup2)
end1 <- proc.time() - start1

start2 <- proc.time()
g2 <- get_G_for_augmentations_fast(X, J, nrow(X), Xcup2)
end2 <- proc.time() - start2

expect_true(all.equal(g1, g2))
expect_true(end2[3] < end1[3])
})

Loading
Loading