BigMindLab · DanielGarbozo · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -39,6 +39,8 @@ Suggests:
     ComplexHeatmap,
     dbscan,
     knitr,
+    rmarkdown,
+    DESeq2,
     ggnewscale,
     ggraph,
     ggrepel,
@@ -48,7 +50,6 @@ Suggests:
     openxlsx,
     pheatmap,
     readr,
-    rmarkdown,
     survival,
     survminer,
     tidygraph,
@@ -59,4 +60,6 @@ Roxygen: list(markdown = TRUE)
 Depends: 
     R (>= 3.5)
 LazyData: true
+LazyDataCompression: xz
 VignetteBuilder: knitr
+
diff --git a/NAMESPACE b/NAMESPACE
@@ -17,6 +17,7 @@ export(nice_KM)
 export(nice_PCA)
 export(nice_UMAP)
 export(nice_VSB)
+export(nice_VSB_DEseq2)
 export(nice_Volcano)
 export(nice_tSNE)
 export(plot_PA)

diff --git a/R/add_annotations.R b/R/add_annotations.R
@@ -8,6 +8,35 @@
 #' @param reference A reference table with the annotations including a column named "geneID".
 #' @param variables Character vector of columns in `reference` to add. If NULL (default), all columns except geneID are used.
 #' @param data_frame Logical; if TRUE, coerce `object` to a data.frame first. Default: FALSE.
+#'
+#' @examples
+#' \dontrun{
+#' data(norm_counts)
+#'
+#' # Requires a reference table with a "geneID" column.
+#' # Use get_annotations() to generate it:
+#' annotations <- get_annotations(
+#'   ensembl_ids = rownames(norm_counts),
+#'   mode        = "genes"
+#' )
+#'
+#' # Add gene symbol and biotype columns to the counts matrix
+#' norm_counts_annot <- add_annotations(
+#'   object    = norm_counts,
+#'   reference = annotations,
+#'   variables = c("symbol", "biotype")
+#' )
+#'
+#' # Inspect result
+#' head(norm_counts_annot[, c("geneID", "symbol", "biotype")])
+#'
+#' # Add all annotation columns (variables = NULL uses everything)
+#' norm_counts_full <- add_annotations(
+#'   object    = norm_counts,
+#'   reference = annotations
+#' )
+#' }
+#'
 #' @export
 
 add_annotations <- function(object, reference, variables = NULL, data_frame = FALSE){

diff --git a/R/deseq2_results.R b/R/deseq2_results.R
@@ -0,0 +1,267 @@
+######################
+#   deseq2_results   #
+######################
+
+#' DESeq2 differential expression results for TCGA-LUAD
+#'
+#' DESeq2 results from a differential expression analysis
+#' comparing primary lung adenocarcinoma tumors versus normal tissue using
+#' TCGA-LUAD RNA-seq data. Contains 21330 genes to produce informative
+#' visualizations with [nice_Volcano()], and also suitable as input for
+#' [detect_filter()], and [add_annotations()]
+#' and related plotting functions.
+#'
+#' @format A data frame with 21,330 rows and 7 columns:
+#'   \describe{
+#'     \item{gene_id}{Character. Ensembl gene ID (e.g., `"ENSG00000141510"`).}
+#'     \item{baseMean}{Numeric. Mean of normalized counts across all samples.}
+#'     \item{log2FoldChange}{Numeric. Shrunken log2 fold change
+#'       (tumor vs. normal).}
+#'     \item{lfcSE}{Numeric. Standard error of the log2 fold change estimate.}
+#'     \item{stat}{Numeric. Wald test statistic.}
+#'     \item{pvalue}{Numeric. Raw p-value.}
+#'     \item{padj}{Numeric. Benjamini-Hochberg adjusted p-value (FDR).}
+#'   }
+#'
+#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal
+#'   (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}).
+#'   DESeq2 analysis performed with default settings; results generated by
+#'   `data-raw/deseq2_results.R`.
+#'
+#' @examples
+#' data(deseq2_results)
+#'
+#' # Overview
+#' head(deseq2_results)
+#'
+#' # Significant genes
+#' sum(deseq2_results$padj < 0.05, na.rm = TRUE)
+#'
+#' # Volcano plot
+#' nice_Volcano(
+#'   results   = deseq2_results,
+#'   x_var     = "log2FoldChange",
+#'   y_var     = "padj",
+#'   label_var = "gene_id",
+#'   title     = "TCGA-LUAD: Tumor vs Normal"
+#' )
+#' \dontrun{
+#' # detect_filter (required: "ensembl" column in results)
+#' deseq2_res <- deseq2_results
+#' colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl"
+#' rownames(deseq2_res) <- deseq2_res$ensembl
+#'
+#' # Get sample IDs per group from sampledata
+#' samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"]
+#' samples_tumor  <- sampledata$patient_id[sampledata$sample_type == "tumor"]
+#'
+#' detected <- detect_filter(
+#'   norm.counts        = as.data.frame(norm_counts),
+#'   df.BvsA            = deseq2_res,
+#'   samples.baseline   = samples_normal,
+#'   samples.condition1 = samples_tumor,
+#'   cutoffs            = c(50, 50, 0)
+#' )
+#'
+#' # Number of detectable genes
+#' length(detected$DetectGenes)
+#'
+#' # Subset results to detectable genes
+#' head(detected$Comparison1)
+#' }
+#' @seealso [nice_Volcano()], [raw_counts], [sampledata]
+"deseq2_results"
+
+
+#####################
+# norm_counts data  #
+#####################
+
+#' Normalized counts matrix for TCGA-LUAD
+#'
+#' DESeq2 size-factor normalized counts derived from the TCGA-LUAD RNA-seq
+#' dataset (16 tumor samples, 16 normal samples). Counts are divided by
+#' DESeq2 size factors to correct for differences in library size across
+#' samples, but remain in counts scale (not log-transformed).
+#'
+#' Suitable as input for [nice_VSB()], [detect_filter()], and
+#' [add_annotations()]. For dimensionality reduction methods ([nice_PCA()],
+#' [nice_UMAP()], [nice_tSNE()]) use [vst_counts] instead, which removes the
+#' mean-variance dependence of RNA-seq data.
+#'
+#' @format A numeric matrix with 21,330 rows (genes) and 32 columns (samples):
+#'   \describe{
+#'     \item{rows}{Ensembl gene IDs (e.g., `"ENSG00000141510"`).}
+#'     \item{columns}{Sample IDs matching the `patient_id` column of
+#'       [sampledata].}
+#'     \item{values}{Non-negative numeric. Size-factor normalized counts.
+#'       Range: \[0, 1,889,573\].}
+#'   }
+#'
+#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal
+#'   (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}).
+#'   Normalized with DESeq2::counts() (`normalized = TRUE`); generated by
+#'   `data-raw/deseq2_results.R`.
+#'
+#' @examples
+#' data(norm_counts)
+#' data(sampledata)
+#'
+#' # Dimensions
+#' dim(norm_counts)
+#'
+#' # Value range
+#' range(norm_counts)
+#'
+#' # Expression of a specific gene across samples
+#' norm_counts["ENSG00000141510", ]
+#'
+#' # Violin-Scatter-Box plot for one gene
+#' nice_VSB(
+#'   object      = norm_counts,
+#'   annotations = sampledata,
+#'   variables   = c(fill = "sample_type"),
+#'   genename    = "ENSG00000141510",
+#'   categories  = c("normal", "tumor"),
+#'   labels      = c("Normal", "Tumor"),
+#'   colors      = c("steelblue", "firebrick")
+#' )
+#'
+#' \dontrun{
+#' # detect_filter: (required: "ensembl" column in results)
+#' deseq2_res <- deseq2_results
+#' colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl"
+#' rownames(deseq2_res) <- deseq2_res$ensembl
+#'
+#' # Get sample IDs per group from sampledata
+#' samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"]
+#' samples_tumor  <- sampledata$patient_id[sampledata$sample_type == "tumor"]
+#'
+#' detected <- detect_filter(
+#'   norm.counts        = as.data.frame(norm_counts),
+#'   df.BvsA            = deseq2_res,
+#'   samples.baseline   = samples_normal,
+#'   samples.condition1 = samples_tumor,
+#'   cutoffs            = c(50, 50, 0)
+#' )
+#'
+#' # Number of detectable genes
+#' length(detected$DetectGenes)
+#'
+#' # Subset results to detectable genes
+#' head(detected$Comparison1)
+#'
+#' #  add_annotations: add gene symbols
+#' # Required: reference df with geneID + annotation columns
+#' # Example using biomaRt to fetch gene symbols
+#' library(biomaRt)
+#' mart <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl")
+#' ref  <- getBM(
+#'   attributes = c("ensembl_gene_id", "hgnc_symbol", "gene_biotype"),
+#'   filters    = "ensembl_gene_id",
+#'   values     = rownames(norm_counts),
+#'   mart       = mart
+#' )
+#' colnames(ref)[1] <- "geneID"
+#'
+#' norm_counts_annot <- add_annotations(
+#'   object    = norm_counts,
+#'   reference = ref,
+#'   variables = c("hgnc_symbol", "gene_biotype")
+#' )
+#'
+#' head(norm_counts_annot[, c("geneID", "hgnc_symbol", "gene_biotype")])
+#' }
+#'
+#' @seealso [vst_counts], [deseq2_results], [sampledata], [nice_VSB()],
+#'   [detect_filter()], [add_annotations()]
+"norm_counts"
+
+
+####################
+# vst_counts data  #
+####################
+
+#' Variance-stabilized counts matrix for TCGA-LUAD
+#'
+#' Variance Stabilizing Transformation (VST) applied to the TCGA-LUAD RNA-seq
+#' dataset (16 tumor samples, 16 normal samples) using [DESeq2::vst()] with
+#' `blind = TRUE`. VST removes the mean-variance dependence characteristic of
+#' RNA-seq count data, placing all genes on a comparable log2-like scale. This
+#' makes it the appropriate input for sample-level dimensionality reduction and
+#' clustering methods.
+#'
+#' Suitable as input for [nice_PCA()], [nice_UMAP()], and [nice_tSNE()]. For
+#' gene-level expression plots ([nice_VSB()]) or filtering ([detect_filter()])
+#' use [norm_counts] instead.
+#'
+#' @format A numeric matrix with 21,330 rows (genes) and 32 columns (samples):
+#'   \describe{
+#'     \item{rows}{Ensembl gene IDs (e.g., `"ENSG00000141510"`).}
+#'     \item{columns}{Sample IDs matching the `patient_id` column of
+#'       [sampledata].}
+#'     \item{values}{Numeric. VST-transformed expression values on a log2-like
+#'       scale. Range: \[1.78, 20.85\].}
+#'   }
+#'
+#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal
+#'   (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}).
+#'   Transformed with [DESeq2::vst()] (`blind = TRUE`); generated by
+#'   `data-raw/deseq2_results.R`.
+#'
+#' @examples
+#' data(vst_counts)
+#' data(sampledata)
+#'
+#' # Dimensions
+#' dim(vst_counts)
+#'
+#' # Value range (log2-like scale)
+#' range(vst_counts)
+#'
+#' # PCA plot colored by sample type
+#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id"
+#
+#' nice_PCA(
+#'   object      = vst_counts,
+#'   annotations = sampledata,
+#'   variables   = c(fill = "sample_type"),
+#'   legend_names = c(fill = "Sample Type"),
+#'   colors      = c("steelblue", "firebrick"),
+#'   shapes      = c(21, 21),
+#'   title       = "TCGA-LUAD PCA"
+#' )
+#'
+#' \dontrun{
+#' # UMAP plot
+#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id"
+#'
+#' nice_UMAP(
+#'   object      = vst_counts,
+#'   annotations = sampledata,
+#'   variables   = c(fill = "sample_type"),
+#'   legend_names = c(fill = "Sample Type"),
+#'   colors      = c("steelblue", "firebrick"),
+#'   shapes      = c(21, 21),
+#'   title       = "TCGA-LUAD UMAP"
+#' )
+#'
+#' # tSNE plot
+#' # perplexity must be lower than the number of samples divided by 3
+#'
+#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id"
+#' nice_tSNE(
+#'   object       = vst_counts,
+#'   annotations  = sampledata,
+#'   perplexity   = 5,
+#'   variables    = c(fill = "sample_type"),
+#'   legend_names = c(fill = "Sample Type"),
+#'   colors       = c("steelblue", "firebrick"),
+#'   shapes       = c(21, 21),
+#'   title        = "TCGA-LUAD tSNE"
+#' )
+#' }
+#'
+#' @seealso [norm_counts], [deseq2_results], [sampledata], [nice_PCA()],
+#'   [nice_UMAP()], [nice_tSNE()]
+"vst_counts"