Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ Suggests:
ComplexHeatmap,
dbscan,
knitr,
rmarkdown,
DESeq2,
ggnewscale,
ggraph,
ggrepel,
Expand All @@ -48,7 +50,6 @@ Suggests:
openxlsx,
pheatmap,
readr,
rmarkdown,
survival,
survminer,
tidygraph,
Expand All @@ -59,4 +60,6 @@ Roxygen: list(markdown = TRUE)
Depends:
R (>= 3.5)
LazyData: true
LazyDataCompression: xz
VignetteBuilder: knitr

1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ export(nice_KM)
export(nice_PCA)
export(nice_UMAP)
export(nice_VSB)
export(nice_VSB_DEseq2)
export(nice_Volcano)
export(nice_tSNE)
export(plot_PA)
Expand Down
29 changes: 29 additions & 0 deletions R/add_annotations.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,35 @@
#' @param reference A reference table with the annotations including a column named "geneID".
#' @param variables Character vector of columns in `reference` to add. If NULL (default), all columns except geneID are used.
#' @param data_frame Logical; if TRUE, coerce `object` to a data.frame first. Default: FALSE.
#'
#' @examples
#' \dontrun{
#' data(norm_counts)
#'
#' # Requires a reference table with a "geneID" column.
#' # Use get_annotations() to generate it:
#' annotations <- get_annotations(
#' ensembl_ids = rownames(norm_counts),
#' mode = "genes"
#' )
#'
#' # Add gene symbol and biotype columns to the counts matrix
#' norm_counts_annot <- add_annotations(
#' object = norm_counts,
#' reference = annotations,
#' variables = c("symbol", "biotype")
#' )
#'
#' # Inspect result
#' head(norm_counts_annot[, c("geneID", "symbol", "biotype")])
#'
#' # Add all annotation columns (variables = NULL uses everything)
#' norm_counts_full <- add_annotations(
#' object = norm_counts,
#' reference = annotations
#' )
#' }
#'
#' @export

add_annotations <- function(object, reference, variables = NULL, data_frame = FALSE){
Expand Down
267 changes: 267 additions & 0 deletions R/deseq2_results.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
######################
# deseq2_results #
######################

#' DESeq2 differential expression results for TCGA-LUAD
#'
#' DESeq2 results from a differential expression analysis
#' comparing primary lung adenocarcinoma tumors versus normal tissue using
#' TCGA-LUAD RNA-seq data. Contains 21330 genes to produce informative
#' visualizations with [nice_Volcano()], and also suitable as input for
#' [detect_filter()], and [add_annotations()]
#' and related plotting functions.
#'
#' @format A data frame with 21,330 rows and 7 columns:
#' \describe{
#' \item{gene_id}{Character. Ensembl gene ID (e.g., `"ENSG00000141510"`).}
#' \item{baseMean}{Numeric. Mean of normalized counts across all samples.}
#' \item{log2FoldChange}{Numeric. Shrunken log2 fold change
#' (tumor vs. normal).}
#' \item{lfcSE}{Numeric. Standard error of the log2 fold change estimate.}
#' \item{stat}{Numeric. Wald test statistic.}
#' \item{pvalue}{Numeric. Raw p-value.}
#' \item{padj}{Numeric. Benjamini-Hochberg adjusted p-value (FDR).}
#' }
#'
#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal
#' (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}).
#' DESeq2 analysis performed with default settings; results generated by
#' `data-raw/deseq2_results.R`.
#'
#' @examples
#' data(deseq2_results)
#'
#' # Overview
#' head(deseq2_results)
#'
#' # Significant genes
#' sum(deseq2_results$padj < 0.05, na.rm = TRUE)
#'
#' # Volcano plot
#' nice_Volcano(
#' results = deseq2_results,
#' x_var = "log2FoldChange",
#' y_var = "padj",
#' label_var = "gene_id",
#' title = "TCGA-LUAD: Tumor vs Normal"
#' )
#' \dontrun{
#' # detect_filter (required: "ensembl" column in results)
#' deseq2_res <- deseq2_results
#' colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl"
#' rownames(deseq2_res) <- deseq2_res$ensembl
#'
#' # Get sample IDs per group from sampledata
#' samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"]
#' samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"]
#'
#' detected <- detect_filter(
#' norm.counts = as.data.frame(norm_counts),
#' df.BvsA = deseq2_res,
#' samples.baseline = samples_normal,
#' samples.condition1 = samples_tumor,
#' cutoffs = c(50, 50, 0)
#' )
#'
#' # Number of detectable genes
#' length(detected$DetectGenes)
#'
#' # Subset results to detectable genes
#' head(detected$Comparison1)
#' }
#' @seealso [nice_Volcano()], [raw_counts], [sampledata]
"deseq2_results"


#####################
# norm_counts data #
#####################

#' Normalized counts matrix for TCGA-LUAD
#'
#' DESeq2 size-factor normalized counts derived from the TCGA-LUAD RNA-seq
#' dataset (16 tumor samples, 16 normal samples). Counts are divided by
#' DESeq2 size factors to correct for differences in library size across
#' samples, but remain in counts scale (not log-transformed).
#'
#' Suitable as input for [nice_VSB()], [detect_filter()], and
#' [add_annotations()]. For dimensionality reduction methods ([nice_PCA()],
#' [nice_UMAP()], [nice_tSNE()]) use [vst_counts] instead, which removes the
#' mean-variance dependence of RNA-seq data.
#'
#' @format A numeric matrix with 21,330 rows (genes) and 32 columns (samples):
#' \describe{
#' \item{rows}{Ensembl gene IDs (e.g., `"ENSG00000141510"`).}
#' \item{columns}{Sample IDs matching the `patient_id` column of
#' [sampledata].}
#' \item{values}{Non-negative numeric. Size-factor normalized counts.
#' Range: \[0, 1,889,573\].}
#' }
#'
#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal
#' (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}).
#' Normalized with DESeq2::counts() (`normalized = TRUE`); generated by
#' `data-raw/deseq2_results.R`.
#'
#' @examples
#' data(norm_counts)
#' data(sampledata)
#'
#' # Dimensions
#' dim(norm_counts)
#'
#' # Value range
#' range(norm_counts)
#'
#' # Expression of a specific gene across samples
#' norm_counts["ENSG00000141510", ]
#'
#' # Violin-Scatter-Box plot for one gene
#' nice_VSB(
#' object = norm_counts,
#' annotations = sampledata,
#' variables = c(fill = "sample_type"),
#' genename = "ENSG00000141510",
#' categories = c("normal", "tumor"),
#' labels = c("Normal", "Tumor"),
#' colors = c("steelblue", "firebrick")
#' )
#'
#' \dontrun{
#' # detect_filter: (required: "ensembl" column in results)
#' deseq2_res <- deseq2_results
#' colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl"
#' rownames(deseq2_res) <- deseq2_res$ensembl
#'
#' # Get sample IDs per group from sampledata
#' samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"]
#' samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"]
#'
#' detected <- detect_filter(
#' norm.counts = as.data.frame(norm_counts),
#' df.BvsA = deseq2_res,
#' samples.baseline = samples_normal,
#' samples.condition1 = samples_tumor,
#' cutoffs = c(50, 50, 0)
#' )
#'
#' # Number of detectable genes
#' length(detected$DetectGenes)
#'
#' # Subset results to detectable genes
#' head(detected$Comparison1)
#'
#' # add_annotations: add gene symbols
#' # Required: reference df with geneID + annotation columns
#' # Example using biomaRt to fetch gene symbols
#' library(biomaRt)
#' mart <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl")
#' ref <- getBM(
#' attributes = c("ensembl_gene_id", "hgnc_symbol", "gene_biotype"),
#' filters = "ensembl_gene_id",
#' values = rownames(norm_counts),
#' mart = mart
#' )
#' colnames(ref)[1] <- "geneID"
#'
#' norm_counts_annot <- add_annotations(
#' object = norm_counts,
#' reference = ref,
#' variables = c("hgnc_symbol", "gene_biotype")
#' )
#'
#' head(norm_counts_annot[, c("geneID", "hgnc_symbol", "gene_biotype")])
#' }
#'
#' @seealso [vst_counts], [deseq2_results], [sampledata], [nice_VSB()],
#' [detect_filter()], [add_annotations()]
"norm_counts"


####################
# vst_counts data #
####################

#' Variance-stabilized counts matrix for TCGA-LUAD
#'
#' Variance Stabilizing Transformation (VST) applied to the TCGA-LUAD RNA-seq
#' dataset (16 tumor samples, 16 normal samples) using [DESeq2::vst()] with
#' `blind = TRUE`. VST removes the mean-variance dependence characteristic of
#' RNA-seq count data, placing all genes on a comparable log2-like scale. This
#' makes it the appropriate input for sample-level dimensionality reduction and
#' clustering methods.
#'
#' Suitable as input for [nice_PCA()], [nice_UMAP()], and [nice_tSNE()]. For
#' gene-level expression plots ([nice_VSB()]) or filtering ([detect_filter()])
#' use [norm_counts] instead.
#'
#' @format A numeric matrix with 21,330 rows (genes) and 32 columns (samples):
#' \describe{
#' \item{rows}{Ensembl gene IDs (e.g., `"ENSG00000141510"`).}
#' \item{columns}{Sample IDs matching the `patient_id` column of
#' [sampledata].}
#' \item{values}{Numeric. VST-transformed expression values on a log2-like
#' scale. Range: \[1.78, 20.85\].}
#' }
#'
#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal
#' (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}).
#' Transformed with [DESeq2::vst()] (`blind = TRUE`); generated by
#' `data-raw/deseq2_results.R`.
#'
#' @examples
#' data(vst_counts)
#' data(sampledata)
#'
#' # Dimensions
#' dim(vst_counts)
#'
#' # Value range (log2-like scale)
#' range(vst_counts)
#'
#' # PCA plot colored by sample type
#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id"
#
#' nice_PCA(
#' object = vst_counts,
#' annotations = sampledata,
#' variables = c(fill = "sample_type"),
#' legend_names = c(fill = "Sample Type"),
#' colors = c("steelblue", "firebrick"),
#' shapes = c(21, 21),
#' title = "TCGA-LUAD PCA"
#' )
#'
#' \dontrun{
#' # UMAP plot
#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id"
#'
#' nice_UMAP(
#' object = vst_counts,
#' annotations = sampledata,
#' variables = c(fill = "sample_type"),
#' legend_names = c(fill = "Sample Type"),
#' colors = c("steelblue", "firebrick"),
#' shapes = c(21, 21),
#' title = "TCGA-LUAD UMAP"
#' )
#'
#' # tSNE plot
#' # perplexity must be lower than the number of samples divided by 3
#'
#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id"
#' nice_tSNE(
#' object = vst_counts,
#' annotations = sampledata,
#' perplexity = 5,
#' variables = c(fill = "sample_type"),
#' legend_names = c(fill = "Sample Type"),
#' colors = c("steelblue", "firebrick"),
#' shapes = c(21, 21),
#' title = "TCGA-LUAD tSNE"
#' )
#' }
#'
#' @seealso [norm_counts], [deseq2_results], [sampledata], [nice_PCA()],
#' [nice_UMAP()], [nice_tSNE()]
"vst_counts"
Loading
Loading