diff --git a/DESCRIPTION b/DESCRIPTION index d00b63d..ce290a4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,6 +39,8 @@ Suggests: ComplexHeatmap, dbscan, knitr, + rmarkdown, + DESeq2, ggnewscale, ggraph, ggrepel, @@ -48,7 +50,6 @@ Suggests: openxlsx, pheatmap, readr, - rmarkdown, survival, survminer, tidygraph, @@ -59,4 +60,6 @@ Roxygen: list(markdown = TRUE) Depends: R (>= 3.5) LazyData: true +LazyDataCompression: xz VignetteBuilder: knitr + diff --git a/NAMESPACE b/NAMESPACE index 89065f0..9cd61e0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(nice_KM) export(nice_PCA) export(nice_UMAP) export(nice_VSB) +export(nice_VSB_DEseq2) export(nice_Volcano) export(nice_tSNE) export(plot_PA) diff --git a/R/add_annotations.R b/R/add_annotations.R index 7fb4985..aef3c96 100644 --- a/R/add_annotations.R +++ b/R/add_annotations.R @@ -8,6 +8,35 @@ #' @param reference A reference table with the annotations including a column named "geneID". #' @param variables Character vector of columns in `reference` to add. If NULL (default), all columns except geneID are used. #' @param data_frame Logical; if TRUE, coerce `object` to a data.frame first. Default: FALSE. +#' +#' @examples +#' \dontrun{ +#' data(norm_counts) +#' +#' # Requires a reference table with a "geneID" column. +#' # Use get_annotations() to generate it: +#' annotations <- get_annotations( +#' ensembl_ids = rownames(norm_counts), +#' mode = "genes" +#' ) +#' +#' # Add gene symbol and biotype columns to the counts matrix +#' norm_counts_annot <- add_annotations( +#' object = norm_counts, +#' reference = annotations, +#' variables = c("symbol", "biotype") +#' ) +#' +#' # Inspect result +#' head(norm_counts_annot[, c("geneID", "symbol", "biotype")]) +#' +#' # Add all annotation columns (variables = NULL uses everything) +#' norm_counts_full <- add_annotations( +#' object = norm_counts, +#' reference = annotations +#' ) +#' } +#' #' @export add_annotations <- function(object, reference, variables = NULL, data_frame = FALSE){ diff --git a/R/deseq2_results.R b/R/deseq2_results.R new file mode 100644 index 0000000..5c1f89c --- /dev/null +++ b/R/deseq2_results.R @@ -0,0 +1,267 @@ +###################### +# deseq2_results # +###################### + +#' DESeq2 differential expression results for TCGA-LUAD +#' +#' DESeq2 results from a differential expression analysis +#' comparing primary lung adenocarcinoma tumors versus normal tissue using +#' TCGA-LUAD RNA-seq data. Contains 21330 genes to produce informative +#' visualizations with [nice_Volcano()], and also suitable as input for +#' [detect_filter()], and [add_annotations()] +#' and related plotting functions. +#' +#' @format A data frame with 21,330 rows and 7 columns: +#' \describe{ +#' \item{gene_id}{Character. Ensembl gene ID (e.g., `"ENSG00000141510"`).} +#' \item{baseMean}{Numeric. Mean of normalized counts across all samples.} +#' \item{log2FoldChange}{Numeric. Shrunken log2 fold change +#' (tumor vs. normal).} +#' \item{lfcSE}{Numeric. Standard error of the log2 fold change estimate.} +#' \item{stat}{Numeric. Wald test statistic.} +#' \item{pvalue}{Numeric. Raw p-value.} +#' \item{padj}{Numeric. Benjamini-Hochberg adjusted p-value (FDR).} +#' } +#' +#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal +#' (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +#' DESeq2 analysis performed with default settings; results generated by +#' `data-raw/deseq2_results.R`. +#' +#' @examples +#' data(deseq2_results) +#' +#' # Overview +#' head(deseq2_results) +#' +#' # Significant genes +#' sum(deseq2_results$padj < 0.05, na.rm = TRUE) +#' +#' # Volcano plot +#' nice_Volcano( +#' results = deseq2_results, +#' x_var = "log2FoldChange", +#' y_var = "padj", +#' label_var = "gene_id", +#' title = "TCGA-LUAD: Tumor vs Normal" +#' ) +#' \dontrun{ +#' # detect_filter (required: "ensembl" column in results) +#' deseq2_res <- deseq2_results +#' colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl" +#' rownames(deseq2_res) <- deseq2_res$ensembl +#' +#' # Get sample IDs per group from sampledata +#' samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +#' samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] +#' +#' detected <- detect_filter( +#' norm.counts = as.data.frame(norm_counts), +#' df.BvsA = deseq2_res, +#' samples.baseline = samples_normal, +#' samples.condition1 = samples_tumor, +#' cutoffs = c(50, 50, 0) +#' ) +#' +#' # Number of detectable genes +#' length(detected$DetectGenes) +#' +#' # Subset results to detectable genes +#' head(detected$Comparison1) +#' } +#' @seealso [nice_Volcano()], [raw_counts], [sampledata] +"deseq2_results" + + +##################### +# norm_counts data # +##################### + +#' Normalized counts matrix for TCGA-LUAD +#' +#' DESeq2 size-factor normalized counts derived from the TCGA-LUAD RNA-seq +#' dataset (16 tumor samples, 16 normal samples). Counts are divided by +#' DESeq2 size factors to correct for differences in library size across +#' samples, but remain in counts scale (not log-transformed). +#' +#' Suitable as input for [nice_VSB()], [detect_filter()], and +#' [add_annotations()]. For dimensionality reduction methods ([nice_PCA()], +#' [nice_UMAP()], [nice_tSNE()]) use [vst_counts] instead, which removes the +#' mean-variance dependence of RNA-seq data. +#' +#' @format A numeric matrix with 21,330 rows (genes) and 32 columns (samples): +#' \describe{ +#' \item{rows}{Ensembl gene IDs (e.g., `"ENSG00000141510"`).} +#' \item{columns}{Sample IDs matching the `patient_id` column of +#' [sampledata].} +#' \item{values}{Non-negative numeric. Size-factor normalized counts. +#' Range: \[0, 1,889,573\].} +#' } +#' +#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal +#' (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +#' Normalized with DESeq2::counts() (`normalized = TRUE`); generated by +#' `data-raw/deseq2_results.R`. +#' +#' @examples +#' data(norm_counts) +#' data(sampledata) +#' +#' # Dimensions +#' dim(norm_counts) +#' +#' # Value range +#' range(norm_counts) +#' +#' # Expression of a specific gene across samples +#' norm_counts["ENSG00000141510", ] +#' +#' # Violin-Scatter-Box plot for one gene +#' nice_VSB( +#' object = norm_counts, +#' annotations = sampledata, +#' variables = c(fill = "sample_type"), +#' genename = "ENSG00000141510", +#' categories = c("normal", "tumor"), +#' labels = c("Normal", "Tumor"), +#' colors = c("steelblue", "firebrick") +#' ) +#' +#' \dontrun{ +#' # detect_filter: (required: "ensembl" column in results) +#' deseq2_res <- deseq2_results +#' colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl" +#' rownames(deseq2_res) <- deseq2_res$ensembl +#' +#' # Get sample IDs per group from sampledata +#' samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +#' samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] +#' +#' detected <- detect_filter( +#' norm.counts = as.data.frame(norm_counts), +#' df.BvsA = deseq2_res, +#' samples.baseline = samples_normal, +#' samples.condition1 = samples_tumor, +#' cutoffs = c(50, 50, 0) +#' ) +#' +#' # Number of detectable genes +#' length(detected$DetectGenes) +#' +#' # Subset results to detectable genes +#' head(detected$Comparison1) +#' +#' # add_annotations: add gene symbols +#' # Required: reference df with geneID + annotation columns +#' # Example using biomaRt to fetch gene symbols +#' library(biomaRt) +#' mart <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl") +#' ref <- getBM( +#' attributes = c("ensembl_gene_id", "hgnc_symbol", "gene_biotype"), +#' filters = "ensembl_gene_id", +#' values = rownames(norm_counts), +#' mart = mart +#' ) +#' colnames(ref)[1] <- "geneID" +#' +#' norm_counts_annot <- add_annotations( +#' object = norm_counts, +#' reference = ref, +#' variables = c("hgnc_symbol", "gene_biotype") +#' ) +#' +#' head(norm_counts_annot[, c("geneID", "hgnc_symbol", "gene_biotype")]) +#' } +#' +#' @seealso [vst_counts], [deseq2_results], [sampledata], [nice_VSB()], +#' [detect_filter()], [add_annotations()] +"norm_counts" + + +#################### +# vst_counts data # +#################### + +#' Variance-stabilized counts matrix for TCGA-LUAD +#' +#' Variance Stabilizing Transformation (VST) applied to the TCGA-LUAD RNA-seq +#' dataset (16 tumor samples, 16 normal samples) using [DESeq2::vst()] with +#' `blind = TRUE`. VST removes the mean-variance dependence characteristic of +#' RNA-seq count data, placing all genes on a comparable log2-like scale. This +#' makes it the appropriate input for sample-level dimensionality reduction and +#' clustering methods. +#' +#' Suitable as input for [nice_PCA()], [nice_UMAP()], and [nice_tSNE()]. For +#' gene-level expression plots ([nice_VSB()]) or filtering ([detect_filter()]) +#' use [norm_counts] instead. +#' +#' @format A numeric matrix with 21,330 rows (genes) and 32 columns (samples): +#' \describe{ +#' \item{rows}{Ensembl gene IDs (e.g., `"ENSG00000141510"`).} +#' \item{columns}{Sample IDs matching the `patient_id` column of +#' [sampledata].} +#' \item{values}{Numeric. VST-transformed expression values on a log2-like +#' scale. Range: \[1.78, 20.85\].} +#' } +#' +#' @source TCGA-LUAD STAR counts downloaded from the GDC Data Portal +#' (\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +#' Transformed with [DESeq2::vst()] (`blind = TRUE`); generated by +#' `data-raw/deseq2_results.R`. +#' +#' @examples +#' data(vst_counts) +#' data(sampledata) +#' +#' # Dimensions +#' dim(vst_counts) +#' +#' # Value range (log2-like scale) +#' range(vst_counts) +#' +#' # PCA plot colored by sample type +#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" +# +#' nice_PCA( +#' object = vst_counts, +#' annotations = sampledata, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD PCA" +#' ) +#' +#' \dontrun{ +#' # UMAP plot +#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" +#' +#' nice_UMAP( +#' object = vst_counts, +#' annotations = sampledata, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD UMAP" +#' ) +#' +#' # tSNE plot +#' # perplexity must be lower than the number of samples divided by 3 +#' +#' colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" +#' nice_tSNE( +#' object = vst_counts, +#' annotations = sampledata, +#' perplexity = 5, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD tSNE" +#' ) +#' } +#' +#' @seealso [norm_counts], [deseq2_results], [sampledata], [nice_PCA()], +#' [nice_UMAP()], [nice_tSNE()] +"vst_counts" diff --git a/R/detect_filter.R b/R/detect_filter.R index f3f5c85..2699720 100644 --- a/R/detect_filter.R +++ b/R/detect_filter.R @@ -17,6 +17,37 @@ #' @param samples.condition2 Vector of Sample IDs or indexes corresponding to the second condition (optional). #' @param samples.condition3 Vector of Sample IDs or indexes corresponding to the third condition (optional). #' @param cutoffs Vector containing threshold values for baseMean, mean normalized counts and Log2 Fold Change; respectively. Default: c(50, 50, 0). +#' +#' @examples +#' \dontrun{ +#' data(norm_counts) +#' data(deseq2_results) +#' data(sampledata) +#' +#' # detect_filter requires an "ensembl" column in the results data frame +#' res <- deseq2_results +#' colnames(res)[colnames(res) == "gene_id"] <- "ensembl" +#' rownames(res) <- res$ensembl +#' +#' # Get sample IDs per group +#' samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +#' samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] +#' +#' detected <- detect_filter( +#' norm.counts = as.data.frame(norm_counts), +#' df.BvsA = res, +#' samples.baseline = samples_normal, +#' samples.condition1 = samples_tumor, +#' cutoffs = c(50, 50, 0) +#' ) +#' +#' # Number of detectable genes +#' length(detected$DetectGenes) +#' +#' # Subset results +#' head(detected$Comparison1) +#' } +#' #' @export @@ -28,27 +59,27 @@ detect_filter <- function(norm.counts, df.BvsA, df.CvsA = NULL, df.DvsA = NULL, if (length(cutoffs) != 3) { stop("Cutoffs vector must contain three values: baseMean, mean normalized counts, and Log2 Fold Change thresholds.") } - + # Create an empty vector to store genes temporary genes_vector <- c() - + # Obtain mean normalized counts per phenotype norm.counts$Mean.Baseline <- rowMeans(norm.counts[, samples.baseline]) norm.counts$Mean.Condition1 <- rowMeans(norm.counts[, samples.condition1]) - + # Filter data frames by baseMean df.BvsA.det <- df.BvsA[df.BvsA$baseMean > cutoffs[1], ] - + if (!is.null(df.CvsA)) { norm.counts$Mean.Condition2 <- rowMeans(norm.counts[, samples.condition2]) df.CvsA.det <- df.CvsA[df.CvsA$baseMean > cutoffs[1], ] } - + if (!is.null(df.DvsA)) { norm.counts$Mean.Condition3 <- rowMeans(norm.counts[, samples.condition3]) df.DvsA.det <- df.DvsA[df.DvsA$baseMean > cutoffs[1], ] } - + # Get detectable genes for (i in 1:length(df.BvsA.det[, "ensembl"])) { if (df.BvsA.det[i, "log2FoldChange"] > cutoffs[3]) { @@ -61,7 +92,7 @@ detect_filter <- function(norm.counts, df.BvsA, df.CvsA = NULL, df.DvsA = NULL, } } } - + if (!is.null(df.CvsA)) { for (i in 1:length(df.CvsA.det[, "ensembl"])) { if (df.CvsA.det[i, "log2FoldChange"] > cutoffs[3]) { @@ -75,7 +106,7 @@ detect_filter <- function(norm.counts, df.BvsA, df.CvsA = NULL, df.DvsA = NULL, } } } - + if (!is.null(df.DvsA)) { for (i in 1:length(df.DvsA.det[, "ensembl"])) { if (df.DvsA.det[i, "log2FoldChange"] > cutoffs[3]) { @@ -89,23 +120,23 @@ detect_filter <- function(norm.counts, df.BvsA, df.CvsA = NULL, df.DvsA = NULL, } } } - + # Remove duplicates genes_vector <- unique(genes_vector) - + # Subset data frames to only include detectable genes df.BvsA.det <- df.BvsA.det[df.BvsA.det$ensembl %in% genes_vector, ] detected_genes <- list(Comparison1 = df.BvsA.det, DetectGenes = genes_vector) - + if (!is.null(df.CvsA)) { df.CvsA.det <- df.CvsA.det[df.CvsA.det$ensembl %in% genes_vector, ] detected_genes$Comparison2 <- df.CvsA.det } - + if (!is.null(df.DvsA)) { df.DvsA.det <- df.DvsA.det[df.DvsA.det$ensembl %in% genes_vector, ] detected_genes$Comparison3 <- df.DvsA.det } - + return(detected_genes) } diff --git a/R/get_annotations.R b/R/get_annotations.R index 6d89c0f..3b3dc50 100644 --- a/R/get_annotations.R +++ b/R/get_annotations.R @@ -18,6 +18,29 @@ #' @param format The output is saved in .csv or .xlsx formats. Default = csv. #' @importFrom magrittr %>% #' @importFrom rlang .data +#' +#' @examples +#' \dontrun{ +#' # Annotate genes from Normalized counts (requires internet connection) +#' data(norm_counts) +#' +#' # Requires a reference table with a "geneID" column. +#' # Use get_annotations() to generate it: +#' annotations <- get_annotations( +#' ensembl_ids = rownames(norm_counts), +#' mode = "genes" +#' ) +#' +#' head(annotations) +#' +#' # Use with add_annotations() +#' norm_counts_annot <- add_annotations( +#' object = norm_counts, +#' reference = annotations, +#' variables = c("symbol", "biotype") +#' ) +#' } +#' #' @export get_annotations <- function(ensembl_ids, mode = "genes", filename = "gene_annotations", version = "Current", format = "csv") { diff --git a/R/get_stars.R b/R/get_stars.R index a2a2298..117564b 100644 --- a/R/get_stars.R +++ b/R/get_stars.R @@ -9,6 +9,33 @@ #' @param geneID Ensembl ID of the gene of interest. #' @param object DESeq2 results object of a comparison. #' @param thresholds Vector with 4 values of significance. Default c(0.001, 0.01, 0.1, 0.25). +#' +#' @examples +#' data(deseq2_results) +#' +#' # get_stars expects a column named "ensembl" +#' res <- deseq2_results +#' colnames(res)[colnames(res) == "gene_id"] <- "ensembl" +#' +#' # Get significance stars for the most significant gene +#' get_stars( +#' geneID = res$ensembl[1], +#' object = res +#' ) +#' +#' # Custom thresholds +#' get_stars( +#' geneID = res$ensembl[1], +#' object = res, +#' thresholds = c(0.001, 0.01, 0.05, 0.10) +#' ) +#' +#' # Non-significant gene +#' get_stars( +#' geneID = res$ensembl[nrow(res)], +#' object = res +#' ) +#' #' @export get_stars <- function(geneID, object, thresholds = c(0.001, 0.01, 0.1, 0.25)) diff --git a/R/nice_PCA.R b/R/nice_PCA.R index a0e48d7..a79281a 100644 --- a/R/nice_PCA.R +++ b/R/nice_PCA.R @@ -36,6 +36,37 @@ #' @import ggplot2 #' @importFrom magrittr %>% #' @importFrom rlang .data +#' +#' @examples +#' data(vst_counts) +#' data(sampledata) +#' +#' # nice_PCA joins by a column named "id" in annotations +#' sampledata_pca <- sampledata +#' colnames(sampledata_pca)[colnames(sampledata_pca) == "patient_id"] <- "id" +#' +#' nice_PCA( +#' object = vst_counts, +#' annotations = sampledata_pca, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD PCA" +#' ) +#' +#' # Return PCA coordinates instead of plot +#' pca_data <- nice_PCA( +#' object = vst_counts, +#' annotations = sampledata_pca, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' returnData = TRUE +#' ) +#' head(pca_data) +#' #' @export nice_PCA <- function(object, annotations = NULL, PCs = c(1,2), ntop = NULL, @@ -51,7 +82,7 @@ nice_PCA <- function(object, annotations = NULL, PCs = c(1,2), ntop = NULL, object <- as.matrix(object) expr <- if (transform) log2(object + 0.001) else object - + if (scale) { expr <- expr[matrixStats::rowVars(expr) > 0, , drop = FALSE] } diff --git a/R/nice_UMAP.R b/R/nice_UMAP.R index 48bbb2f..be6e620 100644 --- a/R/nice_UMAP.R +++ b/R/nice_UMAP.R @@ -29,6 +29,29 @@ #' @import ggplot2 #' @importFrom magrittr %>% #' @importFrom rlang .data +#' +#' @examples +#' \dontrun{ +#' data(vst_counts) +#' data(sampledata) +#' +#' sampledata_u <- sampledata +#' colnames(sampledata_u)[colnames(sampledata_u) == "patient_id"] <- "id" +#' +#' nice_UMAP( +#' object = vst_counts, +#' annotations = sampledata_u, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD UMAP", +#' neighbors = 5, +#' epochs = 1000, +#' seed = 1905 +#' ) +#' } +#' #' @export nice_UMAP <- function(object, annotations = NULL, neighbors = 5, components = 2, epochs = 10000, seed = 0, diff --git a/R/nice_VSB.R b/R/nice_VSB.R index 97be5c6..6d0ee8e 100644 --- a/R/nice_VSB.R +++ b/R/nice_VSB.R @@ -2,12 +2,12 @@ # Function nice_VSB # ##################### -#' Function to make Violin-Scatter-Box plots. +#' Function to make Violin-Scatter-Box plots from data frames. #' #' This function will make a Boxplot, using a DEseq object. #' It will show the data points on top with a small deviation (jitter) for a better visualization. #' -#' @param object A DEseq object already transformed with the variance stabilizing or rlog transformations. +#' @param object A data frame object with normalized counts genes(in rows) across samples(in columns). #' @param annotations Data frame with annotations. #' @param variables To indicate the variables to be used as Shape and Fill of the markers. #' @param genename The gene name to be used for the plot. @@ -25,6 +25,23 @@ #' @import ggplot2 #' @importFrom magrittr %>% #' @importFrom rlang .data +#' +#' @examples +#' data(norm_counts) +#' data(sampledata) +#' +#' nice_VSB( +#' object = norm_counts, +#' annotations = sampledata, +#' variables = c(fill = "sample_type"), +#' genename = rownames(norm_counts)[1], +#' categories = c("normal", "tumor"), +#' labels = c("Normal", "Tumor"), +#' colors = c("steelblue", "firebrick"), +#' shapes = 21, +#' markersize = 3 +#' ) +#' #' @export nice_VSB <- function (object = NULL, annotations, variables = c(fill = "VarFill", shape = "VarShape"), diff --git a/R/nice_VSB_DEseq2.R b/R/nice_VSB_DEseq2.R new file mode 100644 index 0000000..70592f3 --- /dev/null +++ b/R/nice_VSB_DEseq2.R @@ -0,0 +1,115 @@ +############################## +# Function nice_VSB_DEseq2.R # +############################# + +#' Function to make Box-Scatter-Violin plots from DEseq2 output directly. +#' +#' This function will make a Boxplot, using a DEseq object. +#' It will show the data points on top with a small deviation (jitter) for a better visualization. +#' +#' @param object A DEseq object already transformed with the variance stabilizing or rlog transformations. + +#' @param variables To indicate the variables to be used as Shape and Fill of the markers. +#' @param genename The gene name to be used for the plot. +#' @param symbol The gene symbol to be used for the plot. +#' @param labels A vector containing the x-labels of the box-plot. Default: c("N", "P", "R", "M"). +#' @param categories A vector containing the labels for the legend. Default: c("normal", "primary", "recurrence", "metastasis"). +#' @param colors Vector of colors to be used for the categories of the variable assigned as Marker Fill. +#' @param shapes Vector of shapes to be used for the categories of the variable assigned as Marker Shape. +#' @param markersize Size of the marker. +#' @param alpha Transparency of the marker, which goes from 0 (transparent) to 1 (no transparent). Default: 0.8. +#' @param width Width of the plot. +#' @param height Height of the plot. +#' @param jitter Random deviation added to the dots. Default: 0.2. +#' @param dpi DPI of the plot. Default: 150. +#' @param save To save the plot. Default: FALSE. +#' @param title_size Font of the title and axis names. Default: c(axis = 20, fig = 24). +#' @param label_size Font of the labels (x-axis) and numbers (y-axis). Default: c(x = 20, y = 16). +#' @param legend_size Font of the title and elements of the legend. Default: c(title = 14, elements = 12). +#' @import ggplot2 + +#' @importFrom rlang .data +#' +#' @examples +#' \dontrun{ +#' # requires a DESeq2 object +#' +#' data(sampledata) +#' +#' nice_VSB_DEseq2( +#' object = vst, +#' annotations = sampledata, +#' variables = c(fill = "sample_type"), +#' genename = rownames(norm_counts)[1], +#' categories = c("normal", "tumor"), +#' labels = c("Normal", "Tumor"), +#' colors = c("steelblue", "firebrick"), +#' shapes = 21, +#' markersize = 3 +#' ) +#' } +#' @export + +nice_VSB_DEseq2 <- function (object = NULL, variables = c(fill = "VarFill", shape = "VarShape"), + genename = NULL, symbol = NULL, labels = c("N", "P", "R", "M"), + categories = c("normal", "primary", "recurrence", "metastasis"), + colors = NULL, shapes = NULL, markersize = NULL, alpha = 0.8, + width = NULL, height = NULL, jitter = 0.2, dpi = 150, save = FALSE, + title_size = c(axis = 20, fig = 24), label_size = c(x = 20, y = 16), + legend_size = c(title = 14, elements = 12)) { + + if (!requireNamespace("DESeq2", quietly = TRUE)) { + stop( + "Package \"DESeq2\" must be installed to use this function.", + call. = FALSE + ) + } + + # Extracting the vector of counts for that gene + gene_counts <- DESeq2::counts(object, normalized = TRUE)[genename, ] + log2_gc <- log2(gene_counts) + + # Making a dataframe for the plot + df.box <- data.frame(object@colData[, c("id", "sample_type", variables)], log2_gc) + + # Re-ordering sample_type for the plot + df.box[, "sample_type"] <- factor(df.box[, "sample_type"], + levels = categories, + labels = labels) + + # Plot + p.bs <- ggplot(df.box, aes(x = .data$sample_type, y = log2_gc)) + theme_bw() + + geom_violin(alpha = 0.1, scale = "width", fill = "yellow", color = "peru", + show.legend = FALSE, trim = TRUE) + + geom_boxplot(width = 0.6, fill = "gray90") + + labs(title = paste("Gene:", genename, symbol), + x = expression("Sample Type"), + y = expression("log"[2]*"(Normalized Gene Counts)")) + + theme(plot.title = element_text(size = title_size["fig"]), + axis.title = element_text(size = title_size["axis"]), + axis.text.x = element_text(size = label_size["x"]), + axis.text.y = element_text(size = label_size["y"]), + legend.title = element_text(size = legend_size["title"]), + legend.text=element_text(size = legend_size["elements"])) + + if (length(variables) == 1) { + p.bs <- p.bs + geom_point(data = df.box, aes_string(fill = variables["fill"]), shape = 21, + size = markersize, alpha = alpha, color = "black", + position = position_jitter(width = jitter)) + + } else if (length(variables) == 2) { + p.bs <- p.bs + geom_point(data = df.box, aes_string(fill = variables["fill"], shape = variables["shape"]), + size = markersize, alpha = alpha, color = "black", + position = position_jitter(width = jitter)) + + scale_shape_manual(name = variables["shape"], values = shapes) + + } else { return("Up to two variables allowed") } + + p.bs <- p.bs + scale_fill_manual(name = variables[1], values = colors, + guide = guide_legend(override.aes = aes(shape = 21, size = 7))) + + if (save == T) { + ggsave(paste0(symbol,".jpg"), plot = p.bs, width = width, height = height, dpi = dpi) + + } else { return(p.bs) } +} diff --git a/R/nice_Volcano.R b/R/nice_Volcano.R index cd8d4b6..6fc337b 100644 --- a/R/nice_Volcano.R +++ b/R/nice_Volcano.R @@ -25,6 +25,32 @@ #' @param genes Vector of genes to label in the plot. Default: NULL. #' @import ggplot2 #' @importFrom rlang .data +#' +#' @examples +#' data(deseq2_results) +#' +#' nice_Volcano( +#' results = deseq2_results, +#' x_var = "log2FoldChange", +#' y_var = "padj", +#' label_var = "gene_id", +#' title = "TCGA-LUAD: Tumor vs Normal", +#' cutoff_y = 0.05, +#' cutoff_x = 1, +#' x_range = 8, +#' y_max = 10 +#' ) +#' +#' # Highlight specific genes +#' nice_Volcano( +#' results = deseq2_results, +#' x_var = "log2FoldChange", +#' y_var = "padj", +#' label_var = "gene_id", +#' title = "TCGA-LUAD: Tumor vs Normal", +#' genes = deseq2_results$gene_id[1:5] +#' ) +#' #' @export nice_Volcano <- function(results, x_range = 9, y_max = 8, cutoff_y = 0.05, cutoff_x = 1, diff --git a/R/nice_tSNE.R b/R/nice_tSNE.R index a38d27f..451f606 100644 --- a/R/nice_tSNE.R +++ b/R/nice_tSNE.R @@ -28,6 +28,30 @@ #' @import ggplot2 #' @importFrom magrittr %>% #' @importFrom rlang .data +#' +#' @examples +#' \dontrun{ +#' data(vst_counts) +#' data(sampledata) +#' +#' sampledata_t <- sampledata +#' colnames(sampledata_t)[colnames(sampledata_t) == "patient_id"] <- "id" +#' +#' # perplexity must be < n_samples / 3; with 32 samples use perplexity = 5 +#' nice_tSNE( +#' object = vst_counts, +#' annotations = sampledata_t, +#' perplexity = 5, +#' max_iterations = 1000, +#' variables = c(fill = "sample_type"), +#' legend_names = c(fill = "Sample Type"), +#' colors = c("steelblue", "firebrick"), +#' shapes = c(21, 21), +#' title = "TCGA-LUAD tSNE", +#' seed = 1905 +#' ) +#' } +#' #' @export nice_tSNE <- function(object, annotations = NULL, perplexity = 3, max_iterations = 10000, seed = 0, diff --git a/R/power_analysis.R b/R/power_analysis.R index 33f033d..0342cc4 100644 --- a/R/power_analysis.R +++ b/R/power_analysis.R @@ -14,6 +14,28 @@ #' @param plot Logical. If TRUE, draws the power curve; if FALSE, skips plotting. #' @import ggplot2 #' @importFrom rlang .data +#' +#' @examples +#' # Basic power analysis with default parameters +#' result <- power_analysis( +#' effect_size = 1, +#' dispersion = 0.1, +#' n_genes = 20000, +#' prop_de = 0.05, +#' alpha = 0.05, +#' power_target = 0.8, +#' max_n = 20 +#' ) +#' +#' # Minimum sample size to reach 80% power +#' result$min_sample_size +#' +#' # Full power table +#' head(result$power_table) +#' +#' # Higher effect size requires fewer samples +#' power_analysis(effect_size = 2, dispersion = 0.1, plot = FALSE)$min_sample_size +#' #' @export power_analysis <- function( diff --git a/R/save_results.R b/R/save_results.R index e7dd63d..c8a32ef 100644 --- a/R/save_results.R +++ b/R/save_results.R @@ -15,6 +15,24 @@ #' @param l2fc The cut-off of Log2(Fold Change) for the over- and under-expressed tables. Default = 0. #' @param cutoff_alpha The cut-off of the False Discovery Rate (FDR o padj). Default = 0.25. #' @importFrom rlang .data +#' +#' @examples +#' \dontrun{ +#' data(deseq2_results) +#' +#' # Save full results + over/under-expressed tables as .xlsx files +#' save_results( +#' df = deseq2_results, +#' name = "TCGA_LUAD_TumorVsNormal", +#' l2fc = 1, +#' cutoff_alpha = 0.05 +#' ) +#' # Creates: +#' # TCGA_LUAD_TumorVsNormal_full.xlsx +#' # TCGA_LUAD_TumorVsNormal_Overexp.xlsx +#' # TCGA_LUAD_TumorVsNormal_Underexp.xlsx +#' } +#' #' @export save_results <- function(df, name, l2fc = 0, cutoff_alpha = 0.25){ @@ -33,16 +51,12 @@ save_results <- function(df, name, l2fc = 0, cutoff_alpha = 0.25){ file = paste0(name, "_full.xlsx"), overwrite = T) #Saving over-expressed genes: - #df.sig.fold_over <- subset(df, ((FDR < cutoff_alpha) & !is.na(FDR)) & - # log2FoldChange >= l2fc) df.sig.fold_over <- df[df$FDR < cutoff_alpha & !is.na(df$FDR) & df$log2FoldChange >= l2fc, ] openxlsx::write.xlsx(df.sig.fold_over, colNames = T, rowNames = F, append = F, file = paste0(name, "_Overexp.xlsx"), overwrite = T) #Saving under-expressed genes: - #df.sig.fold_under <- subset(df, ((FDR < cutoff_alpha) & !is.na(FDR)) & - # log2FoldChange <= -l2fc) df.sig.fold_under <- df[df$FDR < cutoff_alpha & !is.na(df$FDR) & df$log2FoldChange <= -l2fc, ] openxlsx::write.xlsx(df.sig.fold_under, colNames = T, rowNames = F, append = F, diff --git a/R/split_cases.R b/R/split_cases.R index 6e187e0..8df3996 100644 --- a/R/split_cases.R +++ b/R/split_cases.R @@ -8,7 +8,7 @@ #' including the baseline, there are 10 mutually exclusive cases where genes can #' fall into. This function allows us to obtain these 10 cases and saves them #' into a list. -#' +#' #' @param df.BvsA Data frame comparing the first condition to the baseline. #' @param df.CvsA Data frame comparing the second condition to the baseline. #' @param df.BvsC Data frame comparing the two conditions of the study. @@ -17,236 +17,276 @@ #' @param significance_cutoff Cut-off of the significance variable. #' @param change_var Variable that indicates the direction of the change (i.e. log2FoldChange in DESeq2, NES in GSEA). #' @param change_cutoff The values of the change variable will be filtered by |change_var| > change_cutoff. Default: 0. +#' +#' @examples +#' \dontrun{ +#' # split_cases requires three DESeq2 comparisons. +#' # Simulate a 3-phenotype study: Normal (A), Primary (B), Metastasis (C) +#' set.seed(174) +#' n_genes <- 500 +#' +#' make_res <- function(seed) { +#' set.seed(seed) +#' data.frame( +#' ensembl = paste0("ENSG", sprintf("%011d", seq_len(n_genes))), +#' log2FoldChange = rnorm(n_genes, 0, 1.5), +#' padj = runif(n_genes, 0, 0.5), +#' stringsAsFactors = FALSE +#' ) +#' } +#' +#' df_BvsA <- make_res(1) +#' df_CvsA <- make_res(2) +#' df_BvsC <- make_res(3) +#' +#' cases <- split_cases( +#' df.BvsA = df_BvsA, +#' df.CvsA = df_CvsA, +#' df.BvsC = df_BvsC, +#' unique_id = "ensembl", +#' significance_var = "padj", +#' significance_cutoff = 0.25, +#' change_var = "log2FoldChange", +#' change_cutoff = 0 +#' ) +#' +#' # Number of genes per case +#' sapply(cases, nrow) +#' +#' # Inspect Case 1 (ladder genes: significant in all 3 comparisons) +#' head(cases$Case1) +#' } +#' #' @export split_cases <- function (df.BvsA = NULL, df.CvsA = NULL, df.BvsC = NULL, unique_id = "ensembl", significance_var = "padj", significance_cutoff = 0.25, change_var = "log2FoldChange", change_cutoff = 0) - + { # Set row names as unique identifiers if (!all(rownames(df.BvsA) == df.BvsA[, unique_id])) { rownames(df.BvsA) <- df.BvsA[, unique_id] } - + if (!all(rownames(df.CvsA) == df.CvsA[, unique_id])) { rownames(df.CvsA) <- df.CvsA[, unique_id] } - + if (!all(rownames(df.BvsC) == df.BvsC[, unique_id])) { rownames(df.BvsC) <- df.BvsC[, unique_id] } - + # Create subsets by significance df.BvsA.sig <- df.BvsA[df.BvsA[, significance_var] < significance_cutoff & !is.na(df.BvsA[, significance_var]), ] df.CvsA.sig <- df.CvsA[df.CvsA[, significance_var] < significance_cutoff & !is.na(df.CvsA[, significance_var]), ] df.BvsC.sig <- df.BvsC[df.BvsC[, significance_var] < significance_cutoff & !is.na(df.BvsC[, significance_var]), ] - + df.BvsA.nsig <- df.BvsA[df.BvsA[, significance_var] >= significance_cutoff | is.na(df.BvsA[, significance_var]), ] df.CvsA.nsig <- df.CvsA[df.CvsA[, significance_var] >= significance_cutoff | is.na(df.CvsA[, significance_var]), ] df.BvsC.nsig <- df.BvsC[df.BvsC[, significance_var] >= significance_cutoff | is.na(df.BvsC[, significance_var]), ] - - + + # Defining cases - + ####################### Significant in all comparisons ####################### - + ############################# # Case 1: Ladders up/down ############################# # These genes would show a progressively increasing or decreasing expression, # particularly useful when comparing conditions over time, intensity or evolution - - + + case1.up.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id])) - + case1.up <- as.data.frame(df.BvsA.sig[case1.up.genes, ]) case1.up$trend <- rep("up", nrow(case1.up)) - + case1.dn.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id])) - + case1.dn <- as.data.frame(df.BvsA.sig[case1.dn.genes, ]) case1.dn$trend <- rep("dn", nrow(case1.dn)) - + ################################## - # Case 2: Stronger in condition 1 + # Case 2: Stronger in condition 1 ################################## # Genes with stronger dysregulation in the first condition, which means that # they change their expression trend while still being significant - + case2.up.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id])) - + case2.up <- as.data.frame(df.BvsA.sig[case2.up.genes, ]) case2.up$trend <- rep("up", nrow(case2.up)) - + case2.dn.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id])) - + case2.dn <- as.data.frame(df.BvsA.sig[case2.dn.genes, ]) case2.dn$trend <- rep("dn", nrow(case2.dn)) - + ################################## # Case 3: Stronger in condition 2 ################################## # Genes with stronger dysregulation in the second condition, which means that # they change their expression trend while still being significant - + case3.up.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id])) - + case3.up <- as.data.frame(df.BvsA.sig[case3.up.genes, ]) case3.up$trend <- rep("up", nrow(case3.up)) - + case3.dn.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id])) - + case3.dn <- as.data.frame(df.BvsA.sig[case3.dn.genes, ]) case3.dn$trend <- rep("dn", nrow(case3.dn)) - - + + ##################### Significant in only two comparisons ##################### - + ########################################### # Case 4: Significant in data 2 and data 3 ########################################### # These genes would be consider markers of the second condition since they are # dysregulated compared to the baseline and to the first condition only - + case4.up.genes <- Reduce(intersect, list(df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id], df.BvsA.nsig[, unique_id])) - + case4.up <- as.data.frame(df.CvsA.sig[case4.up.genes, ]) case4.up$trend <- rep("up", nrow(case4.up)) - + case4.dn.genes <- Reduce(intersect, list(df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id], df.BvsA.nsig[, unique_id])) - + case4.dn <- as.data.frame(df.CvsA.sig[case4.dn.genes, ]) case4.dn$trend <- rep("dn", nrow(case4.dn)) - + ########################################### # Case 5: Significant in data 1 and data 3 ########################################### # These genes would be consider markers of the first condition since they are # dysregulated compared to the baseline and to the second condition only - + case5.up.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id], df.CvsA.nsig[, unique_id])) - + case5.up <- as.data.frame(df.BvsA.sig[case5.up.genes, ]) case5.up$trend <- rep("up", nrow(case5.up)) - + case5.dn.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id], df.CvsA.nsig[, unique_id])) - + case5.dn <- as.data.frame(df.BvsA.sig[case5.dn.genes, ]) case5.dn$trend <- rep("dn", nrow(case5.dn)) - + ########################################### # Case 6: Significant in data 1 and data 2 ########################################### # According to the study's design, lets consider that genes dysregulated # in both conditions when compared to the baseline are the ones to focus on - + case6.up.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id], df.BvsC.nsig[, unique_id])) - + case6.up <- as.data.frame(df.BvsA.sig[case6.up.genes, ]) case6.up$trend <- rep("up", nrow(case6.up)) - + case6.dn.genes <- Reduce(intersect, list(df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id], df.BvsC.nsig[, unique_id])) - + case6.dn <- as.data.frame(df.BvsA.sig[case6.dn.genes, ]) case6.dn$trend <- rep("dn", nrow(case6.dn)) - - + + ################ Significant in only one comparison or neither ################ - + # None of these cases are insightful or provide relevant information for the # analysis performed - + ######################################## # Case 7: Significant in data 3 only ######################################## - + case7.up.genes <- Reduce(intersect, list(df.BvsA.nsig[, unique_id], df.CvsA.nsig[, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] > change_cutoff, unique_id])) - + case7.up <- as.data.frame(df.BvsC.sig[case7.up.genes, ]) case7.up$trend <- rep("up", nrow(case7.up)) - + case7.dn.genes <- Reduce(intersect, list(df.BvsA.nsig[, unique_id], df.CvsA.nsig[, unique_id], df.BvsC.sig[df.BvsC.sig[, change_var] < -change_cutoff, unique_id])) - + case7.dn <- as.data.frame(df.BvsC.sig[case7.dn.genes, ]) case7.dn$trend <- rep("dn", nrow(case7.dn)) - + ######################################## # Case 8: Significant in data 2 only ######################################## - + case8.up.genes <- Reduce(intersect, list(df.BvsA.nsig[, unique_id], df.BvsC.nsig[, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] > change_cutoff, unique_id])) - + case8.up <- as.data.frame(df.CvsA.sig[case8.up.genes, ]) case8.up$trend <- rep("up", nrow(case8.up)) - + case8.dn.genes <- Reduce(intersect, list(df.BvsA.nsig[, unique_id], df.BvsC.nsig[, unique_id], df.CvsA.sig[df.CvsA.sig[, change_var] < -change_cutoff, unique_id])) - + case8.dn <- as.data.frame(df.CvsA.sig[case8.dn.genes, ]) case8.dn$trend <- rep("dn", nrow(case8.dn)) - + ######################################## # Case 9: Significant in data 1 only ######################################## - + case9.up.genes <- Reduce(intersect, list(df.CvsA.nsig[, unique_id], df.BvsC.nsig[, unique_id], df.BvsA.sig[df.BvsA.sig[, change_var] > change_cutoff, unique_id])) - + case9.up <- as.data.frame(df.BvsA.sig[case9.up.genes, ]) case9.up$trend <- rep("up", nrow(case9.up)) - + case9.dn.genes <- Reduce(intersect, list(df.CvsA.nsig[, unique_id], df.BvsC.nsig[, unique_id], df.BvsA.sig[df.BvsA.sig[, change_var] < -change_cutoff, unique_id])) - + case9.dn <- as.data.frame(df.BvsA.sig[case9.dn.genes, ]) case9.dn$trend <- rep("dn", nrow(case9.dn)) - + ######################################## # Case 10: Significant in none ######################################## - + case10.genes <- Reduce(intersect, list(df.BvsA.nsig[, unique_id], df.CvsA.nsig[, unique_id], df.BvsC.nsig[, unique_id])) - + case10 <- as.data.frame(df.BvsA.nsig[case10.genes, ]) - + # Create data frames of results per cases case1 <- rbind(case1.up, case1.dn) case2 <- rbind(case2.up, case2.dn) @@ -257,8 +297,8 @@ split_cases <- function (df.BvsA = NULL, df.CvsA = NULL, df.BvsC = NULL, unique_ case7 <- rbind(case7.up, case7.dn) case8 <- rbind(case8.up, case8.dn) case9 <- rbind(case9.up, case9.dn) - + return(list(Case1 = case1, Case2 = case2, Case3 = case3, Case4 = case4, Case5 = case5, Case6 = case6, Case7 = case7, Case8 = case8, Case9 = case9, Case10 = case10)) - + } diff --git a/R/tpm.R b/R/tpm.R index 6a0a04b..b975dc3 100644 --- a/R/tpm.R +++ b/R/tpm.R @@ -12,6 +12,31 @@ #' #' @param raw_counts A table with the gene counts. #' @param gene_lengths A column with the gene lengths. +#' +#' @examples +#' \dontrun{ +#' data(raw_counts) +#' data(deseq2_results) +#' +#' # Gene lengths are needed, retrieve from get_annotations() or use +#' # pre-fetched lengths. Here we use the gene_length column if available. +#' annotations <- get_annotations( +#' ensembl_ids = rownames(raw_counts), +#' mode = "genes" +#' ) +#' +#' # Match gene lengths to raw_counts row order +#' gene_lengths <- annotations$gene_length[ +#' match(rownames(raw_counts), annotations$geneID) +#' ] +#' +#' # Calculate TPM +#' tpm_matrix <- tpm(raw_counts, gene_lengths) +#' +#' # Check: column sums should all be 1,000,000 +#' round(colSums(tpm_matrix)[1:3]) +#' } +#' #' @export tpm <- function(raw_counts, gene_lengths) { diff --git a/data-raw/deseq2_results.R b/data-raw/deseq2_results.R new file mode 100644 index 0000000..daa9840 --- /dev/null +++ b/data-raw/deseq2_results.R @@ -0,0 +1,97 @@ +## data-raw/deseq2_results.R +## Generates deseq2_results, , norm_counts, and vst_counts from TCGA-LUAD raw_counts and sampledata. +## Run once to regenerate the three .rda files in data/. +## Requires: DESeq2, dplyr, tibble + +# ============================================================================= +# DESeq2 pipeline - TCGA LUAD +# Comparison: Tumor vs Normal (sample_type column) +# Source: GDC Data Portal - TCGA-LUAD STAR Counts +# Outputs: +# data/deseq2_results.rda : DESeq2 results table (all genes, FDR filtered) +# data/norm_counts.rda : Normalized counts matrix (counts(dds, normalized=TRUE)) +# data/vst_counts.rda : VST-transformed matrix (assay(vst(dds))) +# ============================================================================= + +library(DESeq2) +library(dplyr) +library(tibble) + +data(raw_counts) +data(sampledata) + +# Inspect sample_type levels +cat("sample_type levels:\n") +print(table(sampledata$sample_type)) + +# Reference = "normal" (baseline), comparison = "tumor" + +sampledata$sample_type <- factor( + sampledata$sample_type, + levels = c("normal", "tumor") +) + +# Build DESeq2 object +dds <- DESeqDataSetFromMatrix( + countData = round(raw_counts), + colData = sampledata, + design = ~ sample_type +) + +# Filter: keep genes with >= 10 counts in at least 10 samples +keep <- rowSums(counts(dds) >= 10) >= 10 +dds <- dds[keep, ] +cat("Genes after filtering:", nrow(dds), "\n") + +# Run DESeq2 +dds <- DESeq(dds) + +# 1. deseq2 results +res <- results( + dds, + contrast = c("sample_type", "tumor", "normal"), + alpha = 0.05 +) + +deseq2_results <- as.data.frame(res) %>% + rownames_to_column("gene_id") %>% + filter(!is.na(padj), !is.na(log2FoldChange)) %>% + arrange(padj) + +cat("\ndeseq2_results:\n") +cat("Total genes with results:", nrow(deseq2_results), "\n") +cat("Significant (FDR < 0.05):", sum(deseq2_results$padj < 0.05), "\n") +cat(" Columns :", paste(colnames(deseq2_results), collapse = ", "), "\n") + +# 2. norm_counts +# Normalized counts: suitable for nice_VSB, detect_filter, add_annotations +# Counts are divided by DESeq2 size factors to correct for library size. +# Still in counts scale (not log-transformed). +norm_counts <- counts(dds, normalized = TRUE) + +cat("\nnorm_counts:\n") +cat(" Dimensions :", nrow(norm_counts), "genes x", ncol(norm_counts), "samples\n") +cat(" Value range : [", round(min(norm_counts), 1), ",", + round(max(norm_counts), 1), "]\n") + +# 3. vst_counts +# Variance Stabilizing Transformation: suitable for nice_PCA, nice_UMAP, nice_tSNE. +# VST removes the mean-variance dependence of RNA-seq counts, +# placing all genes on a comparable log2-like scale for dimensionality +# reduction and sample-level clustering. +vst_counts <- assay(vst(dds, blind = TRUE)) + +cat("\nvst_counts:\n") +cat(" Dimensions :", nrow(vst_counts), "genes x", ncol(vst_counts), "samples\n") +cat(" Value range : [", round(min(vst_counts), 2), ",", + round(max(vst_counts), 2), "]\n") + +# Save +usethis::use_data(deseq2_results, compress = "xz", overwrite = TRUE) +usethis::use_data(norm_counts, compress = "xz", overwrite = TRUE) +usethis::use_data(vst_counts, compress = "xz", overwrite = TRUE) + +message("\nDone. Saved:") +message(" data/deseq2_results.rda") +message(" data/norm_counts.rda") +message(" data/vst_counts.rda") diff --git a/data/deseq2_results.rda b/data/deseq2_results.rda new file mode 100644 index 0000000..6e5edbe Binary files /dev/null and b/data/deseq2_results.rda differ diff --git a/data/norm_counts.rda b/data/norm_counts.rda new file mode 100644 index 0000000..0d835f2 Binary files /dev/null and b/data/norm_counts.rda differ diff --git a/data/vst_counts.rda b/data/vst_counts.rda new file mode 100644 index 0000000..e1af716 Binary files /dev/null and b/data/vst_counts.rda differ diff --git a/man/add_annotations.Rd b/man/add_annotations.Rd index 4bfae0e..677a6cc 100644 --- a/man/add_annotations.Rd +++ b/man/add_annotations.Rd @@ -18,3 +18,32 @@ add_annotations(object, reference, variables = NULL, data_frame = FALSE) \description{ A function to add annotations to a table of gene counts. } +\examples{ +\dontrun{ +data(norm_counts) + +# Requires a reference table with a "geneID" column. +# Use get_annotations() to generate it: +annotations <- get_annotations( + ensembl_ids = rownames(norm_counts), + mode = "genes" +) + +# Add gene symbol and biotype columns to the counts matrix +norm_counts_annot <- add_annotations( + object = norm_counts, + reference = annotations, + variables = c("symbol", "biotype") +) + +# Inspect result +head(norm_counts_annot[, c("geneID", "symbol", "biotype")]) + +# Add all annotation columns (variables = NULL uses everything) +norm_counts_full <- add_annotations( + object = norm_counts, + reference = annotations +) +} + +} diff --git a/man/deseq2_results.Rd b/man/deseq2_results.Rd new file mode 100644 index 0000000..003b423 --- /dev/null +++ b/man/deseq2_results.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deseq2_results.R +\docType{data} +\name{deseq2_results} +\alias{deseq2_results} +\title{DESeq2 differential expression results for TCGA-LUAD} +\format{ +A data frame with 21,330 rows and 7 columns: +\describe{ +\item{gene_id}{Character. Ensembl gene ID (e.g., \code{"ENSG00000141510"}).} +\item{baseMean}{Numeric. Mean of normalized counts across all samples.} +\item{log2FoldChange}{Numeric. Shrunken log2 fold change +(tumor vs. normal).} +\item{lfcSE}{Numeric. Standard error of the log2 fold change estimate.} +\item{stat}{Numeric. Wald test statistic.} +\item{pvalue}{Numeric. Raw p-value.} +\item{padj}{Numeric. Benjamini-Hochberg adjusted p-value (FDR).} +} +} +\source{ +TCGA-LUAD STAR counts downloaded from the GDC Data Portal +(\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +DESeq2 analysis performed with default settings; results generated by +\code{data-raw/deseq2_results.R}. +} +\usage{ +deseq2_results +} +\description{ +DESeq2 results from a differential expression analysis +comparing primary lung adenocarcinoma tumors versus normal tissue using +TCGA-LUAD RNA-seq data. Contains 21330 genes to produce informative +visualizations with \code{\link[=nice_Volcano]{nice_Volcano()}}, and also suitable as input for +\code{\link[=detect_filter]{detect_filter()}}, and \code{\link[=add_annotations]{add_annotations()}} +and related plotting functions. +} +\examples{ +data(deseq2_results) + +# Overview +head(deseq2_results) + +# Significant genes +sum(deseq2_results$padj < 0.05, na.rm = TRUE) + +# Volcano plot +nice_Volcano( + results = deseq2_results, + x_var = "log2FoldChange", + y_var = "padj", + label_var = "gene_id", + title = "TCGA-LUAD: Tumor vs Normal" +) +\dontrun{ +# detect_filter (required: "ensembl" column in results) +deseq2_res <- deseq2_results +colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl" +rownames(deseq2_res) <- deseq2_res$ensembl + +# Get sample IDs per group from sampledata +samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] + +detected <- detect_filter( + norm.counts = as.data.frame(norm_counts), + df.BvsA = deseq2_res, + samples.baseline = samples_normal, + samples.condition1 = samples_tumor, + cutoffs = c(50, 50, 0) +) + +# Number of detectable genes +length(detected$DetectGenes) + +# Subset results to detectable genes +head(detected$Comparison1) +} +} +\seealso{ +\code{\link[=nice_Volcano]{nice_Volcano()}}, \link{raw_counts}, \link{sampledata} +} +\keyword{datasets} diff --git a/man/detect_filter.Rd b/man/detect_filter.Rd index e7cce47..6bdd934 100644 --- a/man/detect_filter.Rd +++ b/man/detect_filter.Rd @@ -40,3 +40,34 @@ This function identifies genes with measurable expression levels across samples. Detectable genes must meet two conditions: the baseMean and their mean normalized counts in the phenotypes of interest must be greater than a set threshold. It returns a list of detectable genes and the comparisons in which they can be found. } +\examples{ +\dontrun{ +data(norm_counts) +data(deseq2_results) +data(sampledata) + +# detect_filter requires an "ensembl" column in the results data frame +res <- deseq2_results +colnames(res)[colnames(res) == "gene_id"] <- "ensembl" +rownames(res) <- res$ensembl + +# Get sample IDs per group +samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] + +detected <- detect_filter( + norm.counts = as.data.frame(norm_counts), + df.BvsA = res, + samples.baseline = samples_normal, + samples.condition1 = samples_tumor, + cutoffs = c(50, 50, 0) +) + +# Number of detectable genes +length(detected$DetectGenes) + +# Subset results +head(detected$Comparison1) +} + +} diff --git a/man/get_annotations.Rd b/man/get_annotations.Rd index 9909682..4591f4b 100644 --- a/man/get_annotations.Rd +++ b/man/get_annotations.Rd @@ -34,3 +34,26 @@ The Gene information added include: \item Gene start, end and length } } +\examples{ +\dontrun{ +# Annotate genes from Normalized counts (requires internet connection) +data(norm_counts) + +# Requires a reference table with a "geneID" column. +# Use get_annotations() to generate it: +annotations <- get_annotations( + ensembl_ids = rownames(norm_counts), + mode = "genes" +) + +head(annotations) + +# Use with add_annotations() +norm_counts_annot <- add_annotations( + object = norm_counts, + reference = annotations, + variables = c("symbol", "biotype") +) +} + +} diff --git a/man/get_stars.Rd b/man/get_stars.Rd index a116fbb..3be68e0 100644 --- a/man/get_stars.Rd +++ b/man/get_stars.Rd @@ -16,3 +16,30 @@ get_stars(geneID, object, thresholds = c(0.001, 0.01, 0.1, 0.25)) \description{ This function will create asteriscs (*) from DESeq2 results objects to represent the significance of comparisons. } +\examples{ +data(deseq2_results) + +# get_stars expects a column named "ensembl" +res <- deseq2_results +colnames(res)[colnames(res) == "gene_id"] <- "ensembl" + +# Get significance stars for the most significant gene +get_stars( + geneID = res$ensembl[1], + object = res +) + +# Custom thresholds +get_stars( + geneID = res$ensembl[1], + object = res, + thresholds = c(0.001, 0.01, 0.05, 0.10) +) + +# Non-significant gene +get_stars( + geneID = res$ensembl[nrow(res)], + object = res +) + +} diff --git a/man/nice_PCA.Rd b/man/nice_PCA.Rd index a4beb55..939db01 100644 --- a/man/nice_PCA.Rd +++ b/man/nice_PCA.Rd @@ -83,3 +83,34 @@ But including some improvements made by David Requena. Now it allows: \item To provide the colors, shapes and fonts. } } +\examples{ +data(vst_counts) +data(sampledata) + +# nice_PCA joins by a column named "id" in annotations +sampledata_pca <- sampledata +colnames(sampledata_pca)[colnames(sampledata_pca) == "patient_id"] <- "id" + +nice_PCA( + object = vst_counts, + annotations = sampledata_pca, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD PCA" +) + +# Return PCA coordinates instead of plot +pca_data <- nice_PCA( + object = vst_counts, + annotations = sampledata_pca, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + returnData = TRUE +) +head(pca_data) + +} diff --git a/man/nice_UMAP.Rd b/man/nice_UMAP.Rd index 3f01428..a10fcec 100644 --- a/man/nice_UMAP.Rd +++ b/man/nice_UMAP.Rd @@ -77,3 +77,26 @@ nice_UMAP( \description{ Function to make UMAP plots. } +\examples{ +\dontrun{ +data(vst_counts) +data(sampledata) + +sampledata_u <- sampledata +colnames(sampledata_u)[colnames(sampledata_u) == "patient_id"] <- "id" + +nice_UMAP( + object = vst_counts, + annotations = sampledata_u, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD UMAP", + neighbors = 5, + epochs = 1000, + seed = 1905 +) +} + +} diff --git a/man/nice_VSB.Rd b/man/nice_VSB.Rd index c7ad3c4..f336f66 100644 --- a/man/nice_VSB.Rd +++ b/man/nice_VSB.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/nice_VSB.R \name{nice_VSB} \alias{nice_VSB} -\title{Function to make Violin-Scatter-Box plots.} +\title{Function to make Violin-Scatter-Box plots from data frames.} \usage{ nice_VSB( object = NULL, @@ -23,7 +23,7 @@ nice_VSB( ) } \arguments{ -\item{object}{A DEseq object already transformed with the variance stabilizing or rlog transformations.} +\item{object}{A data frame object with normalized counts genes(in rows) across samples(in columns).} \item{annotations}{Data frame with annotations.} @@ -57,3 +57,20 @@ nice_VSB( This function will make a Boxplot, using a DEseq object. It will show the data points on top with a small deviation (jitter) for a better visualization. } +\examples{ +data(norm_counts) +data(sampledata) + +nice_VSB( + object = norm_counts, + annotations = sampledata, + variables = c(fill = "sample_type"), + genename = rownames(norm_counts)[1], + categories = c("normal", "tumor"), + labels = c("Normal", "Tumor"), + colors = c("steelblue", "firebrick"), + shapes = 21, + markersize = 3 +) + +} diff --git a/man/nice_VSB_DEseq2.Rd b/man/nice_VSB_DEseq2.Rd new file mode 100644 index 0000000..566e6f3 --- /dev/null +++ b/man/nice_VSB_DEseq2.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nice_VSB_DEseq2.R +\name{nice_VSB_DEseq2} +\alias{nice_VSB_DEseq2} +\title{Function to make Box-Scatter-Violin plots from DEseq2 output directly.} +\usage{ +nice_VSB_DEseq2( + object = NULL, + variables = c(fill = "VarFill", shape = "VarShape"), + genename = NULL, + symbol = NULL, + labels = c("N", "P", "R", "M"), + categories = c("normal", "primary", "recurrence", "metastasis"), + colors = NULL, + shapes = NULL, + markersize = NULL, + alpha = 0.8, + width = NULL, + height = NULL, + jitter = 0.2, + dpi = 150, + save = FALSE, + title_size = c(axis = 20, fig = 24), + label_size = c(x = 20, y = 16), + legend_size = c(title = 14, elements = 12) +) +} +\arguments{ +\item{object}{A DEseq object already transformed with the variance stabilizing or rlog transformations.} + +\item{variables}{To indicate the variables to be used as Shape and Fill of the markers.} + +\item{genename}{The gene name to be used for the plot.} + +\item{symbol}{The gene symbol to be used for the plot.} + +\item{labels}{A vector containing the x-labels of the box-plot. Default: c("N", "P", "R", "M").} + +\item{categories}{A vector containing the labels for the legend. Default: c("normal", "primary", "recurrence", "metastasis").} + +\item{colors}{Vector of colors to be used for the categories of the variable assigned as Marker Fill.} + +\item{shapes}{Vector of shapes to be used for the categories of the variable assigned as Marker Shape.} + +\item{markersize}{Size of the marker.} + +\item{alpha}{Transparency of the marker, which goes from 0 (transparent) to 1 (no transparent). Default: 0.8.} + +\item{width}{Width of the plot.} + +\item{height}{Height of the plot.} + +\item{jitter}{Random deviation added to the dots. Default: 0.2.} + +\item{dpi}{DPI of the plot. Default: 150.} + +\item{save}{To save the plot. Default: FALSE.} + +\item{title_size}{Font of the title and axis names. Default: c(axis = 20, fig = 24).} + +\item{label_size}{Font of the labels (x-axis) and numbers (y-axis). Default: c(x = 20, y = 16).} + +\item{legend_size}{Font of the title and elements of the legend. Default: c(title = 14, elements = 12).} +} +\description{ +This function will make a Boxplot, using a DEseq object. +It will show the data points on top with a small deviation (jitter) for a better visualization. +} +\examples{ +\dontrun{ +# requires a DESeq2 object + +data(sampledata) + +nice_VSB_DEseq2( + object = vst, + annotations = sampledata, + variables = c(fill = "sample_type"), + genename = rownames(norm_counts)[1], + categories = c("normal", "tumor"), + labels = c("Normal", "Tumor"), + colors = c("steelblue", "firebrick"), + shapes = 21, + markersize = 3 +) +} +} diff --git a/man/nice_Volcano.Rd b/man/nice_Volcano.Rd index b40a3db..533e402 100644 --- a/man/nice_Volcano.Rd +++ b/man/nice_Volcano.Rd @@ -58,3 +58,29 @@ Volcano plot with configurable point shapes and threshold annotations: \item Vertical dashed lines at log-fold-change cutoffs, shown as custom x-axis ticks. } } +\examples{ +data(deseq2_results) + +nice_Volcano( + results = deseq2_results, + x_var = "log2FoldChange", + y_var = "padj", + label_var = "gene_id", + title = "TCGA-LUAD: Tumor vs Normal", + cutoff_y = 0.05, + cutoff_x = 1, + x_range = 8, + y_max = 10 +) + +# Highlight specific genes +nice_Volcano( + results = deseq2_results, + x_var = "log2FoldChange", + y_var = "padj", + label_var = "gene_id", + title = "TCGA-LUAD: Tumor vs Normal", + genes = deseq2_results$gene_id[1:5] +) + +} diff --git a/man/nice_tSNE.Rd b/man/nice_tSNE.Rd index 653913e..8442ce0 100644 --- a/man/nice_tSNE.Rd +++ b/man/nice_tSNE.Rd @@ -74,3 +74,27 @@ nice_tSNE( \description{ Function to make tSNE plots. } +\examples{ +\dontrun{ +data(vst_counts) +data(sampledata) + +sampledata_t <- sampledata +colnames(sampledata_t)[colnames(sampledata_t) == "patient_id"] <- "id" + +# perplexity must be < n_samples / 3; with 32 samples use perplexity = 5 +nice_tSNE( + object = vst_counts, + annotations = sampledata_t, + perplexity = 5, + max_iterations = 1000, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD tSNE", + seed = 1905 +) +} + +} diff --git a/man/norm_counts.Rd b/man/norm_counts.Rd new file mode 100644 index 0000000..be06c00 --- /dev/null +++ b/man/norm_counts.Rd @@ -0,0 +1,113 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deseq2_results.R +\docType{data} +\name{norm_counts} +\alias{norm_counts} +\title{Normalized counts matrix for TCGA-LUAD} +\format{ +A numeric matrix with 21,330 rows (genes) and 32 columns (samples): +\describe{ +\item{rows}{Ensembl gene IDs (e.g., \code{"ENSG00000141510"}).} +\item{columns}{Sample IDs matching the \code{patient_id} column of +\link{sampledata}.} +\item{values}{Non-negative numeric. Size-factor normalized counts. +Range: [0, 1,889,573].} +} +} +\source{ +TCGA-LUAD STAR counts downloaded from the GDC Data Portal +(\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +Normalized with DESeq2::counts() (\code{normalized = TRUE}); generated by +\code{data-raw/deseq2_results.R}. +} +\usage{ +norm_counts +} +\description{ +DESeq2 size-factor normalized counts derived from the TCGA-LUAD RNA-seq +dataset (16 tumor samples, 16 normal samples). Counts are divided by +DESeq2 size factors to correct for differences in library size across +samples, but remain in counts scale (not log-transformed). +} +\details{ +Suitable as input for \code{\link[=nice_VSB]{nice_VSB()}}, \code{\link[=detect_filter]{detect_filter()}}, and +\code{\link[=add_annotations]{add_annotations()}}. For dimensionality reduction methods (\code{\link[=nice_PCA]{nice_PCA()}}, +\code{\link[=nice_UMAP]{nice_UMAP()}}, \code{\link[=nice_tSNE]{nice_tSNE()}}) use \link{vst_counts} instead, which removes the +mean-variance dependence of RNA-seq data. +} +\examples{ +data(norm_counts) +data(sampledata) + +# Dimensions +dim(norm_counts) + +# Value range +range(norm_counts) + +# Expression of a specific gene across samples +norm_counts["ENSG00000141510", ] + +# Violin-Scatter-Box plot for one gene +nice_VSB( + object = norm_counts, + annotations = sampledata, + variables = c(fill = "sample_type"), + genename = "ENSG00000141510", + categories = c("normal", "tumor"), + labels = c("Normal", "Tumor"), + colors = c("steelblue", "firebrick") +) + +\dontrun{ +# detect_filter: (required: "ensembl" column in results) +deseq2_res <- deseq2_results +colnames(deseq2_res)[colnames(deseq2_res) == "gene_id"] <- "ensembl" +rownames(deseq2_res) <- deseq2_res$ensembl + +# Get sample IDs per group from sampledata +samples_normal <- sampledata$patient_id[sampledata$sample_type == "normal"] +samples_tumor <- sampledata$patient_id[sampledata$sample_type == "tumor"] + +detected <- detect_filter( + norm.counts = as.data.frame(norm_counts), + df.BvsA = deseq2_res, + samples.baseline = samples_normal, + samples.condition1 = samples_tumor, + cutoffs = c(50, 50, 0) +) + +# Number of detectable genes +length(detected$DetectGenes) + +# Subset results to detectable genes +head(detected$Comparison1) + +# add_annotations: add gene symbols +# Required: reference df with geneID + annotation columns +# Example using biomaRt to fetch gene symbols +library(biomaRt) +mart <- useEnsembl("ensembl", dataset = "hsapiens_gene_ensembl") +ref <- getBM( + attributes = c("ensembl_gene_id", "hgnc_symbol", "gene_biotype"), + filters = "ensembl_gene_id", + values = rownames(norm_counts), + mart = mart +) +colnames(ref)[1] <- "geneID" + +norm_counts_annot <- add_annotations( + object = norm_counts, + reference = ref, + variables = c("hgnc_symbol", "gene_biotype") +) + +head(norm_counts_annot[, c("geneID", "hgnc_symbol", "gene_biotype")]) +} + +} +\seealso{ +\link{vst_counts}, \link{deseq2_results}, \link{sampledata}, \code{\link[=nice_VSB]{nice_VSB()}}, +\code{\link[=detect_filter]{detect_filter()}}, \code{\link[=add_annotations]{add_annotations()}} +} +\keyword{datasets} diff --git a/man/power_analysis.Rd b/man/power_analysis.Rd index 3d054d0..9fd2c65 100644 --- a/man/power_analysis.Rd +++ b/man/power_analysis.Rd @@ -35,3 +35,25 @@ power_analysis( \description{ Power analysis for RNA-seq differential expression with optional plotting } +\examples{ +# Basic power analysis with default parameters +result <- power_analysis( + effect_size = 1, + dispersion = 0.1, + n_genes = 20000, + prop_de = 0.05, + alpha = 0.05, + power_target = 0.8, + max_n = 20 +) + +# Minimum sample size to reach 80\% power +result$min_sample_size + +# Full power table +head(result$power_table) + +# Higher effect size requires fewer samples +power_analysis(effect_size = 2, dispersion = 0.1, plot = FALSE)$min_sample_size + +} diff --git a/man/save_results.Rd b/man/save_results.Rd index 6ea21ed..b4c81f1 100644 --- a/man/save_results.Rd +++ b/man/save_results.Rd @@ -24,3 +24,21 @@ And will save 3 tables: \item A table including only the under-expressed genes } } +\examples{ +\dontrun{ +data(deseq2_results) + +# Save full results + over/under-expressed tables as .xlsx files +save_results( + df = deseq2_results, + name = "TCGA_LUAD_TumorVsNormal", + l2fc = 1, + cutoff_alpha = 0.05 +) +# Creates: +# TCGA_LUAD_TumorVsNormal_full.xlsx +# TCGA_LUAD_TumorVsNormal_Overexp.xlsx +# TCGA_LUAD_TumorVsNormal_Underexp.xlsx +} + +} diff --git a/man/split_cases.Rd b/man/split_cases.Rd index 450803d..524b649 100644 --- a/man/split_cases.Rd +++ b/man/split_cases.Rd @@ -38,3 +38,43 @@ including the baseline, there are 10 mutually exclusive cases where genes can fall into. This function allows us to obtain these 10 cases and saves them into a list. } +\examples{ +\dontrun{ +# split_cases requires three DESeq2 comparisons. +# Simulate a 3-phenotype study: Normal (A), Primary (B), Metastasis (C) +set.seed(174) +n_genes <- 500 + +make_res <- function(seed) { + set.seed(seed) + data.frame( + ensembl = paste0("ENSG", sprintf("\%011d", seq_len(n_genes))), + log2FoldChange = rnorm(n_genes, 0, 1.5), + padj = runif(n_genes, 0, 0.5), + stringsAsFactors = FALSE + ) +} + +df_BvsA <- make_res(1) +df_CvsA <- make_res(2) +df_BvsC <- make_res(3) + +cases <- split_cases( + df.BvsA = df_BvsA, + df.CvsA = df_CvsA, + df.BvsC = df_BvsC, + unique_id = "ensembl", + significance_var = "padj", + significance_cutoff = 0.25, + change_var = "log2FoldChange", + change_cutoff = 0 +) + +# Number of genes per case +sapply(cases, nrow) + +# Inspect Case 1 (ladder genes: significant in all 3 comparisons) +head(cases$Case1) +} + +} diff --git a/man/tpm.Rd b/man/tpm.Rd index f648a45..9572b48 100644 --- a/man/tpm.Rd +++ b/man/tpm.Rd @@ -20,3 +20,28 @@ The input table is numeric: The gene lengths are in a column of a dataframe with the same row order. } } +\examples{ +\dontrun{ +data(raw_counts) +data(deseq2_results) + +# Gene lengths are needed, retrieve from get_annotations() or use +# pre-fetched lengths. Here we use the gene_length column if available. +annotations <- get_annotations( + ensembl_ids = rownames(raw_counts), + mode = "genes" +) + +# Match gene lengths to raw_counts row order +gene_lengths <- annotations$gene_length[ + match(rownames(raw_counts), annotations$geneID) +] + +# Calculate TPM +tpm_matrix <- tpm(raw_counts, gene_lengths) + +# Check: column sums should all be 1,000,000 +round(colSums(tpm_matrix)[1:3]) +} + +} diff --git a/man/vst_counts.Rd b/man/vst_counts.Rd new file mode 100644 index 0000000..8811dc6 --- /dev/null +++ b/man/vst_counts.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deseq2_results.R +\docType{data} +\name{vst_counts} +\alias{vst_counts} +\title{Variance-stabilized counts matrix for TCGA-LUAD} +\format{ +A numeric matrix with 21,330 rows (genes) and 32 columns (samples): +\describe{ +\item{rows}{Ensembl gene IDs (e.g., \code{"ENSG00000141510"}).} +\item{columns}{Sample IDs matching the \code{patient_id} column of +\link{sampledata}.} +\item{values}{Numeric. VST-transformed expression values on a log2-like +scale. Range: [1.78, 20.85].} +} +} +\source{ +TCGA-LUAD STAR counts downloaded from the GDC Data Portal +(\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz}). +Transformed with \code{\link[DESeq2:vst]{DESeq2::vst()}} (\code{blind = TRUE}); generated by +\code{data-raw/deseq2_results.R}. +} +\usage{ +vst_counts +} +\description{ +Variance Stabilizing Transformation (VST) applied to the TCGA-LUAD RNA-seq +dataset (16 tumor samples, 16 normal samples) using \code{\link[DESeq2:vst]{DESeq2::vst()}} with +\code{blind = TRUE}. VST removes the mean-variance dependence characteristic of +RNA-seq count data, placing all genes on a comparable log2-like scale. This +makes it the appropriate input for sample-level dimensionality reduction and +clustering methods. +} +\details{ +Suitable as input for \code{\link[=nice_PCA]{nice_PCA()}}, \code{\link[=nice_UMAP]{nice_UMAP()}}, and \code{\link[=nice_tSNE]{nice_tSNE()}}. For +gene-level expression plots (\code{\link[=nice_VSB]{nice_VSB()}}) or filtering (\code{\link[=detect_filter]{detect_filter()}}) +use \link{norm_counts} instead. +} +\examples{ +data(vst_counts) +data(sampledata) + +# Dimensions +dim(vst_counts) + +# Value range (log2-like scale) +range(vst_counts) + +# PCA plot colored by sample type +colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" +nice_PCA( + object = vst_counts, + annotations = sampledata, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD PCA" +) + +\dontrun{ +# UMAP plot +colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" + +nice_UMAP( + object = vst_counts, + annotations = sampledata, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD UMAP" +) + +# tSNE plot +# perplexity must be lower than the number of samples divided by 3 + +colnames(sampledata)[colnames(sampledata) == "patient_id"] <- "id" +nice_tSNE( + object = vst_counts, + annotations = sampledata, + perplexity = 5, + variables = c(fill = "sample_type"), + legend_names = c(fill = "Sample Type"), + colors = c("steelblue", "firebrick"), + shapes = c(21, 21), + title = "TCGA-LUAD tSNE" +) +} + +} +\seealso{ +\link{norm_counts}, \link{deseq2_results}, \link{sampledata}, \code{\link[=nice_PCA]{nice_PCA()}}, +\code{\link[=nice_UMAP]{nice_UMAP()}}, \code{\link[=nice_tSNE]{nice_tSNE()}} +} +\keyword{datasets}