v1xerunt · v1xerunt · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/.gitignore b/.gitignore
@@ -42,6 +42,7 @@ vignettes/*.pdf
 .Renviron.local
 
 docs/
+output/
 
 # translation temp files
 po/*~

diff --git a/R/Dataset_BaseDataset.R b/R/Dataset_BaseDataset.R
@@ -258,7 +258,10 @@ BaseDataset <- R6::R6Class(
     #' @return List of \code{Patient} objects.
     iter_patients = function(df = NULL) {
       ids <- self$unique_patient_ids()
-      ids <- head(ids, 100)
+      if (self$dev) {
+        message("[dev] Limiting to 100 patients for rapid prototyping")
+        ids <- head(ids, 100)
+      }
 
       progressr::handlers(global = TRUE)
       p <- progressr::progressor(steps = length(ids))

diff --git a/README.Rmd b/README.Rmd
@@ -56,7 +56,7 @@ If you use RHealth in your research, please cite our work:
 
 Install the development version from [GitHub](https://github.com/):
 
-```r
+```{r, eval=FALSE}
 # install.packages("pak")
 pak::pak("v1xerunt/RHealth")
 
@@ -67,7 +67,7 @@ devtools::install_github("v1xerunt/RHealth")
 
 Once installed, load the package to access its functionalities:
 
-```r
+```{r, eval=FALSE}
 library(RHealth)
 ```
 
@@ -83,13 +83,13 @@ This standalone module helps map medical codes between different systems.
 
 **Code Lookup:**
 
-```r
+```{r, eval=FALSE}
 lookup_code(code = "428.0", system = "ICD9CM")
 ```
 
 **Find Ancestors/Descendants:**
 
-```r
+```{r, eval=FALSE}
 # Get all parent codes
 get_ancestors(code = "428.22", system = "ICD9CM")
 
@@ -99,7 +99,7 @@ get_descendants(code = "428", system = "ICD9CM")
 
 **Cross-System Mapping:**
 
-```r
+```{r, eval=FALSE}
 # Map from ICD-9 to CCS
 map_code(code = "428.0", from = "ICD9CM", to = "CCSCM")
 ```
@@ -115,20 +115,27 @@ The **Dataset** module is the foundation of RHealth. It transforms raw, multi-ta
   * **Built-in Caching**: Uses DuckDB for CSV → Parquet caching, enabling up to 10x faster reloads.
   * **Dev Mode**: Allows for lightning-fast iteration by using a small subset of patients.
 
+You can download a sample dataset (MIMIC-IV Demo, version 2.2) directly from PhysioNet using the following link:
+
+👉 https://physionet.org/content/mimic-iv-demo/2.2/#files-panel
+
+
 **Quick Start:**
 
 Define a dataset from your source files using a YAML configuration.
 
-```r
+```{r, eval=FALSE}
 # The YAML config defines tables, patient IDs, timestamps, and attributes
 # See the full documentation for details on the YAML structure.
 
 # Load the dataset
-ds <- BaseDataset$new(
-  root         = "path/to/mimic4",
-  tables       = c("patients", "admissions", "labevents"),
+data_dir <- "/Users/yourname/datasets/mimiciv/"
+
+ds <- MIMIC4EHRDataset$new(
+  root = data_dir,
+  tables = c("patients", "admissions", "diagnoses_icd", "procedures_icd", "prescriptions"),
   dataset_name = "mimic4_ehr",
-  dev          = TRUE  # limit to 1,000 patients for speed
+  dev = TRUE
 )
 
 ds$stats()
@@ -146,7 +153,7 @@ A task is defined by subclassing `BaseTask` and implementing the `call()` method
 
 **Example Task Definition:**
 
-```r
+```{r, eval=FALSE}
 MyReadmissionTask <- R6::R6Class(
   "MyReadmissionTask",
   inherit = BaseTask,
@@ -178,10 +185,9 @@ MyReadmissionTask <- R6::R6Class(
 
 Once a task is defined, use it with your dataset to create a `SampleDataset` compatible with `{torch}`.
 
-```r
+```{r, eval=FALSE}
 task    <- Readmission30DaysMIMIC4$new() # A built-in task
 samples <- ds$set_task(task)
-loader  <- dataloader(samples, batch_size = 64, shuffle = TRUE)
 ```
 
 ### 🧠 4. Model Module
@@ -192,7 +198,7 @@ The **Model** module provides ready-to-use neural network architectures. All mod
 
 RHealth includes reference implementations like `RNN`, which can be instantiated in one line:
 
-```r
+```{r, eval=FALSE}
 model <- RNN(
   dataset       = samples, # The SampleDataset from set_task()
   embedding_dim = 128,
@@ -204,7 +210,7 @@ model <- RNN(
 
 You can easily write your own model by inheriting from `BaseModel`.
 
-```r
+```{r, eval=FALSE}
 MyDenseNet <- torch::nn_module(
   "MyDenseNet",
   inherit = BaseModel,
@@ -238,13 +244,15 @@ The **Trainer** module provides a high-level, configurable training loop that ha
 
 **Example Training Workflow:**
 
-```r
+```{r, eval=FALSE}
 # 1. Create data loaders
-train_loader <- dataloader(train_samples, batch_size = 32, shuffle = TRUE)
-val_loader   <- dataloader(val_samples,   batch_size = 64)
+splits <- split_by_patient(samples, c(0.8, 0.1, 0.1))
+train_dl <- get_dataloader(splits[[1]], batch_size = 32, shuffle = TRUE)
+val_dl <- get_dataloader(splits[[2]], batch_size = 32)
+test_dl <- get_dataloader(splits[[3]], batch_size = 32)
 
 # 2. Instantiate a model
-model <- RNN(train_samples, embedding_dim = 128, hidden_dim = 128)
+model <- RNN(train_dl, embedding_dim = 128, hidden_dim = 128)
 
 # 3. Set up the trainer
 trainer <- Trainer$new(
@@ -256,13 +264,11 @@ trainer <- Trainer$new(
 
 # 4. Start training
 trainer$train(
-    train_dataloader  = train_loader,
-    val_dataloader    = val_loader,
-    epochs            = 10,
-    learning_rate     = 1e-3,
-    weight_decay      = 1e-4,
-    max_grad_norm     = 5.0,
-    monitor           = "auroc" # Save the best model based on AUROC
+  train_dataloader = train_dl,
+  val_dataloader = val_dl,
+  epochs = 10,
+  optimizer_params = list(lr = 1e-3),
+  monitor = "roc_auc"
 )
 ```
 

diff --git a/README.md b/README.md
@@ -120,6 +120,11 @@ tensors that any downstream model can consume.
 - **Dev Mode**: Allows for lightning-fast iteration by using a small
   subset of patients.
 
+You can download a sample dataset (MIMIC-IV Demo, version 2.2) directly
+from PhysioNet using the following link:
+
+👉 <https://physionet.org/content/mimic-iv-demo/2.2/#files-panel>
+
 **Quick Start:**
 
 Define a dataset from your source files using a YAML configuration.
@@ -129,11 +134,13 @@ Define a dataset from your source files using a YAML configuration.
 # See the full documentation for details on the YAML structure.
 
 # Load the dataset
-ds <- BaseDataset$new(
-  root         = "path/to/mimic4",
-  tables       = c("patients", "admissions", "labevents"),
+data_dir <- "/Users/yourname/datasets/mimiciv/"
+
+ds <- MIMIC4EHRDataset$new(
+  root = data_dir,
+  tables = c("patients", "admissions", "diagnoses_icd", "procedures_icd", "prescriptions"),
   dataset_name = "mimic4_ehr",
-  dev          = TRUE  # limit to 1,000 patients for speed
+  dev = TRUE
 )
 
 ds$stats()
@@ -190,7 +197,6 @@ Once a task is defined, use it with your dataset to create a
 ``` r
 task    <- Readmission30DaysMIMIC4$new() # A built-in task
 samples <- ds$set_task(task)
-loader  <- dataloader(samples, batch_size = 64, shuffle = TRUE)
 ```
 
 ### 🧠 4. Model Module
@@ -253,11 +259,13 @@ that handles logging, checkpointing, evaluation, and progress bars.
 
 ``` r
 # 1. Create data loaders
-train_loader <- dataloader(train_samples, batch_size = 32, shuffle = TRUE)
-val_loader   <- dataloader(val_samples,   batch_size = 64)
+splits <- split_by_patient(samples, c(0.8, 0.1, 0.1))
+train_dl <- get_dataloader(splits[[1]], batch_size = 32, shuffle = TRUE)
+val_dl <- get_dataloader(splits[[2]], batch_size = 32)
+test_dl <- get_dataloader(splits[[3]], batch_size = 32)
 
 # 2. Instantiate a model
-model <- RNN(train_samples, embedding_dim = 128, hidden_dim = 128)
+model <- RNN(train_dl, embedding_dim = 128, hidden_dim = 128)
 
 # 3. Set up the trainer
 trainer <- Trainer$new(
@@ -269,13 +277,11 @@ trainer <- Trainer$new(
 
 # 4. Start training
 trainer$train(
-    train_dataloader  = train_loader,
-    val_dataloader    = val_loader,
-    epochs            = 10,
-    learning_rate     = 1e-3,
-    weight_decay      = 1e-4,
-    max_grad_norm     = 5.0,
-    monitor           = "auroc" # Save the best model based on AUROC
+  train_dataloader = train_dl,
+  val_dataloader = val_dl,
+  epochs = 10,
+  optimizer_params = list(lr = 1e-3),
+  monitor = "roc_auc"
 )
 ```
 

diff --git a/output/20250611-194012/best.ckpt b/output/20250611-194012/best.ckpt
diff --git a/output/20250611-194012/last.ckpt b/output/20250611-194012/last.ckpt
diff --git a/output/20250611-194012/train.log b/output/20250611-194012/train.log
@@ -0,0 +1,23 @@
+INFO [2025-06-11 19:40:12] Logger initialized at D:/OneDrive - University of Edinburgh/Code/RHealth/output/20250611-194012/train.log
+INFO [2025-06-11 19:40:12] Initialised model on cpu
+INFO [2025-06-11 19:40:18] Epoch 1/10 | train loss 0.6436
+INFO [2025-06-11 19:40:18] Val scores: pr_auc=0.1370, roc_auc=0.7500, f1=0.1429, accuracy=0.0769, precision=1.0000, recall=0.0769, loss=0.5624
+INFO [2025-06-11 19:40:18] New best roc_auc: 0.7500
+INFO [2025-06-11 19:40:22] Epoch 2/10 | train loss 0.4882
+INFO [2025-06-11 19:40:22] Val scores: pr_auc=0.1074, roc_auc=0.6667, f1=0.1429, accuracy=0.0769, precision=1.0000, recall=0.0769, loss=0.3911
+INFO [2025-06-11 19:40:26] Epoch 3/10 | train loss 0.2972
+INFO [2025-06-11 19:40:26] Val scores: pr_auc=0.0653, roc_auc=0.5833, f1=0.1429, accuracy=0.0769, precision=1.0000, recall=0.0769, loss=0.1603
+INFO [2025-06-11 19:40:28] Epoch 4/10 | train loss 0.0688
+INFO [2025-06-11 19:40:28] Val scores: pr_auc=0.0653, roc_auc=0.5833, f1=0.1429, accuracy=0.0769, precision=1.0000, recall=0.0769, loss=0.0031
+INFO [2025-06-11 19:40:31] Epoch 5/10 | train loss 0.0014
+INFO [2025-06-11 19:40:31] Val scores: pr_auc=0.0685, roc_auc=0.5417, f1=0.1429, accuracy=0.0769, precision=1.0000, recall=0.0769, loss=0.0000
+INFO [2025-06-11 19:40:34] Epoch 6/10 | train loss 0.0000
+INFO [2025-06-11 19:40:34] Val scores: pr_auc=0.1000, roc_auc=0.6250, f1=0.1429, accuracy=0.0769, precision=1.0000, recall=0.0769, loss=0.0000
+INFO [2025-06-11 19:40:38] Epoch 7/10 | train loss 0.0000
+INFO [2025-06-11 19:40:38] Val scores: pr_auc=0.0909, roc_auc=0.5833, f1=0.1429, accuracy=0.0769, precision=1.0000, recall=0.0769, loss=0.0000
+INFO [2025-06-11 19:40:41] Epoch 8/10 | train loss 0.0000
+INFO [2025-06-11 19:40:41] Val scores: pr_auc=0.0909, roc_auc=0.5833, f1=0.1429, accuracy=0.0769, precision=1.0000, recall=0.0769, loss=0.0000
+INFO [2025-06-11 19:40:44] Epoch 9/10 | train loss 0.0000
+INFO [2025-06-11 19:40:44] Val scores: pr_auc=0.0833, roc_auc=0.5417, f1=0.1429, accuracy=0.0769, precision=1.0000, recall=0.0769, loss=0.0000
+INFO [2025-06-11 19:40:47] Epoch 10/10 | train loss 0.0000
+INFO [2025-06-11 19:40:47] Val scores: pr_auc=0.0833, roc_auc=0.5417, f1=0.1429, accuracy=0.0769, precision=1.0000, recall=0.0769, loss=0.0000
-Original file line number
+Diff line change
@@ Expand Up / @@ -42,6 +42,7 @@ vignettes/*.pdf @@
     .Renviron.local
     docs/
+    output/
     # translation temp files
     po/*~
@@ Expand Down @@