From ac25e380481f61ab93d5c7ac4225297a3536ca40 Mon Sep 17 00:00:00 2001
From: Kaushik Srivatsan <kaushiksrivatsan2003@hotmail.com>
Date: Sat, 1 Mar 2025 19:30:09 +0530
Subject: [PATCH 1/2] Commit - Device map feature for maestro models -qwen_2.5,
 florence_2 & paligemma_2

---
 maestro/trainer/models/florence_2/checkpoints.py | 12 ++++++++----
 .../trainer/models/paligemma_2/checkpoints.py    | 16 +++++++++-------
 .../trainer/models/qwen_2_5_vl/checkpoints.py    | 15 ++++++++-------
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/maestro/trainer/models/florence_2/checkpoints.py b/maestro/trainer/models/florence_2/checkpoints.py
index 0c93ba1c..9eaae01c 100644
--- a/maestro/trainer/models/florence_2/checkpoints.py
+++ b/maestro/trainer/models/florence_2/checkpoints.py
@@ -1,6 +1,6 @@
 import os
 from enum import Enum
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 from peft import LoraConfig, get_peft_model
@@ -23,7 +23,7 @@ class OptimizationStrategy(Enum):
 def load_model(
     model_id_or_path: str = DEFAULT_FLORENCE2_MODEL_ID,
     revision: str = DEFAULT_FLORENCE2_MODEL_REVISION,
-    device: str | torch.device = "auto",
+    device_map: Optional[str] = "auto",
     optimization_strategy: OptimizationStrategy = OptimizationStrategy.NONE,
     cache_dir: Optional[str] = None,
 ) -> tuple[AutoProcessor, AutoModelForCausalLM]:
@@ -32,7 +32,10 @@ def load_model(
     Args:
         model_id_or_path (str): The identifier or path of the Florence 2 model to load.
         revision (str): The specific model revision to use.
-        device (torch.device): The device to load the model onto.
+        device_map (Optional[Union[str, dict]]): Device map for the model:        
+            -"auto": Places model on single available device (default)
+            - String like "cpu", "cuda:0", or "mps" for a specific device
+            - Note: Florence-2 doesn't support dict mapping
         optimization_strategy (OptimizationStrategy): The optimization strategy to apply to the model.
         cache_dir (Optional[str]): Directory to cache the downloaded model files.
 
@@ -43,7 +46,8 @@ def load_model(
     Raises:
         ValueError: If the model or processor cannot be loaded.
     """
-    device = parse_device_spec(device)
+
+    device = parse_device_spec(device_map) 
     processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True, revision=revision)
 
     if optimization_strategy == OptimizationStrategy.LORA:
diff --git a/maestro/trainer/models/paligemma_2/checkpoints.py b/maestro/trainer/models/paligemma_2/checkpoints.py
index e7582c2d..fe2fe654 100644
--- a/maestro/trainer/models/paligemma_2/checkpoints.py
+++ b/maestro/trainer/models/paligemma_2/checkpoints.py
@@ -1,6 +1,6 @@
 import os
 from enum import Enum
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 from peft import LoraConfig, get_peft_model
@@ -24,7 +24,7 @@ class OptimizationStrategy(Enum):
 def load_model(
     model_id_or_path: str = DEFAULT_PALIGEMMA2_MODEL_ID,
     revision: str = DEFAULT_PALIGEMMA2_MODEL_REVISION,
-    device: str | torch.device = "auto",
+    device_map: Optional[Union[str, dict]] = None,
     optimization_strategy: OptimizationStrategy = OptimizationStrategy.NONE,
     cache_dir: Optional[str] = None,
 ) -> tuple[PaliGemmaProcessor, PaliGemmaForConditionalGeneration]:
@@ -33,7 +33,10 @@ def load_model(
     Args:
         model_id_or_path (str): The identifier or path of the model to load.
         revision (str): The specific model revision to use.
-        device (torch.device): The device to load the model onto.
+        device_map (Optional[Union[str, dict]]): Device map for the model:
+            - None: Uses "auto" for automatic distribution across available devices (default)
+            - String like "cpu", "cuda:0", or "mps" for a specific device
+            - Dict for custom module-to-device mapping (e.g., {"": "cuda:0"})
         optimization_strategy (OptimizationStrategy): The optimization strategy to apply to the model.
         cache_dir (Optional[str]): Directory to cache the downloaded model files.
 
@@ -44,7 +47,6 @@ def load_model(
     Raises:
         ValueError: If the model or processor cannot be loaded.
     """
-    device = parse_device_spec(device)
     processor = PaliGemmaProcessor.from_pretrained(model_id_or_path, trust_remote_code=True, revision=revision)
 
     if optimization_strategy in {OptimizationStrategy.LORA, OptimizationStrategy.QLORA}:
@@ -66,7 +68,7 @@ def load_model(
             pretrained_model_name_or_path=model_id_or_path,
             revision=revision,
             trust_remote_code=True,
-            device_map="auto",
+            device_map=device_map if device_map else "auto",
             quantization_config=bnb_config,
             torch_dtype=torch.bfloat16,
             cache_dir=cache_dir,
@@ -78,9 +80,9 @@ def load_model(
             pretrained_model_name_or_path=model_id_or_path,
             revision=revision,
             trust_remote_code=True,
-            device_map="auto",
+            device_map=device_map if device_map else "auto",
             cache_dir=cache_dir,
-        ).to(device)
+        )
 
         if optimization_strategy == OptimizationStrategy.FREEZE:
             for param in model.vision_tower.parameters():
diff --git a/maestro/trainer/models/qwen_2_5_vl/checkpoints.py b/maestro/trainer/models/qwen_2_5_vl/checkpoints.py
index 359d2a25..3045161e 100644
--- a/maestro/trainer/models/qwen_2_5_vl/checkpoints.py
+++ b/maestro/trainer/models/qwen_2_5_vl/checkpoints.py
@@ -1,6 +1,6 @@
 import os
 from enum import Enum
-from typing import Optional
+from typing import Optional, Union
 
 import torch
 from peft import LoraConfig, get_peft_model
@@ -23,7 +23,7 @@ class OptimizationStrategy(Enum):
 def load_model(
     model_id_or_path: str = DEFAULT_QWEN2_5_VL_MODEL_ID,
     revision: str = DEFAULT_QWEN2_5_VL_MODEL_REVISION,
-    device: str | torch.device = "auto",
+    device_map: Optional[Union[str, dict]] = None,
     optimization_strategy: OptimizationStrategy = OptimizationStrategy.NONE,
     cache_dir: Optional[str] = None,
     min_pixels: int = 256 * 28 * 28,
@@ -35,7 +35,10 @@ def load_model(
     Args:
         model_id_or_path (str): The model name or path.
         revision (str): The model revision to load.
-        device (str | torch.device): The device to load the model onto.
+        device_map (Optional[Union[str, dict]]): Device map for the model:
+            - None: Uses "auto" for automatic distribution across available devices (default)
+            - String like "cpu", "cuda:0", or "mps" for a specific device
+            - Dict for custom module-to-device mapping (e.g., {"": "cuda:0"})
         optimization_strategy (OptimizationStrategy): LORA, QLORA, or NONE.
         cache_dir (Optional[str]): Directory to cache downloaded model files.
         min_pixels (int): Minimum number of pixels allowed in the resized image.
@@ -45,7 +48,6 @@ def load_model(
         (Qwen2_5_VLProcessor, Qwen2_5_VLForConditionalGeneration):
             A tuple containing the loaded processor and model.
     """
-    device = parse_device_spec(device)
     processor = Qwen2_5_VLProcessor.from_pretrained(
         model_id_or_path,
         revision=revision,
@@ -82,7 +84,7 @@ def load_model(
             model_id_or_path,
             revision=revision,
             trust_remote_code=True,
-            device_map="auto",
+            device_map=device_map if device_map else "auto",
             quantization_config=bnb_config,
             torch_dtype=torch.bfloat16,
             cache_dir=cache_dir,
@@ -94,11 +96,10 @@ def load_model(
             model_id_or_path,
             revision=revision,
             trust_remote_code=True,
-            device_map="auto",
+            device_map=device_map if device_map else "auto",
             torch_dtype=torch.bfloat16,
             cache_dir=cache_dir,
         )
-        model.to(device)
 
     return processor, model
 

From 1a42148baa87597b04e3345022abe58d435bf222 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 1 Mar 2025 15:00:21 +0000
Subject: [PATCH 2/2] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?=
 =?UTF-8?q?format=20pre-commit=20hooks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 maestro/trainer/models/florence_2/checkpoints.py  | 7 +++----
 maestro/trainer/models/paligemma_2/checkpoints.py | 2 --
 maestro/trainer/models/qwen_2_5_vl/checkpoints.py | 2 --
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/maestro/trainer/models/florence_2/checkpoints.py b/maestro/trainer/models/florence_2/checkpoints.py
index 9eaae01c..39e19d46 100644
--- a/maestro/trainer/models/florence_2/checkpoints.py
+++ b/maestro/trainer/models/florence_2/checkpoints.py
@@ -1,8 +1,7 @@
 import os
 from enum import Enum
-from typing import Optional, Union
+from typing import Optional
 
-import torch
 from peft import LoraConfig, get_peft_model
 from transformers import AutoModelForCausalLM, AutoProcessor
 
@@ -32,7 +31,7 @@ def load_model(
     Args:
         model_id_or_path (str): The identifier or path of the Florence 2 model to load.
         revision (str): The specific model revision to use.
-        device_map (Optional[Union[str, dict]]): Device map for the model:        
+        device_map (Optional[Union[str, dict]]): Device map for the model:
             -"auto": Places model on single available device (default)
             - String like "cpu", "cuda:0", or "mps" for a specific device
             - Note: Florence-2 doesn't support dict mapping
@@ -47,7 +46,7 @@ def load_model(
         ValueError: If the model or processor cannot be loaded.
     """
 
-    device = parse_device_spec(device_map) 
+    device = parse_device_spec(device_map)
     processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True, revision=revision)
 
     if optimization_strategy == OptimizationStrategy.LORA:
diff --git a/maestro/trainer/models/paligemma_2/checkpoints.py b/maestro/trainer/models/paligemma_2/checkpoints.py
index fe2fe654..a29a64da 100644
--- a/maestro/trainer/models/paligemma_2/checkpoints.py
+++ b/maestro/trainer/models/paligemma_2/checkpoints.py
@@ -6,8 +6,6 @@
 from peft import LoraConfig, get_peft_model
 from transformers import BitsAndBytesConfig, PaliGemmaForConditionalGeneration, PaliGemmaProcessor
 
-from maestro.trainer.common.utils.device import parse_device_spec
-
 DEFAULT_PALIGEMMA2_MODEL_ID = "google/paligemma2-3b-pt-224"
 DEFAULT_PALIGEMMA2_MODEL_REVISION = "refs/heads/main"
 
diff --git a/maestro/trainer/models/qwen_2_5_vl/checkpoints.py b/maestro/trainer/models/qwen_2_5_vl/checkpoints.py
index 3045161e..d40818bf 100644
--- a/maestro/trainer/models/qwen_2_5_vl/checkpoints.py
+++ b/maestro/trainer/models/qwen_2_5_vl/checkpoints.py
@@ -6,8 +6,6 @@
 from peft import LoraConfig, get_peft_model
 from transformers import BitsAndBytesConfig, Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLProcessor
 
-from maestro.trainer.common.utils.device import parse_device_spec
-
 DEFAULT_QWEN2_5_VL_MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
 DEFAULT_QWEN2_5_VL_MODEL_REVISION = "refs/heads/main"