From 4394a90eb5fef6c564755259b3fe1105d44900d5 Mon Sep 17 00:00:00 2001
From: AshAnand34 <aashishanand2019@gmail.com>
Date: Sat, 10 May 2025 01:24:49 -0700
Subject: [PATCH 01/92] Created SmolVLM2 model in maestro

---
 docs/models/smolvlm2.md                       |  99 ++++++++++
 maestro/cli/introspection.py                  |   7 +
 maestro/trainer/models/smolvlm2/__init__.py   |   0
 .../trainer/models/smolvlm2/checkpoints.py    |  64 +++++++
 maestro/trainer/models/smolvlm2/core.py       | 179 ++++++++++++++++++
 maestro/trainer/models/smolvlm2/detection.py  | 111 +++++++++++
 maestro/trainer/models/smolvlm2/entrypoint.py | 150 +++++++++++++++
 maestro/trainer/models/smolvlm2/inference.py  | 138 ++++++++++++++
 maestro/trainer/models/smolvlm2/loaders.py    | 115 +++++++++++
 mkdocs.yaml                                   |   1 +
 pyproject.toml                                |  71 ++-----
 11 files changed, 882 insertions(+), 53 deletions(-)
 create mode 100644 docs/models/smolvlm2.md
 create mode 100644 maestro/trainer/models/smolvlm2/__init__.py
 create mode 100644 maestro/trainer/models/smolvlm2/checkpoints.py
 create mode 100644 maestro/trainer/models/smolvlm2/core.py
 create mode 100644 maestro/trainer/models/smolvlm2/detection.py
 create mode 100644 maestro/trainer/models/smolvlm2/entrypoint.py
 create mode 100644 maestro/trainer/models/smolvlm2/inference.py
 create mode 100644 maestro/trainer/models/smolvlm2/loaders.py

diff --git a/docs/models/smolvlm2.md b/docs/models/smolvlm2.md
new file mode 100644
index 00000000..cebf95f7
--- /dev/null
+++ b/docs/models/smolvlm2.md
@@ -0,0 +1,99 @@
+---
+comments: true
+---
+
+## Overview
+
+SmolVLM2 is a lightweight vision-language model developed by Smol AI. It offers impressive capabilities for multimodal understanding while maintaining a compact size compared to larger VLMs. The model excels at tasks such as image captioning, visual question answering, and object detection, making it accessible for applications with limited computational resources.
+
+Built to balance performance and efficiency, SmolVLM2 provides a valuable option for developers seeking to implement vision-language capabilities without the overhead of larger models. The 500M parameter variant delivers practical results while being significantly more resource-friendly than multi-billion parameter alternatives.
+
+## Install
+
+```bash
+pip install "maestro[smolvlm2]"
+```
+
+## Train
+
+The training routines support various optimization strategies such as LoRA, QLoRA, and freezing the vision encoder. Customize your fine-tuning process via CLI or Python to align with your dataset and task requirements.
+
+### CLI
+
+Kick off training from the command line by running the command below. Be sure to replace the dataset path and adjust the hyperparameters (such as epochs and batch size) to suit your needs.
+
+```bash
+maestro smolvlm2 train \
+  --dataset "dataset/location" \
+  --epochs 10 \
+  --batch-size 4 \
+  --optimization_strategy "qlora" \
+  --metrics "edit_distance"
+```
+
+### Python
+
+For more control, you can fine-tune SmolVLM2 using the Python API. Create a configuration dictionary with your training parameters and pass it to the train function to integrate the process into your custom workflow.
+
+```python
+from maestro.trainer.models.smolvlm2.core import train
+
+config = {
+    "dataset": "dataset/location",
+    "epochs": 10,
+    "batch_size": 4,
+    "optimization_strategy": "qlora",
+    "metrics": ["edit_distance"],
+}
+
+results = train(config)
+```
+
+## Inference
+
+Use SmolVLM2 for inference on images using either the CLI or Python API.
+
+### CLI
+
+```bash
+maestro smolvlm2 predict \
+  --image "path/to/image.jpg" \
+  --prompt "Describe this image"
+```
+
+### Python
+
+```python
+from maestro.trainer.models.smolvlm2.entrypoint import SmolVLM2
+
+model = SmolVLM2()
+result = model.generate(
+    images="path/to/image.jpg",
+    prompt="Describe this image",
+    max_new_tokens=512
+)
+
+print(result["text"])
+```
+
+## Object Detection
+
+SmolVLM2 can perform object detection on images, identifying and localizing objects with bounding boxes.
+
+```python
+from maestro.trainer.models.smolvlm2.entrypoint import SmolVLM2
+from maestro.trainer.models.smolvlm2.detection import result_to_detections_formatter
+
+model = SmolVLM2()
+result = model.generate(
+    images="path/to/image.jpg",
+    prompt="Detect the following objects: person, car, dog"
+)
+
+# Convert text output to detections format
+boxes, class_ids = result_to_detections_formatter(
+    text=result["text"],
+    resolution_wh=(640, 480),
+    classes=["person", "car", "dog"]
+)
+```
diff --git a/maestro/cli/introspection.py b/maestro/cli/introspection.py
index 086a831b..95368685 100644
--- a/maestro/cli/introspection.py
+++ b/maestro/cli/introspection.py
@@ -28,6 +28,13 @@ def find_training_recipes(app: typer.Typer) -> None:
     except Exception:
         _warn_about_recipe_import_error(model_name="Qwen2.5-VL")
 
+    try:
+        from maestro.trainer.models.smolvlm2.entrypoint import smolvlm2_app
+
+        app.add_typer(smolvlm2_app, name="smolvlm2")
+    except Exception:
+        _warn_about_recipe_import_error(model_name="SmolVLM2")
+
 
 def _warn_about_recipe_import_error(model_name: str) -> None:
     disable_warnings = str2bool(
diff --git a/maestro/trainer/models/smolvlm2/__init__.py b/maestro/trainer/models/smolvlm2/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
new file mode 100644
index 00000000..dbfc8f71
--- /dev/null
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -0,0 +1,64 @@
+import os
+from typing import Dict, Optional
+
+import torch
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
+
+def save_checkpoint(
+    model: AutoModelForVision2Seq,
+    processor: AutoProcessor,
+    path: str,
+    metadata: Optional[Dict] = None
+) -> None:
+    """
+    Save model checkpoint.
+    
+    Args:
+        model: Model to save
+        processor: Processor to save
+        path: Path to save checkpoint
+        metadata: Optional metadata to save
+    """
+    os.makedirs(path, exist_ok=True)
+
+    # Save model
+    model.save_pretrained(path)
+
+    # Save processor
+    processor.save_pretrained(path)
+
+    # Save metadata if provided
+    if metadata is not None:
+        torch.save(metadata, os.path.join(path, "metadata.pt"))
+
+def load_checkpoint(
+    path: str,
+    device: str = "cuda" if torch.cuda.is_available() else "cpu"
+) -> Dict:
+    """
+    Load model checkpoint.
+    
+    Args:
+        path: Path to checkpoint
+        device: Device to load model on
+        
+    Returns:
+        Dictionary containing model, processor, and metadata
+    """
+    # Load model
+    model = AutoModelForVision2Seq.from_pretrained(path)
+    model.to(device)
+
+    # Load processor
+    processor = AutoProcessor.from_pretrained(path)
+
+    # Load metadata if exists
+    metadata_path = os.path.join(path, "metadata.pt")
+    metadata = torch.load(metadata_path) if os.path.exists(metadata_path) else None
+
+    return {
+        "model": model,
+        "processor": processor,
+        "metadata": metadata
+    }
diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
new file mode 100644
index 00000000..101918e1
--- /dev/null
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -0,0 +1,179 @@
+from typing import Optional, Union
+
+import torch
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
+
+class SmolVLM2Core:
+    """Core SmolVLM2 model implementation."""
+
+    def __init__(
+        self,
+        model_name: str = "smol-ai/smolvlm2-500m",
+        device: str = "cuda" if torch.cuda.is_available() else "cpu",
+        **kwargs
+    ):
+        """
+        Initialize SmolVLM2 model.
+
+        Args:
+            model_name: Name or path of the model to load
+            device: Device to run the model on
+            **kwargs: Additional arguments to pass to the model
+        """
+        self.model_name = model_name
+        self.device = device
+
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.model = AutoModelForVision2Seq.from_pretrained(model_name)
+        self.model.to(device)
+
+    def process_inputs(
+        self,
+        images: Union[str, list[str]],
+        prompt: Optional[str] = None
+    ) -> dict:
+        """Process input images and text."""
+        if isinstance(images, str):
+            images = [images]
+
+        return self.processor(
+            images=images,
+            text=prompt if prompt else "",
+            return_tensors="pt"
+        ).to(self.device)
+
+    def generate(
+        self,
+        inputs: dict,
+        max_new_tokens: int = 512,
+        **kwargs
+    ) -> torch.Tensor:
+        """Generate text from processed inputs."""
+        return self.model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        )
+
+    def decode_outputs(
+        self,
+        outputs: torch.Tensor,
+        skip_special_tokens: bool = True
+    ) -> list[str]:
+        """Decode model outputs to text."""
+        return self.processor.batch_decode(
+            outputs,
+            skip_special_tokens=skip_special_tokens
+        )
+
+def train(config: dict) -> dict:
+    """
+    Train SmolVLM2 model with provided configuration.
+    
+    Args:
+        config: Dictionary containing training configuration
+            - dataset: Path to dataset directory or file
+            - epochs: Number of training epochs
+            - batch_size: Training batch size
+            - optimization_strategy: Strategy for optimization (qlora, lora, freeze_vision)
+            - metrics: List of metrics to evaluate during training
+            - output_dir: Directory to save trained model
+    Returns:
+        Dictionary containing training results and metrics
+    """
+    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+    from transformers import BitsAndBytesConfig, TrainingArguments
+
+    # Load dataset
+    dataset_path = config["dataset"]
+
+    # TODO: Implement proper dataset loading logic based on the dataset format
+    # For now, we'll use a placeholder implementation
+    
+    # Create model with the specified optimization strategy
+    model_name = config.get("model_name", "smol-ai/smolvlm2-500m")
+    strategy = config.get("optimization_strategy", "qlora")
+
+    if strategy == "qlora":
+        # Configure QLoRA
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+        )
+
+        model = AutoModelForVision2Seq.from_pretrained(
+            model_name,
+            quantization_config=bnb_config,
+            device_map="auto"
+        )
+        model = prepare_model_for_kbit_training(model)
+
+        lora_config = LoraConfig(
+            r=16,
+            lora_alpha=32,
+            target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM"
+        )
+
+        model = get_peft_model(model, lora_config)
+
+    elif strategy == "lora":
+        # Configure LoRA without quantization
+        model = AutoModelForVision2Seq.from_pretrained(model_name)
+
+        lora_config = LoraConfig(
+            r=16,
+            lora_alpha=32,
+            target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+            lora_dropout=0.05,
+            bias="none",
+            task_type="CAUSAL_LM"
+        )
+
+        model = get_peft_model(model, lora_config)
+
+    elif strategy == "freeze_vision":
+        # Freeze vision encoder, train only language model part
+        model = AutoModelForVision2Seq.from_pretrained(model_name)
+
+        # Freeze vision encoder parameters
+        for param in model.vision_model.parameters():
+            param.requires_grad = False
+
+    else:
+        raise ValueError(f"Unsupported optimization strategy: {strategy}")
+
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    # Set up training arguments
+    output_dir = config.get("output_dir", "./smolvlm2-finetuned")
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        num_train_epochs=config.get("epochs", 10),
+        per_device_train_batch_size=config.get("batch_size", 4),
+        gradient_accumulation_steps=4,
+        learning_rate=2e-5,
+        weight_decay=0.01,
+        warmup_steps=100,
+        save_strategy="epoch",
+        save_total_limit=2,
+        logging_steps=10,
+        remove_unused_columns=False,
+    )
+
+    # TODO: Implement full training logic with dataset loading
+    # This is a placeholder that returns a mock result
+
+    return {
+        "model_path": output_dir,
+        "metrics": {
+            "loss": 0.5,
+            "edit_distance": 0.2
+        },
+        "status": "Training implementation in progress"
+    }
diff --git a/maestro/trainer/models/smolvlm2/detection.py b/maestro/trainer/models/smolvlm2/detection.py
new file mode 100644
index 00000000..a52fcfdf
--- /dev/null
+++ b/maestro/trainer/models/smolvlm2/detection.py
@@ -0,0 +1,111 @@
+import re
+from typing import Optional
+
+import numpy as np
+
+
+def result_to_detections_formatter(
+    text: str,
+    resolution_wh: tuple[int, int],
+    classes: Optional[list[str]] = None
+) -> tuple[np.ndarray, np.ndarray]:
+    """Converts SmolVLM2 text output into detection format.
+
+    SmolVLM2 outputs text in a format like:
+    "a person standing in front of a car [x1, y1, x2, y2]"
+
+    Args:
+        text: SmolVLM2 output text
+        resolution_wh: Target image resolution (width, height)
+        classes: Optional list of valid class names
+
+    Returns:
+        Tuple of (boxes, class_ids) where:
+        - boxes is a float32 array of shape (N, 4) with xyxy coordinates
+        - class_ids is an int32 array of shape (N,) with class IDs
+    """
+    # Extract bounding boxes using regex
+    box_pattern = r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]"
+    matches = re.finditer(box_pattern, text)
+
+    boxes_list = []
+    class_ids_list = []
+
+    # Create class mapping if provided
+    if classes is not None:
+        name_to_index = {cls_name: idx for idx, cls_name in enumerate(classes)}
+    else:
+        name_to_index = None
+
+    for match in matches:
+        x_min, y_min, x_max, y_max = map(float, match.groups())
+
+        # Extract class name from text before the box
+        text_before = text[:match.start()].strip()
+        class_name = text_before.split()[-1] if text_before else "unknown"
+
+        if name_to_index is not None:
+            if class_name not in name_to_index:
+                continue
+            current_class_id = name_to_index[class_name]
+        else:
+            current_class_id = -1
+
+        boxes_list.append([x_min, y_min, x_max, y_max])
+        class_ids_list.append(current_class_id)
+
+    boxes = np.array(boxes_list, dtype=np.float32).reshape(-1, 4)
+    class_ids = np.array(class_ids_list, dtype=np.int32)
+
+    return boxes, class_ids
+
+def detections_to_text_formatter(
+    xyxy: np.ndarray,
+    class_id: np.ndarray,
+    classes: list[str],
+    resolution_wh: tuple[int, int]
+) -> str:
+    """Converts detections to SmolVLM2 text format.
+
+    Args:
+        xyxy: Bounding boxes in xyxy format
+        class_id: Class IDs for each box
+        classes: List of class names
+        resolution_wh: Image resolution (width, height)
+
+    Returns:
+        Formatted text string for SmolVLM2
+    """
+    text_parts = []
+
+    for i in range(len(xyxy)):
+        cls_name = classes[class_id[i]]
+        x_min, y_min, x_max, y_max = map(int, xyxy[i])
+        box_text = f"{cls_name} [{x_min}, {y_min}, {x_max}, {y_max}]"
+        text_parts.append(box_text)
+
+    return " ".join(text_parts)
+
+def format_prompt_for_detection(
+    prompt: str,
+    xyxy: Optional[np.ndarray] = None,
+    class_id: Optional[np.ndarray] = None,
+    classes: Optional[list[str]] = None,
+    resolution_wh: Optional[tuple[int, int]] = None
+) -> str:
+    """Formats a prompt for object detection with SmolVLM2.
+
+    Args:
+        prompt: Base prompt
+        xyxy: Optional bounding boxes
+        class_id: Optional class IDs
+        classes: Optional class names
+        resolution_wh: Optional image resolution
+
+    Returns:
+        Formatted prompt string
+    """
+    if all(x is not None for x in [xyxy, class_id, classes, resolution_wh]):
+        detection_text = detections_to_text_formatter(xyxy, class_id, classes, resolution_wh)
+        return f"{prompt} {detection_text}"
+    return prompt
diff --git a/maestro/trainer/models/smolvlm2/entrypoint.py b/maestro/trainer/models/smolvlm2/entrypoint.py
new file mode 100644
index 00000000..102c544e
--- /dev/null
+++ b/maestro/trainer/models/smolvlm2/entrypoint.py
@@ -0,0 +1,150 @@
+from pathlib import Path
+from typing import Optional, Union
+
+import torch
+import typer
+
+from .inference import SmolVLM2Inference
+
+smolvlm2_app = typer.Typer()
+
+class SmolVLM2:
+    """Main entrypoint for SmolVLM2 model."""
+
+    def __init__(
+        self,
+        model_name: str = "smol-ai/smolvlm2-500m",
+        device: str = "cuda" if torch.cuda.is_available() else "cpu",
+        **kwargs
+    ):
+        """Initialize SmolVLM2 model."""
+        self.inference = SmolVLM2Inference(model_name=model_name, device=device, **kwargs)
+
+    def generate(
+        self,
+        images: Union[str, list[str]],
+        prompt: Optional[str] = None,
+        max_new_tokens: int = 512,
+        **kwargs
+    ) -> dict:
+        """
+        Generate text from images.
+
+        Args:
+            images: Path(s) to image(s)
+            prompt: Optional prompt to guide generation
+            max_new_tokens: Maximum number of tokens to generate
+            **kwargs: Additional generation parameters
+
+        Returns:
+            Dictionary containing generated text and other outputs
+        """
+        return self.inference.generate(
+            images=images,
+            prompt=prompt,
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        )
+
+@smolvlm2_app.command(name="info", help="Get information about the SmolVLM2 model")
+def info() -> None:
+    """Get information about the SmolVLM2 model."""
+    try:
+        model = SmolVLM2()
+        info = model.inference.get_model_info()
+        typer.echo(f"Model Name: {info['model_name']}")
+        typer.echo(f"Model Size: {info['model_size']}")
+        typer.echo(f"Device: {info['device']}")
+        typer.echo(f"Tokenizer: {info['tokenizer']}")
+    except Exception as e:
+        typer.echo(f"Error retrieving model info: {e!s}", err=True)
+        raise typer.Exit(code=1)
+
+@smolvlm2_app.command(name="predict", help="Run inference on one or more images")
+def predict(
+    image: list[Path] = typer.Option(
+        ..., "--image", "-i", help="Path to image(s) for prediction"
+    ),
+    prompt: Optional[str] = typer.Option(
+        None, "--prompt", "-p", help="Optional prompt to guide generation"
+    ),
+    max_new_tokens: int = typer.Option(
+        512, "--max-new-tokens", help="Maximum new tokens to generate"
+    ),
+    output: Optional[Path] = typer.Option(
+        None, "--output", "-o", help="Output file path to save results"
+    ),
+) -> None:
+    """Run inference on images using SmolVLM2."""
+    try:
+        model = SmolVLM2()
+        result = model.generate(
+            images=[str(img) for img in image],
+            prompt=prompt,
+            max_new_tokens=max_new_tokens
+        )
+
+        if output:
+            import json
+            with open(output, "w") as f:
+                json.dump(result, f, indent=2)
+            typer.echo(f"Results saved to {output}")
+        else:
+            typer.echo(f"Generated text: {result['text']}")
+
+    except Exception as e:
+        typer.echo(f"Error during prediction: {e!s}", err=True)
+        raise typer.Exit(code=1)
+
+@smolvlm2_app.command(name="train", help="Fine-tune the SmolVLM2 model")
+def train(
+    dataset: Path = typer.Option(
+        ..., "--dataset", "-d", help="Path to dataset directory or file"
+    ),
+    epochs: int = typer.Option(
+        10, "--epochs", "-e", help="Number of training epochs"
+    ),
+    batch_size: int = typer.Option(
+        4, "--batch-size", "-b", help="Training batch size"
+    ),
+    optimization_strategy: str = typer.Option(
+        "qlora", "--optimization-strategy", "-o",
+        help="Optimization strategy (qlora, lora, freeze_vision)"
+    ),
+    metrics: list[str] = typer.Option(
+        ["edit_distance"], "--metrics", "-m", help="Metrics to evaluate during training"
+    ),
+    output_dir: Optional[Path] = typer.Option(
+        None, "--output-dir", help="Directory to save trained model"
+    ),
+) -> None:
+    """Fine-tune the SmolVLM2 model on a dataset."""
+    try:
+        typer.echo("Starting SmolVLM2 fine-tuning...")
+
+        if output_dir is None:
+            import tempfile
+            output_dir = Path(tempfile.mkdtemp())
+            typer.echo(f"No output directory specified, using temporary directory: {output_dir}")
+
+        # Create configuration for training
+        config = {
+            "dataset": str(dataset),
+            "epochs": epochs,
+            "batch_size": batch_size,
+            "optimization_strategy": optimization_strategy,
+            "metrics": metrics,
+            "output_dir": str(output_dir)
+        }
+
+        # Import the train function here to avoid circular imports
+        from .core import train as train_model
+
+        results = train_model(config)
+
+        typer.echo(f"Training complete! Model saved to {output_dir}")
+        typer.echo(f"Final metrics: {results.get('metrics', {})}")
+
+    except Exception as e:
+        typer.echo(f"Error during training: {e!s}", err=True)
+        raise typer.Exit(code=1)
diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
new file mode 100644
index 00000000..9ed2c631
--- /dev/null
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -0,0 +1,138 @@
+from typing import Optional, Union
+
+import torch
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
+
+class SmolVLM2Inference:
+    """Inference interface for SmolVLM2 model."""
+
+    def __init__(
+        self,
+        model_name: str = "smol-ai/smolvlm2-500m",
+        device: str = "cuda" if torch.cuda.is_available() else "cpu",
+        **kwargs
+    ):
+        """Initialize inference interface."""
+        self.model = AutoModelForVision2Seq.from_pretrained(model_name)
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.device = device
+
+    def generate(
+        self,
+        images: Union[str, list[str]],
+        prompt: Optional[str] = None,
+        max_new_tokens: int = 512,
+        **kwargs
+    ) -> dict:
+        """
+        Generate text from images.
+
+        Args:
+            images: Path(s) to image(s)
+            prompt: Optional prompt to guide generation
+            max_new_tokens: Maximum number of tokens to generate
+            **kwargs: Additional generation parameters
+
+        Returns:
+            Dictionary containing generated text and other outputs
+        """
+        # Process inputs
+        inputs = self.processor(
+            images=images,
+            text=prompt if prompt else "",
+            return_tensors="pt"
+        )
+
+        # Generate
+        outputs = self.model.generate(
+            input_ids=inputs["input_ids"].to(self.device),
+            pixel_values=inputs["pixel_values"].to(self.device),
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        )
+
+        # Decode outputs
+        generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)
+
+        return {
+            "generated_text": generated_text,
+            "model_outputs": outputs
+        }
+
+def predict_with_inputs(
+    model: AutoModelForVision2Seq,
+    processor: AutoProcessor,
+    input_ids: torch.Tensor,
+    pixel_values: torch.Tensor,
+    device: Union[str, torch.device],
+    max_new_tokens: int = 512,
+    **kwargs
+) -> list[str]:
+    """
+    Generate text predictions using the model.
+
+    Args:
+        model: The SmolVLM2 model
+        processor: The model's processor
+        input_ids: Input token IDs
+        pixel_values: Input image pixel values
+        device: Device to run inference on
+        max_new_tokens: Maximum number of tokens to generate
+        **kwargs: Additional generation parameters
+
+    Returns:
+        List of generated text strings
+    """
+    model.eval()
+    with torch.no_grad():
+        outputs = model.generate(
+            input_ids=input_ids.to(device),
+            pixel_values=pixel_values.to(device),
+            max_new_tokens=max_new_tokens,
+            **kwargs
+        )
+    return processor.batch_decode(outputs, skip_special_tokens=True)
+
+def predict_with_images(
+    model: AutoModelForVision2Seq,
+    processor: AutoProcessor,
+    images: Union[str, list[str]],
+    prompt: Optional[str] = None,
+    device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
+    max_new_tokens: int = 512,
+    **kwargs
+) -> list[str]:
+    """
+    Generate text predictions from images.
+
+    Args:
+        model: The SmolVLM2 model
+        processor: The model's processor
+        images: Path(s) to image(s)
+        prompt: Optional prompt to guide generation
+        device: Device to run inference on
+        max_new_tokens: Maximum number of tokens to generate
+        **kwargs: Additional generation parameters
+
+    Returns:
+        List of generated text strings
+    """
+    if isinstance(images, str):
+        images = [images]
+
+    inputs = processor(
+        images=images,
+        text=prompt if prompt else "",
+        return_tensors="pt"
+    )
+
+    return predict_with_inputs(
+        model=model,
+        processor=processor,
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        device=device,
+        max_new_tokens=max_new_tokens,
+        **kwargs
+    )
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
new file mode 100644
index 00000000..4f9bc8b9
--- /dev/null
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -0,0 +1,115 @@
+from typing import Optional
+
+import torch
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from transformers import AutoProcessor
+
+
+class SmolVLM2Dataset(Dataset):
+    """Dataset for SmolVLM2 model."""
+
+    def __init__(
+        self,
+        image_paths: list[str],
+        texts: Optional[list[str]] = None,
+        processor: Optional[AutoProcessor] = None
+    ):
+        """
+        Initialize dataset.
+
+        Args:
+            image_paths: List of paths to images
+            texts: Optional list of corresponding texts
+            processor: Model processor for preprocessing
+        """
+        self.image_paths = image_paths
+        self.texts = texts
+        self.processor = processor
+
+    def __len__(self) -> int:
+        return len(self.image_paths)
+
+    def __getitem__(self, idx: int) -> dict:
+        """Get a single item from the dataset."""
+        image = Image.open(self.image_paths[idx])
+
+        if self.texts is not None:
+            text = self.texts[idx]
+        else:
+            text = ""
+
+        if self.processor is not None:
+            return self.processor(
+                images=image,
+                text=text,
+                return_tensors="pt"
+            )
+        else:
+            return {
+                "image": image,
+                "text": text
+            }
+
+def train_collate_fn(batch: list[dict]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Collate function for training data.
+
+    Args:
+        batch: List of processed samples
+
+    Returns:
+        Tuple of (input_ids, pixel_values, labels)
+    """
+    input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch])
+    pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch])
+    labels = torch.stack([item["labels"].squeeze(0) for item in batch])
+
+    return input_ids, pixel_values, labels
+
+def evaluation_collate_fn(
+    batch: list[dict]
+) -> tuple[torch.Tensor, torch.Tensor, list[Image.Image], list[str], list[str]]:
+    """
+    Collate function for evaluation data.
+
+    Args:
+        batch: List of processed samples
+
+    Returns:
+        Tuple of (input_ids, pixel_values, images, prompts, targets)
+    """
+    input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch])
+    pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch])
+    images = [item["image"] for item in batch]
+    prompts = [item["text"] for item in batch]
+    targets = [item["text"] for item in batch]  # In evaluation, target is same as prompt
+
+    return input_ids, pixel_values, images, prompts, targets
+
+def create_dataloader(
+    dataset: Dataset,
+    batch_size: int = 8,
+    num_workers: int = 4,
+    shuffle: bool = True,
+    collate_fn = None
+) -> DataLoader:
+    """
+    Create a DataLoader for the dataset.
+
+    Args:
+        dataset: Dataset to create loader for
+        batch_size: Batch size
+        num_workers: Number of worker processes
+        shuffle: Whether to shuffle the data
+        collate_fn: Optional collate function
+    Returns:
+        DataLoader instance
+    """
+    return DataLoader(
+        dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=shuffle,
+        collate_fn=collate_fn
+    )
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 3f476c12..f232889b 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -27,6 +27,7 @@ nav:
     - Florence-2: models/florence_2.md
     - PaliGemma 2: models/paligemma_2.md
     - Qwen2.5-VL: models/qwen_2_5_vl.md
+    - SmolVLM2: models/smolvlm2.md
   - Datasets:
     - JSONL: datasets/jsonl.md
 
diff --git a/pyproject.toml b/pyproject.toml
index da476b4f..2aefbc3f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -91,6 +91,14 @@ qwen_2_5_vl = [
     "bitsandbytes>=0.45.0",
     "qwen-vl-utils>=0.0.8"
 ]
+smolvlm2 = [
+    "accelerate>=1.2.1",
+    "peft>=0.12",
+    "torch>=2.4.0",
+    "torchvision>=0.20.0",
+    "transformers>=4.49.0",
+    "bitsandbytes>=0.45.0"
+]
 
 [project.scripts]
 maestro = "maestro.cli.main:app"
@@ -147,62 +155,19 @@ line-length = 120
 indent-width = 4
 
 [tool.ruff.lint]
-
-# Enable pycodestyle (`E`)
-select = ["E", "F", "I", "A", "Q", "W", "N", "T", "Q","TRY","UP","C90","RUF","NPY"]
-ignore = ["T201","TRY003","NPY201"]
-
-# Allow autofix for all enabled rules (when `--fix`) is provided.
-fixable = [
-    "A",
-    "B",
-    "C",
-    "D",
-    "E",
-    "F",
-    "G",
-    "I",
-    "N",
-    "Q",
-    "S",
-    "T",
-    "W",
-    "ANN",
-    "ARG",
-    "BLE",
-    "COM",
-    "DJ",
-    "DTZ",
-    "EM",
-    "ERA",
-    "EXE",
-    "FBT",
-    "ICN",
-    "INP",
-    "ISC",
-    "NPY",
-    "PD",
-    "PGH",
-    "PIE",
-    "PL",
-    "PT",
-    "PTH",
-    "PYI",
-    "RET",
-    "RSE",
-    "RUF",
-    "SIM",
-    "SLF",
-    "TCH",
-    "TID",
-    "TRY",
-    "UP",
-    "YTT",
-]
+select = ["E", "F", "I", "A", "Q", "W", "N", "T", "TRY", "UP", "C90", "RUF", "NPY"]
+ignore = ["T201", "TRY003", "NPY201"]
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", 
+    "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", 
+    "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", 
+    "TCH", "TID", "TRY", "UP", "YTT"]
 unfixable = []
+
 # Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
-pylint.max-args = 20
+
+[tool.ruff.lint.pylint]
+max-args = 20
 
 [tool.ruff.lint.flake8-quotes]
 inline-quotes = "double"

From 0518c67038e1059c032b700d8269232332234a48 Mon Sep 17 00:00:00 2001
From: AshAnand34 <aashishanand2019@gmail.com>
Date: Sat, 10 May 2025 12:26:34 -0700
Subject: [PATCH 02/92] Fixing lint errors and crated trainer for training
 dataset in smolvlm2

---
 .../trainer/models/smolvlm2/checkpoints.py    | 12 ++--
 maestro/trainer/models/smolvlm2/core.py       | 58 ++++++++++++++-----
 pyproject.toml                                | 50 ++++++++++++++--
 3 files changed, 96 insertions(+), 24 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index dbfc8f71..afa4c3c0 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -1,5 +1,5 @@
 import os
-from typing import Dict, Optional
+from typing import Optional
 
 import torch
 from transformers import AutoModelForVision2Seq, AutoProcessor
@@ -9,11 +9,11 @@ def save_checkpoint(
     model: AutoModelForVision2Seq,
     processor: AutoProcessor,
     path: str,
-    metadata: Optional[Dict] = None
+    metadata: Optional[dict] = None
 ) -> None:
     """
     Save model checkpoint.
-    
+
     Args:
         model: Model to save
         processor: Processor to save
@@ -35,14 +35,14 @@ def save_checkpoint(
 def load_checkpoint(
     path: str,
     device: str = "cuda" if torch.cuda.is_available() else "cpu"
-) -> Dict:
+) -> dict:
     """
     Load model checkpoint.
-    
+
     Args:
         path: Path to checkpoint
         device: Device to load model on
-        
+
     Returns:
         Dictionary containing model, processor, and metadata
     """
diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 101918e1..475d60c7 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -1,7 +1,8 @@
+import os
 from typing import Optional, Union
 
 import torch
-from transformers import AutoModelForVision2Seq, AutoProcessor
+from transformers import AutoModelForVision2Seq, AutoProcessor, Trainer
 
 
 class SmolVLM2Core:
@@ -70,7 +71,7 @@ def decode_outputs(
 def train(config: dict) -> dict:
     """
     Train SmolVLM2 model with provided configuration.
-    
+
     Args:
         config: Dictionary containing training configuration
             - dataset: Path to dataset directory or file
@@ -82,15 +83,19 @@ def train(config: dict) -> dict:
     Returns:
         Dictionary containing training results and metrics
     """
+    from functools import partial
+
     from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
     from transformers import BitsAndBytesConfig, TrainingArguments
 
+    from maestro.trainer.common.datasets.core import create_data_loaders, resolve_dataset_path
+    from maestro.trainer.models.smolvlm2.loaders import evaluation_collate_fn, train_collate_fn
     # Load dataset
     dataset_path = config["dataset"]
+    dataset_location = resolve_dataset_path(dataset_path)
+    if dataset_location is None:
+        return {"error": "Dataset not found"}
 
-    # TODO: Implement proper dataset loading logic based on the dataset format
-    # For now, we'll use a placeholder implementation
-    
     # Create model with the specified optimization strategy
     model_name = config.get("model_name", "smol-ai/smolvlm2-500m")
     strategy = config.get("optimization_strategy", "qlora")
@@ -147,15 +152,27 @@ def train(config: dict) -> dict:
 
     else:
         raise ValueError(f"Unsupported optimization strategy: {strategy}")
-
     processor = AutoProcessor.from_pretrained(model_name)
 
+    # Load datasets
+    train_loader, valid_loader, test_loader = create_data_loaders(
+        dataset_location=dataset_location,
+        train_batch_size=config.get("batch_size", 4),
+        train_collect_fn=partial(train_collate_fn, processor=processor),
+        train_num_workers=config.get("num_workers", 0),
+        test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)),
+        test_collect_fn=partial(evaluation_collate_fn, processor=processor),
+        test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)),    )
+
     # Set up training arguments
     output_dir = config.get("output_dir", "./smolvlm2-finetuned")
+    os.makedirs(output_dir, exist_ok=True)
+
     training_args = TrainingArguments(
         output_dir=output_dir,
         num_train_epochs=config.get("epochs", 10),
         per_device_train_batch_size=config.get("batch_size", 4),
+        per_device_eval_batch_size=config.get("val_batch_size", config.get("batch_size", 4)),
         gradient_accumulation_steps=4,
         learning_rate=2e-5,
         weight_decay=0.01,
@@ -163,17 +180,30 @@ def train(config: dict) -> dict:
         save_strategy="epoch",
         save_total_limit=2,
         logging_steps=10,
-        remove_unused_columns=False,
+        evaluation_strategy="epoch",
+        load_best_model_at_end=True,
+        remove_unused_columns=False
+    )
+
+    # Set up trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_loader.dataset,
+        eval_dataset=valid_loader.dataset,
+        data_collator=lambda batch: train_collate_fn(batch, processor)
     )
 
-    # TODO: Implement full training logic with dataset loading
-    # This is a placeholder that returns a mock result
+    # Train model
+    trainer.train()
+
+    # Save model and processor
+    model.save_pretrained(output_dir)
+    processor.save_pretrained(output_dir)
 
+    # Return results
     return {
         "model_path": output_dir,
-        "metrics": {
-            "loss": 0.5,
-            "edit_distance": 0.2
-        },
-        "status": "Training implementation in progress"
+        "metrics": trainer.state.log_history[-1] if trainer.state.log_history else {"loss": "N/A"},
+        "status": "Training completed"
     }
diff --git a/pyproject.toml b/pyproject.toml
index 2aefbc3f..f94e8c04 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -157,10 +157,52 @@ indent-width = 4
 [tool.ruff.lint]
 select = ["E", "F", "I", "A", "Q", "W", "N", "T", "TRY", "UP", "C90", "RUF", "NPY"]
 ignore = ["T201", "TRY003", "NPY201"]
-fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", 
-    "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", 
-    "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", 
-    "TCH", "TID", "TRY", "UP", "YTT"]
+fixable = [
+    "A",
+    "B",
+    "C",
+    "D",
+    "E",
+    "F",
+    "G",
+    "I",
+    "N",
+    "Q",
+    "S",
+    "T",
+    "W",
+    "ANN",
+    "ARG", 
+    "BLE",
+    "COM",
+    "DJ",
+    "DTZ",
+    "EM",
+    "ERA",
+    "EXE",
+    "FBT",
+    "ICN",
+    "INP",
+    "ISC",
+    "NPY", 
+    "PD",
+    "PGH",
+    "PIE",
+    "PL",
+    "PT",
+    "PTH",
+    "PYI",
+    "RET",
+    "RSE",
+    "RUF",
+    "SIM",
+    "SLF", 
+    "TCH",
+    "TID",
+    "TRY",
+    "UP",
+    "YTT"
+]
 unfixable = []
 
 # Allow unused variables when underscore-prefixed.

From 727e01b57429c0f66d3c452653312b5b3812afa4 Mon Sep 17 00:00:00 2001
From: AshAnand34 <aashishanand2019@gmail.com>
Date: Sat, 10 May 2025 12:36:16 -0700
Subject: [PATCH 03/92] SmolVLM2 documented

---
 docs/index.md | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index f6266980..82dd2518 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -69,10 +69,16 @@ we recommend creating a dedicated Python environment for each model.
     pip install "maestro[qwen_2_5_vl]"
     ```
 
+=== "SmolVLM2"
+
+    ```bash
+    pip install "maestro[smolvlm2]"
+    ```
+
 ### CLI
 
 Kick off fine-tuning with our command-line interface, which leverages the configuration
-and training routines defined in each model’s core module. Simply specify key parameters such as
+and training routines defined in each model's core module. Simply specify key parameters such as
 the dataset location, number of epochs, batch size, optimization strategy, and metrics.
 
 === "Florence-2"
@@ -108,6 +114,17 @@ the dataset location, number of epochs, batch size, optimization strategy, and m
       --metrics "edit_distance"
     ```
 
+=== "SmolVLM2"
+
+    ```bash
+    maestro smolvlm2 train \
+      --dataset "dataset/location" \
+      --epochs 10 \
+      --batch-size 4 \
+      --optimization_strategy "lora" \
+      --metrics "edit_distance"
+    ```
+
 ### Python
 
 For greater control, use the Python API to fine-tune your models.
@@ -148,7 +165,6 @@ and training setup.
     ```
 
 === "Qwen2.5-VL"
-
     ```python
     from maestro.trainer.models.qwen_2_5_vl.core import train
 
@@ -162,3 +178,18 @@ and training setup.
 
     train(config)
     ```
+
+=== "SmolVLM2"
+    ```python
+    from maestro.trainer.models.smolvlm2.core import train
+
+    config = {
+        "dataset": "dataset/location",
+        "epochs": 10,
+        "batch_size": 4,
+        "optimization_strategy": "lora",
+        "metrics": ["edit_distance"],
+    }
+
+    train(config)
+    ```

From d074602f3be86de09fec48025a9db2817cc5ea90 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 10 May 2025 19:46:10 +0000
Subject: [PATCH 04/92] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?=
 =?UTF-8?q?=20format=20pre-commit=20hooks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../trainer/models/smolvlm2/checkpoints.py    | 17 ++---
 maestro/trainer/models/smolvlm2/core.py       | 59 +++++-----------
 maestro/trainer/models/smolvlm2/detection.py  | 15 ++--
 maestro/trainer/models/smolvlm2/entrypoint.py | 68 ++++++-------------
 maestro/trainer/models/smolvlm2/inference.py  | 37 ++++------
 maestro/trainer/models/smolvlm2/loaders.py    | 35 +++-------
 pyproject.toml                                |  6 +-
 7 files changed, 72 insertions(+), 165 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index afa4c3c0..87d1aea8 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -6,10 +6,7 @@
 
 
 def save_checkpoint(
-    model: AutoModelForVision2Seq,
-    processor: AutoProcessor,
-    path: str,
-    metadata: Optional[dict] = None
+    model: AutoModelForVision2Seq, processor: AutoProcessor, path: str, metadata: Optional[dict] = None
 ) -> None:
     """
     Save model checkpoint.
@@ -32,10 +29,8 @@ def save_checkpoint(
     if metadata is not None:
         torch.save(metadata, os.path.join(path, "metadata.pt"))
 
-def load_checkpoint(
-    path: str,
-    device: str = "cuda" if torch.cuda.is_available() else "cpu"
-) -> dict:
+
+def load_checkpoint(path: str, device: str = "cuda" if torch.cuda.is_available() else "cpu") -> dict:
     """
     Load model checkpoint.
 
@@ -57,8 +52,4 @@ def load_checkpoint(
     metadata_path = os.path.join(path, "metadata.pt")
     metadata = torch.load(metadata_path) if os.path.exists(metadata_path) else None
 
-    return {
-        "model": model,
-        "processor": processor,
-        "metadata": metadata
-    }
+    return {"model": model, "processor": processor, "metadata": metadata}
diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 475d60c7..ffc34b2b 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -12,7 +12,7 @@ def __init__(
         self,
         model_name: str = "smol-ai/smolvlm2-500m",
         device: str = "cuda" if torch.cuda.is_available() else "cpu",
-        **kwargs
+        **kwargs,
     ):
         """
         Initialize SmolVLM2 model.
@@ -29,44 +29,21 @@ def __init__(
         self.model = AutoModelForVision2Seq.from_pretrained(model_name)
         self.model.to(device)
 
-    def process_inputs(
-        self,
-        images: Union[str, list[str]],
-        prompt: Optional[str] = None
-    ) -> dict:
+    def process_inputs(self, images: Union[str, list[str]], prompt: Optional[str] = None) -> dict:
         """Process input images and text."""
         if isinstance(images, str):
             images = [images]
 
-        return self.processor(
-            images=images,
-            text=prompt if prompt else "",
-            return_tensors="pt"
-        ).to(self.device)
+        return self.processor(images=images, text=prompt if prompt else "", return_tensors="pt").to(self.device)
 
-    def generate(
-        self,
-        inputs: dict,
-        max_new_tokens: int = 512,
-        **kwargs
-    ) -> torch.Tensor:
+    def generate(self, inputs: dict, max_new_tokens: int = 512, **kwargs) -> torch.Tensor:
         """Generate text from processed inputs."""
-        return self.model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            **kwargs
-        )
+        return self.model.generate(**inputs, max_new_tokens=max_new_tokens, **kwargs)
 
-    def decode_outputs(
-        self,
-        outputs: torch.Tensor,
-        skip_special_tokens: bool = True
-    ) -> list[str]:
+    def decode_outputs(self, outputs: torch.Tensor, skip_special_tokens: bool = True) -> list[str]:
         """Decode model outputs to text."""
-        return self.processor.batch_decode(
-            outputs,
-            skip_special_tokens=skip_special_tokens
-        )
+        return self.processor.batch_decode(outputs, skip_special_tokens=skip_special_tokens)
+
 
 def train(config: dict) -> dict:
     """
@@ -90,6 +67,7 @@ def train(config: dict) -> dict:
 
     from maestro.trainer.common.datasets.core import create_data_loaders, resolve_dataset_path
     from maestro.trainer.models.smolvlm2.loaders import evaluation_collate_fn, train_collate_fn
+
     # Load dataset
     dataset_path = config["dataset"]
     dataset_location = resolve_dataset_path(dataset_path)
@@ -109,11 +87,7 @@ def train(config: dict) -> dict:
             bnb_4bit_use_double_quant=True,
         )
 
-        model = AutoModelForVision2Seq.from_pretrained(
-            model_name,
-            quantization_config=bnb_config,
-            device_map="auto"
-        )
+        model = AutoModelForVision2Seq.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
         model = prepare_model_for_kbit_training(model)
 
         lora_config = LoraConfig(
@@ -122,7 +96,7 @@ def train(config: dict) -> dict:
             target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
             lora_dropout=0.05,
             bias="none",
-            task_type="CAUSAL_LM"
+            task_type="CAUSAL_LM",
         )
 
         model = get_peft_model(model, lora_config)
@@ -137,7 +111,7 @@ def train(config: dict) -> dict:
             target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
             lora_dropout=0.05,
             bias="none",
-            task_type="CAUSAL_LM"
+            task_type="CAUSAL_LM",
         )
 
         model = get_peft_model(model, lora_config)
@@ -162,7 +136,8 @@ def train(config: dict) -> dict:
         train_num_workers=config.get("num_workers", 0),
         test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)),
         test_collect_fn=partial(evaluation_collate_fn, processor=processor),
-        test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)),    )
+        test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)),
+    )
 
     # Set up training arguments
     output_dir = config.get("output_dir", "./smolvlm2-finetuned")
@@ -182,7 +157,7 @@ def train(config: dict) -> dict:
         logging_steps=10,
         evaluation_strategy="epoch",
         load_best_model_at_end=True,
-        remove_unused_columns=False
+        remove_unused_columns=False,
     )
 
     # Set up trainer
@@ -191,7 +166,7 @@ def train(config: dict) -> dict:
         args=training_args,
         train_dataset=train_loader.dataset,
         eval_dataset=valid_loader.dataset,
-        data_collator=lambda batch: train_collate_fn(batch, processor)
+        data_collator=lambda batch: train_collate_fn(batch, processor),
     )
 
     # Train model
@@ -205,5 +180,5 @@ def train(config: dict) -> dict:
     return {
         "model_path": output_dir,
         "metrics": trainer.state.log_history[-1] if trainer.state.log_history else {"loss": "N/A"},
-        "status": "Training completed"
+        "status": "Training completed",
     }
diff --git a/maestro/trainer/models/smolvlm2/detection.py b/maestro/trainer/models/smolvlm2/detection.py
index a52fcfdf..d78e2681 100644
--- a/maestro/trainer/models/smolvlm2/detection.py
+++ b/maestro/trainer/models/smolvlm2/detection.py
@@ -5,9 +5,7 @@
 
 
 def result_to_detections_formatter(
-    text: str,
-    resolution_wh: tuple[int, int],
-    classes: Optional[list[str]] = None
+    text: str, resolution_wh: tuple[int, int], classes: Optional[list[str]] = None
 ) -> tuple[np.ndarray, np.ndarray]:
     """Converts SmolVLM2 text output into detection format.
 
@@ -41,7 +39,7 @@ def result_to_detections_formatter(
         x_min, y_min, x_max, y_max = map(float, match.groups())
 
         # Extract class name from text before the box
-        text_before = text[:match.start()].strip()
+        text_before = text[: match.start()].strip()
         class_name = text_before.split()[-1] if text_before else "unknown"
 
         if name_to_index is not None:
@@ -59,11 +57,9 @@ def result_to_detections_formatter(
 
     return boxes, class_ids
 
+
 def detections_to_text_formatter(
-    xyxy: np.ndarray,
-    class_id: np.ndarray,
-    classes: list[str],
-    resolution_wh: tuple[int, int]
+    xyxy: np.ndarray, class_id: np.ndarray, classes: list[str], resolution_wh: tuple[int, int]
 ) -> str:
     """Converts detections to SmolVLM2 text format.
 
@@ -86,12 +82,13 @@ def detections_to_text_formatter(
 
     return " ".join(text_parts)
 
+
 def format_prompt_for_detection(
     prompt: str,
     xyxy: Optional[np.ndarray] = None,
     class_id: Optional[np.ndarray] = None,
     classes: Optional[list[str]] = None,
-    resolution_wh: Optional[tuple[int, int]] = None
+    resolution_wh: Optional[tuple[int, int]] = None,
 ) -> str:
     """Formats a prompt for object detection with SmolVLM2.
 
diff --git a/maestro/trainer/models/smolvlm2/entrypoint.py b/maestro/trainer/models/smolvlm2/entrypoint.py
index 102c544e..a18ff74e 100644
--- a/maestro/trainer/models/smolvlm2/entrypoint.py
+++ b/maestro/trainer/models/smolvlm2/entrypoint.py
@@ -8,6 +8,7 @@
 
 smolvlm2_app = typer.Typer()
 
+
 class SmolVLM2:
     """Main entrypoint for SmolVLM2 model."""
 
@@ -15,17 +16,13 @@ def __init__(
         self,
         model_name: str = "smol-ai/smolvlm2-500m",
         device: str = "cuda" if torch.cuda.is_available() else "cpu",
-        **kwargs
+        **kwargs,
     ):
         """Initialize SmolVLM2 model."""
         self.inference = SmolVLM2Inference(model_name=model_name, device=device, **kwargs)
 
     def generate(
-        self,
-        images: Union[str, list[str]],
-        prompt: Optional[str] = None,
-        max_new_tokens: int = 512,
-        **kwargs
+        self, images: Union[str, list[str]], prompt: Optional[str] = None, max_new_tokens: int = 512, **kwargs
     ) -> dict:
         """
         Generate text from images.
@@ -39,12 +36,8 @@ def generate(
         Returns:
             Dictionary containing generated text and other outputs
         """
-        return self.inference.generate(
-            images=images,
-            prompt=prompt,
-            max_new_tokens=max_new_tokens,
-            **kwargs
-        )
+        return self.inference.generate(images=images, prompt=prompt, max_new_tokens=max_new_tokens, **kwargs)
+
 
 @smolvlm2_app.command(name="info", help="Get information about the SmolVLM2 model")
 def info() -> None:
@@ -60,32 +53,22 @@ def info() -> None:
         typer.echo(f"Error retrieving model info: {e!s}", err=True)
         raise typer.Exit(code=1)
 
+
 @smolvlm2_app.command(name="predict", help="Run inference on one or more images")
 def predict(
-    image: list[Path] = typer.Option(
-        ..., "--image", "-i", help="Path to image(s) for prediction"
-    ),
-    prompt: Optional[str] = typer.Option(
-        None, "--prompt", "-p", help="Optional prompt to guide generation"
-    ),
-    max_new_tokens: int = typer.Option(
-        512, "--max-new-tokens", help="Maximum new tokens to generate"
-    ),
-    output: Optional[Path] = typer.Option(
-        None, "--output", "-o", help="Output file path to save results"
-    ),
+    image: list[Path] = typer.Option(..., "--image", "-i", help="Path to image(s) for prediction"),
+    prompt: Optional[str] = typer.Option(None, "--prompt", "-p", help="Optional prompt to guide generation"),
+    max_new_tokens: int = typer.Option(512, "--max-new-tokens", help="Maximum new tokens to generate"),
+    output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output file path to save results"),
 ) -> None:
     """Run inference on images using SmolVLM2."""
     try:
         model = SmolVLM2()
-        result = model.generate(
-            images=[str(img) for img in image],
-            prompt=prompt,
-            max_new_tokens=max_new_tokens
-        )
+        result = model.generate(images=[str(img) for img in image], prompt=prompt, max_new_tokens=max_new_tokens)
 
         if output:
             import json
+
             with open(output, "w") as f:
                 json.dump(result, f, indent=2)
             typer.echo(f"Results saved to {output}")
@@ -96,27 +79,17 @@ def predict(
         typer.echo(f"Error during prediction: {e!s}", err=True)
         raise typer.Exit(code=1)
 
+
 @smolvlm2_app.command(name="train", help="Fine-tune the SmolVLM2 model")
 def train(
-    dataset: Path = typer.Option(
-        ..., "--dataset", "-d", help="Path to dataset directory or file"
-    ),
-    epochs: int = typer.Option(
-        10, "--epochs", "-e", help="Number of training epochs"
-    ),
-    batch_size: int = typer.Option(
-        4, "--batch-size", "-b", help="Training batch size"
-    ),
+    dataset: Path = typer.Option(..., "--dataset", "-d", help="Path to dataset directory or file"),
+    epochs: int = typer.Option(10, "--epochs", "-e", help="Number of training epochs"),
+    batch_size: int = typer.Option(4, "--batch-size", "-b", help="Training batch size"),
     optimization_strategy: str = typer.Option(
-        "qlora", "--optimization-strategy", "-o",
-        help="Optimization strategy (qlora, lora, freeze_vision)"
-    ),
-    metrics: list[str] = typer.Option(
-        ["edit_distance"], "--metrics", "-m", help="Metrics to evaluate during training"
-    ),
-    output_dir: Optional[Path] = typer.Option(
-        None, "--output-dir", help="Directory to save trained model"
+        "qlora", "--optimization-strategy", "-o", help="Optimization strategy (qlora, lora, freeze_vision)"
     ),
+    metrics: list[str] = typer.Option(["edit_distance"], "--metrics", "-m", help="Metrics to evaluate during training"),
+    output_dir: Optional[Path] = typer.Option(None, "--output-dir", help="Directory to save trained model"),
 ) -> None:
     """Fine-tune the SmolVLM2 model on a dataset."""
     try:
@@ -124,6 +97,7 @@ def train(
 
         if output_dir is None:
             import tempfile
+
             output_dir = Path(tempfile.mkdtemp())
             typer.echo(f"No output directory specified, using temporary directory: {output_dir}")
 
@@ -134,7 +108,7 @@ def train(
             "batch_size": batch_size,
             "optimization_strategy": optimization_strategy,
             "metrics": metrics,
-            "output_dir": str(output_dir)
+            "output_dir": str(output_dir),
         }
 
         # Import the train function here to avoid circular imports
diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index 9ed2c631..47654d34 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -11,7 +11,7 @@ def __init__(
         self,
         model_name: str = "smol-ai/smolvlm2-500m",
         device: str = "cuda" if torch.cuda.is_available() else "cpu",
-        **kwargs
+        **kwargs,
     ):
         """Initialize inference interface."""
         self.model = AutoModelForVision2Seq.from_pretrained(model_name)
@@ -19,11 +19,7 @@ def __init__(
         self.device = device
 
     def generate(
-        self,
-        images: Union[str, list[str]],
-        prompt: Optional[str] = None,
-        max_new_tokens: int = 512,
-        **kwargs
+        self, images: Union[str, list[str]], prompt: Optional[str] = None, max_new_tokens: int = 512, **kwargs
     ) -> dict:
         """
         Generate text from images.
@@ -38,27 +34,21 @@ def generate(
             Dictionary containing generated text and other outputs
         """
         # Process inputs
-        inputs = self.processor(
-            images=images,
-            text=prompt if prompt else "",
-            return_tensors="pt"
-        )
+        inputs = self.processor(images=images, text=prompt if prompt else "", return_tensors="pt")
 
         # Generate
         outputs = self.model.generate(
             input_ids=inputs["input_ids"].to(self.device),
             pixel_values=inputs["pixel_values"].to(self.device),
             max_new_tokens=max_new_tokens,
-            **kwargs
+            **kwargs,
         )
 
         # Decode outputs
         generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)
 
-        return {
-            "generated_text": generated_text,
-            "model_outputs": outputs
-        }
+        return {"generated_text": generated_text, "model_outputs": outputs}
+
 
 def predict_with_inputs(
     model: AutoModelForVision2Seq,
@@ -67,7 +57,7 @@ def predict_with_inputs(
     pixel_values: torch.Tensor,
     device: Union[str, torch.device],
     max_new_tokens: int = 512,
-    **kwargs
+    **kwargs,
 ) -> list[str]:
     """
     Generate text predictions using the model.
@@ -90,10 +80,11 @@ def predict_with_inputs(
             input_ids=input_ids.to(device),
             pixel_values=pixel_values.to(device),
             max_new_tokens=max_new_tokens,
-            **kwargs
+            **kwargs,
         )
     return processor.batch_decode(outputs, skip_special_tokens=True)
 
+
 def predict_with_images(
     model: AutoModelForVision2Seq,
     processor: AutoProcessor,
@@ -101,7 +92,7 @@ def predict_with_images(
     prompt: Optional[str] = None,
     device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
     max_new_tokens: int = 512,
-    **kwargs
+    **kwargs,
 ) -> list[str]:
     """
     Generate text predictions from images.
@@ -121,11 +112,7 @@ def predict_with_images(
     if isinstance(images, str):
         images = [images]
 
-    inputs = processor(
-        images=images,
-        text=prompt if prompt else "",
-        return_tensors="pt"
-    )
+    inputs = processor(images=images, text=prompt if prompt else "", return_tensors="pt")
 
     return predict_with_inputs(
         model=model,
@@ -134,5 +121,5 @@ def predict_with_images(
         pixel_values=inputs["pixel_values"],
         device=device,
         max_new_tokens=max_new_tokens,
-        **kwargs
+        **kwargs,
     )
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 4f9bc8b9..a99af163 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -10,10 +10,7 @@ class SmolVLM2Dataset(Dataset):
     """Dataset for SmolVLM2 model."""
 
     def __init__(
-        self,
-        image_paths: list[str],
-        texts: Optional[list[str]] = None,
-        processor: Optional[AutoProcessor] = None
+        self, image_paths: list[str], texts: Optional[list[str]] = None, processor: Optional[AutoProcessor] = None
     ):
         """
         Initialize dataset.
@@ -40,16 +37,10 @@ def __getitem__(self, idx: int) -> dict:
             text = ""
 
         if self.processor is not None:
-            return self.processor(
-                images=image,
-                text=text,
-                return_tensors="pt"
-            )
+            return self.processor(images=image, text=text, return_tensors="pt")
         else:
-            return {
-                "image": image,
-                "text": text
-            }
+            return {"image": image, "text": text}
+
 
 def train_collate_fn(batch: list[dict]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
@@ -67,8 +58,9 @@ def train_collate_fn(batch: list[dict]) -> tuple[torch.Tensor, torch.Tensor, tor
 
     return input_ids, pixel_values, labels
 
+
 def evaluation_collate_fn(
-    batch: list[dict]
+    batch: list[dict],
 ) -> tuple[torch.Tensor, torch.Tensor, list[Image.Image], list[str], list[str]]:
     """
     Collate function for evaluation data.
@@ -87,12 +79,9 @@ def evaluation_collate_fn(
 
     return input_ids, pixel_values, images, prompts, targets
 
+
 def create_dataloader(
-    dataset: Dataset,
-    batch_size: int = 8,
-    num_workers: int = 4,
-    shuffle: bool = True,
-    collate_fn = None
+    dataset: Dataset, batch_size: int = 8, num_workers: int = 4, shuffle: bool = True, collate_fn=None
 ) -> DataLoader:
     """
     Create a DataLoader for the dataset.
@@ -106,10 +95,4 @@ def create_dataloader(
     Returns:
         DataLoader instance
     """
-    return DataLoader(
-        dataset,
-        batch_size=batch_size,
-        num_workers=num_workers,
-        shuffle=shuffle,
-        collate_fn=collate_fn
-    )
+    return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, collate_fn=collate_fn)
diff --git a/pyproject.toml b/pyproject.toml
index f94e8c04..82f15aeb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -172,7 +172,7 @@ fixable = [
     "T",
     "W",
     "ANN",
-    "ARG", 
+    "ARG",
     "BLE",
     "COM",
     "DJ",
@@ -184,7 +184,7 @@ fixable = [
     "ICN",
     "INP",
     "ISC",
-    "NPY", 
+    "NPY",
     "PD",
     "PGH",
     "PIE",
@@ -196,7 +196,7 @@ fixable = [
     "RSE",
     "RUF",
     "SIM",
-    "SLF", 
+    "SLF",
     "TCH",
     "TID",
     "TRY",

From 6c35bef457f0806926a92c7f1402e6ecd4abd037 Mon Sep 17 00:00:00 2001
From: AshAnand34 <aashishanand2019@gmail.com>
Date: Sat, 10 May 2025 15:08:42 -0700
Subject: [PATCH 05/92] fixing errors in smolvlm2 interpretation

---
 maestro/trainer/models/smolvlm2/core.py      | 31 +++++++++++++-------
 maestro/trainer/models/smolvlm2/detection.py | 13 +++++---
 maestro/trainer/models/smolvlm2/inference.py | 29 ++++++++++++++++++
 3 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 475d60c7..4ffcc243 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -152,16 +152,27 @@ def train(config: dict) -> dict:
 
     else:
         raise ValueError(f"Unsupported optimization strategy: {strategy}")
-    processor = AutoProcessor.from_pretrained(model_name)
-
-    # Load datasets
+    processor = AutoProcessor.from_pretrained(model_name)    # Load datasets
+    
+    # Create processor wrapper to preprocess data before collating
+    def process_batch(batch):
+        processed_batch = []
+        for item in batch:
+            processed_item = processor(
+                images=item.get("image"),
+                text=item.get("text", ""),
+                return_tensors="pt"
+            )
+            processed_batch.append(processed_item)
+        return processed_batch
+        
     train_loader, valid_loader, test_loader = create_data_loaders(
         dataset_location=dataset_location,
         train_batch_size=config.get("batch_size", 4),
-        train_collect_fn=partial(train_collate_fn, processor=processor),
+        train_collect_fn=lambda batch: train_collate_fn(process_batch(batch)),
         train_num_workers=config.get("num_workers", 0),
         test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)),
-        test_collect_fn=partial(evaluation_collate_fn, processor=processor),
+        test_collect_fn=lambda batch: evaluation_collate_fn(process_batch(batch)),
         test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)),    )
 
     # Set up training arguments
@@ -183,15 +194,13 @@ def train(config: dict) -> dict:
         evaluation_strategy="epoch",
         load_best_model_at_end=True,
         remove_unused_columns=False
-    )
-
-    # Set up trainer
+    )    # Set up trainer
     trainer = Trainer(
         model=model,
         args=training_args,
-        train_dataset=train_loader.dataset,
-        eval_dataset=valid_loader.dataset,
-        data_collator=lambda batch: train_collate_fn(batch, processor)
+        train_dataset=train_loader.dataset if train_loader is not None else None,
+        eval_dataset=valid_loader.dataset if valid_loader is not None else None,
+        tokenizer=processor
     )
 
     # Train model
diff --git a/maestro/trainer/models/smolvlm2/detection.py b/maestro/trainer/models/smolvlm2/detection.py
index a52fcfdf..333297a9 100644
--- a/maestro/trainer/models/smolvlm2/detection.py
+++ b/maestro/trainer/models/smolvlm2/detection.py
@@ -99,13 +99,18 @@ def format_prompt_for_detection(
         prompt: Base prompt
         xyxy: Optional bounding boxes
         class_id: Optional class IDs
-        classes: Optional class names
-        resolution_wh: Optional image resolution
-
+        classes: Optional class names        resolution_wh: Optional image resolution
+        
     Returns:
         Formatted prompt string
     """
     if all(x is not None for x in [xyxy, class_id, classes, resolution_wh]):
-        detection_text = detections_to_text_formatter(xyxy, class_id, classes, resolution_wh)
+        # Type-cast to the expected types before passing to formatter
+        detection_text = detections_to_text_formatter(
+            xyxy, 
+            class_id if class_id is not None else [],
+            classes if classes is not None else [],
+            resolution_wh if resolution_wh is not None else (0, 0)
+        )
         return f"{prompt} {detection_text}"
     return prompt
diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index 9ed2c631..540d6b2a 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -17,6 +17,35 @@ def __init__(
         self.model = AutoModelForVision2Seq.from_pretrained(model_name)
         self.processor = AutoProcessor.from_pretrained(model_name)
         self.device = device
+        self.model_name = model_name
+
+    def get_model_info(self) -> dict:
+        """
+        Get information about the loaded model.
+
+        Returns:
+            Dictionary containing model information
+        """
+        # Extract model size from model name (e.g., smolvlm2-500m -> 500M)
+        size_info = "unknown"
+        if "-" in self.model_name:
+            parts = self.model_name.split("-")
+            if len(parts) > 1 and parts[-1].endswith("m"):
+                size_info = parts[-1].upper()
+
+        # Get total parameters
+        total_params = sum(p.numel() for p in self.model.parameters())
+        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
+
+        return {
+            "model_name": self.model_name,
+            "model_size": size_info,
+            "device": self.device,
+            "total_parameters": f"{total_params:,}",
+            "trainable_parameters": f"{trainable_params:,}",
+            "architecture": "Vision-Language Model (VLM)",
+            "framework": "PyTorch/Transformers"
+        }
 
     def generate(
         self,

From 8fe68f2495f0d82025b6b5db34bd06a64d608184 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 10 May 2025 22:10:53 +0000
Subject: [PATCH 06/92] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?=
 =?UTF-8?q?=20format=20pre-commit=20hooks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 maestro/trainer/models/smolvlm2/core.py      | 12 ++++--------
 maestro/trainer/models/smolvlm2/detection.py |  6 +++---
 maestro/trainer/models/smolvlm2/inference.py |  2 +-
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 2a8c392d..7bb32488 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -126,20 +126,16 @@ def train(config: dict) -> dict:
 
     else:
         raise ValueError(f"Unsupported optimization strategy: {strategy}")
-    processor = AutoProcessor.from_pretrained(model_name)    # Load datasets
-    
+    processor = AutoProcessor.from_pretrained(model_name)  # Load datasets
+
     # Create processor wrapper to preprocess data before collating
     def process_batch(batch):
         processed_batch = []
         for item in batch:
-            processed_item = processor(
-                images=item.get("image"),
-                text=item.get("text", ""),
-                return_tensors="pt"
-            )
+            processed_item = processor(images=item.get("image"), text=item.get("text", ""), return_tensors="pt")
             processed_batch.append(processed_item)
         return processed_batch
-        
+
     train_loader, valid_loader, test_loader = create_data_loaders(
         dataset_location=dataset_location,
         train_batch_size=config.get("batch_size", 4),
diff --git a/maestro/trainer/models/smolvlm2/detection.py b/maestro/trainer/models/smolvlm2/detection.py
index 4642fd96..c03126a5 100644
--- a/maestro/trainer/models/smolvlm2/detection.py
+++ b/maestro/trainer/models/smolvlm2/detection.py
@@ -97,17 +97,17 @@ def format_prompt_for_detection(
         xyxy: Optional bounding boxes
         class_id: Optional class IDs
         classes: Optional class names        resolution_wh: Optional image resolution
-        
+
     Returns:
         Formatted prompt string
     """
     if all(x is not None for x in [xyxy, class_id, classes, resolution_wh]):
         # Type-cast to the expected types before passing to formatter
         detection_text = detections_to_text_formatter(
-            xyxy, 
+            xyxy,
             class_id if class_id is not None else [],
             classes if classes is not None else [],
-            resolution_wh if resolution_wh is not None else (0, 0)
+            resolution_wh if resolution_wh is not None else (0, 0),
         )
         return f"{prompt} {detection_text}"
     return prompt
diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index c302640b..31834b13 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -44,7 +44,7 @@ def get_model_info(self) -> dict:
             "total_parameters": f"{total_params:,}",
             "trainable_parameters": f"{trainable_params:,}",
             "architecture": "Vision-Language Model (VLM)",
-            "framework": "PyTorch/Transformers"
+            "framework": "PyTorch/Transformers",
         }
 
     def generate(

From 4ff3d638fa71ebc3e9a1cb92b4fa554470b10e70 Mon Sep 17 00:00:00 2001
From: AshAnand34 <aashishanand2019@gmail.com>
Date: Sat, 10 May 2025 15:36:37 -0700
Subject: [PATCH 07/92] Fixing more errors with core.py

---
 maestro/trainer/models/smolvlm2/core.py | 28 +++++++++++++++++--------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 7bb32488..31ab6ca0 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -123,11 +123,12 @@ def train(config: dict) -> dict:
         # Freeze vision encoder parameters
         for param in model.vision_model.parameters():
             param.requires_grad = False
-
     else:
         raise ValueError(f"Unsupported optimization strategy: {strategy}")
-    processor = AutoProcessor.from_pretrained(model_name)  # Load datasets
-
+    
+    # Load processor and datasets
+    processor = AutoProcessor.from_pretrained(model_name)
+    
     # Create processor wrapper to preprocess data before collating
     def process_batch(batch):
         processed_batch = []
@@ -142,7 +143,7 @@ def process_batch(batch):
         train_collect_fn=lambda batch: train_collate_fn(process_batch(batch)),
         train_num_workers=config.get("num_workers", 0),
         test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)),
-        test_collect_fn=partial(evaluation_collate_fn, processor=processor),
+        test_collect_fn=lambda batch: evaluation_collate_fn(process_batch(batch)),
         test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)),
     )
 
@@ -166,14 +167,23 @@ def process_batch(batch):
         load_best_model_at_end=True,
         remove_unused_columns=False,
     )
-
-    # Set up trainer
+    
+    # Safely handle potential None loaders by directly checking train_loader/valid_loader before accessing dataset attribute
+    train_dataset = None
+    if train_loader is not None:
+        train_dataset = train_loader.dataset
+    
+    eval_dataset = None
+    if valid_loader is not None:
+        eval_dataset = valid_loader.dataset
+    
+    # Create data_collator that matches the train_collate_fn signature (doesn't pass processor)
     trainer = Trainer(
         model=model,
         args=training_args,
-        train_dataset=train_loader.dataset,
-        eval_dataset=valid_loader.dataset,
-        data_collator=lambda batch: train_collate_fn(batch, processor),
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=lambda batch: train_collate_fn(process_batch(batch)),
     )
 
     # Train model

From 0e1804eae7f197a42442b957dd52f334aef1cad5 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 10 May 2025 22:37:01 +0000
Subject: [PATCH 08/92] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto?=
 =?UTF-8?q?=20format=20pre-commit=20hooks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 maestro/trainer/models/smolvlm2/core.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 31ab6ca0..14b38bd4 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -60,7 +60,6 @@ def train(config: dict) -> dict:
     Returns:
         Dictionary containing training results and metrics
     """
-    from functools import partial
 
     from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
     from transformers import BitsAndBytesConfig, TrainingArguments
@@ -125,10 +124,10 @@ def train(config: dict) -> dict:
             param.requires_grad = False
     else:
         raise ValueError(f"Unsupported optimization strategy: {strategy}")
-    
+
     # Load processor and datasets
     processor = AutoProcessor.from_pretrained(model_name)
-    
+
     # Create processor wrapper to preprocess data before collating
     def process_batch(batch):
         processed_batch = []
@@ -167,16 +166,16 @@ def process_batch(batch):
         load_best_model_at_end=True,
         remove_unused_columns=False,
     )
-    
+
     # Safely handle potential None loaders by directly checking train_loader/valid_loader before accessing dataset attribute
     train_dataset = None
     if train_loader is not None:
         train_dataset = train_loader.dataset
-    
+
     eval_dataset = None
     if valid_loader is not None:
         eval_dataset = valid_loader.dataset
-    
+
     # Create data_collator that matches the train_collate_fn signature (doesn't pass processor)
     trainer = Trainer(
         model=model,

From 3ea55447ce1d00fedc167c6cca0674b14304d5f3 Mon Sep 17 00:00:00 2001
From: AshAnand34 <aashishanand2019@gmail.com>
Date: Sat, 10 May 2025 15:38:51 -0700
Subject: [PATCH 09/92] Fixed Ruff error with too long line

---
 maestro/trainer/models/smolvlm2/core.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 14b38bd4..e300982f 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -167,7 +167,8 @@ def process_batch(batch):
         remove_unused_columns=False,
     )
 
-    # Safely handle potential None loaders by directly checking train_loader/valid_loader before accessing dataset attribute
+    # Safely handle potential None loaders by directly checking
+    # train_loader/valid_loader before accessing dataset attribute
     train_dataset = None
     if train_loader is not None:
         train_dataset = train_loader.dataset

From 4039ff489a526cb45803d35e531913444051f3b1 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 10:13:07 -0300
Subject: [PATCH 10/92] first attempt of smolvlm

---
 .../trainer/models/smolvlm2/checkpoints.py    | 153 ++++-
 maestro/trainer/models/smolvlm2/core.py       | 595 +++++++++++++-----
 maestro/trainer/models/smolvlm2/entrypoint.py | 350 +++++++----
 maestro/trainer/models/smolvlm2/inference.py  | 169 ++---
 maestro/trainer/models/smolvlm2/loaders.py    | 203 +++---
 5 files changed, 992 insertions(+), 478 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index 87d1aea8..f90f887d 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -1,8 +1,26 @@
 import os
 from typing import Optional
+from enum import Enum
 
 import torch
 from transformers import AutoModelForVision2Seq, AutoProcessor
+from maestro.trainer.common.utils.device import parse_device_spec
+from maestro.trainer.logger import get_maestro_logger
+from peft import LoraConfig, get_peft_model
+from transformers import BitsAndBytesConfig
+
+DEFAULT_SMOLVLM2_MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"#"smol-ai/smolvlm2-500m"
+DEFAULT_SMOLVLM2_MODEL_REVISION = "refs/heads/main"
+DEFAULT_SMOLVLM2_PEFT_PARAMS = {
+    "r": 8,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+    "bias": "none",
+    "target_modules": ["q_proj", "o_proj", "k_proj", "v_proj", "linear", "Conv2d", "lm_head", "fc2"],
+    "task_type": "CAUSAL_LM",
+}
+
+logger = get_maestro_logger()
 
 
 def save_checkpoint(
@@ -29,27 +47,134 @@ def save_checkpoint(
     if metadata is not None:
         torch.save(metadata, os.path.join(path, "metadata.pt"))
 
+def save_model(
+    target_dir: str,
+    processor: AutoProcessor,
+    model: AutoModelForVision2Seq,
+) -> None:
+    """
+    Save a PaliGemma 2 model and its processor to disk.
 
-def load_checkpoint(path: str, device: str = "cuda" if torch.cuda.is_available() else "cpu") -> dict:
+    Args:
+        target_dir: Directory path where the model and processor will be saved.
+            Will be created if it doesn't exist.
+        processor: The PaliGemma 2 processor to save.
+        model: The PaliGemma 2model to save.
     """
-    Load model checkpoint.
+    os.makedirs(target_dir, exist_ok=True)
+    processor.save_pretrained(target_dir)
+    model.save_pretrained(target_dir)
+
+# def load_checkpoint(path: str, device: str = "cuda" if torch.cuda.is_available() else "cpu") -> dict:
+#     """
+#     Load model checkpoint.
+
+#     Args:
+#         path: Path to checkpoint
+#         device: Device to load model on
+
+#     Returns:
+#         Dictionary containing model, processor, and metadata
+#     """
+#     # Load model
+#     model = AutoModelForVision2Seq.from_pretrained(path)
+#     model.to(device)
+
+#     # Load processor
+#     processor = AutoProcessor.from_pretrained(path)
+
+#     # Load metadata if exists
+#     metadata_path = os.path.join(path, "metadata.pt")
+#     metadata = torch.load(metadata_path) if os.path.exists(metadata_path) else None
+
+#     return {"model": model, "processor": processor, "metadata": metadata}
+
+class OptimizationStrategy(Enum):
+    """Enumeration for optimization strategies."""
+
+    LORA = "lora"
+    QLORA = "qlora"
+    FREEZE = "freeze"
+    NONE = "none"
+def load_model(
+    model_id_or_path: str = DEFAULT_SMOLVLM2_MODEL_ID,
+    revision: str = DEFAULT_SMOLVLM2_MODEL_REVISION,
+    device: str | torch.device = "auto",
+    optimization_strategy: OptimizationStrategy = OptimizationStrategy.NONE,
+    peft_advanced_params: Optional[dict] = None,
+    cache_dir: Optional[str] = None,
+) -> tuple[AutoProcessor, AutoModelForVision2Seq]:
+    """Loads a PaliGemma 2 model and its associated processor.
 
     Args:
-        path: Path to checkpoint
-        device: Device to load model on
+        model_id_or_path (str): The identifier or path of the model to load.
+        revision (str): The specific model revision to use.
+        device (torch.device): The device to load the model onto.
+        optimization_strategy (OptimizationStrategy): The optimization strategy to apply to the model.
+        peft_advanced_params: custom lora configuration
+        cache_dir (Optional[str]): Directory to cache the downloaded model files.
 
     Returns:
-        Dictionary containing model, processor, and metadata
+        (PaliGemmaProcessor, PaliGemmaForConditionalGeneration):
+            A tuple containing the loaded processor and model.
+
+    Raises:
+        ValueError: If the model or processor cannot be loaded.
     """
-    # Load model
-    model = AutoModelForVision2Seq.from_pretrained(path)
-    model.to(device)
+    device = parse_device_spec(device)
+    #processor = PaliGemmaProcessor.from_pretrained(model_id_or_path, trust_remote_code=True, revision=revision)
+    processor = AutoProcessor.from_pretrained(model_id_or_path)
+
+    if optimization_strategy in {OptimizationStrategy.LORA, OptimizationStrategy.QLORA}:
+        default_params = DEFAULT_SMOLVLM2_PEFT_PARAMS
+        if peft_advanced_params is not None:
+            default_params.update(peft_advanced_params)
+            try:
+                lora_config = LoraConfig(**default_params)
+                logger.info("Successfully created LoraConfig")
+            except TypeError:
+                logger.exception("Invalid parameters for LoraConfig")
+                raise
+        
+        else:
+            logger.info("No LoRA parameters provided. Using default configuration.")
+            lora_config = LoraConfig(**default_params)
+        
+        bnb_config = (BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+        ) if optimization_strategy == OptimizationStrategy.QLORA
+            else None)
+        
+        model = AutoModelForVision2Seq.from_pretrained(
+            model_id_or_path,
+            revision=revision,
+            trust_remote_code=True,
+            quantization_config=bnb_config,
+            cache_dir=cache_dir,
+            torch_dtype=torch.bfloat16,
+        )
+        model = get_peft_model(model, lora_config)
+        model.print_trainable_parameters()
+    else:
+
+        model = AutoModelForVision2Seq.from_pretrained(
+            model_id_or_path,
+            revision=revision,
+            trust_remote_code=True,
+            cache_dir=cache_dir,).to(device)
+
+        if optimization_strategy == OptimizationStrategy.FREEZE:
+            # Freeze vision encoder parameters
+            for param in model.vision_model.parameters():
+                param.requires_grad = False
+
+            # for param in model.multi_modal_projector.parameters():
+            #     param.requires_grad = False
+
 
-    # Load processor
-    processor = AutoProcessor.from_pretrained(path)
 
-    # Load metadata if exists
-    metadata_path = os.path.join(path, "metadata.pt")
-    metadata = torch.load(metadata_path) if os.path.exists(metadata_path) else None
+    return processor, model
 
-    return {"model": model, "processor": processor, "metadata": metadata}
diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index e300982f..d17366d9 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -4,198 +4,467 @@
 import torch
 from transformers import AutoModelForVision2Seq, AutoProcessor, Trainer
 
+import lightning
+import dacite
+from functools import partial
+
+
+import numpy as np
+import supervision as sv
+from maestro.trainer.common.callbacks import SaveCheckpoint
+from maestro.trainer.common.datasets.core import create_data_loaders, resolve_dataset_path
+from maestro.trainer.common.metrics import BaseMetric, MetricsTracker, parse_metrics, save_metric_plots
+from maestro.trainer.common.training import MaestroTrainer
+from maestro.trainer.common.utils.device import device_is_available, parse_device_spec
+from maestro.trainer.common.utils.path import create_new_run_directory
+from maestro.trainer.common.utils.seed import ensure_reproducibility
+from maestro.trainer.logger import get_maestro_logger
+from maestro.trainer.models.smolvlm2.checkpoints import (
+    DEFAULT_SMOLVLM2_MODEL_ID,
+    DEFAULT_SMOLVLM2_MODEL_REVISION,
+    OptimizationStrategy,
+    load_model,
+    save_model,
+)
+from maestro.trainer.models.smolvlm2.inference import predict_with_inputs
+from maestro.trainer.models.smolvlm2.loaders import evaluation_collate_fn, train_collate_fn
+from typing import Literal, Optional
+from dataclasses import dataclass, field, replace
+from torch.utils.data import DataLoader
+from torch.optim import AdamW
+from maestro.trainer.common.metrics import (
+    BaseMetric,
+    MeanAveragePrecisionMetric,
+    MetricsTracker,
+    parse_metrics,
+    save_metric_plots,
+)
+from maestro.trainer.models.florence_2.detection import (
+    detections_to_prefix_formatter,
+    detections_to_suffix_formatter,
+    result_to_detections_formatter,
+)
+logger = get_maestro_logger()
+
+
+@dataclass()
+class SmolVLM2Configuration:
+    """
+    Configuration for training the SmolVLM2 model.
+
+    Attributes:
+        dataset (str):
+            Local path or Roboflow identifier. If not found locally, it will be resolved (and downloaded) automatically.
+        model_id (str):
+            Identifier for the PaliGemma2 model.
+        revision (str):
+            Model revision to use.
+        device (str | torch.device):
+            Device to run training on. Can be a ``torch.device`` or a string such as
+            "auto", "cpu", "cuda", or "mps". If "auto", the code will pick the best
+            available device.
+        optimization_strategy (Literal["lora", "qlora", "freeze", "none"]):
+            Strategy for optimizing the model parameters.
+        cache_dir (Optional[str]):
+            Directory to cache the model weights locally.
+        epochs (int):
+            Number of training epochs.
+        lr (float):
+            Learning rate for training.
+        batch_size (int):
+            Training batch size.
+        accumulate_grad_batches (int):
+            Number of batches to accumulate before performing a gradient update.
+        val_batch_size (Optional[int]):
+            Validation batch size. If None, defaults to the training batch size.
+        num_workers (int):
+            Number of workers for data loading.
+        val_num_workers (Optional[int]):
+            Number of workers for validation data loading. If None, defaults to num_workers.
+        output_dir (str):
+            Directory to store training outputs.
+        metrics (list[BaseMetric] | list[str]):
+            Metrics to track during training. Can be a list of metric objects or metric names.
+        max_new_tokens (int):
+            Maximum number of new tokens generated during inference.
+        random_seed (Optional[int]):
+            Random seed for ensuring reproducibility. If None, no seeding is applied.
+        peft_advanced_params (Optional[dict]):
+            Custom LoRA configuration . If None, default configuration is applied.
+    """
 
-class SmolVLM2Core:
-    """Core SmolVLM2 model implementation."""
+    dataset: str
+    model_id: str = DEFAULT_SMOLVLM2_MODEL_ID
+    revision: str = DEFAULT_SMOLVLM2_MODEL_REVISION
+    device: str | torch.device = "auto"
+    optimization_strategy: Literal["lora", "qlora", "freeze", "none"] = "lora"
+    cache_dir: Optional[str] = None
+    epochs: int = 10
+    lr: float = 2e-5
+    batch_size: int = 4
+    accumulate_grad_batches: int = 4
+    val_batch_size: Optional[int] = None
+    num_workers: int = 0
+    val_num_workers: Optional[int] = None
+    output_dir: str = "./training/smol_vlm_2"
+    metrics: list[BaseMetric] | list[str] = field(default_factory=list)
+    max_new_tokens: int = 512
+    random_seed: Optional[int] = None
+    peft_advanced_params: Optional[dict] = None
+
+    def __post_init__(self):
+        if self.val_batch_size is None:
+            self.val_batch_size = self.batch_size
+
+        if self.val_num_workers is None:
+            self.val_num_workers = self.num_workers
+
+        if isinstance(self.metrics, list) and all(isinstance(m, str) for m in self.metrics):
+            self.metrics = parse_metrics(self.metrics)
+
+        self.device = parse_device_spec(self.device)
+        if not device_is_available(self.device):
+            raise ValueError(f"Requested device '{self.device}' is not available.")
+
+
+class SmolVLM2Trainer(MaestroTrainer):
+    """
+    Trainer for fine-tuning the SmolVLM-2 model.
+
+    Attributes:
+        processor (AutoProcessor): Processor for model inputs.
+        model (AutoModelForCausalLM): The SmolVLM-2 model.
+        train_loader (DataLoader): DataLoader for training data.
+        valid_loader (DataLoader): DataLoader for validation data.
+        config (SmolVLM2Configuration): Configuration object with training parameters.
+    """
 
     def __init__(
         self,
-        model_name: str = "smol-ai/smolvlm2-500m",
-        device: str = "cuda" if torch.cuda.is_available() else "cpu",
-        **kwargs,
+        processor: AutoProcessor,
+        model: AutoModelForVision2Seq,
+        train_loader: DataLoader,
+        valid_loader: DataLoader,
+        config: SmolVLM2Configuration,
     ):
-        """
-        Initialize SmolVLM2 model.
+        super().__init__(processor, model, train_loader, valid_loader)
+        self.config = config
+
+        # TODO: Redesign metric tracking system
+        self.train_metrics_tracker = MetricsTracker.init(metrics=["loss"])
+        metrics = ["loss"]
+        for metric in config.metrics:
+            if isinstance(metric, BaseMetric):
+                metrics += metric.describe()
+        self.valid_metrics_tracker = MetricsTracker.init(metrics=metrics)
+
+    def training_step(self, batch, batch_idx):
+        input_ids, pixel_values, labels = batch
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            labels=labels,
+        )
+        loss = outputs.loss
+        self.log("train_loss", loss, prog_bar=True, logger=True, batch_size=self.config.batch_size)
+        self.train_metrics_tracker.register("loss", epoch=self.current_epoch, step=batch_idx, value=loss.item())
+        return loss
+
+    def validation_step(self, batch, batch_idx):
+        input_ids, pixel_values, images, prefixes, suffixes = batch
+        generated_suffixes = predict_with_inputs(
+            model=self.model,
+            processor=self.processor,
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            device=self.config.device,
+            max_new_tokens=self.config.max_new_tokens,
+        )
 
-        Args:
-            model_name: Name or path of the model to load
-            device: Device to run the model on
-            **kwargs: Additional arguments to pass to the model
-        """
-        self.model_name = model_name
-        self.device = device
+        if batch_idx == 0:
+            logger.info(f"sample valid prefix: {prefixes[0]}")
+            logger.info(f"sample valid suffix: {suffixes[0]}")
+            logger.info(f"sample generated suffix: {generated_suffixes[0]}")
+
+        for metric in self.config.metrics:
+            result = metric.compute(predictions=generated_suffixes, targets=suffixes)
+            for key, value in result.items():
+                self.valid_metrics_tracker.register(
+                    metric=key,
+                    epoch=self.current_epoch,
+                    step=batch_idx,
+                    value=value,
+                )
+                self.log(key, value, prog_bar=True, logger=True, batch_size=self.config.val_batch_size)
+
+    def configure_optimizers(self):
+        optimizer = AdamW(self.model.parameters(), lr=self.config.lr)
+        return optimizer
+
+    def on_fit_end(self) -> None:
+        save_metrics_path = os.path.join(self.config.output_dir, "metrics")
+        save_metric_plots(
+            training_tracker=self.train_metrics_tracker,
+            validation_tracker=self.valid_metrics_tracker,
+            output_dir=save_metrics_path,
+        )
 
-        self.processor = AutoProcessor.from_pretrained(model_name)
-        self.model = AutoModelForVision2Seq.from_pretrained(model_name)
-        self.model.to(device)
 
-    def process_inputs(self, images: Union[str, list[str]], prompt: Optional[str] = None) -> dict:
-        """Process input images and text."""
-        if isinstance(images, str):
-            images = [images]
 
-        return self.processor(images=images, text=prompt if prompt else "", return_tensors="pt").to(self.device)
 
-    def generate(self, inputs: dict, max_new_tokens: int = 512, **kwargs) -> torch.Tensor:
-        """Generate text from processed inputs."""
-        return self.model.generate(**inputs, max_new_tokens=max_new_tokens, **kwargs)
 
-    def decode_outputs(self, outputs: torch.Tensor, skip_special_tokens: bool = True) -> list[str]:
-        """Decode model outputs to text."""
-        return self.processor.batch_decode(outputs, skip_special_tokens=skip_special_tokens)
 
+def train(config: SmolVLM2Configuration | dict) -> None:
+    if isinstance(config, dict):
+        config = dacite.from_dict(data_class=SmolVLM2Configuration, data=config)
+    assert isinstance(config, SmolVLM2Configuration)  # ensure mypy understands it's not a dict
 
-def train(config: dict) -> dict:
-    """
-    Train SmolVLM2 model with provided configuration.
-
-    Args:
-        config: Dictionary containing training configuration
-            - dataset: Path to dataset directory or file
-            - epochs: Number of training epochs
-            - batch_size: Training batch size
-            - optimization_strategy: Strategy for optimization (qlora, lora, freeze_vision)
-            - metrics: List of metrics to evaluate during training
-            - output_dir: Directory to save trained model
-    Returns:
-        Dictionary containing training results and metrics
-    """
+    ensure_reproducibility(seed=config.random_seed, avoid_non_deterministic_algorithms=False)
+    run_dir = create_new_run_directory(base_output_dir=config.output_dir)
+    config = replace(config, output_dir=run_dir)
 
-    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-    from transformers import BitsAndBytesConfig, TrainingArguments
+    processor, model = load_model(
+        model_id_or_path=config.model_id,
+        revision=config.revision,
+        device=config.device,
+        optimization_strategy=OptimizationStrategy(config.optimization_strategy),
+        peft_advanced_params=config.peft_advanced_params,
+        cache_dir=config.cache_dir,
+    )
+    dataset_location = resolve_dataset_path(config.dataset)
+    if dataset_location is None:
+        return
+    train_loader, valid_loader, test_loader = create_data_loaders(
+        dataset_location=dataset_location,
+        train_batch_size=config.batch_size,
+        train_collect_fn=partial(train_collate_fn, processor=processor, max_length=config.max_new_tokens),
+        train_num_workers=config.num_workers,
+        test_batch_size=config.val_batch_size,
+        test_collect_fn=partial(evaluation_collate_fn, processor=processor),
+        test_num_workers=config.val_num_workers,
+    )
 
-    from maestro.trainer.common.datasets.core import create_data_loaders, resolve_dataset_path
-    from maestro.trainer.models.smolvlm2.loaders import evaluation_collate_fn, train_collate_fn
+    _, train_entry = train_loader.dataset[0]
+    logger.info(f"sample train prefix: {train_entry['prefix']}")
+    logger.info(f"sample train suffix: {train_entry['suffix']}")
 
-    # Load dataset
-    dataset_path = config["dataset"]
-    dataset_location = resolve_dataset_path(dataset_path)
-    if dataset_location is None:
-        return {"error": "Dataset not found"}
-
-    # Create model with the specified optimization strategy
-    model_name = config.get("model_name", "smol-ai/smolvlm2-500m")
-    strategy = config.get("optimization_strategy", "qlora")
-
-    if strategy == "qlora":
-        # Configure QLoRA
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True,
-        )
+    pl_module = SmolVLM2Trainer(
+        processor=processor, model=model, train_loader=train_loader, valid_loader=valid_loader, config=config
+    )
+    save_checkpoints_path = os.path.join(config.output_dir, "checkpoints")
+    save_checkpoint_callback = SaveCheckpoint(result_path=save_checkpoints_path, save_model_callback=save_model)
+    trainer = lightning.Trainer(
+        max_epochs=config.epochs,
+        accumulate_grad_batches=config.accumulate_grad_batches,
+        check_val_every_n_epoch=1,
+        limit_val_batches=1,
+        log_every_n_steps=10,
+        callbacks=[save_checkpoint_callback],
+    )
+    trainer.fit(pl_module)
 
-        model = AutoModelForVision2Seq.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
-        model = prepare_model_for_kbit_training(model)
 
-        lora_config = LoraConfig(
-            r=16,
-            lora_alpha=32,
-            target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
-            lora_dropout=0.05,
-            bias="none",
-            task_type="CAUSAL_LM",
-        )
 
-        model = get_peft_model(model, lora_config)
 
-    elif strategy == "lora":
-        # Configure LoRA without quantization
-        model = AutoModelForVision2Seq.from_pretrained(model_name)
 
-        lora_config = LoraConfig(
-            r=16,
-            lora_alpha=32,
-            target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
-            lora_dropout=0.05,
-            bias="none",
-            task_type="CAUSAL_LM",
-        )
 
-        model = get_peft_model(model, lora_config)
 
-    elif strategy == "freeze_vision":
-        # Freeze vision encoder, train only language model part
-        model = AutoModelForVision2Seq.from_pretrained(model_name)
 
-        # Freeze vision encoder parameters
-        for param in model.vision_model.parameters():
-            param.requires_grad = False
-    else:
-        raise ValueError(f"Unsupported optimization strategy: {strategy}")
 
-    # Load processor and datasets
-    processor = AutoProcessor.from_pretrained(model_name)
 
-    # Create processor wrapper to preprocess data before collating
-    def process_batch(batch):
-        processed_batch = []
-        for item in batch:
-            processed_item = processor(images=item.get("image"), text=item.get("text", ""), return_tensors="pt")
-            processed_batch.append(processed_item)
-        return processed_batch
 
-    train_loader, valid_loader, test_loader = create_data_loaders(
-        dataset_location=dataset_location,
-        train_batch_size=config.get("batch_size", 4),
-        train_collect_fn=lambda batch: train_collate_fn(process_batch(batch)),
-        train_num_workers=config.get("num_workers", 0),
-        test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)),
-        test_collect_fn=lambda batch: evaluation_collate_fn(process_batch(batch)),
-        test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)),
-    )
 
-    # Set up training arguments
-    output_dir = config.get("output_dir", "./smolvlm2-finetuned")
-    os.makedirs(output_dir, exist_ok=True)
-
-    training_args = TrainingArguments(
-        output_dir=output_dir,
-        num_train_epochs=config.get("epochs", 10),
-        per_device_train_batch_size=config.get("batch_size", 4),
-        per_device_eval_batch_size=config.get("val_batch_size", config.get("batch_size", 4)),
-        gradient_accumulation_steps=4,
-        learning_rate=2e-5,
-        weight_decay=0.01,
-        warmup_steps=100,
-        save_strategy="epoch",
-        save_total_limit=2,
-        logging_steps=10,
-        evaluation_strategy="epoch",
-        load_best_model_at_end=True,
-        remove_unused_columns=False,
-    )
+# class SmolVLM2Core:
+#     """Core SmolVLM2 model implementation."""
 
-    # Safely handle potential None loaders by directly checking
-    # train_loader/valid_loader before accessing dataset attribute
-    train_dataset = None
-    if train_loader is not None:
-        train_dataset = train_loader.dataset
-
-    eval_dataset = None
-    if valid_loader is not None:
-        eval_dataset = valid_loader.dataset
-
-    # Create data_collator that matches the train_collate_fn signature (doesn't pass processor)
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        data_collator=lambda batch: train_collate_fn(process_batch(batch)),
-    )
+#     def __init__(
+#         self,
+#         model_name: str = "smol-ai/smolvlm2-500m",
+#         device: str = "cuda" if torch.cuda.is_available() else "cpu",
+#         **kwargs,
+#     ):
+#         """
+#         Initialize SmolVLM2 model.
+
+#         Args:
+#             model_name: Name or path of the model to load
+#             device: Device to run the model on
+#             **kwargs: Additional arguments to pass to the model
+#         """
+#         self.model_name = model_name
+#         self.device = device
+
+#         self.processor = AutoProcessor.from_pretrained(model_name)
+#         self.model = AutoModelForVision2Seq.from_pretrained(model_name)
+#         self.model.to(device)
+
+#     def process_inputs(self, images: Union[str, list[str]], prompt: Optional[str] = None) -> dict:
+#         """Process input images and text."""
+#         if isinstance(images, str):
+#             images = [images]
+
+#         return self.processor(images=images, text=prompt if prompt else "", return_tensors="pt").to(self.device)
+
+#     def generate(self, inputs: dict, max_new_tokens: int = 512, **kwargs) -> torch.Tensor:
+#         """Generate text from processed inputs."""
+#         return self.model.generate(**inputs, max_new_tokens=max_new_tokens, **kwargs)
+
+#     def decode_outputs(self, outputs: torch.Tensor, skip_special_tokens: bool = True) -> list[str]:
+#         """Decode model outputs to text."""
+#         return self.processor.batch_decode(outputs, skip_special_tokens=skip_special_tokens)
 
-    # Train model
-    trainer.train()
 
-    # Save model and processor
-    model.save_pretrained(output_dir)
-    processor.save_pretrained(output_dir)
+# def train(config: dict) -> dict:
+#     """
+#     Train SmolVLM2 model with provided configuration.
 
-    # Return results
-    return {
-        "model_path": output_dir,
-        "metrics": trainer.state.log_history[-1] if trainer.state.log_history else {"loss": "N/A"},
-        "status": "Training completed",
-    }
+#     Args:
+#         config: Dictionary containing training configuration
+#             - dataset: Path to dataset directory or file
+#             - epochs: Number of training epochs
+#             - batch_size: Training batch size
+#             - optimization_strategy: Strategy for optimization (qlora, lora, freeze_vision)
+#             - metrics: List of metrics to evaluate during training
+#             - output_dir: Directory to save trained model
+#     Returns:
+#         Dictionary containing training results and metrics
+#     """
+
+#     from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+#     from transformers import BitsAndBytesConfig, TrainingArguments
+
+#     from maestro.trainer.common.datasets.core import create_data_loaders, resolve_dataset_path
+#     from maestro.trainer.models.smolvlm2.loaders import evaluation_collate_fn, train_collate_fn
+
+#     # Load dataset
+#     dataset_path = config["dataset"]
+#     dataset_location = resolve_dataset_path(dataset_path)
+#     if dataset_location is None:
+#         return {"error": "Dataset not found"}
+
+#     # Create model with the specified optimization strategy
+#     model_name = config.get("model_name", "smol-ai/smolvlm2-500m")
+#     strategy = config.get("optimization_strategy", "qlora")
+
+#     if strategy == "qlora":
+#         # Configure QLoRA
+#         bnb_config = BitsAndBytesConfig(
+#             load_in_4bit=True,
+#             bnb_4bit_quant_type="nf4",
+#             bnb_4bit_compute_dtype=torch.float16,
+#             bnb_4bit_use_double_quant=True,
+#         )
+
+#         model = AutoModelForVision2Seq.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
+#         model = prepare_model_for_kbit_training(model)
+
+#         lora_config = LoraConfig(
+#             r=16,
+#             lora_alpha=32,
+#             target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+#             lora_dropout=0.05,
+#             bias="none",
+#             task_type="CAUSAL_LM",
+#         )
+
+#         model = get_peft_model(model, lora_config)
+
+#     elif strategy == "lora":
+#         # Configure LoRA without quantization
+#         model = AutoModelForVision2Seq.from_pretrained(model_name)
+
+#         lora_config = LoraConfig(
+#             r=16,
+#             lora_alpha=32,
+#             target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+#             lora_dropout=0.05,
+#             bias="none",
+#             task_type="CAUSAL_LM",
+#         )
+
+#         model = get_peft_model(model, lora_config)
+
+#     elif strategy == "freeze_vision":
+#         # Freeze vision encoder, train only language model part
+#         model = AutoModelForVision2Seq.from_pretrained(model_name)
+
+#         # Freeze vision encoder parameters
+#         for param in model.vision_model.parameters():
+#             param.requires_grad = False
+#     else:
+#         raise ValueError(f"Unsupported optimization strategy: {strategy}")
+
+#     # Load processor and datasets
+#     processor = AutoProcessor.from_pretrained(model_name)
+
+#     # Create processor wrapper to preprocess data before collating
+#     def process_batch(batch):
+#         processed_batch = []
+#         for item in batch:
+#             processed_item = processor(images=item.get("image"), text=item.get("text", ""), return_tensors="pt")
+#             processed_batch.append(processed_item)
+#         return processed_batch
+
+#     train_loader, valid_loader, test_loader = create_data_loaders(
+#         dataset_location=dataset_location,
+#         train_batch_size=config.get("batch_size", 4),
+#         train_collect_fn=lambda batch: train_collate_fn(process_batch(batch)),
+#         train_num_workers=config.get("num_workers", 0),
+#         test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)),
+#         test_collect_fn=lambda batch: evaluation_collate_fn(process_batch(batch)),
+#         test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)),
+#     )
+
+#     # Set up training arguments
+#     output_dir = config.get("output_dir", "./smolvlm2-finetuned")
+#     os.makedirs(output_dir, exist_ok=True)
+
+#     training_args = TrainingArguments(
+#         output_dir=output_dir,
+#         num_train_epochs=config.get("epochs", 10),
+#         per_device_train_batch_size=config.get("batch_size", 4),
+#         per_device_eval_batch_size=config.get("val_batch_size", config.get("batch_size", 4)),
+#         gradient_accumulation_steps=4,
+#         learning_rate=2e-5,
+#         weight_decay=0.01,
+#         warmup_steps=100,
+#         save_strategy="epoch",
+#         save_total_limit=2,
+#         logging_steps=10,
+#         evaluation_strategy="epoch",
+#         load_best_model_at_end=True,
+#         remove_unused_columns=False,
+#     )
+
+#     # Safely handle potential None loaders by directly checking
+#     # train_loader/valid_loader before accessing dataset attribute
+#     train_dataset = None
+#     if train_loader is not None:
+#         train_dataset = train_loader.dataset
+
+#     eval_dataset = None
+#     if valid_loader is not None:
+#         eval_dataset = valid_loader.dataset
+
+#     # Create data_collator that matches the train_collate_fn signature (doesn't pass processor)
+#     trainer = Trainer(
+#         model=model,
+#         args=training_args,
+#         train_dataset=train_dataset,
+#         eval_dataset=eval_dataset,
+#         data_collator=lambda batch: train_collate_fn(process_batch(batch)),
+#     )
+
+#     # Train model
+#     trainer.train()
+
+#     # Save model and processor
+#     model.save_pretrained(output_dir)
+#     processor.save_pretrained(output_dir)
+
+#     # Return results
+#     return {
+#         "model_path": output_dir,
+#         "metrics": trainer.state.log_history[-1] if trainer.state.log_history else {"loss": "N/A"},
+#         "status": "Training completed",
+#     }
diff --git a/maestro/trainer/models/smolvlm2/entrypoint.py b/maestro/trainer/models/smolvlm2/entrypoint.py
index a18ff74e..d6c09dd7 100644
--- a/maestro/trainer/models/smolvlm2/entrypoint.py
+++ b/maestro/trainer/models/smolvlm2/entrypoint.py
@@ -1,124 +1,244 @@
-from pathlib import Path
-from typing import Optional, Union
+import dataclasses
+import json
+from typing import Annotated, Any, Optional
 
-import torch
+import rich
 import typer
 
-from .inference import SmolVLM2Inference
-
-smolvlm2_app = typer.Typer()
-
-
-class SmolVLM2:
-    """Main entrypoint for SmolVLM2 model."""
-
-    def __init__(
-        self,
-        model_name: str = "smol-ai/smolvlm2-500m",
-        device: str = "cuda" if torch.cuda.is_available() else "cpu",
-        **kwargs,
-    ):
-        """Initialize SmolVLM2 model."""
-        self.inference = SmolVLM2Inference(model_name=model_name, device=device, **kwargs)
-
-    def generate(
-        self, images: Union[str, list[str]], prompt: Optional[str] = None, max_new_tokens: int = 512, **kwargs
-    ) -> dict:
-        """
-        Generate text from images.
-
-        Args:
-            images: Path(s) to image(s)
-            prompt: Optional prompt to guide generation
-            max_new_tokens: Maximum number of tokens to generate
-            **kwargs: Additional generation parameters
-
-        Returns:
-            Dictionary containing generated text and other outputs
-        """
-        return self.inference.generate(images=images, prompt=prompt, max_new_tokens=max_new_tokens, **kwargs)
-
-
-@smolvlm2_app.command(name="info", help="Get information about the SmolVLM2 model")
-def info() -> None:
-    """Get information about the SmolVLM2 model."""
-    try:
-        model = SmolVLM2()
-        info = model.inference.get_model_info()
-        typer.echo(f"Model Name: {info['model_name']}")
-        typer.echo(f"Model Size: {info['model_size']}")
-        typer.echo(f"Device: {info['device']}")
-        typer.echo(f"Tokenizer: {info['tokenizer']}")
-    except Exception as e:
-        typer.echo(f"Error retrieving model info: {e!s}", err=True)
-        raise typer.Exit(code=1)
-
-
-@smolvlm2_app.command(name="predict", help="Run inference on one or more images")
-def predict(
-    image: list[Path] = typer.Option(..., "--image", "-i", help="Path to image(s) for prediction"),
-    prompt: Optional[str] = typer.Option(None, "--prompt", "-p", help="Optional prompt to guide generation"),
-    max_new_tokens: int = typer.Option(512, "--max-new-tokens", help="Maximum new tokens to generate"),
-    output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output file path to save results"),
-) -> None:
-    """Run inference on images using SmolVLM2."""
-    try:
-        model = SmolVLM2()
-        result = model.generate(images=[str(img) for img in image], prompt=prompt, max_new_tokens=max_new_tokens)
-
-        if output:
-            import json
-
-            with open(output, "w") as f:
-                json.dump(result, f, indent=2)
-            typer.echo(f"Results saved to {output}")
-        else:
-            typer.echo(f"Generated text: {result['text']}")
+from maestro.trainer.logger import get_maestro_logger
+from maestro.trainer.models.smolvlm2.checkpoints import DEFAULT_SMOLVLM2_MODEL_ID, DEFAULT_SMOLVLM2_MODEL_REVISION
+from maestro.trainer.models.smolvlm2.core import SmolVLM2Configuration
+from maestro.trainer.models.smolvlm2.core import train as smolvlm2_train
 
-    except Exception as e:
-        typer.echo(f"Error during prediction: {e!s}", err=True)
-        raise typer.Exit(code=1)
+logger = get_maestro_logger()
+smolvlm2_app = typer.Typer(help="Fine-tune and evaluate SmolVLM2 model")
 
 
-@smolvlm2_app.command(name="train", help="Fine-tune the SmolVLM2 model")
+@smolvlm2_app.command(
+    help="Train SmolVLM2 model",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
 def train(
-    dataset: Path = typer.Option(..., "--dataset", "-d", help="Path to dataset directory or file"),
-    epochs: int = typer.Option(10, "--epochs", "-e", help="Number of training epochs"),
-    batch_size: int = typer.Option(4, "--batch-size", "-b", help="Training batch size"),
-    optimization_strategy: str = typer.Option(
-        "qlora", "--optimization-strategy", "-o", help="Optimization strategy (qlora, lora, freeze_vision)"
-    ),
-    metrics: list[str] = typer.Option(["edit_distance"], "--metrics", "-m", help="Metrics to evaluate during training"),
-    output_dir: Optional[Path] = typer.Option(None, "--output-dir", help="Directory to save trained model"),
+    dataset: Annotated[
+        str,
+        typer.Option(
+            "--dataset",
+            help="Local path or Roboflow identifier. If not found locally, it will be resolved (and downloaded) "
+            "automatically",
+        ),
+    ],
+    model_id: Annotated[
+        str, typer.Option("--model_id", help="Identifier for the SmolVLM2 model")
+    ] = DEFAULT_SMOLVLM2_MODEL_ID,
+    revision: Annotated[
+        str, typer.Option("--revision", help="Model revision to use")
+    ] = DEFAULT_SMOLVLM2_MODEL_REVISION,
+    device: Annotated[str, typer.Option("--device", help="Device to use for training")] = "auto",
+    optimization_strategy: Annotated[
+        str, typer.Option("--optimization_strategy", help="Optimization strategy: lora, freeze, or none")
+    ] = "lora",
+    cache_dir: Annotated[
+        Optional[str], typer.Option("--cache_dir", help="Directory to cache the model weights locally")
+    ] = None,
+    epochs: Annotated[int, typer.Option("--epochs", help="Number of training epochs")] = 10,
+    lr: Annotated[float, typer.Option("--lr", help="Learning rate for training")] = 1e-5,
+    batch_size: Annotated[int, typer.Option("--batch_size", help="Training batch size")] = 4,
+    accumulate_grad_batches: Annotated[
+        int, typer.Option("--accumulate_grad_batches", help="Number of batches to accumulate for gradient updates")
+    ] = 8,
+    val_batch_size: Annotated[Optional[int], typer.Option("--val_batch_size", help="Validation batch size")] = None,
+    num_workers: Annotated[int, typer.Option("--num_workers", help="Number of workers for data loading")] = 0,
+    val_num_workers: Annotated[
+        Optional[int], typer.Option("--val_num_workers", help="Number of workers for validation data loading")
+    ] = None,
+    output_dir: Annotated[
+        str, typer.Option("--output_dir", help="Directory to store training outputs")
+    ] = "./training/smolvlm2",
+    metrics: Annotated[list[str], typer.Option("--metrics", help="List of metrics to track during training")] = [],
+    max_new_tokens: Annotated[
+        int,
+        typer.Option("--max_new_tokens", help="Maximum number of new tokens generated during inference"),
+    ] = 1024,
+    random_seed: Annotated[
+        Optional[int],
+        typer.Option("--random_seed", help="Random seed for ensuring reproducibility. If None, no seed is set"),
+    ] = None,
+    peft_advanced_params: Annotated[
+        Optional[str],
+        typer.Option("--peft_advanced_params", help="custom LoRA config. If None, default LoRA config is set"),
+    ] = None,
 ) -> None:
-    """Fine-tune the SmolVLM2 model on a dataset."""
-    try:
-        typer.echo("Starting SmolVLM2 fine-tuning...")
-
-        if output_dir is None:
-            import tempfile
-
-            output_dir = Path(tempfile.mkdtemp())
-            typer.echo(f"No output directory specified, using temporary directory: {output_dir}")
-
-        # Create configuration for training
-        config = {
-            "dataset": str(dataset),
-            "epochs": epochs,
-            "batch_size": batch_size,
-            "optimization_strategy": optimization_strategy,
-            "metrics": metrics,
-            "output_dir": str(output_dir),
-        }
-
-        # Import the train function here to avoid circular imports
-        from .core import train as train_model
-
-        results = train_model(config)
-
-        typer.echo(f"Training complete! Model saved to {output_dir}")
-        typer.echo(f"Final metrics: {results.get('metrics', {})}")
+    def parse_lora_params(param_str) -> dict[str, Any]:
+        parsed_params = json.loads(param_str)
+        if not isinstance(parsed_params, dict):
+            raise TypeError("Parsed JSON is not a dictionary")
+        return parsed_params
+
+    if peft_advanced_params is not None:
+        try:
+            peft_advanced_params_dict = parse_lora_params(peft_advanced_params)
+            logger.info(f"Parsed LoRA parameters: {peft_advanced_params_dict}")
+        except json.JSONDecodeError:
+            logger.exception("Failed to parse JSON")
+            raise
+        except TypeError:
+            logger.exception("Invalid LoRA parameter format")
+            raise
+
+    config = SmolVLM2Configuration(
+        dataset=dataset,
+        model_id=model_id,
+        revision=revision,
+        device=device,
+        optimization_strategy=optimization_strategy,  # type: ignore
+        cache_dir=cache_dir,
+        epochs=epochs,
+        lr=lr,
+        batch_size=batch_size,
+        accumulate_grad_batches=accumulate_grad_batches,
+        val_batch_size=val_batch_size,
+        num_workers=num_workers,
+        val_num_workers=val_num_workers,
+        output_dir=output_dir,
+        metrics=metrics,
+        max_new_tokens=max_new_tokens,
+        random_seed=random_seed,
+        peft_advanced_params=peft_advanced_params_dict,
+    )
+    typer.echo(typer.style("Training configuration", fg=typer.colors.BRIGHT_GREEN, bold=True))
+    rich.print(dataclasses.asdict(config))
+    smolvlm2_train(config=config)
+
+
+
+
+
+
+
+
+
+
+# from pathlib import Path
+# from typing import Optional, Union
+
+# import torch
+# import typer
+
+# from .inference import SmolVLM2Inference
+
+# smolvlm2_app = typer.Typer()
+
+
+# class SmolVLM2:
+#     """Main entrypoint for SmolVLM2 model."""
+
+#     def __init__(
+#         self,
+#         model_name: str = "smol-ai/smolvlm2-500m",
+#         device: str = "cuda" if torch.cuda.is_available() else "cpu",
+#         **kwargs,
+#     ):
+#         """Initialize SmolVLM2 model."""
+#         self.inference = SmolVLM2Inference(model_name=model_name, device=device, **kwargs)
+
+#     def generate(
+#         self, images: Union[str, list[str]], prompt: Optional[str] = None, max_new_tokens: int = 512, **kwargs
+#     ) -> dict:
+#         """
+#         Generate text from images.
+
+#         Args:
+#             images: Path(s) to image(s)
+#             prompt: Optional prompt to guide generation
+#             max_new_tokens: Maximum number of tokens to generate
+#             **kwargs: Additional generation parameters
+
+#         Returns:
+#             Dictionary containing generated text and other outputs
+#         """
+#         return self.inference.generate(images=images, prompt=prompt, max_new_tokens=max_new_tokens, **kwargs)
+
+
+# @smolvlm2_app.command(name="info", help="Get information about the SmolVLM2 model")
+# def info() -> None:
+#     """Get information about the SmolVLM2 model."""
+#     try:
+#         model = SmolVLM2()
+#         info = model.inference.get_model_info()
+#         typer.echo(f"Model Name: {info['model_name']}")
+#         typer.echo(f"Model Size: {info['model_size']}")
+#         typer.echo(f"Device: {info['device']}")
+#         typer.echo(f"Tokenizer: {info['tokenizer']}")
+#     except Exception as e:
+#         typer.echo(f"Error retrieving model info: {e!s}", err=True)
+#         raise typer.Exit(code=1)
+
+
+# @smolvlm2_app.command(name="predict", help="Run inference on one or more images")
+# def predict(
+#     image: list[Path] = typer.Option(..., "--image", "-i", help="Path to image(s) for prediction"),
+#     prompt: Optional[str] = typer.Option(None, "--prompt", "-p", help="Optional prompt to guide generation"),
+#     max_new_tokens: int = typer.Option(512, "--max-new-tokens", help="Maximum new tokens to generate"),
+#     output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output file path to save results"),
+# ) -> None:
+#     """Run inference on images using SmolVLM2."""
+#     try:
+#         model = SmolVLM2()
+#         result = model.generate(images=[str(img) for img in image], prompt=prompt, max_new_tokens=max_new_tokens)
+
+#         if output:
+#             import json
+
+#             with open(output, "w") as f:
+#                 json.dump(result, f, indent=2)
+#             typer.echo(f"Results saved to {output}")
+#         else:
+#             typer.echo(f"Generated text: {result['text']}")
+
+#     except Exception as e:
+#         typer.echo(f"Error during prediction: {e!s}", err=True)
+#         raise typer.Exit(code=1)
+
 
-    except Exception as e:
-        typer.echo(f"Error during training: {e!s}", err=True)
-        raise typer.Exit(code=1)
+# @smolvlm2_app.command(name="train", help="Fine-tune the SmolVLM2 model")
+# def train(
+#     dataset: Path = typer.Option(..., "--dataset", "-d", help="Path to dataset directory or file"),
+#     epochs: int = typer.Option(10, "--epochs", "-e", help="Number of training epochs"),
+#     batch_size: int = typer.Option(4, "--batch-size", "-b", help="Training batch size"),
+#     optimization_strategy: str = typer.Option(
+#         "qlora", "--optimization-strategy", "-o", help="Optimization strategy (qlora, lora, freeze_vision)"
+#     ),
+#     metrics: list[str] = typer.Option(["edit_distance"], "--metrics", "-m", help="Metrics to evaluate during training"),
+#     output_dir: Optional[Path] = typer.Option(None, "--output-dir", help="Directory to save trained model"),
+# ) -> None:
+#     """Fine-tune the SmolVLM2 model on a dataset."""
+#     try:
+#         typer.echo("Starting SmolVLM2 fine-tuning...")
+
+#         if output_dir is None:
+#             import tempfile
+
+#             output_dir = Path(tempfile.mkdtemp())
+#             typer.echo(f"No output directory specified, using temporary directory: {output_dir}")
+
+#         # Create configuration for training
+#         config = {
+#             "dataset": str(dataset),
+#             "epochs": epochs,
+#             "batch_size": batch_size,
+#             "optimization_strategy": optimization_strategy,
+#             "metrics": metrics,
+#             "output_dir": str(output_dir),
+#         }
+
+#         # Import the train function here to avoid circular imports
+#         from .core import train as train_model
+
+#         results = train_model(config)
+
+#         typer.echo(f"Training complete! Model saved to {output_dir}")
+#         typer.echo(f"Final metrics: {results.get('metrics', {})}")
+
+#     except Exception as e:
+#         typer.echo(f"Error during training: {e!s}", err=True)
+#         raise typer.Exit(code=1)
diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index 31834b13..5303cfe2 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -1,82 +1,9 @@
 from typing import Optional, Union
+from PIL import Image
 
 import torch
 from transformers import AutoModelForVision2Seq, AutoProcessor
-
-
-class SmolVLM2Inference:
-    """Inference interface for SmolVLM2 model."""
-
-    def __init__(
-        self,
-        model_name: str = "smol-ai/smolvlm2-500m",
-        device: str = "cuda" if torch.cuda.is_available() else "cpu",
-        **kwargs,
-    ):
-        """Initialize inference interface."""
-        self.model = AutoModelForVision2Seq.from_pretrained(model_name)
-        self.processor = AutoProcessor.from_pretrained(model_name)
-        self.device = device
-        self.model_name = model_name
-
-    def get_model_info(self) -> dict:
-        """
-        Get information about the loaded model.
-
-        Returns:
-            Dictionary containing model information
-        """
-        # Extract model size from model name (e.g., smolvlm2-500m -> 500M)
-        size_info = "unknown"
-        if "-" in self.model_name:
-            parts = self.model_name.split("-")
-            if len(parts) > 1 and parts[-1].endswith("m"):
-                size_info = parts[-1].upper()
-
-        # Get total parameters
-        total_params = sum(p.numel() for p in self.model.parameters())
-        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
-
-        return {
-            "model_name": self.model_name,
-            "model_size": size_info,
-            "device": self.device,
-            "total_parameters": f"{total_params:,}",
-            "trainable_parameters": f"{trainable_params:,}",
-            "architecture": "Vision-Language Model (VLM)",
-            "framework": "PyTorch/Transformers",
-        }
-
-    def generate(
-        self, images: Union[str, list[str]], prompt: Optional[str] = None, max_new_tokens: int = 512, **kwargs
-    ) -> dict:
-        """
-        Generate text from images.
-
-        Args:
-            images: Path(s) to image(s)
-            prompt: Optional prompt to guide generation
-            max_new_tokens: Maximum number of tokens to generate
-            **kwargs: Additional generation parameters
-
-        Returns:
-            Dictionary containing generated text and other outputs
-        """
-        # Process inputs
-        inputs = self.processor(images=images, text=prompt if prompt else "", return_tensors="pt")
-
-        # Generate
-        outputs = self.model.generate(
-            input_ids=inputs["input_ids"].to(self.device),
-            pixel_values=inputs["pixel_values"].to(self.device),
-            max_new_tokens=max_new_tokens,
-            **kwargs,
-        )
-
-        # Decode outputs
-        generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)
-
-        return {"generated_text": generated_text, "model_outputs": outputs}
+from maestro.trainer.common.utils.device import parse_device_spec
 
 
 def predict_with_inputs(
@@ -103,52 +30,84 @@ def predict_with_inputs(
     Returns:
         List of generated text strings
     """
-    model.eval()
     with torch.no_grad():
-        outputs = model.generate(
+        generated_ids = model.generate(
             input_ids=input_ids.to(device),
             pixel_values=pixel_values.to(device),
             max_new_tokens=max_new_tokens,
-            **kwargs,
+            do_sample=False,
+            num_beams=3,
         )
-    return processor.batch_decode(outputs, skip_special_tokens=True)
+    return processor.batch_decode(generated_ids, skip_special_tokens=False)
 
 
-def predict_with_images(
+def predict(
     model: AutoModelForVision2Seq,
     processor: AutoProcessor,
-    images: Union[str, list[str]],
-    prompt: Optional[str] = None,
-    device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
-    max_new_tokens: int = 512,
-    **kwargs,
-) -> list[str]:
-    """
-    Generate text predictions from images.
+    image: Image.Image,
+    prefix: str,
+    device: str | torch.device = "auto",
+    max_new_tokens: int = 1024,
+) -> str:
+    """Generate a text prediction for a single image and text prefix.
 
     Args:
-        model: The SmolVLM2 model
-        processor: The model's processor
-        images: Path(s) to image(s)
-        prompt: Optional prompt to guide generation
-        device: Device to run inference on
-        max_new_tokens: Maximum number of tokens to generate
-        **kwargs: Additional generation parameters
+        model (AutoModelForCausalLM): The Florence-2 model for conditional text generation.
+        processor (AutoProcessor): Processor for model inputs and outputs, handling tokenization and decoding.
+        image (str | bytes | Image.Image): Input image as a file path, raw bytes, or a PIL Image.
+        prefix (str): Text prefix to condition the generated output.
+        device (str | torch.device): Device on which to run inference (e.g., "auto", "cpu", "cuda").
+        max_new_tokens (int): Maximum number of tokens to generate.
 
     Returns:
-        List of generated text strings
+        str: The generated text prediction.
     """
-    if isinstance(images, str):
-        images = [images]
-
-    inputs = processor(images=images, text=prompt if prompt else "", return_tensors="pt")
-
+    device = parse_device_spec(device)
+    inputs = processor(text=prefix, images=image, return_tensors="pt", padding=True)
     return predict_with_inputs(
-        model=model,
-        processor=processor,
         input_ids=inputs["input_ids"],
         pixel_values=inputs["pixel_values"],
+        model=model,
+        processor=processor,
         device=device,
         max_new_tokens=max_new_tokens,
-        **kwargs,
-    )
+    )[0]
+
+# def predict_with_images(
+#     model: AutoModelForVision2Seq,
+#     processor: AutoProcessor,
+#     images: Union[str, list[str]],
+#     prompt: Optional[str] = None,
+#     device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
+#     max_new_tokens: int = 512,
+#     **kwargs,
+# ) -> list[str]:
+#     """
+#     Generate text predictions from images.
+
+#     Args:
+#         model: The SmolVLM2 model
+#         processor: The model's processor
+#         images: Path(s) to image(s)
+#         prompt: Optional prompt to guide generation
+#         device: Device to run inference on
+#         max_new_tokens: Maximum number of tokens to generate
+#         **kwargs: Additional generation parameters
+
+#     Returns:
+#         List of generated text strings
+#     """
+#     if isinstance(images, str):
+#         images = [images]
+
+#     inputs = processor(images=images, text=prompt if prompt else "", return_tensors="pt")
+
+#     return predict_with_inputs(
+#         model=model,
+#         processor=processor,
+#         input_ids=inputs["input_ids"],
+#         pixel_values=inputs["pixel_values"],
+#         device=device,
+#         max_new_tokens=max_new_tokens,
+#         **kwargs,
+#     )
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index a99af163..bdb70a6e 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -1,98 +1,139 @@
-from typing import Optional
+from typing import Any
 
-import torch
 from PIL import Image
-from torch.utils.data import DataLoader, Dataset
-from transformers import AutoProcessor
+from transformers import  AutoProcessor
 
 
-class SmolVLM2Dataset(Dataset):
-    """Dataset for SmolVLM2 model."""
+def train_collate_fn(
+    batch: list[tuple[Image.Image, dict[str, Any]]],
+      processor: AutoProcessor ):
+    images, data = zip(*batch)
+    prefixes = [entry["prefix"] for entry in data]
+    suffixes = [entry["suffix"] for entry in data]
+    inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
 
-    def __init__(
-        self, image_paths: list[str], texts: Optional[list[str]] = None, processor: Optional[AutoProcessor] = None
-    ):
-        """
-        Initialize dataset.
+    input_ids = inputs["input_ids"]
+    pixel_values = inputs["pixel_values"]
 
-        Args:
-            image_paths: List of paths to images
-            texts: Optional list of corresponding texts
-            processor: Model processor for preprocessing
-        """
-        self.image_paths = image_paths
-        self.texts = texts
-        self.processor = processor
+    labels = processor.tokenizer(
+        text=suffixes, return_tensors="pt", padding=True, return_token_type_ids=False
+    ).input_ids
 
-    def __len__(self) -> int:
-        return len(self.image_paths)
+    return input_ids, pixel_values, labels
 
-    def __getitem__(self, idx: int) -> dict:
-        """Get a single item from the dataset."""
-        image = Image.open(self.image_paths[idx])
+def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: PaliGemmaProcessor):
+    images, data = zip(*batch)
+    prefixes = [entry["prefix"] for entry in data]
+    suffixes = [entry["suffix"] for entry in data]
+    inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
 
-        if self.texts is not None:
-            text = self.texts[idx]
-        else:
-            text = ""
+    input_ids = inputs["input_ids"]
+    pixel_values = inputs["pixel_values"]
+    return input_ids, pixel_values, images, prefixes, suffixes
 
-        if self.processor is not None:
-            return self.processor(images=image, text=text, return_tensors="pt")
-        else:
-            return {"image": image, "text": text}
 
 
-def train_collate_fn(batch: list[dict]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Collate function for training data.
 
-    Args:
-        batch: List of processed samples
 
-    Returns:
-        Tuple of (input_ids, pixel_values, labels)
-    """
-    input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch])
-    pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch])
-    labels = torch.stack([item["labels"].squeeze(0) for item in batch])
 
-    return input_ids, pixel_values, labels
 
 
-def evaluation_collate_fn(
-    batch: list[dict],
-) -> tuple[torch.Tensor, torch.Tensor, list[Image.Image], list[str], list[str]]:
-    """
-    Collate function for evaluation data.
-
-    Args:
-        batch: List of processed samples
-
-    Returns:
-        Tuple of (input_ids, pixel_values, images, prompts, targets)
-    """
-    input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch])
-    pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch])
-    images = [item["image"] for item in batch]
-    prompts = [item["text"] for item in batch]
-    targets = [item["text"] for item in batch]  # In evaluation, target is same as prompt
-
-    return input_ids, pixel_values, images, prompts, targets
-
-
-def create_dataloader(
-    dataset: Dataset, batch_size: int = 8, num_workers: int = 4, shuffle: bool = True, collate_fn=None
-) -> DataLoader:
-    """
-    Create a DataLoader for the dataset.
-
-    Args:
-        dataset: Dataset to create loader for
-        batch_size: Batch size
-        num_workers: Number of worker processes
-        shuffle: Whether to shuffle the data
-        collate_fn: Optional collate function
-    Returns:
-        DataLoader instance
-    """
-    return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, collate_fn=collate_fn)
+
+# from typing import Optional
+
+# import torch
+# from PIL import Image
+# from torch.utils.data import DataLoader, Dataset
+# from transformers import AutoProcessor
+
+
+# class SmolVLM2Dataset(Dataset):
+#     """Dataset for SmolVLM2 model."""
+
+#     def __init__(
+#         self, image_paths: list[str], texts: Optional[list[str]] = None, processor: Optional[AutoProcessor] = None
+#     ):
+#         """
+#         Initialize dataset.
+
+#         Args:
+#             image_paths: List of paths to images
+#             texts: Optional list of corresponding texts
+#             processor: Model processor for preprocessing
+#         """
+#         self.image_paths = image_paths
+#         self.texts = texts
+#         self.processor = processor
+
+#     def __len__(self) -> int:
+#         return len(self.image_paths)
+
+#     def __getitem__(self, idx: int) -> dict:
+#         """Get a single item from the dataset."""
+#         image = Image.open(self.image_paths[idx])
+
+#         if self.texts is not None:
+#             text = self.texts[idx]
+#         else:
+#             text = ""
+
+#         if self.processor is not None:
+#             return self.processor(images=image, text=text, return_tensors="pt")
+#         else:
+#             return {"image": image, "text": text}
+
+
+# def train_collate_fn(batch: list[dict]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+#     """
+#     Collate function for training data.
+
+#     Args:
+#         batch: List of processed samples
+
+#     Returns:
+#         Tuple of (input_ids, pixel_values, labels)
+#     """
+#     input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch])
+#     pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch])
+#     labels = torch.stack([item["labels"].squeeze(0) for item in batch])
+
+#     return input_ids, pixel_values, labels
+
+
+# def evaluation_collate_fn(
+#     batch: list[dict],
+# ) -> tuple[torch.Tensor, torch.Tensor, list[Image.Image], list[str], list[str]]:
+#     """
+#     Collate function for evaluation data.
+
+#     Args:
+#         batch: List of processed samples
+
+#     Returns:
+#         Tuple of (input_ids, pixel_values, images, prompts, targets)
+#     """
+#     input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch])
+#     pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch])
+#     images = [item["image"] for item in batch]
+#     prompts = [item["text"] for item in batch]
+#     targets = [item["text"] for item in batch]  # In evaluation, target is same as prompt
+
+#     return input_ids, pixel_values, images, prompts, targets
+
+
+# def create_dataloader(
+#     dataset: Dataset, batch_size: int = 8, num_workers: int = 4, shuffle: bool = True, collate_fn=None
+# ) -> DataLoader:
+#     """
+#     Create a DataLoader for the dataset.
+
+#     Args:
+#         dataset: Dataset to create loader for
+#         batch_size: Batch size
+#         num_workers: Number of worker processes
+#         shuffle: Whether to shuffle the data
+#         collate_fn: Optional collate function
+#     Returns:
+#         DataLoader instance
+#     """
+#     return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, collate_fn=collate_fn)

From fd521f5577ab0a42852d772d002156d35fcfbd48 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 10:22:01 -0300
Subject: [PATCH 11/92] first attempt of smolvlm

---
 maestro/trainer/models/smolvlm2/loaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index bdb70a6e..fe7bd39c 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -21,7 +21,7 @@ def train_collate_fn(
 
     return input_ids, pixel_values, labels
 
-def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: PaliGemmaProcessor):
+def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: AutoProcessor):
     images, data = zip(*batch)
     prefixes = [entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]

From d46e1f53fc1a69aa941439fa679d22599b236351 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 11:12:55 -0300
Subject: [PATCH 12/92] changed model type to AutoModelForImageTextToText

---
 maestro/trainer/models/smolvlm2/checkpoints.py | 14 +++++++-------
 maestro/trainer/models/smolvlm2/core.py        |  2 +-
 maestro/trainer/models/smolvlm2/inference.py   |  8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index f90f887d..ed8e2b12 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -3,7 +3,7 @@
 from enum import Enum
 
 import torch
-from transformers import AutoModelForVision2Seq, AutoProcessor
+from transformers import AutoModelForImageTextToText, AutoProcessor
 from maestro.trainer.common.utils.device import parse_device_spec
 from maestro.trainer.logger import get_maestro_logger
 from peft import LoraConfig, get_peft_model
@@ -24,7 +24,7 @@
 
 
 def save_checkpoint(
-    model: AutoModelForVision2Seq, processor: AutoProcessor, path: str, metadata: Optional[dict] = None
+    model: AutoModelForImageTextToText, processor: AutoProcessor, path: str, metadata: Optional[dict] = None
 ) -> None:
     """
     Save model checkpoint.
@@ -50,7 +50,7 @@ def save_checkpoint(
 def save_model(
     target_dir: str,
     processor: AutoProcessor,
-    model: AutoModelForVision2Seq,
+    model: AutoModelForImageTextToText,
 ) -> None:
     """
     Save a PaliGemma 2 model and its processor to disk.
@@ -77,7 +77,7 @@ def save_model(
 #         Dictionary containing model, processor, and metadata
 #     """
 #     # Load model
-#     model = AutoModelForVision2Seq.from_pretrained(path)
+#     model = AutoModelForImageTextToText.from_pretrained(path)
 #     model.to(device)
 
 #     # Load processor
@@ -103,7 +103,7 @@ def load_model(
     optimization_strategy: OptimizationStrategy = OptimizationStrategy.NONE,
     peft_advanced_params: Optional[dict] = None,
     cache_dir: Optional[str] = None,
-) -> tuple[AutoProcessor, AutoModelForVision2Seq]:
+) -> tuple[AutoProcessor, AutoModelForImageTextToText]:
     """Loads a PaliGemma 2 model and its associated processor.
 
     Args:
@@ -148,7 +148,7 @@ def load_model(
         ) if optimization_strategy == OptimizationStrategy.QLORA
             else None)
         
-        model = AutoModelForVision2Seq.from_pretrained(
+        model = AutoModelForImageTextToText.from_pretrained(
             model_id_or_path,
             revision=revision,
             trust_remote_code=True,
@@ -160,7 +160,7 @@ def load_model(
         model.print_trainable_parameters()
     else:
 
-        model = AutoModelForVision2Seq.from_pretrained(
+        model = AutoModelForImageTextToText.from_pretrained(
             model_id_or_path,
             revision=revision,
             trust_remote_code=True,
diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index d17366d9..bb257a98 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -133,7 +133,7 @@ class SmolVLM2Trainer(MaestroTrainer):
 
     Attributes:
         processor (AutoProcessor): Processor for model inputs.
-        model (AutoModelForCausalLM): The SmolVLM-2 model.
+        model (AutoModelForImageTextToText): The SmolVLM-2 model.
         train_loader (DataLoader): DataLoader for training data.
         valid_loader (DataLoader): DataLoader for validation data.
         config (SmolVLM2Configuration): Configuration object with training parameters.
diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index 5303cfe2..dbdee318 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -2,12 +2,12 @@
 from PIL import Image
 
 import torch
-from transformers import AutoModelForVision2Seq, AutoProcessor
+from transformers import AutoModelForImageTextToText, AutoProcessor
 from maestro.trainer.common.utils.device import parse_device_spec
 
 
 def predict_with_inputs(
-    model: AutoModelForVision2Seq,
+    model: AutoModelForImageTextToText,
     processor: AutoProcessor,
     input_ids: torch.Tensor,
     pixel_values: torch.Tensor,
@@ -42,7 +42,7 @@ def predict_with_inputs(
 
 
 def predict(
-    model: AutoModelForVision2Seq,
+    model: AutoModelForImageTextToText,
     processor: AutoProcessor,
     image: Image.Image,
     prefix: str,
@@ -74,7 +74,7 @@ def predict(
     )[0]
 
 # def predict_with_images(
-#     model: AutoModelForVision2Seq,
+#     model: AutoModelForImageTextToText,
 #     processor: AutoProcessor,
 #     images: Union[str, list[str]],
 #     prompt: Optional[str] = None,

From 7a6747f16053314e137890717e333ca784960e06 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 11:26:30 -0300
Subject: [PATCH 13/92] updated train_collate_fn format

---
 maestro/trainer/models/smolvlm2/loaders.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index fe7bd39c..61432e3e 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -8,7 +8,7 @@ def train_collate_fn(
     batch: list[tuple[Image.Image, dict[str, Any]]],
       processor: AutoProcessor ):
     images, data = zip(*batch)
-    prefixes = [entry["prefix"] for entry in data]
+    prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]
     inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
 
@@ -23,7 +23,7 @@ def train_collate_fn(
 
 def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: AutoProcessor):
     images, data = zip(*batch)
-    prefixes = [entry["prefix"] for entry in data]
+    prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]
     inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
 

From 1c7d6a6759f42b498b64a8adf9f8e20cd6621749 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 11:36:35 -0300
Subject: [PATCH 14/92] trying casting model to bfloat 16

---
 maestro/trainer/models/smolvlm2/core.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index bb257a98..df138dec 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -247,6 +247,12 @@ def train(config: SmolVLM2Configuration | dict) -> None:
     _, train_entry = train_loader.dataset[0]
     logger.info(f"sample train prefix: {train_entry['prefix']}")
     logger.info(f"sample train suffix: {train_entry['suffix']}")
+    if config.device.type == "cuda":  # Or check for 'cpu' if you intend to use BF16 on CPU
+        logger.info(f"Casting model to {torch.bfloat16} for training on {config.device}")
+        model = model.to(torch.bfloat16)
+    else:
+        logger.info(f"Using default precision for training on {config.device}")
+
 
     pl_module = SmolVLM2Trainer(
         processor=processor, model=model, train_loader=train_loader, valid_loader=valid_loader, config=config

From a28af8f0cb5c36b3f3039791e319669df5de35d0 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 11:51:09 -0300
Subject: [PATCH 15/92] trying casting model to device

---
 maestro/trainer/models/smolvlm2/checkpoints.py | 4 ++--
 maestro/trainer/models/smolvlm2/core.py        | 6 ------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index ed8e2b12..f0d5dedb 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -154,8 +154,8 @@ def load_model(
             trust_remote_code=True,
             quantization_config=bnb_config,
             cache_dir=cache_dir,
-            torch_dtype=torch.bfloat16,
-        )
+            torch_dtype=torch.bfloat16, 
+        ).to(device)
         model = get_peft_model(model, lora_config)
         model.print_trainable_parameters()
     else:
diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index df138dec..bb257a98 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -247,12 +247,6 @@ def train(config: SmolVLM2Configuration | dict) -> None:
     _, train_entry = train_loader.dataset[0]
     logger.info(f"sample train prefix: {train_entry['prefix']}")
     logger.info(f"sample train suffix: {train_entry['suffix']}")
-    if config.device.type == "cuda":  # Or check for 'cpu' if you intend to use BF16 on CPU
-        logger.info(f"Casting model to {torch.bfloat16} for training on {config.device}")
-        model = model.to(torch.bfloat16)
-    else:
-        logger.info(f"Using default precision for training on {config.device}")
-
 
     pl_module = SmolVLM2Trainer(
         processor=processor, model=model, train_loader=train_loader, valid_loader=valid_loader, config=config

From a767b8dffa4087cbad3585d745cd436b70d7f93f Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 12:10:04 -0300
Subject: [PATCH 16/92] trying changing the freezing

---
 maestro/trainer/models/smolvlm2/checkpoints.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index f0d5dedb..3bb4ec3b 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -168,7 +168,7 @@ def load_model(
 
         if optimization_strategy == OptimizationStrategy.FREEZE:
             # Freeze vision encoder parameters
-            for param in model.vision_model.parameters():
+            for param in model.model.vision_model.parameters():
                 param.requires_grad = False
 
             # for param in model.multi_modal_projector.parameters():

From f90613488c83282b7fd08c331689119fa82889d6 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 12:18:01 -0300
Subject: [PATCH 17/92] removed max length

---
 maestro/trainer/models/smolvlm2/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index bb257a98..517a7d98 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -237,7 +237,7 @@ def train(config: SmolVLM2Configuration | dict) -> None:
     train_loader, valid_loader, test_loader = create_data_loaders(
         dataset_location=dataset_location,
         train_batch_size=config.batch_size,
-        train_collect_fn=partial(train_collate_fn, processor=processor, max_length=config.max_new_tokens),
+        train_collect_fn=partial(train_collate_fn, processor=processor),
         train_num_workers=config.num_workers,
         test_batch_size=config.val_batch_size,
         test_collect_fn=partial(evaluation_collate_fn, processor=processor),

From a814cb2ad6788b041aa1a2cad9bca2646257918c Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 13:00:31 -0300
Subject: [PATCH 18/92] added attention masks

---
 maestro/trainer/models/smolvlm2/core.py    | 6 ++++--
 maestro/trainer/models/smolvlm2/loaders.py | 7 +++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 517a7d98..0fc961d7 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -159,9 +159,10 @@ def __init__(
         self.valid_metrics_tracker = MetricsTracker.init(metrics=metrics)
 
     def training_step(self, batch, batch_idx):
-        input_ids, pixel_values, labels = batch
+        input_ids, attention_mask, pixel_values, labels = batch
         outputs = self.model(
             input_ids=input_ids,
+            attention_mask=attention_mask,
             pixel_values=pixel_values,
             labels=labels,
         )
@@ -171,11 +172,12 @@ def training_step(self, batch, batch_idx):
         return loss
 
     def validation_step(self, batch, batch_idx):
-        input_ids, pixel_values, images, prefixes, suffixes = batch
+        input_ids,attention_mask, pixel_values, images, prefixes, suffixes = batch
         generated_suffixes = predict_with_inputs(
             model=self.model,
             processor=self.processor,
             input_ids=input_ids,
+            attention_mask=attention_mask,
             pixel_values=pixel_values,
             device=self.config.device,
             max_new_tokens=self.config.max_new_tokens,
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 61432e3e..f25722e5 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -14,12 +14,13 @@ def train_collate_fn(
 
     input_ids = inputs["input_ids"]
     pixel_values = inputs["pixel_values"]
+    attention_mask = inputs["attention_mask"]
 
     labels = processor.tokenizer(
         text=suffixes, return_tensors="pt", padding=True, return_token_type_ids=False
     ).input_ids
 
-    return input_ids, pixel_values, labels
+    return input_ids,attention_mask, pixel_values, labels
 
 def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: AutoProcessor):
     images, data = zip(*batch)
@@ -29,7 +30,9 @@ def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], proce
 
     input_ids = inputs["input_ids"]
     pixel_values = inputs["pixel_values"]
-    return input_ids, pixel_values, images, prefixes, suffixes
+    attention_mask = inputs["attention_mask"]
+
+    return input_ids, attention_mask, pixel_values, images, prefixes, suffixes
 
 
 

From 23fa5a1810fd0bb3447597e5303a2aa0b11fa55d Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 15:30:54 -0300
Subject: [PATCH 19/92] trying different train collate fn

---
 maestro/trainer/models/smolvlm2/inference.py |  2 +-
 maestro/trainer/models/smolvlm2/loaders.py   | 37 ++++++++++++++++----
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index dbdee318..9649ac67 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -52,7 +52,7 @@ def predict(
     """Generate a text prediction for a single image and text prefix.
 
     Args:
-        model (AutoModelForCausalLM): The Florence-2 model for conditional text generation.
+        model (AutoModelForImageTextToText): The Florence-2 model for conditional text generation.
         processor (AutoProcessor): Processor for model inputs and outputs, handling tokenization and decoding.
         image (str | bytes | Image.Image): Input image as a file path, raw bytes, or a PIL Image.
         prefix (str): Text prefix to condition the generated output.
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index f25722e5..b6b68ffd 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -2,19 +2,44 @@
 
 from PIL import Image
 from transformers import  AutoProcessor
-
+import supervision as sv
+from torch.nn.utils.rnn import pad_sequence
 
 def train_collate_fn(
     batch: list[tuple[Image.Image, dict[str, Any]]],
       processor: AutoProcessor ):
     images, data = zip(*batch)
-    prefixes = ["<image>" + entry["prefix"] for entry in data]
+    instances = []
+
+    for i in range(len(images)) in images:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": Image.open(images[i])},
+                    {"type": "text", "text": data[i]["prefix"]},
+                ]
+            },
+        ]
+
+        instance = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        instances.append(instance)
+
+
+
+    #prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]
-    inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
+    #inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
 
-    input_ids = inputs["input_ids"]
-    pixel_values = inputs["pixel_values"]
-    attention_mask = inputs["attention_mask"]
+    input_ids = [i["inputs_ids"] for i in instances]#inputs["input_ids"]
+    pixel_values = [i["pixel_values"] for i in instances]#inputs["pixel_values"]
+    attention_mask = [i["attention_mask"] for i in instances]#inputs["attention_mask"]
 
     labels = processor.tokenizer(
         text=suffixes, return_tensors="pt", padding=True, return_token_type_ids=False

From 023d817770d0934d4968b78789208861ab43beb6 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 15:37:39 -0300
Subject: [PATCH 20/92] trying different train collate fn

---
 maestro/trainer/models/smolvlm2/loaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index b6b68ffd..9af8a91e 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -11,7 +11,7 @@ def train_collate_fn(
     images, data = zip(*batch)
     instances = []
 
-    for i in range(len(images)) in images:
+    for i in range(len(images)):
         messages = [
             {
                 "role": "user",

From 4135537976bf59eeb6fdbf6440c4163a349392c4 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 15:43:30 -0300
Subject: [PATCH 21/92] added debuging prints

---
 maestro/trainer/models/smolvlm2/loaders.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 9af8a91e..aab5b5ff 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -10,13 +10,14 @@ def train_collate_fn(
       processor: AutoProcessor ):
     images, data = zip(*batch)
     instances = []
-
+    print(type(images))
     for i in range(len(images)):
+        print(i)
         messages = [
             {
                 "role": "user",
                 "content": [
-                    {"type": "image", "image": Image.open(images[i])},
+                    {"type": "image", "image": images[i]},
                     {"type": "text", "text": data[i]["prefix"]},
                 ]
             },

From 022c61f7b353a62d484d2fcf350f2ab45aa55c5c Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 15:45:02 -0300
Subject: [PATCH 22/92] removed debuging prints

---
 maestro/trainer/models/smolvlm2/loaders.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index aab5b5ff..15d80304 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -10,9 +10,7 @@ def train_collate_fn(
       processor: AutoProcessor ):
     images, data = zip(*batch)
     instances = []
-    print(type(images))
     for i in range(len(images)):
-        print(i)
         messages = [
             {
                 "role": "user",
@@ -38,7 +36,7 @@ def train_collate_fn(
     suffixes = [entry["suffix"] for entry in data]
     #inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
 
-    input_ids = [i["inputs_ids"] for i in instances]#inputs["input_ids"]
+    input_ids = [i["input_ids"] for i in instances]#inputs["input_ids"]
     pixel_values = [i["pixel_values"] for i in instances]#inputs["pixel_values"]
     attention_mask = [i["attention_mask"] for i in instances]#inputs["attention_mask"]
 

From 5891867cee968d07bd7c2140f66a01b5131072fb Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 15:50:28 -0300
Subject: [PATCH 23/92] trying train collate from SmolVLM2_video_FT

---
 maestro/trainer/models/smolvlm2/loaders.py | 50 +++++++++++++++++++---
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 15d80304..e7c64b20 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -4,7 +4,7 @@
 from transformers import  AutoProcessor
 import supervision as sv
 from torch.nn.utils.rnn import pad_sequence
-
+import torch
 def train_collate_fn(
     batch: list[tuple[Image.Image, dict[str, Any]]],
       processor: AutoProcessor ):
@@ -31,14 +31,54 @@ def train_collate_fn(
         instances.append(instance)
 
 
-
+    input_ids = pad_sequence(
+        [inst["input_ids"].squeeze(0) for inst in instances],
+        batch_first=True,
+        padding_value=processor.tokenizer.pad_token_id
+    )
+    attention_mask = pad_sequence(
+        [inst["attention_mask"].squeeze(0) for inst in instances],
+        batch_first=True,
+        padding_value=0
+    )
+
+    # Step 1: figure out maximum frames, height, width across the batch
+    pvs = [inst["pixel_values"].squeeze(0) for inst in instances if "pixel_values" in inst]
+    if pvs:  # there is at least one non-None pixel_values
+        max_frames = max(pv.shape[0] for pv in pvs)
+        max_h = max(pv.shape[-2] for pv in pvs)
+        max_w = max(pv.shape[-1] for pv in pvs)
+    else:
+        max_h = max_w = processor.video_size['longest_edge']
+        max_frames = 1
+
+    padded_pixel_values_list = []
+    for ex in instances:
+        pv = ex.get("pixel_values", None).squeeze(0)
+
+        if pv is None:
+            # text-only => fill pixel data + mask with zeros
+            shape_pv = (max_frames, 3, max_h, max_w)
+            padded_pv = torch.zeros(shape_pv, dtype=torch.float32)
+        else:
+            f, c, h, w = pv.shape
+            # Prepare final storage
+            padded_pv = torch.zeros(
+                (max_frames, c, max_h, max_w),
+                dtype=pv.dtype,
+                device=pv.device
+            )
+            padded_pv[:f, :, :h, :w] = pv
+        padded_pixel_values_list.append(padded_pv)
+
+    pixel_values = torch.stack(padded_pixel_values_list, dim=0)
     #prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]
     #inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
 
-    input_ids = [i["input_ids"] for i in instances]#inputs["input_ids"]
-    pixel_values = [i["pixel_values"] for i in instances]#inputs["pixel_values"]
-    attention_mask = [i["attention_mask"] for i in instances]#inputs["attention_mask"]
+    # input_ids = [i["input_ids"] for i in instances]#inputs["input_ids"]
+    # pixel_values = [i["pixel_values"] for i in instances]#inputs["pixel_values"]
+    # attention_mask = [i["attention_mask"] for i in instances]#inputs["attention_mask"]
 
     labels = processor.tokenizer(
         text=suffixes, return_tensors="pt", padding=True, return_token_type_ids=False

From 309894bc5ed925442f0d9484607c4fe054e0a087 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 15:54:05 -0300
Subject: [PATCH 24/92] added debuging prints

---
 maestro/trainer/models/smolvlm2/loaders.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index e7c64b20..5ba9eb5b 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -83,7 +83,8 @@ def train_collate_fn(
     labels = processor.tokenizer(
         text=suffixes, return_tensors="pt", padding=True, return_token_type_ids=False
     ).input_ids
-
+    print(labels.shape)
+    print(input_ids.shape)
     return input_ids,attention_mask, pixel_values, labels
 
 def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: AutoProcessor):

From 3c4e2ceadfcd689cda9fbe1f6adcc7ce86272d75 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 16:07:44 -0300
Subject: [PATCH 25/92] try padding labels

---
 maestro/trainer/models/smolvlm2/loaders.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 5ba9eb5b..516b86bd 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -83,6 +83,12 @@ def train_collate_fn(
     labels = processor.tokenizer(
         text=suffixes, return_tensors="pt", padding=True, return_token_type_ids=False
     ).input_ids
+    labels = pad_sequence(
+        labels,
+        batch_first=True,
+        padding_value=-100
+    )
+
     print(labels.shape)
     print(input_ids.shape)
     return input_ids,attention_mask, pixel_values, labels

From b3109a323c516a853b7c4ba42d4899d0cf5bd28a Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 16:26:11 -0300
Subject: [PATCH 26/92] trying charqa format

---
 maestro/trainer/models/smolvlm2/loaders.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 516b86bd..b3726b9c 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -80,14 +80,15 @@ def train_collate_fn(
     # pixel_values = [i["pixel_values"] for i in instances]#inputs["pixel_values"]
     # attention_mask = [i["attention_mask"] for i in instances]#inputs["attention_mask"]
 
-    labels = processor.tokenizer(
-        text=suffixes, return_tensors="pt", padding=True, return_token_type_ids=False
-    ).input_ids
-    labels = pad_sequence(
-        labels,
-        batch_first=True,
-        padding_value=-100
-    )
+    # labels = processor.tokenizer(
+    #     text=suffixes, return_tensors="pt", padding=True, return_token_type_ids=False
+    # ).input_ids
+    image_token_id = processor.tokenizer.additional_special_tokens_ids[
+    processor.tokenizer.additional_special_tokens.index("<image>")
+    ]   
+    labels = batch["input_ids"].clone()
+    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels
+    labels[labels == image_token_id] = -100  # Mask image token IDs in labels
 
     print(labels.shape)
     print(input_ids.shape)

From acaa5aab92c60e862297bf8ca4b715345dd81486 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 16:27:45 -0300
Subject: [PATCH 27/92] trying charqa format

---
 maestro/trainer/models/smolvlm2/loaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index b3726b9c..5f66fac1 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -86,7 +86,7 @@ def train_collate_fn(
     image_token_id = processor.tokenizer.additional_special_tokens_ids[
     processor.tokenizer.additional_special_tokens.index("<image>")
     ]   
-    labels = batch["input_ids"].clone()
+    labels = input_ids.clone()
     labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels
     labels[labels == image_token_id] = -100  # Mask image token IDs in labels
 

From 87c5a26962f5b2093079d0b02c7e6e569a2d6ded Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 19:51:42 -0300
Subject: [PATCH 28/92] printing last ids

---
 maestro/trainer/models/smolvlm2/loaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 5f66fac1..a1e7a779 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -90,7 +90,7 @@ def train_collate_fn(
     labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels
     labels[labels == image_token_id] = -100  # Mask image token IDs in labels
 
-    print(labels.shape)
+    print(labels[0,-10:])
     print(input_ids.shape)
     return input_ids,attention_mask, pixel_values, labels
 

From b2544163950874bff2dd33728798f53e72ad9fc6 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 20:35:39 -0300
Subject: [PATCH 29/92] potential solution

---
 maestro/trainer/models/smolvlm2/loaders.py | 134 ++++++++-------------
 1 file changed, 53 insertions(+), 81 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index a1e7a779..2aeda61a 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -5,94 +5,66 @@
 import supervision as sv
 from torch.nn.utils.rnn import pad_sequence
 import torch
+
+def format_data(image, prefix, suffix):
+    return [
+
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image,
+                },
+                {
+                    "type": "text",
+                    "text": prefix,
+                },
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [{"type": "text", "text": suffix}],
+        },
+    ]
+
 def train_collate_fn(
     batch: list[tuple[Image.Image, dict[str, Any]]],
       processor: AutoProcessor ):
     images, data = zip(*batch)
-    instances = []
+
+    messages = []
+    suffixes = []
     for i in range(len(images)):
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": images[i]},
-                    {"type": "text", "text": data[i]["prefix"]},
-                ]
-            },
-        ]
-
-        instance = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        )
-        instances.append(instance)
-
-
-    input_ids = pad_sequence(
-        [inst["input_ids"].squeeze(0) for inst in instances],
-        batch_first=True,
-        padding_value=processor.tokenizer.pad_token_id
-    )
-    attention_mask = pad_sequence(
-        [inst["attention_mask"].squeeze(0) for inst in instances],
-        batch_first=True,
-        padding_value=0
-    )
-
-    # Step 1: figure out maximum frames, height, width across the batch
-    pvs = [inst["pixel_values"].squeeze(0) for inst in instances if "pixel_values" in inst]
-    if pvs:  # there is at least one non-None pixel_values
-        max_frames = max(pv.shape[0] for pv in pvs)
-        max_h = max(pv.shape[-2] for pv in pvs)
-        max_w = max(pv.shape[-1] for pv in pvs)
-    else:
-        max_h = max_w = processor.video_size['longest_edge']
-        max_frames = 1
-
-    padded_pixel_values_list = []
-    for ex in instances:
-        pv = ex.get("pixel_values", None).squeeze(0)
-
-        if pv is None:
-            # text-only => fill pixel data + mask with zeros
-            shape_pv = (max_frames, 3, max_h, max_w)
-            padded_pv = torch.zeros(shape_pv, dtype=torch.float32)
-        else:
-            f, c, h, w = pv.shape
-            # Prepare final storage
-            padded_pv = torch.zeros(
-                (max_frames, c, max_h, max_w),
-                dtype=pv.dtype,
-                device=pv.device
-            )
-            padded_pv[:f, :, :h, :w] = pv
-        padded_pixel_values_list.append(padded_pv)
-
-    pixel_values = torch.stack(padded_pixel_values_list, dim=0)
-    #prefixes = ["<image>" + entry["prefix"] for entry in data]
-    suffixes = [entry["suffix"] for entry in data]
-    #inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
-
-    # input_ids = [i["input_ids"] for i in instances]#inputs["input_ids"]
-    # pixel_values = [i["pixel_values"] for i in instances]#inputs["pixel_values"]
-    # attention_mask = [i["attention_mask"] for i in instances]#inputs["attention_mask"]
-
-    # labels = processor.tokenizer(
-    #     text=suffixes, return_tensors="pt", padding=True, return_token_type_ids=False
-    # ).input_ids
-    image_token_id = processor.tokenizer.additional_special_tokens_ids[
-    processor.tokenizer.additional_special_tokens.index("<image>")
-    ]   
+        messages.append(format_data(images[i], data[i]["prefix"], data[i]["suffix"]))
+        suffixes.append(data[i]["suffix"])
+
+    # Apply chat template WITHOUT tokenization
+    texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
+
+    # Tokenize and encode images
+    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    input_ids = batch_enc["input_ids"]
+    attention_mask = batch_enc["attention_mask"]
+    pixel_values = batch_enc["pixel_values"]
+
+    # Clone input_ids to labels and mask out everything except suffix
     labels = input_ids.clone()
-    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens in labels
-    labels[labels == image_token_id] = -100  # Mask image token IDs in labels
 
-    print(labels[0,-10:])
-    print(input_ids.shape)
-    return input_ids,attention_mask, pixel_values, labels
+    # Mask pad tokens
+    labels[labels == processor.tokenizer.pad_token_id] = -100
+
+    # Mask <image> tokens
+    image_token_id = processor.tokenizer.convert_tokens_to_ids("<image>")
+    labels[labels == image_token_id] = -100
+
+    # Mask prefix tokens: keep only suffix as target
+    for i, suffix in enumerate(suffixes):
+        suffix_ids = processor.tokenizer(suffix, add_special_tokens=False).input_ids
+        # Only keep the last len(suffix_ids) tokens in labels
+        labels[i, :-len(suffix_ids)] = -100
+
+    return input_ids, attention_mask, pixel_values, labels
 
 def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: AutoProcessor):
     images, data = zip(*batch)

From beb1b4f4ed600507d3945fe736bd3472a64f8590 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 20:42:38 -0300
Subject: [PATCH 30/92] casting pixels to bfloat 16

---
 maestro/trainer/models/smolvlm2/loaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 2aeda61a..b377b207 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -64,7 +64,7 @@ def train_collate_fn(
         # Only keep the last len(suffix_ids) tokens in labels
         labels[i, :-len(suffix_ids)] = -100
 
-    return input_ids, attention_mask, pixel_values, labels
+    return input_ids, attention_mask, pixel_values.to(dtype=torch.bfloat16), labels
 
 def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: AutoProcessor):
     images, data = zip(*batch)

From 7788dfbaf33c556c6343b2172793ed2422941e8f Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Wed, 28 May 2025 20:45:52 -0300
Subject: [PATCH 31/92] trying autocast

---
 maestro/trainer/models/smolvlm2/core.py    | 16 ++++++++++------
 maestro/trainer/models/smolvlm2/loaders.py |  2 +-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 0fc961d7..615d1e5d 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -39,6 +39,8 @@
     parse_metrics,
     save_metric_plots,
 )
+from torch.cuda.amp import autocast
+
 from maestro.trainer.models.florence_2.detection import (
     detections_to_prefix_formatter,
     detections_to_suffix_formatter,
@@ -160,12 +162,14 @@ def __init__(
 
     def training_step(self, batch, batch_idx):
         input_ids, attention_mask, pixel_values, labels = batch
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            pixel_values=pixel_values,
-            labels=labels,
-        )
+        with autocast(dtype=torch.bfloat16):
+
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                pixel_values=pixel_values,
+                labels=labels,
+            )
         loss = outputs.loss
         self.log("train_loss", loss, prog_bar=True, logger=True, batch_size=self.config.batch_size)
         self.train_metrics_tracker.register("loss", epoch=self.current_epoch, step=batch_idx, value=loss.item())
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index b377b207..2aeda61a 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -64,7 +64,7 @@ def train_collate_fn(
         # Only keep the last len(suffix_ids) tokens in labels
         labels[i, :-len(suffix_ids)] = -100
 
-    return input_ids, attention_mask, pixel_values.to(dtype=torch.bfloat16), labels
+    return input_ids, attention_mask, pixel_values, labels
 
 def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: AutoProcessor):
     images, data = zip(*batch)

From a8a27ca0ddd8139d6b5d84cf250fc8d2ab91569b Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 10:50:51 -0300
Subject: [PATCH 32/92] removing bfloat 16 from load model

---
 maestro/trainer/models/smolvlm2/checkpoints.py |  4 ++--
 maestro/trainer/models/smolvlm2/core.py        | 16 ++++++----------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index 3bb4ec3b..08fa4933 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -143,7 +143,7 @@ def load_model(
         bnb_config = (BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.float16,
+            #bnb_4bit_compute_dtype=torch.float16,
             bnb_4bit_use_double_quant=True,
         ) if optimization_strategy == OptimizationStrategy.QLORA
             else None)
@@ -154,7 +154,7 @@ def load_model(
             trust_remote_code=True,
             quantization_config=bnb_config,
             cache_dir=cache_dir,
-            torch_dtype=torch.bfloat16, 
+            #torch_dtype=torch.bfloat16, 
         ).to(device)
         model = get_peft_model(model, lora_config)
         model.print_trainable_parameters()
diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 615d1e5d..0fc961d7 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -39,8 +39,6 @@
     parse_metrics,
     save_metric_plots,
 )
-from torch.cuda.amp import autocast
-
 from maestro.trainer.models.florence_2.detection import (
     detections_to_prefix_formatter,
     detections_to_suffix_formatter,
@@ -162,14 +160,12 @@ def __init__(
 
     def training_step(self, batch, batch_idx):
         input_ids, attention_mask, pixel_values, labels = batch
-        with autocast(dtype=torch.bfloat16):
-
-            outputs = self.model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                pixel_values=pixel_values,
-                labels=labels,
-            )
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            labels=labels,
+        )
         loss = outputs.loss
         self.log("train_loss", loss, prog_bar=True, logger=True, batch_size=self.config.batch_size)
         self.train_metrics_tracker.register("loss", epoch=self.current_epoch, step=batch_idx, value=loss.item())

From 7e34098814d69c7767aa80ae77d6923521d98db7 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 11:13:14 -0300
Subject: [PATCH 33/92] small optimization and added evalutation collate_fn

---
 maestro/trainer/models/smolvlm2/loaders.py | 51 +++++++++++++++-------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 2aeda61a..1a4ac18d 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -33,12 +33,12 @@ def train_collate_fn(
       processor: AutoProcessor ):
     images, data = zip(*batch)
 
-    messages = []
-    suffixes = []
-    for i in range(len(images)):
-        messages.append(format_data(images[i], data[i]["prefix"], data[i]["suffix"]))
-        suffixes.append(data[i]["suffix"])
-
+    messages = [
+        format_data(image, entry["prefix"], entry["suffix"])
+        for image, entry in zip(images, data)
+    ]
+    suffixes = [entry["suffix"] for entry in data]
+    
     # Apply chat template WITHOUT tokenization
     texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
 
@@ -65,20 +65,41 @@ def train_collate_fn(
         labels[i, :-len(suffix_ids)] = -100
 
     return input_ids, attention_mask, pixel_values, labels
-
-def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: AutoProcessor):
+def evaluation_collate_fn(
+    batch: list[tuple[Image.Image, dict[str, Any]]],
+    processor: AutoProcessor
+):
     images, data = zip(*batch)
-    prefixes = ["<image>" + entry["prefix"] for entry in data]
-    suffixes = [entry["suffix"] for entry in data]
-    inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
 
-    input_ids = inputs["input_ids"]
-    pixel_values = inputs["pixel_values"]
-    attention_mask = inputs["attention_mask"]
+    # Format inputs: <image> token + prefix as text, image will be passed separately
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": entry["prefix"]},
+                ],
+            }
+        ]
+        for image, entry in zip(images, data)
+    ]
 
-    return input_ids, attention_mask, pixel_values, images, prefixes, suffixes
+    # Apply chat template without tokenizing to get clean prompt strings
+    texts = [processor.apply_chat_template(msg, tokenize=False) for msg in messages]
 
+    # Tokenize with processor (includes image + text)
+    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
+
+    input_ids = batch_enc["input_ids"]
+    attention_mask = batch_enc["attention_mask"]
+    pixel_values = batch_enc["pixel_values"]
+
+    # Optionally return raw text + images for later reference/evaluation
+    prefixes = [entry["prefix"] for entry in data]
+    suffixes = [entry["suffix"] for entry in data]
 
+    return input_ids, attention_mask, pixel_values, images, prefixes, suffixes
 
 
 

From 1750334de682df42f328cd73f9455df5fc875258 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 11:13:52 -0300
Subject: [PATCH 34/92] trying float 16

---
 maestro/trainer/models/smolvlm2/checkpoints.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index 08fa4933..5a620f4e 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -143,7 +143,7 @@ def load_model(
         bnb_config = (BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_quant_type="nf4",
-            #bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_compute_dtype=torch.float16,
             bnb_4bit_use_double_quant=True,
         ) if optimization_strategy == OptimizationStrategy.QLORA
             else None)
@@ -154,7 +154,7 @@ def load_model(
             trust_remote_code=True,
             quantization_config=bnb_config,
             cache_dir=cache_dir,
-            #torch_dtype=torch.bfloat16, 
+            torch_dtype=torch.float16, 
         ).to(device)
         model = get_peft_model(model, lora_config)
         model.print_trainable_parameters()

From 0b810f2ece9a5d1cf7e0ff017237363e51db23a6 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 11:16:40 -0300
Subject: [PATCH 35/92] removing float type

---
 maestro/trainer/models/smolvlm2/checkpoints.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index 5a620f4e..08fa4933 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -143,7 +143,7 @@ def load_model(
         bnb_config = (BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=torch.float16,
+            #bnb_4bit_compute_dtype=torch.float16,
             bnb_4bit_use_double_quant=True,
         ) if optimization_strategy == OptimizationStrategy.QLORA
             else None)
@@ -154,7 +154,7 @@ def load_model(
             trust_remote_code=True,
             quantization_config=bnb_config,
             cache_dir=cache_dir,
-            torch_dtype=torch.float16, 
+            #torch_dtype=torch.bfloat16, 
         ).to(device)
         model = get_peft_model(model, lora_config)
         model.print_trainable_parameters()

From 5023150dfb94d983624f122e2437c293d9274e9c Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 11:38:36 -0300
Subject: [PATCH 36/92] changed evaluation collate

---
 maestro/trainer/models/smolvlm2/loaders.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 1a4ac18d..f2418310 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -95,8 +95,7 @@ def evaluation_collate_fn(
     attention_mask = batch_enc["attention_mask"]
     pixel_values = batch_enc["pixel_values"]
 
-    # Optionally return raw text + images for later reference/evaluation
-    prefixes = [entry["prefix"] for entry in data]
+    prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]
 
     return input_ids, attention_mask, pixel_values, images, prefixes, suffixes

From fae5ff08db34c4c551645d5515ea066db4d8ba7a Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 11:41:34 -0300
Subject: [PATCH 37/92] evaluation collate as train collate

---
 maestro/trainer/models/smolvlm2/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 0fc961d7..3fb63c39 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -242,7 +242,7 @@ def train(config: SmolVLM2Configuration | dict) -> None:
         train_collect_fn=partial(train_collate_fn, processor=processor),
         train_num_workers=config.num_workers,
         test_batch_size=config.val_batch_size,
-        test_collect_fn=partial(evaluation_collate_fn, processor=processor),
+        #test_collect_fn=partial(evaluation_collate_fn, processor=processor),
         test_num_workers=config.val_num_workers,
     )
 

From 257dc78bc158027e864c0b451b1691dc208ee562 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 12:01:49 -0300
Subject: [PATCH 38/92] evaluation collate updated

---
 maestro/trainer/models/smolvlm2/core.py    |  4 ++--
 maestro/trainer/models/smolvlm2/loaders.py | 19 ++++++++++++-------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 3fb63c39..424ac63b 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -182,7 +182,7 @@ def validation_step(self, batch, batch_idx):
             device=self.config.device,
             max_new_tokens=self.config.max_new_tokens,
         )
-
+        print("generated_suffixes", generated_suffixes)
         if batch_idx == 0:
             logger.info(f"sample valid prefix: {prefixes[0]}")
             logger.info(f"sample valid suffix: {suffixes[0]}")
@@ -242,7 +242,7 @@ def train(config: SmolVLM2Configuration | dict) -> None:
         train_collect_fn=partial(train_collate_fn, processor=processor),
         train_num_workers=config.num_workers,
         test_batch_size=config.val_batch_size,
-        #test_collect_fn=partial(evaluation_collate_fn, processor=processor),
+        test_collect_fn=partial(evaluation_collate_fn, processor=processor),
         test_num_workers=config.val_num_workers,
     )
 
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index f2418310..dad7edec 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -71,7 +71,6 @@ def evaluation_collate_fn(
 ):
     images, data = zip(*batch)
 
-    # Format inputs: <image> token + prefix as text, image will be passed separately
     messages = [
         [
             {
@@ -85,15 +84,21 @@ def evaluation_collate_fn(
         for image, entry in zip(images, data)
     ]
 
-    # Apply chat template without tokenizing to get clean prompt strings
-    texts = [processor.apply_chat_template(msg, tokenize=False) for msg in messages]
+    texts = [processor.apply_chat_template(msg, tokenize=True, 
+                                           add_generation_prompt=True,
+                                           return_dict=True,
+                                            )
+                                            for msg in messages]
 
     # Tokenize with processor (includes image + text)
-    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    #batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
 
-    input_ids = batch_enc["input_ids"]
-    attention_mask = batch_enc["attention_mask"]
-    pixel_values = batch_enc["pixel_values"]
+    input_ids = [t["input_ids"] for t in texts]#batch_enc["input_ids"]
+    attention_mask = [t["attention_mask"] for t in texts]
+    pixel_values = [t["pixel_values"] for t in texts]
+
+    #attention_mask = batch_enc["attention_mask"]
+    #pixel_values = batch_enc["pixel_values"]
 
     prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]

From b6c1d1873af9cd143536b1f2cccedda4daf18587 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 12:35:14 -0300
Subject: [PATCH 39/92] casting to tensors

---
 maestro/trainer/models/smolvlm2/loaders.py | 110 +--------------------
 1 file changed, 3 insertions(+), 107 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index dad7edec..0bfc8292 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -93,9 +93,9 @@ def evaluation_collate_fn(
     # Tokenize with processor (includes image + text)
     #batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
 
-    input_ids = [t["input_ids"] for t in texts]#batch_enc["input_ids"]
-    attention_mask = [t["attention_mask"] for t in texts]
-    pixel_values = [t["pixel_values"] for t in texts]
+    input_ids = torch.Tensor([t["input_ids"] for t in texts])#batch_enc["input_ids"]
+    attention_mask = torch.Tensor([t["attention_mask"] for t in texts])
+    pixel_values =torch.Tensor( [t["pixel_values"] for t in texts])
 
     #attention_mask = batch_enc["attention_mask"]
     #pixel_values = batch_enc["pixel_values"]
@@ -105,107 +105,3 @@ def evaluation_collate_fn(
 
     return input_ids, attention_mask, pixel_values, images, prefixes, suffixes
 
-
-
-
-
-
-
-# from typing import Optional
-
-# import torch
-# from PIL import Image
-# from torch.utils.data import DataLoader, Dataset
-# from transformers import AutoProcessor
-
-
-# class SmolVLM2Dataset(Dataset):
-#     """Dataset for SmolVLM2 model."""
-
-#     def __init__(
-#         self, image_paths: list[str], texts: Optional[list[str]] = None, processor: Optional[AutoProcessor] = None
-#     ):
-#         """
-#         Initialize dataset.
-
-#         Args:
-#             image_paths: List of paths to images
-#             texts: Optional list of corresponding texts
-#             processor: Model processor for preprocessing
-#         """
-#         self.image_paths = image_paths
-#         self.texts = texts
-#         self.processor = processor
-
-#     def __len__(self) -> int:
-#         return len(self.image_paths)
-
-#     def __getitem__(self, idx: int) -> dict:
-#         """Get a single item from the dataset."""
-#         image = Image.open(self.image_paths[idx])
-
-#         if self.texts is not None:
-#             text = self.texts[idx]
-#         else:
-#             text = ""
-
-#         if self.processor is not None:
-#             return self.processor(images=image, text=text, return_tensors="pt")
-#         else:
-#             return {"image": image, "text": text}
-
-
-# def train_collate_fn(batch: list[dict]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-#     """
-#     Collate function for training data.
-
-#     Args:
-#         batch: List of processed samples
-
-#     Returns:
-#         Tuple of (input_ids, pixel_values, labels)
-#     """
-#     input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch])
-#     pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch])
-#     labels = torch.stack([item["labels"].squeeze(0) for item in batch])
-
-#     return input_ids, pixel_values, labels
-
-
-# def evaluation_collate_fn(
-#     batch: list[dict],
-# ) -> tuple[torch.Tensor, torch.Tensor, list[Image.Image], list[str], list[str]]:
-#     """
-#     Collate function for evaluation data.
-
-#     Args:
-#         batch: List of processed samples
-
-#     Returns:
-#         Tuple of (input_ids, pixel_values, images, prompts, targets)
-#     """
-#     input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch])
-#     pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch])
-#     images = [item["image"] for item in batch]
-#     prompts = [item["text"] for item in batch]
-#     targets = [item["text"] for item in batch]  # In evaluation, target is same as prompt
-
-#     return input_ids, pixel_values, images, prompts, targets
-
-
-# def create_dataloader(
-#     dataset: Dataset, batch_size: int = 8, num_workers: int = 4, shuffle: bool = True, collate_fn=None
-# ) -> DataLoader:
-#     """
-#     Create a DataLoader for the dataset.
-
-#     Args:
-#         dataset: Dataset to create loader for
-#         batch_size: Batch size
-#         num_workers: Number of worker processes
-#         shuffle: Whether to shuffle the data
-#         collate_fn: Optional collate function
-#     Returns:
-#         DataLoader instance
-#     """
-#     return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, collate_fn=collate_fn)

From 934e3de064f6847f4996133cd7f5f1728f19d118 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 12:45:00 -0300
Subject: [PATCH 40/92] added debugging print

---
 maestro/trainer/models/smolvlm2/core.py    | 199 ---------------------
 maestro/trainer/models/smolvlm2/loaders.py |   2 +-
 2 files changed, 1 insertion(+), 200 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 424ac63b..418886d2 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -271,202 +271,3 @@ def train(config: SmolVLM2Configuration | dict) -> None:
 
 
 
-
-
-
-
-
-# class SmolVLM2Core:
-#     """Core SmolVLM2 model implementation."""
-
-#     def __init__(
-#         self,
-#         model_name: str = "smol-ai/smolvlm2-500m",
-#         device: str = "cuda" if torch.cuda.is_available() else "cpu",
-#         **kwargs,
-#     ):
-#         """
-#         Initialize SmolVLM2 model.
-
-#         Args:
-#             model_name: Name or path of the model to load
-#             device: Device to run the model on
-#             **kwargs: Additional arguments to pass to the model
-#         """
-#         self.model_name = model_name
-#         self.device = device
-
-#         self.processor = AutoProcessor.from_pretrained(model_name)
-#         self.model = AutoModelForVision2Seq.from_pretrained(model_name)
-#         self.model.to(device)
-
-#     def process_inputs(self, images: Union[str, list[str]], prompt: Optional[str] = None) -> dict:
-#         """Process input images and text."""
-#         if isinstance(images, str):
-#             images = [images]
-
-#         return self.processor(images=images, text=prompt if prompt else "", return_tensors="pt").to(self.device)
-
-#     def generate(self, inputs: dict, max_new_tokens: int = 512, **kwargs) -> torch.Tensor:
-#         """Generate text from processed inputs."""
-#         return self.model.generate(**inputs, max_new_tokens=max_new_tokens, **kwargs)
-
-#     def decode_outputs(self, outputs: torch.Tensor, skip_special_tokens: bool = True) -> list[str]:
-#         """Decode model outputs to text."""
-#         return self.processor.batch_decode(outputs, skip_special_tokens=skip_special_tokens)
-
-
-# def train(config: dict) -> dict:
-#     """
-#     Train SmolVLM2 model with provided configuration.
-
-#     Args:
-#         config: Dictionary containing training configuration
-#             - dataset: Path to dataset directory or file
-#             - epochs: Number of training epochs
-#             - batch_size: Training batch size
-#             - optimization_strategy: Strategy for optimization (qlora, lora, freeze_vision)
-#             - metrics: List of metrics to evaluate during training
-#             - output_dir: Directory to save trained model
-#     Returns:
-#         Dictionary containing training results and metrics
-#     """
-
-#     from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-#     from transformers import BitsAndBytesConfig, TrainingArguments
-
-#     from maestro.trainer.common.datasets.core import create_data_loaders, resolve_dataset_path
-#     from maestro.trainer.models.smolvlm2.loaders import evaluation_collate_fn, train_collate_fn
-
-#     # Load dataset
-#     dataset_path = config["dataset"]
-#     dataset_location = resolve_dataset_path(dataset_path)
-#     if dataset_location is None:
-#         return {"error": "Dataset not found"}
-
-#     # Create model with the specified optimization strategy
-#     model_name = config.get("model_name", "smol-ai/smolvlm2-500m")
-#     strategy = config.get("optimization_strategy", "qlora")
-
-#     if strategy == "qlora":
-#         # Configure QLoRA
-#         bnb_config = BitsAndBytesConfig(
-#             load_in_4bit=True,
-#             bnb_4bit_quant_type="nf4",
-#             bnb_4bit_compute_dtype=torch.float16,
-#             bnb_4bit_use_double_quant=True,
-#         )
-
-#         model = AutoModelForVision2Seq.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
-#         model = prepare_model_for_kbit_training(model)
-
-#         lora_config = LoraConfig(
-#             r=16,
-#             lora_alpha=32,
-#             target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
-#             lora_dropout=0.05,
-#             bias="none",
-#             task_type="CAUSAL_LM",
-#         )
-
-#         model = get_peft_model(model, lora_config)
-
-#     elif strategy == "lora":
-#         # Configure LoRA without quantization
-#         model = AutoModelForVision2Seq.from_pretrained(model_name)
-
-#         lora_config = LoraConfig(
-#             r=16,
-#             lora_alpha=32,
-#             target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
-#             lora_dropout=0.05,
-#             bias="none",
-#             task_type="CAUSAL_LM",
-#         )
-
-#         model = get_peft_model(model, lora_config)
-
-#     elif strategy == "freeze_vision":
-#         # Freeze vision encoder, train only language model part
-#         model = AutoModelForVision2Seq.from_pretrained(model_name)
-
-#         # Freeze vision encoder parameters
-#         for param in model.vision_model.parameters():
-#             param.requires_grad = False
-#     else:
-#         raise ValueError(f"Unsupported optimization strategy: {strategy}")
-
-#     # Load processor and datasets
-#     processor = AutoProcessor.from_pretrained(model_name)
-
-#     # Create processor wrapper to preprocess data before collating
-#     def process_batch(batch):
-#         processed_batch = []
-#         for item in batch:
-#             processed_item = processor(images=item.get("image"), text=item.get("text", ""), return_tensors="pt")
-#             processed_batch.append(processed_item)
-#         return processed_batch
-
-#     train_loader, valid_loader, test_loader = create_data_loaders(
-#         dataset_location=dataset_location,
-#         train_batch_size=config.get("batch_size", 4),
-#         train_collect_fn=lambda batch: train_collate_fn(process_batch(batch)),
-#         train_num_workers=config.get("num_workers", 0),
-#         test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)),
-#         test_collect_fn=lambda batch: evaluation_collate_fn(process_batch(batch)),
-#         test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)),
-#     )
-
-#     # Set up training arguments
-#     output_dir = config.get("output_dir", "./smolvlm2-finetuned")
-#     os.makedirs(output_dir, exist_ok=True)
-
-#     training_args = TrainingArguments(
-#         output_dir=output_dir,
-#         num_train_epochs=config.get("epochs", 10),
-#         per_device_train_batch_size=config.get("batch_size", 4),
-#         per_device_eval_batch_size=config.get("val_batch_size", config.get("batch_size", 4)),
-#         gradient_accumulation_steps=4,
-#         learning_rate=2e-5,
-#         weight_decay=0.01,
-#         warmup_steps=100,
-#         save_strategy="epoch",
-#         save_total_limit=2,
-#         logging_steps=10,
-#         evaluation_strategy="epoch",
-#         load_best_model_at_end=True,
-#         remove_unused_columns=False,
-#     )
-
-#     # Safely handle potential None loaders by directly checking
-#     # train_loader/valid_loader before accessing dataset attribute
-#     train_dataset = None
-#     if train_loader is not None:
-#         train_dataset = train_loader.dataset
-
-#     eval_dataset = None
-#     if valid_loader is not None:
-#         eval_dataset = valid_loader.dataset
-
-#     # Create data_collator that matches the train_collate_fn signature (doesn't pass processor)
-#     trainer = Trainer(
-#         model=model,
-#         args=training_args,
-#         train_dataset=train_dataset,
-#         eval_dataset=eval_dataset,
-#         data_collator=lambda batch: train_collate_fn(process_batch(batch)),
-#     )
-
-#     # Train model
-#     trainer.train()
-
-#     # Save model and processor
-#     model.save_pretrained(output_dir)
-#     processor.save_pretrained(output_dir)
-
-#     # Return results
-#     return {
-#         "model_path": output_dir,
-#         "metrics": trainer.state.log_history[-1] if trainer.state.log_history else {"loss": "N/A"},
-#         "status": "Training completed",
-#     }
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 0bfc8292..3da2ab98 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -96,7 +96,7 @@ def evaluation_collate_fn(
     input_ids = torch.Tensor([t["input_ids"] for t in texts])#batch_enc["input_ids"]
     attention_mask = torch.Tensor([t["attention_mask"] for t in texts])
     pixel_values =torch.Tensor( [t["pixel_values"] for t in texts])
-
+    print(input_ids)
     #attention_mask = batch_enc["attention_mask"]
     #pixel_values = batch_enc["pixel_values"]
 

From 877d79943aae66d80dfae1ff0daf1bd45c79de55 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 12:49:21 -0300
Subject: [PATCH 41/92] changed processor

---
 maestro/trainer/models/smolvlm2/core.py    |  2 +-
 maestro/trainer/models/smolvlm2/loaders.py | 22 ++++++++--------------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 418886d2..1d2752bb 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -172,7 +172,7 @@ def training_step(self, batch, batch_idx):
         return loss
 
     def validation_step(self, batch, batch_idx):
-        input_ids,attention_mask, pixel_values, images, prefixes, suffixes = batch
+        input_ids,attention_mask, pixel_values, prefixes, suffixes = batch
         generated_suffixes = predict_with_inputs(
             model=self.model,
             processor=self.processor,
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 3da2ab98..44d371d7 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -84,24 +84,18 @@ def evaluation_collate_fn(
         for image, entry in zip(images, data)
     ]
 
-    texts = [processor.apply_chat_template(msg, tokenize=True, 
-                                           add_generation_prompt=True,
-                                           return_dict=True,
-                                            )
-                                            for msg in messages]
-
-    # Tokenize with processor (includes image + text)
-    #batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
-
-    input_ids = torch.Tensor([t["input_ids"] for t in texts])#batch_enc["input_ids"]
-    attention_mask = torch.Tensor([t["attention_mask"] for t in texts])
-    pixel_values =torch.Tensor( [t["pixel_values"] for t in texts])
-    print(input_ids)
+    texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
+
+    # Tokenize and encode images
+    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    input_ids = batch_enc["input_ids"]
+    attention_mask = batch_enc["attention_mask"]
+    pixel_values = batch_enc["pixel_values"]
     #attention_mask = batch_enc["attention_mask"]
     #pixel_values = batch_enc["pixel_values"]
 
     prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]
 
-    return input_ids, attention_mask, pixel_values, images, prefixes, suffixes
+    return input_ids, attention_mask, pixel_values, prefixes, suffixes
 

From 518c3ca5d5305094437e498a2df1ff2fd2dbbb72 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 13:07:44 -0300
Subject: [PATCH 42/92] changes in collates

---
 maestro/trainer/models/smolvlm2/inference.py | 3 +--
 maestro/trainer/models/smolvlm2/loaders.py   | 7 +++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index 9649ac67..f3dbcfb0 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -36,9 +36,8 @@ def predict_with_inputs(
             pixel_values=pixel_values.to(device),
             max_new_tokens=max_new_tokens,
             do_sample=False,
-            num_beams=3,
         )
-    return processor.batch_decode(generated_ids, skip_special_tokens=False)
+    return processor.batch_decode(generated_ids, skip_special_tokens=True)
 
 
 def predict(
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 44d371d7..a1a79a02 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -40,7 +40,7 @@ def train_collate_fn(
     suffixes = [entry["suffix"] for entry in data]
     
     # Apply chat template WITHOUT tokenization
-    texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
+    texts = [processor.apply_chat_template(m,  add_generation_prompt=False) for m in messages]
 
     # Tokenize and encode images
     batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
@@ -61,7 +61,6 @@ def train_collate_fn(
     # Mask prefix tokens: keep only suffix as target
     for i, suffix in enumerate(suffixes):
         suffix_ids = processor.tokenizer(suffix, add_special_tokens=False).input_ids
-        # Only keep the last len(suffix_ids) tokens in labels
         labels[i, :-len(suffix_ids)] = -100
 
     return input_ids, attention_mask, pixel_values, labels
@@ -85,7 +84,7 @@ def evaluation_collate_fn(
     ]
 
     texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
-
+    print(texts)
     # Tokenize and encode images
     batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
     input_ids = batch_enc["input_ids"]
@@ -96,6 +95,6 @@ def evaluation_collate_fn(
 
     prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]
-
+    print(suffixes)
     return input_ids, attention_mask, pixel_values, prefixes, suffixes
 

From 2769c96005c9cdfab5809f759fdf3a6fce683c0d Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 13:13:29 -0300
Subject: [PATCH 43/92] trying again paligemma collates

---
 maestro/trainer/models/smolvlm2/loaders.py | 82 +++++++---------------
 1 file changed, 25 insertions(+), 57 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index a1a79a02..dae7c7a0 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -29,72 +29,40 @@ def format_data(image, prefix, suffix):
     ]
 
 def train_collate_fn(
-    batch: list[tuple[Image.Image, dict[str, Any]]],
-      processor: AutoProcessor ):
+    batch: list[tuple[Image.Image, dict[str, Any]]], processor: PaliGemmaProcessor, max_length: int = 512
+):
     images, data = zip(*batch)
-
-    messages = [
-        format_data(image, entry["prefix"], entry["suffix"])
-        for image, entry in zip(images, data)
-    ]
+    prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]
-    
-    # Apply chat template WITHOUT tokenization
-    texts = [processor.apply_chat_template(m,  add_generation_prompt=False) for m in messages]
-
-    # Tokenize and encode images
-    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
-    input_ids = batch_enc["input_ids"]
-    attention_mask = batch_enc["attention_mask"]
-    pixel_values = batch_enc["pixel_values"]
 
-    # Clone input_ids to labels and mask out everything except suffix
-    labels = input_ids.clone()
+    inputs = processor(
+        text=prefixes,
+        images=images,
+        return_tensors="pt",
+        suffix=suffixes,
+        padding=True,
+        truncation="only_second",
+        max_length=max_length,
+    )
 
-    # Mask pad tokens
-    labels[labels == processor.tokenizer.pad_token_id] = -100
+    input_ids = inputs["input_ids"]
+    attention_mask = inputs["attention_mask"]
+    token_type_ids = inputs["token_type_ids"]
+    pixel_values = inputs["pixel_values"]
+    labels = inputs["labels"]
 
-    # Mask <image> tokens
-    image_token_id = processor.tokenizer.convert_tokens_to_ids("<image>")
-    labels[labels == image_token_id] = -100
+    return input_ids, attention_mask, token_type_ids, pixel_values, labels
 
-    # Mask prefix tokens: keep only suffix as target
-    for i, suffix in enumerate(suffixes):
-        suffix_ids = processor.tokenizer(suffix, add_special_tokens=False).input_ids
-        labels[i, :-len(suffix_ids)] = -100
 
-    return input_ids, attention_mask, pixel_values, labels
-def evaluation_collate_fn(
-    batch: list[tuple[Image.Image, dict[str, Any]]],
-    processor: AutoProcessor
-):
+def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: PaliGemmaProcessor):
     images, data = zip(*batch)
+    prefixes = ["<image>" + entry["prefix"] for entry in data]
+    suffixes = [entry["suffix"] for entry in data]
 
-    messages = [
-        [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {"type": "text", "text": entry["prefix"]},
-                ],
-            }
-        ]
-        for image, entry in zip(images, data)
-    ]
+    inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
 
-    texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
-    print(texts)
-    # Tokenize and encode images
-    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
-    input_ids = batch_enc["input_ids"]
-    attention_mask = batch_enc["attention_mask"]
-    pixel_values = batch_enc["pixel_values"]
-    #attention_mask = batch_enc["attention_mask"]
-    #pixel_values = batch_enc["pixel_values"]
+    input_ids = inputs["input_ids"]
+    attention_mask = inputs["attention_mask"]
+    pixel_values = inputs["pixel_values"]
 
-    prefixes = ["<image>" + entry["prefix"] for entry in data]
-    suffixes = [entry["suffix"] for entry in data]
-    print(suffixes)
     return input_ids, attention_mask, pixel_values, prefixes, suffixes
-

From 62efcde212f5ec7bf002eeb6dec6121129e14690 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 13:13:41 -0300
Subject: [PATCH 44/92] trying again paligemma collates

---
 maestro/trainer/models/smolvlm2/loaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index dae7c7a0..00fe6f9d 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -51,7 +51,7 @@ def train_collate_fn(
     pixel_values = inputs["pixel_values"]
     labels = inputs["labels"]
 
-    return input_ids, attention_mask, token_type_ids, pixel_values, labels
+    return input_ids, attention_mask, pixel_values, labels
 
 
 def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: PaliGemmaProcessor):

From 7ebb56dab0a0667e6b5d088abd95c88b5a7a3997 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 13:14:48 -0300
Subject: [PATCH 45/92] trying again paligemma collates

---
 maestro/trainer/models/smolvlm2/loaders.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 00fe6f9d..99df793c 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -29,7 +29,7 @@ def format_data(image, prefix, suffix):
     ]
 
 def train_collate_fn(
-    batch: list[tuple[Image.Image, dict[str, Any]]], processor: PaliGemmaProcessor, max_length: int = 512
+    batch: list[tuple[Image.Image, dict[str, Any]]], processor
 ):
     images, data = zip(*batch)
     prefixes = ["<image>" + entry["prefix"] for entry in data]
@@ -42,7 +42,6 @@ def train_collate_fn(
         suffix=suffixes,
         padding=True,
         truncation="only_second",
-        max_length=max_length,
     )
 
     input_ids = inputs["input_ids"]

From 434d725fbdd5c08cc718051a05cc3957aa039927 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 13:16:25 -0300
Subject: [PATCH 46/92] trying again paligemma collates

---
 maestro/trainer/models/smolvlm2/loaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 99df793c..9522c678 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -53,7 +53,7 @@ def train_collate_fn(
     return input_ids, attention_mask, pixel_values, labels
 
 
-def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: PaliGemmaProcessor):
+def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor):
     images, data = zip(*batch)
     prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]

From 537286748885736a1c596104b3cb2e526aadff20 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 13:29:23 -0300
Subject: [PATCH 47/92] trying again paligemma collates

---
 maestro/trainer/models/smolvlm2/checkpoints.py | 5 +++--
 maestro/trainer/models/smolvlm2/core.py        | 4 +++-
 maestro/trainer/models/smolvlm2/loaders.py     | 8 +++++---
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index 08fa4933..88dadfb2 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -8,6 +8,7 @@
 from maestro.trainer.logger import get_maestro_logger
 from peft import LoraConfig, get_peft_model
 from transformers import BitsAndBytesConfig
+from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
 
 DEFAULT_SMOLVLM2_MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"#"smol-ai/smolvlm2-500m"
 DEFAULT_SMOLVLM2_MODEL_REVISION = "refs/heads/main"
@@ -122,8 +123,8 @@ def load_model(
         ValueError: If the model or processor cannot be loaded.
     """
     device = parse_device_spec(device)
-    #processor = PaliGemmaProcessor.from_pretrained(model_id_or_path, trust_remote_code=True, revision=revision)
-    processor = AutoProcessor.from_pretrained(model_id_or_path)
+    #processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True, revision=revision)
+    processor = PaliGemmaProcessor.from_pretrained(model_id_or_path)
 
     if optimization_strategy in {OptimizationStrategy.LORA, OptimizationStrategy.QLORA}:
         default_params = DEFAULT_SMOLVLM2_PEFT_PARAMS
diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 1d2752bb..4e612208 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -159,10 +159,12 @@ def __init__(
         self.valid_metrics_tracker = MetricsTracker.init(metrics=metrics)
 
     def training_step(self, batch, batch_idx):
-        input_ids, attention_mask, pixel_values, labels = batch
+        input_ids, attention_mask, token_type_ids, pixel_values, labels = batch
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+
             pixel_values=pixel_values,
             labels=labels,
         )
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 9522c678..a2c66c0c 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -29,7 +29,7 @@ def format_data(image, prefix, suffix):
     ]
 
 def train_collate_fn(
-    batch: list[tuple[Image.Image, dict[str, Any]]], processor
+    batch: list[tuple[Image.Image, dict[str, Any]]], processor: PaliGemmaProcessor, max_length: int = 512
 ):
     images, data = zip(*batch)
     prefixes = ["<image>" + entry["prefix"] for entry in data]
@@ -42,6 +42,7 @@ def train_collate_fn(
         suffix=suffixes,
         padding=True,
         truncation="only_second",
+        max_length=max_length,
     )
 
     input_ids = inputs["input_ids"]
@@ -50,10 +51,11 @@ def train_collate_fn(
     pixel_values = inputs["pixel_values"]
     labels = inputs["labels"]
 
-    return input_ids, attention_mask, pixel_values, labels
+    return input_ids, attention_mask, token_type_ids, pixel_values, labels
 
 
-def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor):
+
+def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: PaliGemmaProcessor):
     images, data = zip(*batch)
     prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]

From 1681dce47c2e238df4245b4a1ff44251e44421c8 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 13:36:06 -0300
Subject: [PATCH 48/92] going back to custom collators

---
 maestro/trainer/models/smolvlm2/core.py    |  4 +-
 maestro/trainer/models/smolvlm2/loaders.py | 77 +++++++++++++++-------
 2 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 4e612208..1d2752bb 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -159,12 +159,10 @@ def __init__(
         self.valid_metrics_tracker = MetricsTracker.init(metrics=metrics)
 
     def training_step(self, batch, batch_idx):
-        input_ids, attention_mask, token_type_ids, pixel_values, labels = batch
+        input_ids, attention_mask, pixel_values, labels = batch
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-
             pixel_values=pixel_values,
             labels=labels,
         )
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index a2c66c0c..8d73b945 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -29,41 +29,68 @@ def format_data(image, prefix, suffix):
     ]
 
 def train_collate_fn(
-    batch: list[tuple[Image.Image, dict[str, Any]]], processor: PaliGemmaProcessor, max_length: int = 512
-):
+    batch: list[tuple[Image.Image, dict[str, Any]]],
+      processor: AutoProcessor ):
     images, data = zip(*batch)
-    prefixes = ["<image>" + entry["prefix"] for entry in data]
+
+    messages = [
+        format_data(image, entry["prefix"], entry["suffix"])
+        for image, entry in zip(images, data)
+    ]
     suffixes = [entry["suffix"] for entry in data]
+    
+    # Apply chat template WITHOUT tokenization
+    texts = [processor.apply_chat_template(m,  add_generation_prompt=False) for m in messages]
 
-    inputs = processor(
-        text=prefixes,
-        images=images,
-        return_tensors="pt",
-        suffix=suffixes,
-        padding=True,
-        truncation="only_second",
-        max_length=max_length,
-    )
+    # Tokenize and encode images
+    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    input_ids = batch_enc["input_ids"]
+    attention_mask = batch_enc["attention_mask"]
+    pixel_values = batch_enc["pixel_values"]
 
-    input_ids = inputs["input_ids"]
-    attention_mask = inputs["attention_mask"]
-    token_type_ids = inputs["token_type_ids"]
-    pixel_values = inputs["pixel_values"]
-    labels = inputs["labels"]
+    # Clone input_ids to labels and mask out everything except suffix
+    labels = input_ids.clone()
 
-    return input_ids, attention_mask, token_type_ids, pixel_values, labels
+    # Mask pad tokens
+    labels[labels == processor.tokenizer.pad_token_id] = -100
 
+    # Mask <image> tokens
+    image_token_id = processor.tokenizer.convert_tokens_to_ids("<image>")
+    labels[labels == image_token_id] = -100
 
+    # Mask prefix tokens: keep only suffix as target
+    for i, suffix in enumerate(suffixes):
+        suffix_ids = processor.tokenizer(suffix, add_special_tokens=False).input_ids
+        labels[i, :-len(suffix_ids)] = -100
 
-def evaluation_collate_fn(batch: list[tuple[Image.Image, dict[str, Any]]], processor: PaliGemmaProcessor):
+    return input_ids, attention_mask, pixel_values, labels
+def evaluation_collate_fn(
+    batch: list[tuple[Image.Image, dict[str, Any]]],
+    processor: AutoProcessor
+):
     images, data = zip(*batch)
-    prefixes = ["<image>" + entry["prefix"] for entry in data]
-    suffixes = [entry["suffix"] for entry in data]
 
-    inputs = processor(text=prefixes, images=images, return_tensors="pt", padding=True)
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": entry["prefix"]},
+                ],
+            }
+        ]
+        for image, entry in zip(images, data)
+    ]
 
-    input_ids = inputs["input_ids"]
-    attention_mask = inputs["attention_mask"]
-    pixel_values = inputs["pixel_values"]
+    texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
+    # Tokenize and encode images
+    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    input_ids = batch_enc["input_ids"]
+    attention_mask = batch_enc["attention_mask"]
+    pixel_values = batch_enc["pixel_values"]
 
+    prefixes = ["<image>" + entry["prefix"] for entry in data]
+    suffixes = [entry["suffix"] for entry in data]
     return input_ids, attention_mask, pixel_values, prefixes, suffixes
+

From 3698feba5cbc093fd95e6c5ee578ee7cb203cda8 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 13:39:35 -0300
Subject: [PATCH 49/92] going back to custom collators

---
 maestro/trainer/models/smolvlm2/checkpoints.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index 88dadfb2..304f4608 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -123,8 +123,7 @@ def load_model(
         ValueError: If the model or processor cannot be loaded.
     """
     device = parse_device_spec(device)
-    #processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True, revision=revision)
-    processor = PaliGemmaProcessor.from_pretrained(model_id_or_path)
+    processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True, revision=revision)
 
     if optimization_strategy in {OptimizationStrategy.LORA, OptimizationStrategy.QLORA}:
         default_params = DEFAULT_SMOLVLM2_PEFT_PARAMS

From 3afba9816c8e82159988f72b1084f350eee9a911 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 16:10:50 -0300
Subject: [PATCH 50/92] removing images from processor

---
 maestro/trainer/models/smolvlm2/loaders.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 8d73b945..da38fcaa 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -40,10 +40,10 @@ def train_collate_fn(
     suffixes = [entry["suffix"] for entry in data]
     
     # Apply chat template WITHOUT tokenization
-    texts = [processor.apply_chat_template(m,  add_generation_prompt=False) for m in messages]
+    texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
 
     # Tokenize and encode images
-    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    batch_enc = processor(text=texts, return_tensors="pt", padding=True)
     input_ids = batch_enc["input_ids"]
     attention_mask = batch_enc["attention_mask"]
     pixel_values = batch_enc["pixel_values"]
@@ -85,7 +85,7 @@ def evaluation_collate_fn(
 
     texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
     # Tokenize and encode images
-    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    batch_enc = processor(text=texts,  return_tensors="pt", padding=True)
     input_ids = batch_enc["input_ids"]
     attention_mask = batch_enc["attention_mask"]
     pixel_values = batch_enc["pixel_values"]

From 974f15763ab8a99a2f75e32a82664741bef6499a Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 16:15:16 -0300
Subject: [PATCH 51/92] removing images from processor

---
 maestro/trainer/models/smolvlm2/loaders.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index da38fcaa..51576b1b 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -14,7 +14,7 @@ def format_data(image, prefix, suffix):
             "content": [
                 {
                     "type": "image",
-                    "image": image,
+                    "image": "<image>",
                 },
                 {
                     "type": "text",
@@ -43,7 +43,7 @@ def train_collate_fn(
     texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
 
     # Tokenize and encode images
-    batch_enc = processor(text=texts, return_tensors="pt", padding=True)
+    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
     input_ids = batch_enc["input_ids"]
     attention_mask = batch_enc["attention_mask"]
     pixel_values = batch_enc["pixel_values"]
@@ -75,7 +75,7 @@ def evaluation_collate_fn(
             {
                 "role": "user",
                 "content": [
-                    {"type": "image", "image": image},
+                    {"type": "image", "image": "<image>"},
                     {"type": "text", "text": entry["prefix"]},
                 ],
             }
@@ -85,7 +85,7 @@ def evaluation_collate_fn(
 
     texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
     # Tokenize and encode images
-    batch_enc = processor(text=texts,  return_tensors="pt", padding=True)
+    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
     input_ids = batch_enc["input_ids"]
     attention_mask = batch_enc["attention_mask"]
     pixel_values = batch_enc["pixel_values"]

From a2597395cdaceb539868dfefd0ee778c05c90da4 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 16:16:20 -0300
Subject: [PATCH 52/92] rollback

---
 maestro/trainer/models/smolvlm2/loaders.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 51576b1b..a9d7ce42 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -14,7 +14,7 @@ def format_data(image, prefix, suffix):
             "content": [
                 {
                     "type": "image",
-                    "image": "<image>",
+                    "image": image,
                 },
                 {
                     "type": "text",
@@ -75,7 +75,7 @@ def evaluation_collate_fn(
             {
                 "role": "user",
                 "content": [
-                    {"type": "image", "image": "<image>"},
+                    {"type": "image", "image": image},
                     {"type": "text", "text": entry["prefix"]},
                 ],
             }

From e35e1c4cb01faeb0083e14bc48fb20d05340fdd9 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 16:25:41 -0300
Subject: [PATCH 53/92] trying to apply chat template to all messages at once

---
 maestro/trainer/models/smolvlm2/loaders.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index a9d7ce42..3337a7f6 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -40,10 +40,11 @@ def train_collate_fn(
     suffixes = [entry["suffix"] for entry in data]
     
     # Apply chat template WITHOUT tokenization
-    texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
+    #texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
+    batch_enc = processor.apply_chat_template(messages, tokenize=True)
 
     # Tokenize and encode images
-    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    #batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
     input_ids = batch_enc["input_ids"]
     attention_mask = batch_enc["attention_mask"]
     pixel_values = batch_enc["pixel_values"]
@@ -83,9 +84,11 @@ def evaluation_collate_fn(
         for image, entry in zip(images, data)
     ]
 
-    texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
-    # Tokenize and encode images
-    batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    # texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
+    # # Tokenize and encode images
+    # batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
+    batch_enc = processor.apply_chat_template(messages, tokenize=True)
+
     input_ids = batch_enc["input_ids"]
     attention_mask = batch_enc["attention_mask"]
     pixel_values = batch_enc["pixel_values"]

From d8f58b9482c3e20c954675d9adce602eb9187836 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 16:27:39 -0300
Subject: [PATCH 54/92] added debug print

---
 maestro/trainer/models/smolvlm2/loaders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 3337a7f6..69a8d045 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -42,7 +42,7 @@ def train_collate_fn(
     # Apply chat template WITHOUT tokenization
     #texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
     batch_enc = processor.apply_chat_template(messages, tokenize=True)
-
+    print(batch_enc)
     # Tokenize and encode images
     #batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
     input_ids = batch_enc["input_ids"]

From d598759b36bf205edcb6febc41c93a06cd9bb42f Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 16:29:13 -0300
Subject: [PATCH 55/92] added debug print

---
 maestro/trainer/models/smolvlm2/loaders.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 69a8d045..10ae7bb4 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -88,6 +88,7 @@ def evaluation_collate_fn(
     # # Tokenize and encode images
     # batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
     batch_enc = processor.apply_chat_template(messages, tokenize=True)
+    print(batch_enc)
 
     input_ids = batch_enc["input_ids"]
     attention_mask = batch_enc["attention_mask"]

From 09e749830a2cb00b263704fb403ea2e43b5f8c5e Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 16:39:10 -0300
Subject: [PATCH 56/92] testing different things in valid step

---
 maestro/trainer/models/smolvlm2/core.py    | 23 +++++++++++++---------
 maestro/trainer/models/smolvlm2/loaders.py | 11 +++++------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 1d2752bb..adb73366 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -172,15 +172,20 @@ def training_step(self, batch, batch_idx):
         return loss
 
     def validation_step(self, batch, batch_idx):
-        input_ids,attention_mask, pixel_values, prefixes, suffixes = batch
-        generated_suffixes = predict_with_inputs(
-            model=self.model,
-            processor=self.processor,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            pixel_values=pixel_values,
-            device=self.config.device,
-            max_new_tokens=self.config.max_new_tokens,
+        inputs, prefixes, suffixes = batch
+        # generated_suffixes = predict_with_inputs(
+        #     model=self.model,
+        #     processor=self.processor,
+        #     input_ids=input_ids,
+        #     attention_mask=attention_mask,
+        #     pixel_values=pixel_values,
+        #     device=self.config.device,
+        #     max_new_tokens=self.config.max_new_tokens,
+        # )
+        generated_ids = self.model.generate(**inputs, do_sample=False, max_new_tokens=64)
+        generated_suffixes = self.processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True,
         )
         print("generated_suffixes", generated_suffixes)
         if batch_idx == 0:
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 10ae7bb4..fbc2cde3 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -42,7 +42,6 @@ def train_collate_fn(
     # Apply chat template WITHOUT tokenization
     #texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
     batch_enc = processor.apply_chat_template(messages, tokenize=True)
-    print(batch_enc)
     # Tokenize and encode images
     #batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
     input_ids = batch_enc["input_ids"]
@@ -88,13 +87,13 @@ def evaluation_collate_fn(
     # # Tokenize and encode images
     # batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
     batch_enc = processor.apply_chat_template(messages, tokenize=True)
-    print(batch_enc)
+    #print(batch_enc)
 
-    input_ids = batch_enc["input_ids"]
-    attention_mask = batch_enc["attention_mask"]
-    pixel_values = batch_enc["pixel_values"]
+    # input_ids = batch_enc["input_ids"]
+    # attention_mask = batch_enc["attention_mask"]
+    # pixel_values = batch_enc["pixel_values"]
 
     prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]
-    return input_ids, attention_mask, pixel_values, prefixes, suffixes
+    return batch_enc, prefixes, suffixes
 

From b20455d3e157ffdbacdeab3b86a71b6fb907e218 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 16:41:21 -0300
Subject: [PATCH 57/92] testing different things in valid step

---
 maestro/trainer/models/smolvlm2/loaders.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index fbc2cde3..7883eab0 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -86,7 +86,8 @@ def evaluation_collate_fn(
     # texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
     # # Tokenize and encode images
     # batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
-    batch_enc = processor.apply_chat_template(messages, tokenize=True)
+    batch_enc = processor.apply_chat_template(messages, tokenize=True,return_dict=True,
+            return_tensors="pt",)
     #print(batch_enc)
 
     # input_ids = batch_enc["input_ids"]

From 52994612d30138725dca2b2b67dec97a0a673be4 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 16:52:55 -0300
Subject: [PATCH 58/92] testing different things in train step

---
 maestro/trainer/models/smolvlm2/loaders.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 7883eab0..b2bdf483 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -41,15 +41,11 @@ def train_collate_fn(
     
     # Apply chat template WITHOUT tokenization
     #texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
-    batch_enc = processor.apply_chat_template(messages, tokenize=True)
-    # Tokenize and encode images
-    #batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
-    input_ids = batch_enc["input_ids"]
-    attention_mask = batch_enc["attention_mask"]
-    pixel_values = batch_enc["pixel_values"]
+    batch_enc = processor.apply_chat_template(messages, tokenize=True,return_dict=True,
+            return_tensors="pt",)
 
     # Clone input_ids to labels and mask out everything except suffix
-    labels = input_ids.clone()
+    labels = batch_enc.input_ids.clone()
 
     # Mask pad tokens
     labels[labels == processor.tokenizer.pad_token_id] = -100
@@ -63,7 +59,7 @@ def train_collate_fn(
         suffix_ids = processor.tokenizer(suffix, add_special_tokens=False).input_ids
         labels[i, :-len(suffix_ids)] = -100
 
-    return input_ids, attention_mask, pixel_values, labels
+    return batch_enc, labels
 def evaluation_collate_fn(
     batch: list[tuple[Image.Image, dict[str, Any]]],
     processor: AutoProcessor

From 476e120fb4de9002f063055659d9221e500d6ba4 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 16:54:53 -0300
Subject: [PATCH 59/92] testing different things in train step

---
 maestro/trainer/models/smolvlm2/core.py    | 6 ++----
 maestro/trainer/models/smolvlm2/loaders.py | 4 ++--
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index adb73366..c8ddf84e 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -159,11 +159,9 @@ def __init__(
         self.valid_metrics_tracker = MetricsTracker.init(metrics=metrics)
 
     def training_step(self, batch, batch_idx):
-        input_ids, attention_mask, pixel_values, labels = batch
+        inputs, labels = batch
         outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            pixel_values=pixel_values,
+            **inputs,
             labels=labels,
         )
         loss = outputs.loss
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index b2bdf483..635a9bd8 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -42,7 +42,7 @@ def train_collate_fn(
     # Apply chat template WITHOUT tokenization
     #texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
     batch_enc = processor.apply_chat_template(messages, tokenize=True,return_dict=True,
-            return_tensors="pt",)
+            return_tensors="pt",padding = True)
 
     # Clone input_ids to labels and mask out everything except suffix
     labels = batch_enc.input_ids.clone()
@@ -83,7 +83,7 @@ def evaluation_collate_fn(
     # # Tokenize and encode images
     # batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
     batch_enc = processor.apply_chat_template(messages, tokenize=True,return_dict=True,
-            return_tensors="pt",)
+            return_tensors="pt",padding = True)
     #print(batch_enc)
 
     # input_ids = batch_enc["input_ids"]

From b488446a068b05d74b33083b0e6c925dca557ddc Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 17:43:55 -0300
Subject: [PATCH 60/92] validation using predict with inputs

---
 maestro/trainer/models/smolvlm2/core.py       |  21 +--
 maestro/trainer/models/smolvlm2/entrypoint.py | 133 ------------------
 maestro/trainer/models/smolvlm2/inference.py  |  77 +---------
 maestro/trainer/models/smolvlm2/loaders.py    |  20 +--
 4 files changed, 13 insertions(+), 238 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index c8ddf84e..665688ac 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -171,21 +171,12 @@ def training_step(self, batch, batch_idx):
 
     def validation_step(self, batch, batch_idx):
         inputs, prefixes, suffixes = batch
-        # generated_suffixes = predict_with_inputs(
-        #     model=self.model,
-        #     processor=self.processor,
-        #     input_ids=input_ids,
-        #     attention_mask=attention_mask,
-        #     pixel_values=pixel_values,
-        #     device=self.config.device,
-        #     max_new_tokens=self.config.max_new_tokens,
-        # )
-        generated_ids = self.model.generate(**inputs, do_sample=False, max_new_tokens=64)
-        generated_suffixes = self.processor.batch_decode(
-            generated_ids,
-            skip_special_tokens=True,
-        )
-        print("generated_suffixes", generated_suffixes)
+
+        generated_suffixes = predict_with_inputs(self.model,
+                                            self.processor,
+                                            inputs,
+                                            max_new_tokens=self.config.max_new_tokens )
+
         if batch_idx == 0:
             logger.info(f"sample valid prefix: {prefixes[0]}")
             logger.info(f"sample valid suffix: {suffixes[0]}")
diff --git a/maestro/trainer/models/smolvlm2/entrypoint.py b/maestro/trainer/models/smolvlm2/entrypoint.py
index d6c09dd7..8205e664 100644
--- a/maestro/trainer/models/smolvlm2/entrypoint.py
+++ b/maestro/trainer/models/smolvlm2/entrypoint.py
@@ -109,136 +109,3 @@ def parse_lora_params(param_str) -> dict[str, Any]:
     rich.print(dataclasses.asdict(config))
     smolvlm2_train(config=config)
 
-
-
-
-
-
-
-
-
-
-# from pathlib import Path
-# from typing import Optional, Union
-
-# import torch
-# import typer
-
-# from .inference import SmolVLM2Inference
-
-# smolvlm2_app = typer.Typer()
-
-
-# class SmolVLM2:
-#     """Main entrypoint for SmolVLM2 model."""
-
-#     def __init__(
-#         self,
-#         model_name: str = "smol-ai/smolvlm2-500m",
-#         device: str = "cuda" if torch.cuda.is_available() else "cpu",
-#         **kwargs,
-#     ):
-#         """Initialize SmolVLM2 model."""
-#         self.inference = SmolVLM2Inference(model_name=model_name, device=device, **kwargs)
-
-#     def generate(
-#         self, images: Union[str, list[str]], prompt: Optional[str] = None, max_new_tokens: int = 512, **kwargs
-#     ) -> dict:
-#         """
-#         Generate text from images.
-
-#         Args:
-#             images: Path(s) to image(s)
-#             prompt: Optional prompt to guide generation
-#             max_new_tokens: Maximum number of tokens to generate
-#             **kwargs: Additional generation parameters
-
-#         Returns:
-#             Dictionary containing generated text and other outputs
-#         """
-#         return self.inference.generate(images=images, prompt=prompt, max_new_tokens=max_new_tokens, **kwargs)
-
-
-# @smolvlm2_app.command(name="info", help="Get information about the SmolVLM2 model")
-# def info() -> None:
-#     """Get information about the SmolVLM2 model."""
-#     try:
-#         model = SmolVLM2()
-#         info = model.inference.get_model_info()
-#         typer.echo(f"Model Name: {info['model_name']}")
-#         typer.echo(f"Model Size: {info['model_size']}")
-#         typer.echo(f"Device: {info['device']}")
-#         typer.echo(f"Tokenizer: {info['tokenizer']}")
-#     except Exception as e:
-#         typer.echo(f"Error retrieving model info: {e!s}", err=True)
-#         raise typer.Exit(code=1)
-
-
-# @smolvlm2_app.command(name="predict", help="Run inference on one or more images")
-# def predict(
-#     image: list[Path] = typer.Option(..., "--image", "-i", help="Path to image(s) for prediction"),
-#     prompt: Optional[str] = typer.Option(None, "--prompt", "-p", help="Optional prompt to guide generation"),
-#     max_new_tokens: int = typer.Option(512, "--max-new-tokens", help="Maximum new tokens to generate"),
-#     output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output file path to save results"),
-# ) -> None:
-#     """Run inference on images using SmolVLM2."""
-#     try:
-#         model = SmolVLM2()
-#         result = model.generate(images=[str(img) for img in image], prompt=prompt, max_new_tokens=max_new_tokens)
-
-#         if output:
-#             import json
-
-#             with open(output, "w") as f:
-#                 json.dump(result, f, indent=2)
-#             typer.echo(f"Results saved to {output}")
-#         else:
-#             typer.echo(f"Generated text: {result['text']}")
-
-#     except Exception as e:
-#         typer.echo(f"Error during prediction: {e!s}", err=True)
-#         raise typer.Exit(code=1)
-
-
-# @smolvlm2_app.command(name="train", help="Fine-tune the SmolVLM2 model")
-# def train(
-#     dataset: Path = typer.Option(..., "--dataset", "-d", help="Path to dataset directory or file"),
-#     epochs: int = typer.Option(10, "--epochs", "-e", help="Number of training epochs"),
-#     batch_size: int = typer.Option(4, "--batch-size", "-b", help="Training batch size"),
-#     optimization_strategy: str = typer.Option(
-#         "qlora", "--optimization-strategy", "-o", help="Optimization strategy (qlora, lora, freeze_vision)"
-#     ),
-#     metrics: list[str] = typer.Option(["edit_distance"], "--metrics", "-m", help="Metrics to evaluate during training"),
-#     output_dir: Optional[Path] = typer.Option(None, "--output-dir", help="Directory to save trained model"),
-# ) -> None:
-#     """Fine-tune the SmolVLM2 model on a dataset."""
-#     try:
-#         typer.echo("Starting SmolVLM2 fine-tuning...")
-
-#         if output_dir is None:
-#             import tempfile
-
-#             output_dir = Path(tempfile.mkdtemp())
-#             typer.echo(f"No output directory specified, using temporary directory: {output_dir}")
-
-#         # Create configuration for training
-#         config = {
-#             "dataset": str(dataset),
-#             "epochs": epochs,
-#             "batch_size": batch_size,
-#             "optimization_strategy": optimization_strategy,
-#             "metrics": metrics,
-#             "output_dir": str(output_dir),
-#         }
-
-#         # Import the train function here to avoid circular imports
-#         from .core import train as train_model
-
-#         results = train_model(config)
-
-#         typer.echo(f"Training complete! Model saved to {output_dir}")
-#         typer.echo(f"Final metrics: {results.get('metrics', {})}")
-
-#     except Exception as e:
-#         typer.echo(f"Error during training: {e!s}", err=True)
-#         raise typer.Exit(code=1)
diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index f3dbcfb0..78b91a37 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -9,8 +9,7 @@
 def predict_with_inputs(
     model: AutoModelForImageTextToText,
     processor: AutoProcessor,
-    input_ids: torch.Tensor,
-    pixel_values: torch.Tensor,
+    inputs: torch.Tensor,
     device: Union[str, torch.device],
     max_new_tokens: int = 512,
     **kwargs,
@@ -32,81 +31,9 @@ def predict_with_inputs(
     """
     with torch.no_grad():
         generated_ids = model.generate(
-            input_ids=input_ids.to(device),
-            pixel_values=pixel_values.to(device),
+            **inputs,
             max_new_tokens=max_new_tokens,
             do_sample=False,
         )
     return processor.batch_decode(generated_ids, skip_special_tokens=True)
 
-
-def predict(
-    model: AutoModelForImageTextToText,
-    processor: AutoProcessor,
-    image: Image.Image,
-    prefix: str,
-    device: str | torch.device = "auto",
-    max_new_tokens: int = 1024,
-) -> str:
-    """Generate a text prediction for a single image and text prefix.
-
-    Args:
-        model (AutoModelForImageTextToText): The Florence-2 model for conditional text generation.
-        processor (AutoProcessor): Processor for model inputs and outputs, handling tokenization and decoding.
-        image (str | bytes | Image.Image): Input image as a file path, raw bytes, or a PIL Image.
-        prefix (str): Text prefix to condition the generated output.
-        device (str | torch.device): Device on which to run inference (e.g., "auto", "cpu", "cuda").
-        max_new_tokens (int): Maximum number of tokens to generate.
-
-    Returns:
-        str: The generated text prediction.
-    """
-    device = parse_device_spec(device)
-    inputs = processor(text=prefix, images=image, return_tensors="pt", padding=True)
-    return predict_with_inputs(
-        input_ids=inputs["input_ids"],
-        pixel_values=inputs["pixel_values"],
-        model=model,
-        processor=processor,
-        device=device,
-        max_new_tokens=max_new_tokens,
-    )[0]
-
-# def predict_with_images(
-#     model: AutoModelForImageTextToText,
-#     processor: AutoProcessor,
-#     images: Union[str, list[str]],
-#     prompt: Optional[str] = None,
-#     device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
-#     max_new_tokens: int = 512,
-#     **kwargs,
-# ) -> list[str]:
-#     """
-#     Generate text predictions from images.
-
-#     Args:
-#         model: The SmolVLM2 model
-#         processor: The model's processor
-#         images: Path(s) to image(s)
-#         prompt: Optional prompt to guide generation
-#         device: Device to run inference on
-#         max_new_tokens: Maximum number of tokens to generate
-#         **kwargs: Additional generation parameters
-
-#     Returns:
-#         List of generated text strings
-#     """
-#     if isinstance(images, str):
-#         images = [images]
-
-#     inputs = processor(images=images, text=prompt if prompt else "", return_tensors="pt")
-
-#     return predict_with_inputs(
-#         model=model,
-#         processor=processor,
-#         input_ids=inputs["input_ids"],
-#         pixel_values=inputs["pixel_values"],
-#         device=device,
-#         max_new_tokens=max_new_tokens,
-#         **kwargs,
-#     )
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 635a9bd8..7e0efa50 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -39,13 +39,11 @@ def train_collate_fn(
     ]
     suffixes = [entry["suffix"] for entry in data]
     
-    # Apply chat template WITHOUT tokenization
-    #texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
-    batch_enc = processor.apply_chat_template(messages, tokenize=True,return_dict=True,
+    inputs = processor.apply_chat_template(messages, tokenize=True,return_dict=True,
             return_tensors="pt",padding = True)
 
     # Clone input_ids to labels and mask out everything except suffix
-    labels = batch_enc.input_ids.clone()
+    labels = inputs.input_ids.clone()
 
     # Mask pad tokens
     labels[labels == processor.tokenizer.pad_token_id] = -100
@@ -59,7 +57,7 @@ def train_collate_fn(
         suffix_ids = processor.tokenizer(suffix, add_special_tokens=False).input_ids
         labels[i, :-len(suffix_ids)] = -100
 
-    return batch_enc, labels
+    return inputs, labels
 def evaluation_collate_fn(
     batch: list[tuple[Image.Image, dict[str, Any]]],
     processor: AutoProcessor
@@ -78,19 +76,11 @@ def evaluation_collate_fn(
         ]
         for image, entry in zip(images, data)
     ]
-
-    # texts = [processor.apply_chat_template(m, tokenize=False) for m in messages]
-    # # Tokenize and encode images
-    # batch_enc = processor(text=texts, images=images, return_tensors="pt", padding=True)
-    batch_enc = processor.apply_chat_template(messages, tokenize=True,return_dict=True,
+    inputs = processor.apply_chat_template(messages, tokenize=True,return_dict=True,
             return_tensors="pt",padding = True)
-    #print(batch_enc)
 
-    # input_ids = batch_enc["input_ids"]
-    # attention_mask = batch_enc["attention_mask"]
-    # pixel_values = batch_enc["pixel_values"]
 
     prefixes = ["<image>" + entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]
-    return batch_enc, prefixes, suffixes
+    return inputs, prefixes, suffixes
 

From a600a1e51aca8e1114c034414e00d7c1b99dca2d Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 17:54:57 -0300
Subject: [PATCH 61/92] added device

---
 maestro/trainer/models/smolvlm2/core.py      | 1 +
 maestro/trainer/models/smolvlm2/inference.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 665688ac..45d56616 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -175,6 +175,7 @@ def validation_step(self, batch, batch_idx):
         generated_suffixes = predict_with_inputs(self.model,
                                             self.processor,
                                             inputs,
+                                            device = self.config.device,
                                             max_new_tokens=self.config.max_new_tokens )
 
         if batch_idx == 0:
diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index 78b91a37..062e6d2e 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -31,7 +31,7 @@ def predict_with_inputs(
     """
     with torch.no_grad():
         generated_ids = model.generate(
-            **inputs,
+            **inputs.to(device),
             max_new_tokens=max_new_tokens,
             do_sample=False,
         )

From 4667b3aa9ffa5d4aeadb6ba858f2fc7da44d38c7 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 18:33:20 -0300
Subject: [PATCH 62/92] nicer code

---
 maestro/trainer/models/smolvlm2/checkpoints.py | 15 +++++++--------
 maestro/trainer/models/smolvlm2/core.py        |  4 ----
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index 304f4608..a6625a77 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -8,7 +8,6 @@
 from maestro.trainer.logger import get_maestro_logger
 from peft import LoraConfig, get_peft_model
 from transformers import BitsAndBytesConfig
-from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor
 
 DEFAULT_SMOLVLM2_MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"#"smol-ai/smolvlm2-500m"
 DEFAULT_SMOLVLM2_MODEL_REVISION = "refs/heads/main"
@@ -54,13 +53,13 @@ def save_model(
     model: AutoModelForImageTextToText,
 ) -> None:
     """
-    Save a PaliGemma 2 model and its processor to disk.
+    Save a SmolVLM 2 model and its processor to disk.
 
     Args:
         target_dir: Directory path where the model and processor will be saved.
             Will be created if it doesn't exist.
-        processor: The PaliGemma 2 processor to save.
-        model: The PaliGemma 2model to save.
+        processor: The SmolVLM 2 processor to save.
+        model: The SmolVLM 2model to save.
     """
     os.makedirs(target_dir, exist_ok=True)
     processor.save_pretrained(target_dir)
@@ -105,7 +104,7 @@ def load_model(
     peft_advanced_params: Optional[dict] = None,
     cache_dir: Optional[str] = None,
 ) -> tuple[AutoProcessor, AutoModelForImageTextToText]:
-    """Loads a PaliGemma 2 model and its associated processor.
+    """Loads a SmolVLM 2 model and its associated processor.
 
     Args:
         model_id_or_path (str): The identifier or path of the model to load.
@@ -116,7 +115,7 @@ def load_model(
         cache_dir (Optional[str]): Directory to cache the downloaded model files.
 
     Returns:
-        (PaliGemmaProcessor, PaliGemmaForConditionalGeneration):
+        (SmolVLM2Processor, SmolVLM2ForConditionalGeneration):
             A tuple containing the loaded processor and model.
 
     Raises:
@@ -125,6 +124,7 @@ def load_model(
     device = parse_device_spec(device)
     processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True, revision=revision)
 
+    # TODO: QLORA IS NOT WORKING, MAYBE THE SOLUTION IS CAST THE INPUTS TO blfloat16
     if optimization_strategy in {OptimizationStrategy.LORA, OptimizationStrategy.QLORA}:
         default_params = DEFAULT_SMOLVLM2_PEFT_PARAMS
         if peft_advanced_params is not None:
@@ -171,10 +171,9 @@ def load_model(
             for param in model.model.vision_model.parameters():
                 param.requires_grad = False
 
+            # TODO: check if there are more weights to freeze, like:
             # for param in model.multi_modal_projector.parameters():
             #     param.requires_grad = False
 
-
-
     return processor, model
 
diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 45d56616..eeb78049 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -207,10 +207,6 @@ def on_fit_end(self) -> None:
         )
 
 
-
-
-
-
 def train(config: SmolVLM2Configuration | dict) -> None:
     if isinstance(config, dict):
         config = dacite.from_dict(data_class=SmolVLM2Configuration, data=config)

From 508f397bac5bb948ae2d159807359fc85e153a2e Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 19:23:09 -0300
Subject: [PATCH 63/92] added predict

---
 maestro/trainer/models/smolvlm2/inference.py | 27 ++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index 062e6d2e..c5f991be 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -37,3 +37,30 @@ def predict_with_inputs(
         )
     return processor.batch_decode(generated_ids, skip_special_tokens=True)
 
+def predict(
+    model: AutoModelForImageTextToText,
+    processor: AutoProcessor,
+    image: str | bytes | Image.Image,
+    prefix: str,
+    device: str | torch.device = "auto",
+    max_new_tokens: int = 1024,
+) -> str:
+    """Generate a text prediction for a single image and text prefix.
+
+    Args:
+        model (AutoModelForImageTextToText): The PaliGemma model for generation.
+        processor (AutoProcessor): Tokenizer and processor for model inputs/outputs.
+        image (str | bytes | Image.Image): Input image as a file path, bytes, or PIL Image.
+        prefix (str): Text prefix to condition the generation.
+        device (str | torch.device): Device to run inference on.
+        max_new_tokens (int): Maximum number of new tokens to generate.
+
+    Returns:
+        str: Generated text prediction.
+    """
+    device = parse_device_spec(device)
+    text = "<image>" + prefix
+    inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
+    return predict_with_inputs(
+        **inputs, model=model, processor=processor, device=device, max_new_tokens=max_new_tokens
+    )[0]
\ No newline at end of file

From ed27b5a6d30aececb83945c615518c8393a2dcf7 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 19:24:13 -0300
Subject: [PATCH 64/92] added predict

---
 maestro/trainer/models/smolvlm2/inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index c5f991be..52c030b4 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -62,5 +62,5 @@ def predict(
     text = "<image>" + prefix
     inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
     return predict_with_inputs(
-        **inputs, model=model, processor=processor, device=device, max_new_tokens=max_new_tokens
+        inputs = inputs, model=model, processor=processor, device=device, max_new_tokens=max_new_tokens
     )[0]
\ No newline at end of file

From f9f6c2f6adc3ae19c070527a9489047f39f35835 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 19:29:19 -0300
Subject: [PATCH 65/92] added predict

---
 maestro/trainer/models/smolvlm2/inference.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index 52c030b4..a9d525cc 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -59,8 +59,23 @@ def predict(
         str: Generated text prediction.
     """
     device = parse_device_spec(device)
-    text = "<image>" + prefix
-    inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
+    messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image":image},
+            {"type": "text", "text": prefix},
+        ]
+    },
+    ]
+    inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(model.device, dtype=torch.bfloat16)
+    #inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
     return predict_with_inputs(
         inputs = inputs, model=model, processor=processor, device=device, max_new_tokens=max_new_tokens
     )[0]
\ No newline at end of file

From e4fed49a94c38881d348947a58ae054e7221b54e Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Thu, 29 May 2025 19:33:44 -0300
Subject: [PATCH 66/92] added predict

---
 maestro/trainer/models/smolvlm2/inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index a9d525cc..c681377a 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -74,7 +74,7 @@ def predict(
             tokenize=True,
             return_dict=True,
             return_tensors="pt",
-        ).to(model.device, dtype=torch.bfloat16)
+        )#.to(model.device, dtype=torch.bfloat16)
     #inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
     return predict_with_inputs(
         inputs = inputs, model=model, processor=processor, device=device, max_new_tokens=max_new_tokens

From d7e40f7031364f50af221901fc1d80a4f2b1b3ae Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Fri, 30 May 2025 09:34:52 -0300
Subject: [PATCH 67/92] updated predict with input to only return the suffix

---
 maestro/trainer/models/smolvlm2/inference.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index c681377a..06ee59fa 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -35,6 +35,10 @@ def predict_with_inputs(
             max_new_tokens=max_new_tokens,
             do_sample=False,
         )
+        prefix_length = inputs['input_ids'].shape[-1]
+
+        generated_ids = generated_ids[:, prefix_length:]
+
     return processor.batch_decode(generated_ids, skip_special_tokens=True)
 
 def predict(

From 94d905b4b51017b26a3c628043be4ccac6405de8 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Fri, 30 May 2025 10:08:44 -0300
Subject: [PATCH 68/92] fix validation output

---
 maestro/trainer/models/smolvlm2/inference.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index 06ee59fa..23d58480 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -35,11 +35,11 @@ def predict_with_inputs(
             max_new_tokens=max_new_tokens,
             do_sample=False,
         )
-        prefix_length = inputs['input_ids'].shape[-1]
+        # Trim the generated ids to remove the input ids
+        generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
 
-        generated_ids = generated_ids[:, prefix_length:]
 
-    return processor.batch_decode(generated_ids, skip_special_tokens=True)
+    return processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 
 def predict(
     model: AutoModelForImageTextToText,
@@ -79,7 +79,6 @@ def predict(
             return_dict=True,
             return_tensors="pt",
         )#.to(model.device, dtype=torch.bfloat16)
-    #inputs = processor(text=text, images=image, return_tensors="pt", padding=True)
     return predict_with_inputs(
         inputs = inputs, model=model, processor=processor, device=device, max_new_tokens=max_new_tokens
     )[0]
\ No newline at end of file

From ba855a7f1974975a54dbca80b6e45f5f14d07d46 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Fri, 30 May 2025 11:46:39 -0300
Subject: [PATCH 69/92] tryng collator from HF

---
 maestro/trainer/models/smolvlm2/core.py      | 14 ++++++++++----
 maestro/trainer/models/smolvlm2/inference.py |  2 +-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index eeb78049..f7e8cd65 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -45,6 +45,7 @@
     result_to_detections_formatter,
 )
 logger = get_maestro_logger()
+from transformers import DataCollatorForCompletionOnlyLM
 
 
 @dataclass()
@@ -159,10 +160,13 @@ def __init__(
         self.valid_metrics_tracker = MetricsTracker.init(metrics=metrics)
 
     def training_step(self, batch, batch_idx):
-        inputs, labels = batch
+        batch = {k: v.to(self.config.device) for k, v in batch.items()}
+
+        # Forward pass
         outputs = self.model(
-            **inputs,
-            labels=labels,
+            input_ids=batch["input_ids"],
+            attention_mask=batch["attention_mask"],
+            labels=batch["labels"],
         )
         loss = outputs.loss
         self.log("train_loss", loss, prog_bar=True, logger=True, batch_size=self.config.batch_size)
@@ -227,10 +231,12 @@ def train(config: SmolVLM2Configuration | dict) -> None:
     dataset_location = resolve_dataset_path(config.dataset)
     if dataset_location is None:
         return
+    response_template = "### Assistant:"
+    data_collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
     train_loader, valid_loader, test_loader = create_data_loaders(
         dataset_location=dataset_location,
         train_batch_size=config.batch_size,
-        train_collect_fn=partial(train_collate_fn, processor=processor),
+        train_collect_fn=partial(data_collator, processor=processor),
         train_num_workers=config.num_workers,
         test_batch_size=config.val_batch_size,
         test_collect_fn=partial(evaluation_collate_fn, processor=processor),
diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
index 23d58480..4dd9fa49 100644
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ b/maestro/trainer/models/smolvlm2/inference.py
@@ -77,7 +77,7 @@ def predict(
             add_generation_prompt=True,
             tokenize=True,
             return_dict=True,
-            return_tensors="pt",
+            return_tensors="pt",    
         )#.to(model.device, dtype=torch.bfloat16)
     return predict_with_inputs(
         inputs = inputs, model=model, processor=processor, device=device, max_new_tokens=max_new_tokens

From 2b13c6964e29f73db4b91944777fc49a8fb3ff44 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Fri, 30 May 2025 11:53:38 -0300
Subject: [PATCH 70/92] tryng collator from HF

---
 maestro/trainer/models/smolvlm2/core.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index f7e8cd65..d54c64cd 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -45,7 +45,7 @@
     result_to_detections_formatter,
 )
 logger = get_maestro_logger()
-from transformers import DataCollatorForCompletionOnlyLM
+from trl import  DataCollatorForCompletionOnlyLM
 
 
 @dataclass()
@@ -232,11 +232,12 @@ def train(config: SmolVLM2Configuration | dict) -> None:
     if dataset_location is None:
         return
     response_template = "### Assistant:"
-    data_collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)
+    data_collator = DataCollatorForCompletionOnlyLM(response_template=response_template,
+                                                     tokenizer=processor.tokenizer)
     train_loader, valid_loader, test_loader = create_data_loaders(
         dataset_location=dataset_location,
         train_batch_size=config.batch_size,
-        train_collect_fn=partial(data_collator, processor=processor),
+        train_collect_fn=data_collator,
         train_num_workers=config.num_workers,
         test_batch_size=config.val_batch_size,
         test_collect_fn=partial(evaluation_collate_fn, processor=processor),

From 00f3860eb28ab1878ba5295a3bdf6a359697d6b7 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Fri, 30 May 2025 12:03:12 -0300
Subject: [PATCH 71/92] rollback

---
 maestro/trainer/models/smolvlm2/core.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index d54c64cd..677e870f 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -45,7 +45,6 @@
     result_to_detections_formatter,
 )
 logger = get_maestro_logger()
-from trl import  DataCollatorForCompletionOnlyLM
 
 
 @dataclass()
@@ -160,13 +159,10 @@ def __init__(
         self.valid_metrics_tracker = MetricsTracker.init(metrics=metrics)
 
     def training_step(self, batch, batch_idx):
-        batch = {k: v.to(self.config.device) for k, v in batch.items()}
-
-        # Forward pass
+        inputs, labels = batch
         outputs = self.model(
-            input_ids=batch["input_ids"],
-            attention_mask=batch["attention_mask"],
-            labels=batch["labels"],
+            **inputs,
+            labels=labels,
         )
         loss = outputs.loss
         self.log("train_loss", loss, prog_bar=True, logger=True, batch_size=self.config.batch_size)
@@ -231,13 +227,11 @@ def train(config: SmolVLM2Configuration | dict) -> None:
     dataset_location = resolve_dataset_path(config.dataset)
     if dataset_location is None:
         return
-    response_template = "### Assistant:"
-    data_collator = DataCollatorForCompletionOnlyLM(response_template=response_template,
-                                                     tokenizer=processor.tokenizer)
+            
     train_loader, valid_loader, test_loader = create_data_loaders(
         dataset_location=dataset_location,
         train_batch_size=config.batch_size,
-        train_collect_fn=data_collator,
+        train_collect_fn=partial(train_collate_fn, processor=processor),
         train_num_workers=config.num_workers,
         test_batch_size=config.val_batch_size,
         test_collect_fn=partial(evaluation_collate_fn, processor=processor),

From 6c8129898b0d294e5672d4077fb32f9b6b6f84b1 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Fri, 30 May 2025 12:28:20 -0300
Subject: [PATCH 72/92] possible fix for masking

---
 maestro/trainer/models/smolvlm2/loaders.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 7e0efa50..afea7120 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -55,9 +55,17 @@ def train_collate_fn(
     # Mask prefix tokens: keep only suffix as target
     for i, suffix in enumerate(suffixes):
         suffix_ids = processor.tokenizer(suffix, add_special_tokens=False).input_ids
-        labels[i, :-len(suffix_ids)] = -100
+        #labels[i, :-len(suffix_ids)] = -100
+
+        # Try to find the start index of the suffix tokens in the full input sequence
+        sequence = inputs.input_ids[i].tolist()
+        for j in range(len(sequence) - len(suffix_ids) + 1):
+            if sequence[j:j + len(suffix_ids)] == suffix_ids:
+                labels[i, :j] = -100
+                labels[i, j + len(suffix_ids):] = -100
+                break
+        return inputs, labels
 
-    return inputs, labels
 def evaluation_collate_fn(
     batch: list[tuple[Image.Image, dict[str, Any]]],
     processor: AutoProcessor

From 1e13d190c2f4b91569671ef0ba0d4a45b7b261cd Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Fri, 30 May 2025 12:39:34 -0300
Subject: [PATCH 73/92] buug fix for masking

---
 maestro/trainer/models/smolvlm2/loaders.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index afea7120..403334e5 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -55,8 +55,6 @@ def train_collate_fn(
     # Mask prefix tokens: keep only suffix as target
     for i, suffix in enumerate(suffixes):
         suffix_ids = processor.tokenizer(suffix, add_special_tokens=False).input_ids
-        #labels[i, :-len(suffix_ids)] = -100
-
         # Try to find the start index of the suffix tokens in the full input sequence
         sequence = inputs.input_ids[i].tolist()
         for j in range(len(sequence) - len(suffix_ids) + 1):
@@ -64,7 +62,7 @@ def train_collate_fn(
                 labels[i, :j] = -100
                 labels[i, j + len(suffix_ids):] = -100
                 break
-        return inputs, labels
+    return inputs, labels
 
 def evaluation_collate_fn(
     batch: list[tuple[Image.Image, dict[str, Any]]],

From e92064e189bd472510266d3ef8ccbebe179c6ba1 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Fri, 30 May 2025 12:49:14 -0300
Subject: [PATCH 74/92] debug print

---
 maestro/trainer/models/smolvlm2/loaders.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 403334e5..830249b1 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -59,6 +59,7 @@ def train_collate_fn(
         sequence = inputs.input_ids[i].tolist()
         for j in range(len(sequence) - len(suffix_ids) + 1):
             if sequence[j:j + len(suffix_ids)] == suffix_ids:
+                print("here")
                 labels[i, :j] = -100
                 labels[i, j + len(suffix_ids):] = -100
                 break

From 57634e09ad51a456904159565bcb1496b84cb398 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Fri, 30 May 2025 12:53:53 -0300
Subject: [PATCH 75/92] trying without masking the input

---
 maestro/trainer/models/smolvlm2/loaders.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 830249b1..839bdd47 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -52,17 +52,16 @@ def train_collate_fn(
     image_token_id = processor.tokenizer.convert_tokens_to_ids("<image>")
     labels[labels == image_token_id] = -100
 
-    # Mask prefix tokens: keep only suffix as target
-    for i, suffix in enumerate(suffixes):
-        suffix_ids = processor.tokenizer(suffix, add_special_tokens=False).input_ids
-        # Try to find the start index of the suffix tokens in the full input sequence
-        sequence = inputs.input_ids[i].tolist()
-        for j in range(len(sequence) - len(suffix_ids) + 1):
-            if sequence[j:j + len(suffix_ids)] == suffix_ids:
-                print("here")
-                labels[i, :j] = -100
-                labels[i, j + len(suffix_ids):] = -100
-                break
+    # # Mask prefix tokens: keep only suffix as target
+    # for i, suffix in enumerate(suffixes):
+    #     suffix_ids = processor.tokenizer(suffix, add_special_tokens=False).input_ids
+    #     # Try to find the start index of the suffix tokens in the full input sequence
+    #     sequence = inputs.input_ids[i].tolist()
+    #     for j in range(len(sequence) - len(suffix_ids) + 1):
+    #         if sequence[j:j + len(suffix_ids)] == suffix_ids:
+    #             labels[i, :j] = -100
+    #             labels[i, j + len(suffix_ids):] = -100
+    #             break
     return inputs, labels
 
 def evaluation_collate_fn(

From 9af7a2b4f0a4a58ae0f629827e39eb709352a82f Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Fri, 30 May 2025 13:14:46 -0300
Subject: [PATCH 76/92] removed comments

---
 .../trainer/models/smolvlm2/checkpoints.py    | 24 -------------------
 maestro/trainer/models/smolvlm2/core.py       |  6 +----
 maestro/trainer/models/smolvlm2/loaders.py    | 15 ------------
 3 files changed, 1 insertion(+), 44 deletions(-)

diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py
index a6625a77..5ad23082 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm2/checkpoints.py
@@ -65,30 +65,6 @@ def save_model(
     processor.save_pretrained(target_dir)
     model.save_pretrained(target_dir)
 
-# def load_checkpoint(path: str, device: str = "cuda" if torch.cuda.is_available() else "cpu") -> dict:
-#     """
-#     Load model checkpoint.
-
-#     Args:
-#         path: Path to checkpoint
-#         device: Device to load model on
-
-#     Returns:
-#         Dictionary containing model, processor, and metadata
-#     """
-#     # Load model
-#     model = AutoModelForImageTextToText.from_pretrained(path)
-#     model.to(device)
-
-#     # Load processor
-#     processor = AutoProcessor.from_pretrained(path)
-
-#     # Load metadata if exists
-#     metadata_path = os.path.join(path, "metadata.pt")
-#     metadata = torch.load(metadata_path) if os.path.exists(metadata_path) else None
-
-#     return {"model": model, "processor": processor, "metadata": metadata}
-
 class OptimizationStrategy(Enum):
     """Enumeration for optimization strategies."""
 
diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py
index 677e870f..1c77988e 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm2/core.py
@@ -8,9 +8,6 @@
 import dacite
 from functools import partial
 
-
-import numpy as np
-import supervision as sv
 from maestro.trainer.common.callbacks import SaveCheckpoint
 from maestro.trainer.common.datasets.core import create_data_loaders, resolve_dataset_path
 from maestro.trainer.common.metrics import BaseMetric, MetricsTracker, parse_metrics, save_metric_plots
@@ -34,7 +31,6 @@
 from torch.optim import AdamW
 from maestro.trainer.common.metrics import (
     BaseMetric,
-    MeanAveragePrecisionMetric,
     MetricsTracker,
     parse_metrics,
     save_metric_plots,
@@ -227,7 +223,7 @@ def train(config: SmolVLM2Configuration | dict) -> None:
     dataset_location = resolve_dataset_path(config.dataset)
     if dataset_location is None:
         return
-            
+
     train_loader, valid_loader, test_loader = create_data_loaders(
         dataset_location=dataset_location,
         train_batch_size=config.batch_size,
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
index 839bdd47..ff96357c 100644
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ b/maestro/trainer/models/smolvlm2/loaders.py
@@ -2,9 +2,6 @@
 
 from PIL import Image
 from transformers import  AutoProcessor
-import supervision as sv
-from torch.nn.utils.rnn import pad_sequence
-import torch
 
 def format_data(image, prefix, suffix):
     return [
@@ -37,7 +34,6 @@ def train_collate_fn(
         format_data(image, entry["prefix"], entry["suffix"])
         for image, entry in zip(images, data)
     ]
-    suffixes = [entry["suffix"] for entry in data]
     
     inputs = processor.apply_chat_template(messages, tokenize=True,return_dict=True,
             return_tensors="pt",padding = True)
@@ -51,17 +47,6 @@ def train_collate_fn(
     # Mask <image> tokens
     image_token_id = processor.tokenizer.convert_tokens_to_ids("<image>")
     labels[labels == image_token_id] = -100
-
-    # # Mask prefix tokens: keep only suffix as target
-    # for i, suffix in enumerate(suffixes):
-    #     suffix_ids = processor.tokenizer(suffix, add_special_tokens=False).input_ids
-    #     # Try to find the start index of the suffix tokens in the full input sequence
-    #     sequence = inputs.input_ids[i].tolist()
-    #     for j in range(len(sequence) - len(suffix_ids) + 1):
-    #         if sequence[j:j + len(suffix_ids)] == suffix_ids:
-    #             labels[i, :j] = -100
-    #             labels[i, j + len(suffix_ids):] = -100
-    #             break
     return inputs, labels
 
 def evaluation_collate_fn(

From 1f57cb156bbdc03c7500dd5814a1e49ba821aa71 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Mon, 2 Jun 2025 11:35:39 -0300
Subject: [PATCH 77/92] Changed with piotr colab suggestions

---
 maestro/trainer/models/smolvlm2/inference.py  |  84 -------------
 maestro/trainer/models/smolvlm2/loaders.py    |  77 ------------
 .../{smolvlm2 => smolvlm_2}/__init__.py       |   0
 .../{smolvlm2 => smolvlm_2}/checkpoints.py    | 104 ++++++++--------
 .../models/{smolvlm2 => smolvlm_2}/core.py    |  88 ++++----------
 .../{smolvlm2 => smolvlm_2}/detection.py      |  12 +-
 .../{smolvlm2 => smolvlm_2}/entrypoint.py     |  22 ++--
 maestro/trainer/models/smolvlm_2/inference.py |  58 +++++++++
 maestro/trainer/models/smolvlm_2/loaders.py   | 115 ++++++++++++++++++
 9 files changed, 267 insertions(+), 293 deletions(-)
 delete mode 100644 maestro/trainer/models/smolvlm2/inference.py
 delete mode 100644 maestro/trainer/models/smolvlm2/loaders.py
 rename maestro/trainer/models/{smolvlm2 => smolvlm_2}/__init__.py (100%)
 rename maestro/trainer/models/{smolvlm2 => smolvlm_2}/checkpoints.py (59%)
 rename maestro/trainer/models/{smolvlm2 => smolvlm_2}/core.py (71%)
 rename maestro/trainer/models/{smolvlm2 => smolvlm_2}/detection.py (91%)
 rename maestro/trainer/models/{smolvlm2 => smolvlm_2}/entrypoint.py (88%)
 create mode 100644 maestro/trainer/models/smolvlm_2/inference.py
 create mode 100644 maestro/trainer/models/smolvlm_2/loaders.py

diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py
deleted file mode 100644
index 4dd9fa49..00000000
--- a/maestro/trainer/models/smolvlm2/inference.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from typing import Optional, Union
-from PIL import Image
-
-import torch
-from transformers import AutoModelForImageTextToText, AutoProcessor
-from maestro.trainer.common.utils.device import parse_device_spec
-
-
-def predict_with_inputs(
-    model: AutoModelForImageTextToText,
-    processor: AutoProcessor,
-    inputs: torch.Tensor,
-    device: Union[str, torch.device],
-    max_new_tokens: int = 512,
-    **kwargs,
-) -> list[str]:
-    """
-    Generate text predictions using the model.
-
-    Args:
-        model: The SmolVLM2 model
-        processor: The model's processor
-        input_ids: Input token IDs
-        pixel_values: Input image pixel values
-        device: Device to run inference on
-        max_new_tokens: Maximum number of tokens to generate
-        **kwargs: Additional generation parameters
-
-    Returns:
-        List of generated text strings
-    """
-    with torch.no_grad():
-        generated_ids = model.generate(
-            **inputs.to(device),
-            max_new_tokens=max_new_tokens,
-            do_sample=False,
-        )
-        # Trim the generated ids to remove the input ids
-        generated_ids = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-
-
-    return processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-
-def predict(
-    model: AutoModelForImageTextToText,
-    processor: AutoProcessor,
-    image: str | bytes | Image.Image,
-    prefix: str,
-    device: str | torch.device = "auto",
-    max_new_tokens: int = 1024,
-) -> str:
-    """Generate a text prediction for a single image and text prefix.
-
-    Args:
-        model (AutoModelForImageTextToText): The PaliGemma model for generation.
-        processor (AutoProcessor): Tokenizer and processor for model inputs/outputs.
-        image (str | bytes | Image.Image): Input image as a file path, bytes, or PIL Image.
-        prefix (str): Text prefix to condition the generation.
-        device (str | torch.device): Device to run inference on.
-        max_new_tokens (int): Maximum number of new tokens to generate.
-
-    Returns:
-        str: Generated text prediction.
-    """
-    device = parse_device_spec(device)
-    messages = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image", "image":image},
-            {"type": "text", "text": prefix},
-        ]
-    },
-    ]
-    inputs = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",    
-        )#.to(model.device, dtype=torch.bfloat16)
-    return predict_with_inputs(
-        inputs = inputs, model=model, processor=processor, device=device, max_new_tokens=max_new_tokens
-    )[0]
\ No newline at end of file
diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py
deleted file mode 100644
index ff96357c..00000000
--- a/maestro/trainer/models/smolvlm2/loaders.py
+++ /dev/null
@@ -1,77 +0,0 @@
-from typing import Any
-
-from PIL import Image
-from transformers import  AutoProcessor
-
-def format_data(image, prefix, suffix):
-    return [
-
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": image,
-                },
-                {
-                    "type": "text",
-                    "text": prefix,
-                },
-            ],
-        },
-        {
-            "role": "assistant",
-            "content": [{"type": "text", "text": suffix}],
-        },
-    ]
-
-def train_collate_fn(
-    batch: list[tuple[Image.Image, dict[str, Any]]],
-      processor: AutoProcessor ):
-    images, data = zip(*batch)
-
-    messages = [
-        format_data(image, entry["prefix"], entry["suffix"])
-        for image, entry in zip(images, data)
-    ]
-    
-    inputs = processor.apply_chat_template(messages, tokenize=True,return_dict=True,
-            return_tensors="pt",padding = True)
-
-    # Clone input_ids to labels and mask out everything except suffix
-    labels = inputs.input_ids.clone()
-
-    # Mask pad tokens
-    labels[labels == processor.tokenizer.pad_token_id] = -100
-
-    # Mask <image> tokens
-    image_token_id = processor.tokenizer.convert_tokens_to_ids("<image>")
-    labels[labels == image_token_id] = -100
-    return inputs, labels
-
-def evaluation_collate_fn(
-    batch: list[tuple[Image.Image, dict[str, Any]]],
-    processor: AutoProcessor
-):
-    images, data = zip(*batch)
-
-    messages = [
-        [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {"type": "text", "text": entry["prefix"]},
-                ],
-            }
-        ]
-        for image, entry in zip(images, data)
-    ]
-    inputs = processor.apply_chat_template(messages, tokenize=True,return_dict=True,
-            return_tensors="pt",padding = True)
-
-
-    prefixes = ["<image>" + entry["prefix"] for entry in data]
-    suffixes = [entry["suffix"] for entry in data]
-    return inputs, prefixes, suffixes
-
diff --git a/maestro/trainer/models/smolvlm2/__init__.py b/maestro/trainer/models/smolvlm_2/__init__.py
similarity index 100%
rename from maestro/trainer/models/smolvlm2/__init__.py
rename to maestro/trainer/models/smolvlm_2/__init__.py
diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm_2/checkpoints.py
similarity index 59%
rename from maestro/trainer/models/smolvlm2/checkpoints.py
rename to maestro/trainer/models/smolvlm_2/checkpoints.py
index 5ad23082..b3a0fc50 100644
--- a/maestro/trainer/models/smolvlm2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm_2/checkpoints.py
@@ -9,17 +9,26 @@
 from peft import LoraConfig, get_peft_model
 from transformers import BitsAndBytesConfig
 
-DEFAULT_SMOLVLM2_MODEL_ID = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"#"smol-ai/smolvlm2-500m"
-DEFAULT_SMOLVLM2_MODEL_REVISION = "refs/heads/main"
-DEFAULT_SMOLVLM2_PEFT_PARAMS = {
+DEFAULT_SMOLVLM_2_MODEL_ID = "HuggingFaceTB/SmolVLM-500M-Instruct"#"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+DEFAULT_SMOLVLM_2_MODEL_REVISION = "refs/heads/main"
+DEFAULT_SMOLVLM_2_LORA_PARAMS = {
     "r": 8,
-    "lora_alpha": 16,
-    "lora_dropout": 0.05,
+    "lora_alpha": 8,
+    "lora_dropout": 0.1,
     "bias": "none",
-    "target_modules": ["q_proj", "o_proj", "k_proj", "v_proj", "linear", "Conv2d", "lm_head", "fc2"],
-    "task_type": "CAUSAL_LM",
+    "target_modules": ['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
+    "init_lora_weights": "gaussian",
+    "use_dora": True
+}
+DEFAULT_SMOLVLM_2_QLORA_PARAMS = {
+    "r": 8,
+    "lora_alpha": 8,
+    "lora_dropout": 0.1,
+    "bias": "none",
+    "target_modules": ['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
+    "init_lora_weights": "gaussian",
+    "use_dora": False
 }
-
 logger = get_maestro_logger()
 
 
@@ -73,36 +82,24 @@ class OptimizationStrategy(Enum):
     FREEZE = "freeze"
     NONE = "none"
 def load_model(
-    model_id_or_path: str = DEFAULT_SMOLVLM2_MODEL_ID,
-    revision: str = DEFAULT_SMOLVLM2_MODEL_REVISION,
+    model_id_or_path: str = DEFAULT_SMOLVLM_2_MODEL_ID,
+    revision: str = DEFAULT_SMOLVLM_2_MODEL_REVISION,
     device: str | torch.device = "auto",
     optimization_strategy: OptimizationStrategy = OptimizationStrategy.NONE,
     peft_advanced_params: Optional[dict] = None,
     cache_dir: Optional[str] = None,
+    longest_edge: int = 512
 ) -> tuple[AutoProcessor, AutoModelForImageTextToText]:
-    """Loads a SmolVLM 2 model and its associated processor.
-
-    Args:
-        model_id_or_path (str): The identifier or path of the model to load.
-        revision (str): The specific model revision to use.
-        device (torch.device): The device to load the model onto.
-        optimization_strategy (OptimizationStrategy): The optimization strategy to apply to the model.
-        peft_advanced_params: custom lora configuration
-        cache_dir (Optional[str]): Directory to cache the downloaded model files.
-
-    Returns:
-        (SmolVLM2Processor, SmolVLM2ForConditionalGeneration):
-            A tuple containing the loaded processor and model.
-
-    Raises:
-        ValueError: If the model or processor cannot be loaded.
-    """
     device = parse_device_spec(device)
-    processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True, revision=revision)
+    processor = AutoProcessor.from_pretrained(
+        model_id_or_path,
+        do_resize=True, size={"longest_edge": longest_edge},
+        trust_remote_code=True,
+        revision=revision
+    )
 
-    # TODO: QLORA IS NOT WORKING, MAYBE THE SOLUTION IS CAST THE INPUTS TO blfloat16
     if optimization_strategy in {OptimizationStrategy.LORA, OptimizationStrategy.QLORA}:
-        default_params = DEFAULT_SMOLVLM2_PEFT_PARAMS
+        default_params = DEFAULT_SMOLVLM_2_QLORA_PARAMS if optimization_strategy == OptimizationStrategy.QLORA else DEFAULT_SMOLVLM_2_LORA_PARAMS
         if peft_advanced_params is not None:
             default_params.update(peft_advanced_params)
             try:
@@ -111,45 +108,46 @@ def load_model(
             except TypeError:
                 logger.exception("Invalid parameters for LoraConfig")
                 raise
-        
         else:
-            logger.info("No LoRA parameters provided. Using default configuration.")
+            logger.info("No additiopnal LoRA parameters provided. Using default configuration.")
             lora_config = LoraConfig(**default_params)
-        
-        bnb_config = (BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            #bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True,
-        ) if optimization_strategy == OptimizationStrategy.QLORA
-            else None)
-        
+
+        bnb_config = (
+            BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16
+            )
+            if optimization_strategy == OptimizationStrategy.QLORA
+            else None
+        )
+
         model = AutoModelForImageTextToText.from_pretrained(
-            model_id_or_path,
+            pretrained_model_name_or_path=model_id_or_path,
             revision=revision,
             trust_remote_code=True,
+            device_map="auto",
             quantization_config=bnb_config,
+            torch_dtype=torch.bfloat16,
             cache_dir=cache_dir,
-            #torch_dtype=torch.bfloat16, 
-        ).to(device)
+            _attn_implementation="flash_attention_2",
+        )
         model = get_peft_model(model, lora_config)
         model.print_trainable_parameters()
     else:
-
         model = AutoModelForImageTextToText.from_pretrained(
-            model_id_or_path,
+            pretrained_model_name_or_path=model_id_or_path,
             revision=revision,
             trust_remote_code=True,
-            cache_dir=cache_dir,).to(device)
+            device_map="auto",
+            cache_dir=cache_dir,
+            torch_dtype=torch.bfloat16,
+            _attn_implementation="flash_attention_2"
+        ).to(device)
 
         if optimization_strategy == OptimizationStrategy.FREEZE:
-            # Freeze vision encoder parameters
             for param in model.model.vision_model.parameters():
                 param.requires_grad = False
 
-            # TODO: check if there are more weights to freeze, like:
-            # for param in model.multi_modal_projector.parameters():
-            #     param.requires_grad = False
-
     return processor, model
-
diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm_2/core.py
similarity index 71%
rename from maestro/trainer/models/smolvlm2/core.py
rename to maestro/trainer/models/smolvlm_2/core.py
index 1c77988e..ecd16830 100644
--- a/maestro/trainer/models/smolvlm2/core.py
+++ b/maestro/trainer/models/smolvlm_2/core.py
@@ -16,15 +16,15 @@
 from maestro.trainer.common.utils.path import create_new_run_directory
 from maestro.trainer.common.utils.seed import ensure_reproducibility
 from maestro.trainer.logger import get_maestro_logger
-from maestro.trainer.models.smolvlm2.checkpoints import (
-    DEFAULT_SMOLVLM2_MODEL_ID,
-    DEFAULT_SMOLVLM2_MODEL_REVISION,
+from maestro.trainer.models.smolvlm_2.checkpoints import (
+    DEFAULT_SMOLVLM_2_MODEL_ID,
+    DEFAULT_SMOLVLM_2_MODEL_REVISION,
     OptimizationStrategy,
     load_model,
     save_model,
 )
-from maestro.trainer.models.smolvlm2.inference import predict_with_inputs
-from maestro.trainer.models.smolvlm2.loaders import evaluation_collate_fn, train_collate_fn
+from maestro.trainer.models.smolvlm_2.inference import predict_with_inputs
+from maestro.trainer.models.smolvlm_2.loaders import evaluation_collate_fn, train_collate_fn
 from typing import Literal, Optional
 from dataclasses import dataclass, field, replace
 from torch.utils.data import DataLoader
@@ -43,60 +43,17 @@
 logger = get_maestro_logger()
 
 
+
 @dataclass()
 class SmolVLM2Configuration:
-    """
-    Configuration for training the SmolVLM2 model.
-
-    Attributes:
-        dataset (str):
-            Local path or Roboflow identifier. If not found locally, it will be resolved (and downloaded) automatically.
-        model_id (str):
-            Identifier for the PaliGemma2 model.
-        revision (str):
-            Model revision to use.
-        device (str | torch.device):
-            Device to run training on. Can be a ``torch.device`` or a string such as
-            "auto", "cpu", "cuda", or "mps". If "auto", the code will pick the best
-            available device.
-        optimization_strategy (Literal["lora", "qlora", "freeze", "none"]):
-            Strategy for optimizing the model parameters.
-        cache_dir (Optional[str]):
-            Directory to cache the model weights locally.
-        epochs (int):
-            Number of training epochs.
-        lr (float):
-            Learning rate for training.
-        batch_size (int):
-            Training batch size.
-        accumulate_grad_batches (int):
-            Number of batches to accumulate before performing a gradient update.
-        val_batch_size (Optional[int]):
-            Validation batch size. If None, defaults to the training batch size.
-        num_workers (int):
-            Number of workers for data loading.
-        val_num_workers (Optional[int]):
-            Number of workers for validation data loading. If None, defaults to num_workers.
-        output_dir (str):
-            Directory to store training outputs.
-        metrics (list[BaseMetric] | list[str]):
-            Metrics to track during training. Can be a list of metric objects or metric names.
-        max_new_tokens (int):
-            Maximum number of new tokens generated during inference.
-        random_seed (Optional[int]):
-            Random seed for ensuring reproducibility. If None, no seeding is applied.
-        peft_advanced_params (Optional[dict]):
-            Custom LoRA configuration . If None, default configuration is applied.
-    """
-
     dataset: str
-    model_id: str = DEFAULT_SMOLVLM2_MODEL_ID
-    revision: str = DEFAULT_SMOLVLM2_MODEL_REVISION
+    model_id: str = DEFAULT_SMOLVLM_2_MODEL_ID
+    revision: str = DEFAULT_SMOLVLM_2_MODEL_REVISION
     device: str | torch.device = "auto"
     optimization_strategy: Literal["lora", "qlora", "freeze", "none"] = "lora"
     cache_dir: Optional[str] = None
     epochs: int = 10
-    lr: float = 2e-5
+    lr: float = 1e-4
     batch_size: int = 4
     accumulate_grad_batches: int = 4
     val_batch_size: Optional[int] = None
@@ -104,7 +61,8 @@ class SmolVLM2Configuration:
     val_num_workers: Optional[int] = None
     output_dir: str = "./training/smol_vlm_2"
     metrics: list[BaseMetric] | list[str] = field(default_factory=list)
-    max_new_tokens: int = 512
+    system_message: Optional[str] = None
+    max_new_tokens: int = 64
     random_seed: Optional[int] = None
     peft_advanced_params: Optional[dict] = None
 
@@ -155,9 +113,12 @@ def __init__(
         self.valid_metrics_tracker = MetricsTracker.init(metrics=metrics)
 
     def training_step(self, batch, batch_idx):
-        inputs, labels = batch
+        input_ids, attention_mask, pixel_values, pixel_attention_mask, labels = batch
         outputs = self.model(
-            **inputs,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask,
             labels=labels,
         )
         loss = outputs.loss
@@ -165,14 +126,17 @@ def training_step(self, batch, batch_idx):
         self.train_metrics_tracker.register("loss", epoch=self.current_epoch, step=batch_idx, value=loss.item())
         return loss
 
-    def validation_step(self, batch, batch_idx):
-        inputs, prefixes, suffixes = batch
 
-        generated_suffixes = predict_with_inputs(self.model,
-                                            self.processor,
-                                            inputs,
-                                            device = self.config.device,
-                                            max_new_tokens=self.config.max_new_tokens )
+    def validation_step(self, batch, batch_idx):
+        input_ids, attention_mask, pixel_values, pixel_attention_mask, images, prefixes, suffixes = batch
+        generated_suffixes = predict_with_inputs(
+            model=self.model,
+            processor=self.processor,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            pixel_attention_mask=pixel_attention_mask
+        )
 
         if batch_idx == 0:
             logger.info(f"sample valid prefix: {prefixes[0]}")
diff --git a/maestro/trainer/models/smolvlm2/detection.py b/maestro/trainer/models/smolvlm_2/detection.py
similarity index 91%
rename from maestro/trainer/models/smolvlm2/detection.py
rename to maestro/trainer/models/smolvlm_2/detection.py
index c03126a5..15879aac 100644
--- a/maestro/trainer/models/smolvlm2/detection.py
+++ b/maestro/trainer/models/smolvlm_2/detection.py
@@ -7,13 +7,13 @@
 def result_to_detections_formatter(
     text: str, resolution_wh: tuple[int, int], classes: Optional[list[str]] = None
 ) -> tuple[np.ndarray, np.ndarray]:
-    """Converts SmolVLM2 text output into detection format.
+    """Converts SmolVLM_2 text output into detection format.
 
-    SmolVLM2 outputs text in a format like:
+    SmolVLM_2 outputs text in a format like:
     "a person standing in front of a car [x1, y1, x2, y2]"
 
     Args:
-        text: SmolVLM2 output text
+        text: SmolVLM_2 output text
         resolution_wh: Target image resolution (width, height)
         classes: Optional list of valid class names
 
@@ -61,7 +61,7 @@ def result_to_detections_formatter(
 def detections_to_text_formatter(
     xyxy: np.ndarray, class_id: np.ndarray, classes: list[str], resolution_wh: tuple[int, int]
 ) -> str:
-    """Converts detections to SmolVLM2 text format.
+    """Converts detections to SmolVLM_2 text format.
 
     Args:
         xyxy: Bounding boxes in xyxy format
@@ -70,7 +70,7 @@ def detections_to_text_formatter(
         resolution_wh: Image resolution (width, height)
 
     Returns:
-        Formatted text string for SmolVLM2
+        Formatted text string for SmolVLM_2
     """
     text_parts = []
 
@@ -90,7 +90,7 @@ def format_prompt_for_detection(
     classes: Optional[list[str]] = None,
     resolution_wh: Optional[tuple[int, int]] = None,
 ) -> str:
-    """Formats a prompt for object detection with SmolVLM2.
+    """Formats a prompt for object detection with SmolVLM_2.
 
     Args:
         prompt: Base prompt
diff --git a/maestro/trainer/models/smolvlm2/entrypoint.py b/maestro/trainer/models/smolvlm_2/entrypoint.py
similarity index 88%
rename from maestro/trainer/models/smolvlm2/entrypoint.py
rename to maestro/trainer/models/smolvlm_2/entrypoint.py
index 8205e664..f49c22e6 100644
--- a/maestro/trainer/models/smolvlm2/entrypoint.py
+++ b/maestro/trainer/models/smolvlm_2/entrypoint.py
@@ -6,16 +6,16 @@
 import typer
 
 from maestro.trainer.logger import get_maestro_logger
-from maestro.trainer.models.smolvlm2.checkpoints import DEFAULT_SMOLVLM2_MODEL_ID, DEFAULT_SMOLVLM2_MODEL_REVISION
-from maestro.trainer.models.smolvlm2.core import SmolVLM2Configuration
-from maestro.trainer.models.smolvlm2.core import train as smolvlm2_train
+from maestro.trainer.models.smolvlm_2.checkpoints import DEFAULT_SMOLVLM_2_MODEL_ID, DEFAULT_SMOLVLM_2_MODEL_REVISION
+from maestro.trainer.models.smolvlm_2.core import SmolVLM2Configuration
+from maestro.trainer.models.smolvlm_2.core import train as smolvlm_2_train
 
 logger = get_maestro_logger()
-smolvlm2_app = typer.Typer(help="Fine-tune and evaluate SmolVLM2 model")
+smolvlm_2_app = typer.Typer(help="Fine-tune and evaluate SmolVLM_2 model")
 
 
-@smolvlm2_app.command(
-    help="Train SmolVLM2 model",
+@smolvlm_2_app.command(
+    help="Train SmolVLM_2 model",
     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
 def train(
@@ -28,11 +28,11 @@ def train(
         ),
     ],
     model_id: Annotated[
-        str, typer.Option("--model_id", help="Identifier for the SmolVLM2 model")
-    ] = DEFAULT_SMOLVLM2_MODEL_ID,
+        str, typer.Option("--model_id", help="Identifier for the SmolVLM_2 model")
+    ] = DEFAULT_SMOLVLM_2_MODEL_ID,
     revision: Annotated[
         str, typer.Option("--revision", help="Model revision to use")
-    ] = DEFAULT_SMOLVLM2_MODEL_REVISION,
+    ] = DEFAULT_SMOLVLM_2_MODEL_REVISION,
     device: Annotated[str, typer.Option("--device", help="Device to use for training")] = "auto",
     optimization_strategy: Annotated[
         str, typer.Option("--optimization_strategy", help="Optimization strategy: lora, freeze, or none")
@@ -53,7 +53,7 @@ def train(
     ] = None,
     output_dir: Annotated[
         str, typer.Option("--output_dir", help="Directory to store training outputs")
-    ] = "./training/smolvlm2",
+    ] = "./training/smolvlm_2",
     metrics: Annotated[list[str], typer.Option("--metrics", help="List of metrics to track during training")] = [],
     max_new_tokens: Annotated[
         int,
@@ -107,5 +107,5 @@ def parse_lora_params(param_str) -> dict[str, Any]:
     )
     typer.echo(typer.style("Training configuration", fg=typer.colors.BRIGHT_GREEN, bold=True))
     rich.print(dataclasses.asdict(config))
-    smolvlm2_train(config=config)
+    smolvlm_2_train(config=config)
 
diff --git a/maestro/trainer/models/smolvlm_2/inference.py b/maestro/trainer/models/smolvlm_2/inference.py
new file mode 100644
index 00000000..b1af9fac
--- /dev/null
+++ b/maestro/trainer/models/smolvlm_2/inference.py
@@ -0,0 +1,58 @@
+from typing import Optional, Union
+from PIL import Image
+
+import torch
+from transformers import AutoModelForImageTextToText, AutoProcessor
+from maestro.trainer.common.utils.device import parse_device_spec
+
+def predict_with_inputs(
+    model: AutoModelForImageTextToText,
+    processor: AutoProcessor,
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    pixel_values: torch.Tensor,
+    pixel_attention_mask: torch.Tensor,
+    max_new_tokens: int = 64,
+) -> list[str]:
+    with torch.no_grad():
+        generated_ids = model.generate(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            pixel_attention_mask=pixel_attention_mask,
+            do_sample=False,
+            max_new_tokens=max_new_tokens,
+        )
+        prefix_length = input_ids.shape[-1]
+        generated_ids = generated_ids[:, prefix_length:]
+        return processor.batch_decode(generated_ids, skip_special_tokens=True)
+
+
+def predict(
+    model: AutoModelForImageTextToText,
+    processor: AutoProcessor,
+    image: str | bytes | Image.Image,
+    prefix: str,
+    device: str | torch.device = "auto",
+    max_new_tokens: int = 64,
+) -> str:
+    device = parse_device_spec(device)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prefix},
+            ]
+        },
+    ]
+    inputs = processor.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+    ).to(device, dtype=torch.bfloat16)
+    return predict_with_inputs(
+        **inputs, model=model, processor=processor, max_new_tokens=max_new_tokens
+    )[0]
\ No newline at end of file
diff --git a/maestro/trainer/models/smolvlm_2/loaders.py b/maestro/trainer/models/smolvlm_2/loaders.py
new file mode 100644
index 00000000..59ce2fe4
--- /dev/null
+++ b/maestro/trainer/models/smolvlm_2/loaders.py
@@ -0,0 +1,115 @@
+from typing import Any
+from maestro.trainer.common.utils.device import parse_device_spec
+import torch
+from PIL import Image
+from transformers import  AutoProcessor
+
+def format_conversation(
+    image: str | bytes | Image.Image, prefix: str, suffix: str | None = None, system_message: str | None = None
+) -> list[dict]:
+    messages = []
+
+    if system_message is not None:
+        messages.append(
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": system_message}],
+            }
+        )
+
+    messages.append(
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": image,
+                },
+                {
+                    "type": "text",
+                    "text": prefix,
+                },
+            ],
+        }
+    )
+
+    if suffix is not None:
+        messages.append(
+            {
+                "role": "assistant",
+                "content": [{"type": "text", "text": suffix}],
+            }
+        )
+
+    return messages
+
+
+def train_collate_fn(
+    batch: list[tuple[Image.Image, dict[str, Any]]],
+    processor: AutoProcessor,
+    system_message: str | None = None,
+    device: str | torch.device = "auto",
+):
+    device = parse_device_spec(device)
+    images, data = zip(*batch)
+    conversations = [
+        format_conversation(image, entry["prefix"], entry["suffix"], system_message)
+        for image, entry in zip(images, data)
+    ]
+    texts = [
+        processor.apply_chat_template(conversation=conversation, add_generation_prompt=False).strip()
+        for conversation in conversations
+    ]
+    user_conversations = [
+        format_conversation(image, entry["prefix"], system_message)
+        for image, entry in zip(images, data)
+    ]
+    user_texts = [
+        processor.apply_chat_template(conversation=user_conversation, add_generation_prompt=False).strip()
+        for user_conversation in user_conversations
+    ]
+    images = [[image] for image in images]
+    model_inputs = processor(text=texts, images=images, return_tensors="pt", padding=True).to(device, dtype=torch.bfloat16)
+    user_model_inputs = processor(text=user_texts, images=images, return_tensors="pt", padding=True).to(device, dtype=torch.bfloat16)
+
+    labels = model_inputs["input_ids"].clone()
+    input_ids = model_inputs["input_ids"]
+    attention_mask = model_inputs["attention_mask"]
+    pixel_values = model_inputs["pixel_values"]
+    pixel_attention_mask = model_inputs["pixel_attention_mask"]
+    user_input_ids = user_model_inputs["input_ids"]
+
+    for index, user_input_id in enumerate(user_input_ids):
+        user_input_length = user_input_id.shape[0]
+        labels[index, :user_input_length] = -100
+
+    return input_ids, attention_mask, pixel_values, pixel_attention_mask, labels
+
+
+def evaluation_collate_fn(
+    batch: list[tuple[Image.Image, dict[str, Any]]],
+    processor: AutoProcessor,
+    system_message: str | None = None,
+    device: str | torch.device = "auto",
+):
+    device = parse_device_spec(device)
+    images, data = zip(*batch)
+    prefixes = [entry["prefix"] for entry in data]
+    suffixes = [entry["suffix"] for entry in data]
+    user_conversations = [
+        format_conversation(image, entry["prefix"], system_message)
+        for image, entry in zip(images, data)
+    ]
+    user_texts = [
+        processor.apply_chat_template(conversation=user_conversation, add_generation_prompt=False).strip()
+        for user_conversation in user_conversations
+    ]
+    images = [[image] for image in images]
+    user_model_inputs = processor(text=user_texts, images=images, return_tensors="pt", padding=True).to(device, dtype=torch.bfloat16)
+
+    user_input_ids = user_model_inputs["input_ids"]
+    user_attention_mask = user_model_inputs["attention_mask"]
+    user_pixel_values = user_model_inputs["pixel_values"]
+    user_pixel_attention_mask = user_model_inputs["pixel_attention_mask"]
+
+    return user_input_ids, user_attention_mask, user_pixel_values, user_pixel_attention_mask, images, prefixes, suffixes
\ No newline at end of file

From bb04f69af410a245e84a12ebba619b96568452a8 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Mon, 2 Jun 2025 11:40:40 -0300
Subject: [PATCH 78/92] removed flash attn

---
 maestro/trainer/models/smolvlm_2/checkpoints.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/maestro/trainer/models/smolvlm_2/checkpoints.py b/maestro/trainer/models/smolvlm_2/checkpoints.py
index b3a0fc50..da30505c 100644
--- a/maestro/trainer/models/smolvlm_2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm_2/checkpoints.py
@@ -131,7 +131,7 @@ def load_model(
             quantization_config=bnb_config,
             torch_dtype=torch.bfloat16,
             cache_dir=cache_dir,
-            _attn_implementation="flash_attention_2",
+            #_attn_implementation="flash_attention_2",
         )
         model = get_peft_model(model, lora_config)
         model.print_trainable_parameters()
@@ -143,7 +143,7 @@ def load_model(
             device_map="auto",
             cache_dir=cache_dir,
             torch_dtype=torch.bfloat16,
-            _attn_implementation="flash_attention_2"
+            #_attn_implementation="flash_attention_2"
         ).to(device)
 
         if optimization_strategy == OptimizationStrategy.FREEZE:

From 869048aa7b65c5385b6e6dfef6aa18342a789897 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Mon, 2 Jun 2025 11:54:06 -0300
Subject: [PATCH 79/92] removed underscore

---
 maestro/trainer/models/smolvlm_2/core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm_2/core.py b/maestro/trainer/models/smolvlm_2/core.py
index ecd16830..70dd2716 100644
--- a/maestro/trainer/models/smolvlm_2/core.py
+++ b/maestro/trainer/models/smolvlm_2/core.py
@@ -59,7 +59,7 @@ class SmolVLM2Configuration:
     val_batch_size: Optional[int] = None
     num_workers: int = 0
     val_num_workers: Optional[int] = None
-    output_dir: str = "./training/smol_vlm_2"
+    output_dir: str = "./training/smolvlm_2"
     metrics: list[BaseMetric] | list[str] = field(default_factory=list)
     system_message: Optional[str] = None
     max_new_tokens: int = 64

From 9ff898d5751e5dc68cf4368c1e27735f9a8a48cd Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Mon, 2 Jun 2025 12:49:39 -0300
Subject: [PATCH 80/92] added max image size

---
 .../trainer/models/smolvlm_2/checkpoints.py   |   2 +-
 maestro/trainer/models/smolvlm_2/core.py      |   8 +-
 maestro/trainer/models/smolvlm_2/detection.py | 113 ------------------
 maestro/trainer/models/smolvlm_2/inference.py |   1 -
 4 files changed, 3 insertions(+), 121 deletions(-)
 delete mode 100644 maestro/trainer/models/smolvlm_2/detection.py

diff --git a/maestro/trainer/models/smolvlm_2/checkpoints.py b/maestro/trainer/models/smolvlm_2/checkpoints.py
index da30505c..ee475a6b 100644
--- a/maestro/trainer/models/smolvlm_2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm_2/checkpoints.py
@@ -93,7 +93,7 @@ def load_model(
     device = parse_device_spec(device)
     processor = AutoProcessor.from_pretrained(
         model_id_or_path,
-        do_resize=True, size={"longest_edge": longest_edge},
+        do_resize=True, size={"longest_edge": longest_edge}, max_image_size = {"longest_edge": longest_edge},
         trust_remote_code=True,
         revision=revision
     )
diff --git a/maestro/trainer/models/smolvlm_2/core.py b/maestro/trainer/models/smolvlm_2/core.py
index 70dd2716..89a9df77 100644
--- a/maestro/trainer/models/smolvlm_2/core.py
+++ b/maestro/trainer/models/smolvlm_2/core.py
@@ -2,7 +2,7 @@
 from typing import Optional, Union
 
 import torch
-from transformers import AutoModelForVision2Seq, AutoProcessor, Trainer
+from transformers import AutoModelForVision2Seq, AutoProcessor
 
 import lightning
 import dacite
@@ -35,11 +35,7 @@
     parse_metrics,
     save_metric_plots,
 )
-from maestro.trainer.models.florence_2.detection import (
-    detections_to_prefix_formatter,
-    detections_to_suffix_formatter,
-    result_to_detections_formatter,
-)
+
 logger = get_maestro_logger()
 
 
diff --git a/maestro/trainer/models/smolvlm_2/detection.py b/maestro/trainer/models/smolvlm_2/detection.py
deleted file mode 100644
index 15879aac..00000000
--- a/maestro/trainer/models/smolvlm_2/detection.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import re
-from typing import Optional
-
-import numpy as np
-
-
-def result_to_detections_formatter(
-    text: str, resolution_wh: tuple[int, int], classes: Optional[list[str]] = None
-) -> tuple[np.ndarray, np.ndarray]:
-    """Converts SmolVLM_2 text output into detection format.
-
-    SmolVLM_2 outputs text in a format like:
-    "a person standing in front of a car [x1, y1, x2, y2]"
-
-    Args:
-        text: SmolVLM_2 output text
-        resolution_wh: Target image resolution (width, height)
-        classes: Optional list of valid class names
-
-    Returns:
-        Tuple of (boxes, class_ids) where:
-        - boxes is a float32 array of shape (N, 4) with xyxy coordinates
-        - class_ids is an int32 array of shape (N,) with class IDs
-    """
-    # Extract bounding boxes using regex
-    box_pattern = r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]"
-    matches = re.finditer(box_pattern, text)
-
-    boxes_list = []
-    class_ids_list = []
-
-    # Create class mapping if provided
-    if classes is not None:
-        name_to_index = {cls_name: idx for idx, cls_name in enumerate(classes)}
-    else:
-        name_to_index = None
-
-    for match in matches:
-        x_min, y_min, x_max, y_max = map(float, match.groups())
-
-        # Extract class name from text before the box
-        text_before = text[: match.start()].strip()
-        class_name = text_before.split()[-1] if text_before else "unknown"
-
-        if name_to_index is not None:
-            if class_name not in name_to_index:
-                continue
-            current_class_id = name_to_index[class_name]
-        else:
-            current_class_id = -1
-
-        boxes_list.append([x_min, y_min, x_max, y_max])
-        class_ids_list.append(current_class_id)
-
-    boxes = np.array(boxes_list, dtype=np.float32).reshape(-1, 4)
-    class_ids = np.array(class_ids_list, dtype=np.int32)
-
-    return boxes, class_ids
-
-
-def detections_to_text_formatter(
-    xyxy: np.ndarray, class_id: np.ndarray, classes: list[str], resolution_wh: tuple[int, int]
-) -> str:
-    """Converts detections to SmolVLM_2 text format.
-
-    Args:
-        xyxy: Bounding boxes in xyxy format
-        class_id: Class IDs for each box
-        classes: List of class names
-        resolution_wh: Image resolution (width, height)
-
-    Returns:
-        Formatted text string for SmolVLM_2
-    """
-    text_parts = []
-
-    for i in range(len(xyxy)):
-        cls_name = classes[class_id[i]]
-        x_min, y_min, x_max, y_max = map(int, xyxy[i])
-        box_text = f"{cls_name} [{x_min}, {y_min}, {x_max}, {y_max}]"
-        text_parts.append(box_text)
-
-    return " ".join(text_parts)
-
-
-def format_prompt_for_detection(
-    prompt: str,
-    xyxy: Optional[np.ndarray] = None,
-    class_id: Optional[np.ndarray] = None,
-    classes: Optional[list[str]] = None,
-    resolution_wh: Optional[tuple[int, int]] = None,
-) -> str:
-    """Formats a prompt for object detection with SmolVLM_2.
-
-    Args:
-        prompt: Base prompt
-        xyxy: Optional bounding boxes
-        class_id: Optional class IDs
-        classes: Optional class names        resolution_wh: Optional image resolution
-
-    Returns:
-        Formatted prompt string
-    """
-    if all(x is not None for x in [xyxy, class_id, classes, resolution_wh]):
-        # Type-cast to the expected types before passing to formatter
-        detection_text = detections_to_text_formatter(
-            xyxy,
-            class_id if class_id is not None else [],
-            classes if classes is not None else [],
-            resolution_wh if resolution_wh is not None else (0, 0),
-        )
-        return f"{prompt} {detection_text}"
-    return prompt
diff --git a/maestro/trainer/models/smolvlm_2/inference.py b/maestro/trainer/models/smolvlm_2/inference.py
index b1af9fac..1106feeb 100644
--- a/maestro/trainer/models/smolvlm_2/inference.py
+++ b/maestro/trainer/models/smolvlm_2/inference.py
@@ -1,4 +1,3 @@
-from typing import Optional, Union
 from PIL import Image
 
 import torch

From c57e61cef06c9976e912491c129ac4b5bb806d06 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Mon, 2 Jun 2025 12:54:03 -0300
Subject: [PATCH 81/92] added video sampling size

---
 maestro/trainer/models/smolvlm_2/checkpoints.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/maestro/trainer/models/smolvlm_2/checkpoints.py b/maestro/trainer/models/smolvlm_2/checkpoints.py
index ee475a6b..86e2ab80 100644
--- a/maestro/trainer/models/smolvlm_2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm_2/checkpoints.py
@@ -94,6 +94,13 @@ def load_model(
     processor = AutoProcessor.from_pretrained(
         model_id_or_path,
         do_resize=True, size={"longest_edge": longest_edge}, max_image_size = {"longest_edge": longest_edge},
+        video_sampling= {
+            "fps": 1,
+            "max_frames": 64,
+            "video_size": {
+            "longest_edge": longest_edge
+            }
+        }
         trust_remote_code=True,
         revision=revision
     )

From 33e28d360064e24a8d19fc747da9003b23075799 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Mon, 2 Jun 2025 12:58:28 -0300
Subject: [PATCH 82/92] added video sampling size

---
 maestro/trainer/models/smolvlm_2/checkpoints.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/trainer/models/smolvlm_2/checkpoints.py b/maestro/trainer/models/smolvlm_2/checkpoints.py
index 86e2ab80..3f1cbd4c 100644
--- a/maestro/trainer/models/smolvlm_2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm_2/checkpoints.py
@@ -100,7 +100,7 @@ def load_model(
             "video_size": {
             "longest_edge": longest_edge
             }
-        }
+        },
         trust_remote_code=True,
         revision=revision
     )

From 749a1c952f0394d82884e54283e36d499e31edd7 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Mon, 2 Jun 2025 13:27:04 -0300
Subject: [PATCH 83/92] rollback and added modules to toml

---
 maestro/trainer/models/smolvlm_2/checkpoints.py | 9 +--------
 pyproject.toml                                  | 5 +++--
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/maestro/trainer/models/smolvlm_2/checkpoints.py b/maestro/trainer/models/smolvlm_2/checkpoints.py
index 3f1cbd4c..da30505c 100644
--- a/maestro/trainer/models/smolvlm_2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm_2/checkpoints.py
@@ -93,14 +93,7 @@ def load_model(
     device = parse_device_spec(device)
     processor = AutoProcessor.from_pretrained(
         model_id_or_path,
-        do_resize=True, size={"longest_edge": longest_edge}, max_image_size = {"longest_edge": longest_edge},
-        video_sampling= {
-            "fps": 1,
-            "max_frames": 64,
-            "video_size": {
-            "longest_edge": longest_edge
-            }
-        },
+        do_resize=True, size={"longest_edge": longest_edge},
         trust_remote_code=True,
         revision=revision
     )
diff --git a/pyproject.toml b/pyproject.toml
index 82f15aeb..8fc0d27b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -91,13 +91,14 @@ qwen_2_5_vl = [
     "bitsandbytes>=0.45.0",
     "qwen-vl-utils>=0.0.8"
 ]
-smolvlm2 = [
+smolvlm_2 = [
     "accelerate>=1.2.1",
     "peft>=0.12",
     "torch>=2.4.0",
     "torchvision>=0.20.0",
     "transformers>=4.49.0",
-    "bitsandbytes>=0.45.0"
+    "bitsandbytes>=0.45.0",
+    num2words
 ]
 
 [project.scripts]

From 32e683e132d753ee0466b419b3664470601b73a8 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Mon, 2 Jun 2025 13:31:22 -0300
Subject: [PATCH 84/92] added num2words version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8fc0d27b..4d91dc1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -98,7 +98,7 @@ smolvlm_2 = [
     "torchvision>=0.20.0",
     "transformers>=4.49.0",
     "bitsandbytes>=0.45.0",
-    num2words
+    "num2words>=0.54.14"
 ]
 
 [project.scripts]

From 03a5b33057325c11efde9045ae9f65d57e775d26 Mon Sep 17 00:00:00 2001
From: AlexBodner <yodabodner@gmail.com>
Date: Mon, 2 Jun 2025 18:53:49 -0300
Subject: [PATCH 85/92] fixed ruff and mypy issues

---
 .../trainer/models/smolvlm_2/checkpoints.py   | 41 +++++++++++--------
 maestro/trainer/models/smolvlm_2/core.py      | 33 ++++-----------
 .../trainer/models/smolvlm_2/entrypoint.py    |  1 -
 maestro/trainer/models/smolvlm_2/inference.py | 11 +++--
 maestro/trainer/models/smolvlm_2/loaders.py   | 39 ++++++++++++------
 5 files changed, 63 insertions(+), 62 deletions(-)

diff --git a/maestro/trainer/models/smolvlm_2/checkpoints.py b/maestro/trainer/models/smolvlm_2/checkpoints.py
index da30505c..99ebafbf 100644
--- a/maestro/trainer/models/smolvlm_2/checkpoints.py
+++ b/maestro/trainer/models/smolvlm_2/checkpoints.py
@@ -1,33 +1,33 @@
 import os
-from typing import Optional
 from enum import Enum
+from typing import Optional
 
 import torch
-from transformers import AutoModelForImageTextToText, AutoProcessor
+from peft import LoraConfig, get_peft_model
+from transformers import AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig
+
 from maestro.trainer.common.utils.device import parse_device_spec
 from maestro.trainer.logger import get_maestro_logger
-from peft import LoraConfig, get_peft_model
-from transformers import BitsAndBytesConfig
 
-DEFAULT_SMOLVLM_2_MODEL_ID = "HuggingFaceTB/SmolVLM-500M-Instruct"#"HuggingFaceTB/SmolVLM2-2.2B-Instruct"
+DEFAULT_SMOLVLM_2_MODEL_ID = "HuggingFaceTB/SmolVLM-500M-Instruct"  # "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
 DEFAULT_SMOLVLM_2_MODEL_REVISION = "refs/heads/main"
 DEFAULT_SMOLVLM_2_LORA_PARAMS = {
     "r": 8,
     "lora_alpha": 8,
     "lora_dropout": 0.1,
     "bias": "none",
-    "target_modules": ['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
+    "target_modules": ["down_proj", "o_proj", "k_proj", "q_proj", "gate_proj", "up_proj", "v_proj"],
     "init_lora_weights": "gaussian",
-    "use_dora": True
+    "use_dora": True,
 }
 DEFAULT_SMOLVLM_2_QLORA_PARAMS = {
     "r": 8,
     "lora_alpha": 8,
     "lora_dropout": 0.1,
     "bias": "none",
-    "target_modules": ['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
+    "target_modules": ["down_proj", "o_proj", "k_proj", "q_proj", "gate_proj", "up_proj", "v_proj"],
     "init_lora_weights": "gaussian",
-    "use_dora": False
+    "use_dora": False,
 }
 logger = get_maestro_logger()
 
@@ -56,6 +56,7 @@ def save_checkpoint(
     if metadata is not None:
         torch.save(metadata, os.path.join(path, "metadata.pt"))
 
+
 def save_model(
     target_dir: str,
     processor: AutoProcessor,
@@ -74,6 +75,7 @@ def save_model(
     processor.save_pretrained(target_dir)
     model.save_pretrained(target_dir)
 
+
 class OptimizationStrategy(Enum):
     """Enumeration for optimization strategies."""
 
@@ -81,6 +83,8 @@ class OptimizationStrategy(Enum):
     QLORA = "qlora"
     FREEZE = "freeze"
     NONE = "none"
+
+
 def load_model(
     model_id_or_path: str = DEFAULT_SMOLVLM_2_MODEL_ID,
     revision: str = DEFAULT_SMOLVLM_2_MODEL_REVISION,
@@ -88,18 +92,19 @@ def load_model(
     optimization_strategy: OptimizationStrategy = OptimizationStrategy.NONE,
     peft_advanced_params: Optional[dict] = None,
     cache_dir: Optional[str] = None,
-    longest_edge: int = 512
+    longest_edge: int = 512,
 ) -> tuple[AutoProcessor, AutoModelForImageTextToText]:
     device = parse_device_spec(device)
     processor = AutoProcessor.from_pretrained(
-        model_id_or_path,
-        do_resize=True, size={"longest_edge": longest_edge},
-        trust_remote_code=True,
-        revision=revision
+        model_id_or_path, do_resize=True, size={"longest_edge": longest_edge}, trust_remote_code=True, revision=revision
     )
 
     if optimization_strategy in {OptimizationStrategy.LORA, OptimizationStrategy.QLORA}:
-        default_params = DEFAULT_SMOLVLM_2_QLORA_PARAMS if optimization_strategy == OptimizationStrategy.QLORA else DEFAULT_SMOLVLM_2_LORA_PARAMS
+        default_params = (
+            DEFAULT_SMOLVLM_2_QLORA_PARAMS
+            if optimization_strategy == OptimizationStrategy.QLORA
+            else DEFAULT_SMOLVLM_2_LORA_PARAMS
+        )
         if peft_advanced_params is not None:
             default_params.update(peft_advanced_params)
             try:
@@ -117,7 +122,7 @@ def load_model(
                 load_in_4bit=True,
                 bnb_4bit_use_double_quant=True,
                 bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype=torch.bfloat16
+                bnb_4bit_compute_dtype=torch.bfloat16,
             )
             if optimization_strategy == OptimizationStrategy.QLORA
             else None
@@ -131,7 +136,7 @@ def load_model(
             quantization_config=bnb_config,
             torch_dtype=torch.bfloat16,
             cache_dir=cache_dir,
-            #_attn_implementation="flash_attention_2",
+            # _attn_implementation="flash_attention_2",
         )
         model = get_peft_model(model, lora_config)
         model.print_trainable_parameters()
@@ -143,7 +148,7 @@ def load_model(
             device_map="auto",
             cache_dir=cache_dir,
             torch_dtype=torch.bfloat16,
-            #_attn_implementation="flash_attention_2"
+            # _attn_implementation="flash_attention_2"
         ).to(device)
 
         if optimization_strategy == OptimizationStrategy.FREEZE:
diff --git a/maestro/trainer/models/smolvlm_2/core.py b/maestro/trainer/models/smolvlm_2/core.py
index 89a9df77..d08a0ec5 100644
--- a/maestro/trainer/models/smolvlm_2/core.py
+++ b/maestro/trainer/models/smolvlm_2/core.py
@@ -1,13 +1,15 @@
 import os
-from typing import Optional, Union
+from dataclasses import dataclass, field, replace
+from functools import partial
+from typing import Literal, Optional
 
+import dacite
+import lightning
 import torch
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
 from transformers import AutoModelForVision2Seq, AutoProcessor
 
-import lightning
-import dacite
-from functools import partial
-
 from maestro.trainer.common.callbacks import SaveCheckpoint
 from maestro.trainer.common.datasets.core import create_data_loaders, resolve_dataset_path
 from maestro.trainer.common.metrics import BaseMetric, MetricsTracker, parse_metrics, save_metric_plots
@@ -25,21 +27,10 @@
 )
 from maestro.trainer.models.smolvlm_2.inference import predict_with_inputs
 from maestro.trainer.models.smolvlm_2.loaders import evaluation_collate_fn, train_collate_fn
-from typing import Literal, Optional
-from dataclasses import dataclass, field, replace
-from torch.utils.data import DataLoader
-from torch.optim import AdamW
-from maestro.trainer.common.metrics import (
-    BaseMetric,
-    MetricsTracker,
-    parse_metrics,
-    save_metric_plots,
-)
 
 logger = get_maestro_logger()
 
 
-
 @dataclass()
 class SmolVLM2Configuration:
     dataset: str
@@ -122,7 +113,6 @@ def training_step(self, batch, batch_idx):
         self.train_metrics_tracker.register("loss", epoch=self.current_epoch, step=batch_idx, value=loss.item())
         return loss
 
-
     def validation_step(self, batch, batch_idx):
         input_ids, attention_mask, pixel_values, pixel_attention_mask, images, prefixes, suffixes = batch
         generated_suffixes = predict_with_inputs(
@@ -131,7 +121,7 @@ def validation_step(self, batch, batch_idx):
             input_ids=input_ids,
             attention_mask=attention_mask,
             pixel_values=pixel_values,
-            pixel_attention_mask=pixel_attention_mask
+            pixel_attention_mask=pixel_attention_mask,
         )
 
         if batch_idx == 0:
@@ -212,10 +202,3 @@ def train(config: SmolVLM2Configuration | dict) -> None:
         callbacks=[save_checkpoint_callback],
     )
     trainer.fit(pl_module)
-
-
-
-
-
-
-
diff --git a/maestro/trainer/models/smolvlm_2/entrypoint.py b/maestro/trainer/models/smolvlm_2/entrypoint.py
index f49c22e6..7f84e6a4 100644
--- a/maestro/trainer/models/smolvlm_2/entrypoint.py
+++ b/maestro/trainer/models/smolvlm_2/entrypoint.py
@@ -108,4 +108,3 @@ def parse_lora_params(param_str) -> dict[str, Any]:
     typer.echo(typer.style("Training configuration", fg=typer.colors.BRIGHT_GREEN, bold=True))
     rich.print(dataclasses.asdict(config))
     smolvlm_2_train(config=config)
-
diff --git a/maestro/trainer/models/smolvlm_2/inference.py b/maestro/trainer/models/smolvlm_2/inference.py
index 1106feeb..a4c707c4 100644
--- a/maestro/trainer/models/smolvlm_2/inference.py
+++ b/maestro/trainer/models/smolvlm_2/inference.py
@@ -1,9 +1,10 @@
-from PIL import Image
-
 import torch
+from PIL import Image
 from transformers import AutoModelForImageTextToText, AutoProcessor
+
 from maestro.trainer.common.utils.device import parse_device_spec
 
+
 def predict_with_inputs(
     model: AutoModelForImageTextToText,
     processor: AutoProcessor,
@@ -42,7 +43,7 @@ def predict(
             "content": [
                 {"type": "image", "image": image},
                 {"type": "text", "text": prefix},
-            ]
+            ],
         },
     ]
     inputs = processor.apply_chat_template(
@@ -52,6 +53,4 @@ def predict(
         return_dict=True,
         return_tensors="pt",
     ).to(device, dtype=torch.bfloat16)
-    return predict_with_inputs(
-        **inputs, model=model, processor=processor, max_new_tokens=max_new_tokens
-    )[0]
\ No newline at end of file
+    return predict_with_inputs(**inputs, model=model, processor=processor, max_new_tokens=max_new_tokens)[0]
diff --git a/maestro/trainer/models/smolvlm_2/loaders.py b/maestro/trainer/models/smolvlm_2/loaders.py
index 59ce2fe4..605db7e8 100644
--- a/maestro/trainer/models/smolvlm_2/loaders.py
+++ b/maestro/trainer/models/smolvlm_2/loaders.py
@@ -1,8 +1,11 @@
 from typing import Any
-from maestro.trainer.common.utils.device import parse_device_spec
+
 import torch
 from PIL import Image
-from transformers import  AutoProcessor
+from transformers import AutoProcessor
+
+from maestro.trainer.common.utils.device import parse_device_spec
+
 
 def format_conversation(
     image: str | bytes | Image.Image, prefix: str, suffix: str | None = None, system_message: str | None = None
@@ -61,16 +64,19 @@ def train_collate_fn(
         for conversation in conversations
     ]
     user_conversations = [
-        format_conversation(image, entry["prefix"], system_message)
-        for image, entry in zip(images, data)
+        format_conversation(image, entry["prefix"], system_message) for image, entry in zip(images, data)
     ]
     user_texts = [
         processor.apply_chat_template(conversation=user_conversation, add_generation_prompt=False).strip()
         for user_conversation in user_conversations
     ]
-    images = [[image] for image in images]
-    model_inputs = processor(text=texts, images=images, return_tensors="pt", padding=True).to(device, dtype=torch.bfloat16)
-    user_model_inputs = processor(text=user_texts, images=images, return_tensors="pt", padding=True).to(device, dtype=torch.bfloat16)
+    image_lists = [[image] for image in images]
+    model_inputs = processor(text=texts, images=image_lists, return_tensors="pt", padding=True).to(
+        device, dtype=torch.bfloat16
+    )
+    user_model_inputs = processor(text=user_texts, images=image_lists, return_tensors="pt", padding=True).to(
+        device, dtype=torch.bfloat16
+    )
 
     labels = model_inputs["input_ids"].clone()
     input_ids = model_inputs["input_ids"]
@@ -97,19 +103,28 @@ def evaluation_collate_fn(
     prefixes = [entry["prefix"] for entry in data]
     suffixes = [entry["suffix"] for entry in data]
     user_conversations = [
-        format_conversation(image, entry["prefix"], system_message)
-        for image, entry in zip(images, data)
+        format_conversation(image, entry["prefix"], system_message) for image, entry in zip(images, data)
     ]
     user_texts = [
         processor.apply_chat_template(conversation=user_conversation, add_generation_prompt=False).strip()
         for user_conversation in user_conversations
     ]
-    images = [[image] for image in images]
-    user_model_inputs = processor(text=user_texts, images=images, return_tensors="pt", padding=True).to(device, dtype=torch.bfloat16)
+    image_lists = [[image] for image in images]
+    user_model_inputs = processor(text=user_texts, images=image_lists, return_tensors="pt", padding=True).to(
+        device, dtype=torch.bfloat16
+    )
 
     user_input_ids = user_model_inputs["input_ids"]
     user_attention_mask = user_model_inputs["attention_mask"]
     user_pixel_values = user_model_inputs["pixel_values"]
     user_pixel_attention_mask = user_model_inputs["pixel_attention_mask"]
 
-    return user_input_ids, user_attention_mask, user_pixel_values, user_pixel_attention_mask, images, prefixes, suffixes
\ No newline at end of file
+    return (
+        user_input_ids,
+        user_attention_mask,
+        user_pixel_values,
+        user_pixel_attention_mask,
+        image_lists,
+        prefixes,
+        suffixes,
+    )

From 5c270a95895a4dd0a8d2e4471455c0ddd7ff94f3 Mon Sep 17 00:00:00 2001
From: Alexander Bodner <61150961+AlexBodner@users.noreply.github.com>
Date: Fri, 6 Jun 2025 09:23:28 -0300
Subject: [PATCH 86/92] Update smolvlm2.md

---
 docs/models/smolvlm2.md | 83 ++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/docs/models/smolvlm2.md b/docs/models/smolvlm2.md
index cebf95f7..b9925d06 100644
--- a/docs/models/smolvlm2.md
+++ b/docs/models/smolvlm2.md
@@ -11,7 +11,7 @@ Built to balance performance and efficiency, SmolVLM2 provides a valuable option
 ## Install
 
 ```bash
-pip install "maestro[smolvlm2]"
+pip install "maestro[smolvlm_2]"
 ```
 
 ## Train
@@ -23,77 +23,76 @@ The training routines support various optimization strategies such as LoRA, QLoR
 Kick off training from the command line by running the command below. Be sure to replace the dataset path and adjust the hyperparameters (such as epochs and batch size) to suit your needs.
 
 ```bash
-maestro smolvlm2 train \
+maestro smolvlm_2 train \
   --dataset "dataset/location" \
   --epochs 10 \
   --batch-size 4 \
-  --optimization_strategy "qlora" \
+  --accumulate_grad_batches 4 \
+  --optimization_strategy "lora" \
   --metrics "edit_distance"
 ```
 
-### Python
 
-For more control, you can fine-tune SmolVLM2 using the Python API. Create a configuration dictionary with your training parameters and pass it to the train function to integrate the process into your custom workflow.
 
+### Python
 ```python
-from maestro.trainer.models.smolvlm2.core import train
+from maestro.trainer.models.smovlm_2.core import train
 
 config = {
+    "model_id": "HuggingFaceTB/SmolVLM-500M-Instruct",
     "dataset": "dataset/location",
+    "lr": 2e-5,
     "epochs": 10,
     "batch_size": 4,
-    "optimization_strategy": "qlora",
+    "accumulate_grad_batches": 4,
+    "num_workers": 0,
+    "optimization_strategy": "lora",
     "metrics": ["edit_distance"],
+    "device": "cuda"
 }
 
-results = train(config)
-```
-
-## Inference
 
-Use SmolVLM2 for inference on images using either the CLI or Python API.
+train(config)
+```
 
-### CLI
 
-```bash
-maestro smolvlm2 predict \
-  --image "path/to/image.jpg" \
-  --prompt "Describe this image"
-```
+## Load
 
-### Python
+Load a pre-trained or fine-tuned SmolVLM model along with its processor using the load_model function. Specify your model's path and the desired optimization strategy.
 
 ```python
-from maestro.trainer.models.smolvlm2.entrypoint import SmolVLM2
-
-model = SmolVLM2()
-result = model.generate(
-    images="path/to/image.jpg",
-    prompt="Describe this image",
-    max_new_tokens=512
+from maestro.trainer.models.smolvlm_2.checkpoints import (
+    OptimizationStrategy, load_model
 )
 
-print(result["text"])
+processor, model = load_model(
+    model_id_or_path="model/location",
+    optimization_strategy=OptimizationStrategy.NONE
+)
 ```
+## Predict
 
-## Object Detection
-
-SmolVLM2 can perform object detection on images, identifying and localizing objects with bounding boxes.
+Perform inference with SmolVLM using the predict function. Supply an image and a text prefix to obtain predictions, such as object detection outputs or captions.
 
 ```python
-from maestro.trainer.models.smolvlm2.entrypoint import SmolVLM2
-from maestro.trainer.models.smolvlm2.detection import result_to_detections_formatter
+from maestro.trainer.common.datasets.jsonl import JSONLDataset
+from maestro.trainer.models.smolvlm_2.inference import predict
 
-model = SmolVLM2()
-result = model.generate(
-    images="path/to/image.jpg",
-    prompt="Detect the following objects: person, car, dog"
+ds = JSONLDataset(
+    jsonl_file_path="dataset/location/test/annotations.jsonl",
+    image_directory_path="dataset/location/test",
 )
 
-# Convert text output to detections format
-boxes, class_ids = result_to_detections_formatter(
-    text=result["text"],
-    resolution_wh=(640, 480),
-    classes=["person", "car", "dog"]
-)
+image, entry = ds[0]
+
+predict(model=model, processor=processor, image=image, prefix=entry["prefix"])
+```
+
+### CLI
+
+```bash
+maestro smolvlm_2 predict \
+  --image "path/to/image.jpg" \
+  --prompt "Describe this image"
 ```
+

From 045dcb3323582bae223fb07a97d75271b6b2daf3 Mon Sep 17 00:00:00 2001
From: Alexander Bodner <61150961+AlexBodner@users.noreply.github.com>
Date: Fri, 6 Jun 2025 10:00:10 -0300
Subject: [PATCH 87/92] Update smolvlm2.md

---
 docs/models/smolvlm2.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/models/smolvlm2.md b/docs/models/smolvlm2.md
index b9925d06..f999b3f3 100644
--- a/docs/models/smolvlm2.md
+++ b/docs/models/smolvlm2.md
@@ -4,7 +4,7 @@ comments: true
 
 ## Overview
 
-SmolVLM2 is a lightweight vision-language model developed by Smol AI. It offers impressive capabilities for multimodal understanding while maintaining a compact size compared to larger VLMs. The model excels at tasks such as image captioning, visual question answering, and object detection, making it accessible for applications with limited computational resources.
+SmolVLM2 is a lightweight vision-language model developed by Hugging Face. It offers impressive capabilities for multimodal understanding while maintaining a compact size compared to larger VLMs. The model excels at tasks such as image captioning, visual question answering, and object detection, making it accessible for applications with limited computational resources.
 
 Built to balance performance and efficiency, SmolVLM2 provides a valuable option for developers seeking to implement vision-language capabilities without the overhead of larger models. The 500M parameter variant delivers practical results while being significantly more resource-friendly than multi-billion parameter alternatives.
 
@@ -36,7 +36,7 @@ maestro smolvlm_2 train \
 
 ### Python
 ```python
-from maestro.trainer.models.smovlm_2.core import train
+from maestro.trainer.models.smolvlm_2.core import train
 
 config = {
     "model_id": "HuggingFaceTB/SmolVLM-500M-Instruct",

From fe1292d849ac5e2c23c7f32593715f8995afe99b Mon Sep 17 00:00:00 2001
From: Alexander Bodner <61150961+AlexBodner@users.noreply.github.com>
Date: Fri, 6 Jun 2025 10:19:49 -0300
Subject: [PATCH 88/92] Update smolvlm2.md

---
 docs/models/smolvlm2.md | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/docs/models/smolvlm2.md b/docs/models/smolvlm2.md
index f999b3f3..0a5d705a 100644
--- a/docs/models/smolvlm2.md
+++ b/docs/models/smolvlm2.md
@@ -88,11 +88,3 @@ image, entry = ds[0]
 predict(model=model, processor=processor, image=image, prefix=entry["prefix"])
 ```
 
-### CLI
-
-```bash
-maestro smolvlm_2 predict \
-  --image "path/to/image.jpg" \
-  --prompt "Describe this image"
-```
-

From 2255e93aab042fca1c2981130bbd415b210b9ed6 Mon Sep 17 00:00:00 2001
From: Alexander Bodner <61150961+AlexBodner@users.noreply.github.com>
Date: Fri, 6 Jun 2025 10:20:16 -0300
Subject: [PATCH 89/92] Update introspection.py

---
 maestro/cli/introspection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/cli/introspection.py b/maestro/cli/introspection.py
index 95368685..e8d551ca 100644
--- a/maestro/cli/introspection.py
+++ b/maestro/cli/introspection.py
@@ -31,7 +31,7 @@ def find_training_recipes(app: typer.Typer) -> None:
     try:
         from maestro.trainer.models.smolvlm2.entrypoint import smolvlm2_app
 
-        app.add_typer(smolvlm2_app, name="smolvlm2")
+        app.add_typer(smolvlm2_app, name="smolvlm_2")
     except Exception:
         _warn_about_recipe_import_error(model_name="SmolVLM2")
 

From a405fc2e6b94c04a16cde8d337754b1a6d702e01 Mon Sep 17 00:00:00 2001
From: Alexander Bodner <61150961+AlexBodner@users.noreply.github.com>
Date: Fri, 6 Jun 2025 10:21:01 -0300
Subject: [PATCH 90/92] Update introspection.py

---
 maestro/cli/introspection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/cli/introspection.py b/maestro/cli/introspection.py
index e8d551ca..a5c61504 100644
--- a/maestro/cli/introspection.py
+++ b/maestro/cli/introspection.py
@@ -29,7 +29,7 @@ def find_training_recipes(app: typer.Typer) -> None:
         _warn_about_recipe_import_error(model_name="Qwen2.5-VL")
 
     try:
-        from maestro.trainer.models.smolvlm2.entrypoint import smolvlm2_app
+        from maestro.trainer.models.smolvlm_2.entrypoint import smolvlm2_app
 
         app.add_typer(smolvlm2_app, name="smolvlm_2")
     except Exception:

From 9d6180afc670bc3c33d079b17abcc418cc52467b Mon Sep 17 00:00:00 2001
From: Alexander Bodner <61150961+AlexBodner@users.noreply.github.com>
Date: Fri, 6 Jun 2025 10:21:53 -0300
Subject: [PATCH 91/92] Update introspection.py

---
 maestro/cli/introspection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maestro/cli/introspection.py b/maestro/cli/introspection.py
index a5c61504..f383429a 100644
--- a/maestro/cli/introspection.py
+++ b/maestro/cli/introspection.py
@@ -29,7 +29,7 @@ def find_training_recipes(app: typer.Typer) -> None:
         _warn_about_recipe_import_error(model_name="Qwen2.5-VL")
 
     try:
-        from maestro.trainer.models.smolvlm_2.entrypoint import smolvlm2_app
+        from maestro.trainer.models.smolvlm_2.entrypoint import smolvlm_2_app
 
         app.add_typer(smolvlm2_app, name="smolvlm_2")
     except Exception:

From 3787e4158a7969c162534d16b033942cc02c6212 Mon Sep 17 00:00:00 2001
From: Alexander Bodner <61150961+AlexBodner@users.noreply.github.com>
Date: Fri, 6 Jun 2025 10:50:03 -0300
Subject: [PATCH 92/92] Update and rename smolvlm2.md to smolvlm_2.md

---
 docs/models/{smolvlm2.md => smolvlm_2.md} | 1 +
 1 file changed, 1 insertion(+)
 rename docs/models/{smolvlm2.md => smolvlm_2.md} (98%)

diff --git a/docs/models/smolvlm2.md b/docs/models/smolvlm_2.md
similarity index 98%
rename from docs/models/smolvlm2.md
rename to docs/models/smolvlm_2.md
index 0a5d705a..aa62b7d9 100644
--- a/docs/models/smolvlm2.md
+++ b/docs/models/smolvlm_2.md
@@ -24,6 +24,7 @@ Kick off training from the command line by running the command below. Be sure to
 
 ```bash
 maestro smolvlm_2 train \
+  --model_id "HuggingFaceTB/SmolVLM-500M-Instruct" \
   --dataset "dataset/location" \
   --epochs 10 \
   --batch-size 4 \