From 4394a90eb5fef6c564755259b3fe1105d44900d5 Mon Sep 17 00:00:00 2001 From: AshAnand34 Date: Sat, 10 May 2025 01:24:49 -0700 Subject: [PATCH 1/9] Created SmolVLM2 model in maestro --- docs/models/smolvlm2.md | 99 ++++++++++ maestro/cli/introspection.py | 7 + maestro/trainer/models/smolvlm2/__init__.py | 0 .../trainer/models/smolvlm2/checkpoints.py | 64 +++++++ maestro/trainer/models/smolvlm2/core.py | 179 ++++++++++++++++++ maestro/trainer/models/smolvlm2/detection.py | 111 +++++++++++ maestro/trainer/models/smolvlm2/entrypoint.py | 150 +++++++++++++++ maestro/trainer/models/smolvlm2/inference.py | 138 ++++++++++++++ maestro/trainer/models/smolvlm2/loaders.py | 115 +++++++++++ mkdocs.yaml | 1 + pyproject.toml | 71 ++----- 11 files changed, 882 insertions(+), 53 deletions(-) create mode 100644 docs/models/smolvlm2.md create mode 100644 maestro/trainer/models/smolvlm2/__init__.py create mode 100644 maestro/trainer/models/smolvlm2/checkpoints.py create mode 100644 maestro/trainer/models/smolvlm2/core.py create mode 100644 maestro/trainer/models/smolvlm2/detection.py create mode 100644 maestro/trainer/models/smolvlm2/entrypoint.py create mode 100644 maestro/trainer/models/smolvlm2/inference.py create mode 100644 maestro/trainer/models/smolvlm2/loaders.py diff --git a/docs/models/smolvlm2.md b/docs/models/smolvlm2.md new file mode 100644 index 00000000..cebf95f7 --- /dev/null +++ b/docs/models/smolvlm2.md @@ -0,0 +1,99 @@ +--- +comments: true +--- + +## Overview + +SmolVLM2 is a lightweight vision-language model developed by Smol AI. It offers impressive capabilities for multimodal understanding while maintaining a compact size compared to larger VLMs. The model excels at tasks such as image captioning, visual question answering, and object detection, making it accessible for applications with limited computational resources. + +Built to balance performance and efficiency, SmolVLM2 provides a valuable option for developers seeking to implement vision-language capabilities without the overhead of larger models. The 500M parameter variant delivers practical results while being significantly more resource-friendly than multi-billion parameter alternatives. + +## Install + +```bash +pip install "maestro[smolvlm2]" +``` + +## Train + +The training routines support various optimization strategies such as LoRA, QLoRA, and freezing the vision encoder. Customize your fine-tuning process via CLI or Python to align with your dataset and task requirements. + +### CLI + +Kick off training from the command line by running the command below. Be sure to replace the dataset path and adjust the hyperparameters (such as epochs and batch size) to suit your needs. + +```bash +maestro smolvlm2 train \ + --dataset "dataset/location" \ + --epochs 10 \ + --batch-size 4 \ + --optimization_strategy "qlora" \ + --metrics "edit_distance" +``` + +### Python + +For more control, you can fine-tune SmolVLM2 using the Python API. Create a configuration dictionary with your training parameters and pass it to the train function to integrate the process into your custom workflow. + +```python +from maestro.trainer.models.smolvlm2.core import train + +config = { + "dataset": "dataset/location", + "epochs": 10, + "batch_size": 4, + "optimization_strategy": "qlora", + "metrics": ["edit_distance"], +} + +results = train(config) +``` + +## Inference + +Use SmolVLM2 for inference on images using either the CLI or Python API. + +### CLI + +```bash +maestro smolvlm2 predict \ + --image "path/to/image.jpg" \ + --prompt "Describe this image" +``` + +### Python + +```python +from maestro.trainer.models.smolvlm2.entrypoint import SmolVLM2 + +model = SmolVLM2() +result = model.generate( + images="path/to/image.jpg", + prompt="Describe this image", + max_new_tokens=512 +) + +print(result["text"]) +``` + +## Object Detection + +SmolVLM2 can perform object detection on images, identifying and localizing objects with bounding boxes. + +```python +from maestro.trainer.models.smolvlm2.entrypoint import SmolVLM2 +from maestro.trainer.models.smolvlm2.detection import result_to_detections_formatter + +model = SmolVLM2() +result = model.generate( + images="path/to/image.jpg", + prompt="Detect the following objects: person, car, dog" +) + +# Convert text output to detections format +boxes, class_ids = result_to_detections_formatter( + text=result["text"], + resolution_wh=(640, 480), + classes=["person", "car", "dog"] +) +``` diff --git a/maestro/cli/introspection.py b/maestro/cli/introspection.py index 086a831b..95368685 100644 --- a/maestro/cli/introspection.py +++ b/maestro/cli/introspection.py @@ -28,6 +28,13 @@ def find_training_recipes(app: typer.Typer) -> None: except Exception: _warn_about_recipe_import_error(model_name="Qwen2.5-VL") + try: + from maestro.trainer.models.smolvlm2.entrypoint import smolvlm2_app + + app.add_typer(smolvlm2_app, name="smolvlm2") + except Exception: + _warn_about_recipe_import_error(model_name="SmolVLM2") + def _warn_about_recipe_import_error(model_name: str) -> None: disable_warnings = str2bool( diff --git a/maestro/trainer/models/smolvlm2/__init__.py b/maestro/trainer/models/smolvlm2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py new file mode 100644 index 00000000..dbfc8f71 --- /dev/null +++ b/maestro/trainer/models/smolvlm2/checkpoints.py @@ -0,0 +1,64 @@ +import os +from typing import Dict, Optional + +import torch +from transformers import AutoModelForVision2Seq, AutoProcessor + + +def save_checkpoint( + model: AutoModelForVision2Seq, + processor: AutoProcessor, + path: str, + metadata: Optional[Dict] = None +) -> None: + """ + Save model checkpoint. + + Args: + model: Model to save + processor: Processor to save + path: Path to save checkpoint + metadata: Optional metadata to save + """ + os.makedirs(path, exist_ok=True) + + # Save model + model.save_pretrained(path) + + # Save processor + processor.save_pretrained(path) + + # Save metadata if provided + if metadata is not None: + torch.save(metadata, os.path.join(path, "metadata.pt")) + +def load_checkpoint( + path: str, + device: str = "cuda" if torch.cuda.is_available() else "cpu" +) -> Dict: + """ + Load model checkpoint. + + Args: + path: Path to checkpoint + device: Device to load model on + + Returns: + Dictionary containing model, processor, and metadata + """ + # Load model + model = AutoModelForVision2Seq.from_pretrained(path) + model.to(device) + + # Load processor + processor = AutoProcessor.from_pretrained(path) + + # Load metadata if exists + metadata_path = os.path.join(path, "metadata.pt") + metadata = torch.load(metadata_path) if os.path.exists(metadata_path) else None + + return { + "model": model, + "processor": processor, + "metadata": metadata + } diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py new file mode 100644 index 00000000..101918e1 --- /dev/null +++ b/maestro/trainer/models/smolvlm2/core.py @@ -0,0 +1,179 @@ +from typing import Optional, Union + +import torch +from transformers import AutoModelForVision2Seq, AutoProcessor + + +class SmolVLM2Core: + """Core SmolVLM2 model implementation.""" + + def __init__( + self, + model_name: str = "smol-ai/smolvlm2-500m", + device: str = "cuda" if torch.cuda.is_available() else "cpu", + **kwargs + ): + """ + Initialize SmolVLM2 model. + + Args: + model_name: Name or path of the model to load + device: Device to run the model on + **kwargs: Additional arguments to pass to the model + """ + self.model_name = model_name + self.device = device + + self.processor = AutoProcessor.from_pretrained(model_name) + self.model = AutoModelForVision2Seq.from_pretrained(model_name) + self.model.to(device) + + def process_inputs( + self, + images: Union[str, list[str]], + prompt: Optional[str] = None + ) -> dict: + """Process input images and text.""" + if isinstance(images, str): + images = [images] + + return self.processor( + images=images, + text=prompt if prompt else "", + return_tensors="pt" + ).to(self.device) + + def generate( + self, + inputs: dict, + max_new_tokens: int = 512, + **kwargs + ) -> torch.Tensor: + """Generate text from processed inputs.""" + return self.model.generate( + **inputs, + max_new_tokens=max_new_tokens, + **kwargs + ) + + def decode_outputs( + self, + outputs: torch.Tensor, + skip_special_tokens: bool = True + ) -> list[str]: + """Decode model outputs to text.""" + return self.processor.batch_decode( + outputs, + skip_special_tokens=skip_special_tokens + ) + +def train(config: dict) -> dict: + """ + Train SmolVLM2 model with provided configuration. + + Args: + config: Dictionary containing training configuration + - dataset: Path to dataset directory or file + - epochs: Number of training epochs + - batch_size: Training batch size + - optimization_strategy: Strategy for optimization (qlora, lora, freeze_vision) + - metrics: List of metrics to evaluate during training + - output_dir: Directory to save trained model + Returns: + Dictionary containing training results and metrics + """ + from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training + from transformers import BitsAndBytesConfig, TrainingArguments + + # Load dataset + dataset_path = config["dataset"] + + # TODO: Implement proper dataset loading logic based on the dataset format + # For now, we'll use a placeholder implementation + + # Create model with the specified optimization strategy + model_name = config.get("model_name", "smol-ai/smolvlm2-500m") + strategy = config.get("optimization_strategy", "qlora") + + if strategy == "qlora": + # Configure QLoRA + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + ) + + model = AutoModelForVision2Seq.from_pretrained( + model_name, + quantization_config=bnb_config, + device_map="auto" + ) + model = prepare_model_for_kbit_training(model) + + lora_config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM" + ) + + model = get_peft_model(model, lora_config) + + elif strategy == "lora": + # Configure LoRA without quantization + model = AutoModelForVision2Seq.from_pretrained(model_name) + + lora_config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM" + ) + + model = get_peft_model(model, lora_config) + + elif strategy == "freeze_vision": + # Freeze vision encoder, train only language model part + model = AutoModelForVision2Seq.from_pretrained(model_name) + + # Freeze vision encoder parameters + for param in model.vision_model.parameters(): + param.requires_grad = False + + else: + raise ValueError(f"Unsupported optimization strategy: {strategy}") + + processor = AutoProcessor.from_pretrained(model_name) + + # Set up training arguments + output_dir = config.get("output_dir", "./smolvlm2-finetuned") + training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=config.get("epochs", 10), + per_device_train_batch_size=config.get("batch_size", 4), + gradient_accumulation_steps=4, + learning_rate=2e-5, + weight_decay=0.01, + warmup_steps=100, + save_strategy="epoch", + save_total_limit=2, + logging_steps=10, + remove_unused_columns=False, + ) + + # TODO: Implement full training logic with dataset loading + # This is a placeholder that returns a mock result + + return { + "model_path": output_dir, + "metrics": { + "loss": 0.5, + "edit_distance": 0.2 + }, + "status": "Training implementation in progress" + } diff --git a/maestro/trainer/models/smolvlm2/detection.py b/maestro/trainer/models/smolvlm2/detection.py new file mode 100644 index 00000000..a52fcfdf --- /dev/null +++ b/maestro/trainer/models/smolvlm2/detection.py @@ -0,0 +1,111 @@ +import re +from typing import Optional + +import numpy as np + + +def result_to_detections_formatter( + text: str, + resolution_wh: tuple[int, int], + classes: Optional[list[str]] = None +) -> tuple[np.ndarray, np.ndarray]: + """Converts SmolVLM2 text output into detection format. + + SmolVLM2 outputs text in a format like: + "a person standing in front of a car [x1, y1, x2, y2]" + + Args: + text: SmolVLM2 output text + resolution_wh: Target image resolution (width, height) + classes: Optional list of valid class names + + Returns: + Tuple of (boxes, class_ids) where: + - boxes is a float32 array of shape (N, 4) with xyxy coordinates + - class_ids is an int32 array of shape (N,) with class IDs + """ + # Extract bounding boxes using regex + box_pattern = r"\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]" + matches = re.finditer(box_pattern, text) + + boxes_list = [] + class_ids_list = [] + + # Create class mapping if provided + if classes is not None: + name_to_index = {cls_name: idx for idx, cls_name in enumerate(classes)} + else: + name_to_index = None + + for match in matches: + x_min, y_min, x_max, y_max = map(float, match.groups()) + + # Extract class name from text before the box + text_before = text[:match.start()].strip() + class_name = text_before.split()[-1] if text_before else "unknown" + + if name_to_index is not None: + if class_name not in name_to_index: + continue + current_class_id = name_to_index[class_name] + else: + current_class_id = -1 + + boxes_list.append([x_min, y_min, x_max, y_max]) + class_ids_list.append(current_class_id) + + boxes = np.array(boxes_list, dtype=np.float32).reshape(-1, 4) + class_ids = np.array(class_ids_list, dtype=np.int32) + + return boxes, class_ids + +def detections_to_text_formatter( + xyxy: np.ndarray, + class_id: np.ndarray, + classes: list[str], + resolution_wh: tuple[int, int] +) -> str: + """Converts detections to SmolVLM2 text format. + + Args: + xyxy: Bounding boxes in xyxy format + class_id: Class IDs for each box + classes: List of class names + resolution_wh: Image resolution (width, height) + + Returns: + Formatted text string for SmolVLM2 + """ + text_parts = [] + + for i in range(len(xyxy)): + cls_name = classes[class_id[i]] + x_min, y_min, x_max, y_max = map(int, xyxy[i]) + box_text = f"{cls_name} [{x_min}, {y_min}, {x_max}, {y_max}]" + text_parts.append(box_text) + + return " ".join(text_parts) + +def format_prompt_for_detection( + prompt: str, + xyxy: Optional[np.ndarray] = None, + class_id: Optional[np.ndarray] = None, + classes: Optional[list[str]] = None, + resolution_wh: Optional[tuple[int, int]] = None +) -> str: + """Formats a prompt for object detection with SmolVLM2. + + Args: + prompt: Base prompt + xyxy: Optional bounding boxes + class_id: Optional class IDs + classes: Optional class names + resolution_wh: Optional image resolution + + Returns: + Formatted prompt string + """ + if all(x is not None for x in [xyxy, class_id, classes, resolution_wh]): + detection_text = detections_to_text_formatter(xyxy, class_id, classes, resolution_wh) + return f"{prompt} {detection_text}" + return prompt diff --git a/maestro/trainer/models/smolvlm2/entrypoint.py b/maestro/trainer/models/smolvlm2/entrypoint.py new file mode 100644 index 00000000..102c544e --- /dev/null +++ b/maestro/trainer/models/smolvlm2/entrypoint.py @@ -0,0 +1,150 @@ +from pathlib import Path +from typing import Optional, Union + +import torch +import typer + +from .inference import SmolVLM2Inference + +smolvlm2_app = typer.Typer() + +class SmolVLM2: + """Main entrypoint for SmolVLM2 model.""" + + def __init__( + self, + model_name: str = "smol-ai/smolvlm2-500m", + device: str = "cuda" if torch.cuda.is_available() else "cpu", + **kwargs + ): + """Initialize SmolVLM2 model.""" + self.inference = SmolVLM2Inference(model_name=model_name, device=device, **kwargs) + + def generate( + self, + images: Union[str, list[str]], + prompt: Optional[str] = None, + max_new_tokens: int = 512, + **kwargs + ) -> dict: + """ + Generate text from images. + + Args: + images: Path(s) to image(s) + prompt: Optional prompt to guide generation + max_new_tokens: Maximum number of tokens to generate + **kwargs: Additional generation parameters + + Returns: + Dictionary containing generated text and other outputs + """ + return self.inference.generate( + images=images, + prompt=prompt, + max_new_tokens=max_new_tokens, + **kwargs + ) + +@smolvlm2_app.command(name="info", help="Get information about the SmolVLM2 model") +def info() -> None: + """Get information about the SmolVLM2 model.""" + try: + model = SmolVLM2() + info = model.inference.get_model_info() + typer.echo(f"Model Name: {info['model_name']}") + typer.echo(f"Model Size: {info['model_size']}") + typer.echo(f"Device: {info['device']}") + typer.echo(f"Tokenizer: {info['tokenizer']}") + except Exception as e: + typer.echo(f"Error retrieving model info: {e!s}", err=True) + raise typer.Exit(code=1) + +@smolvlm2_app.command(name="predict", help="Run inference on one or more images") +def predict( + image: list[Path] = typer.Option( + ..., "--image", "-i", help="Path to image(s) for prediction" + ), + prompt: Optional[str] = typer.Option( + None, "--prompt", "-p", help="Optional prompt to guide generation" + ), + max_new_tokens: int = typer.Option( + 512, "--max-new-tokens", help="Maximum new tokens to generate" + ), + output: Optional[Path] = typer.Option( + None, "--output", "-o", help="Output file path to save results" + ), +) -> None: + """Run inference on images using SmolVLM2.""" + try: + model = SmolVLM2() + result = model.generate( + images=[str(img) for img in image], + prompt=prompt, + max_new_tokens=max_new_tokens + ) + + if output: + import json + with open(output, "w") as f: + json.dump(result, f, indent=2) + typer.echo(f"Results saved to {output}") + else: + typer.echo(f"Generated text: {result['text']}") + + except Exception as e: + typer.echo(f"Error during prediction: {e!s}", err=True) + raise typer.Exit(code=1) + +@smolvlm2_app.command(name="train", help="Fine-tune the SmolVLM2 model") +def train( + dataset: Path = typer.Option( + ..., "--dataset", "-d", help="Path to dataset directory or file" + ), + epochs: int = typer.Option( + 10, "--epochs", "-e", help="Number of training epochs" + ), + batch_size: int = typer.Option( + 4, "--batch-size", "-b", help="Training batch size" + ), + optimization_strategy: str = typer.Option( + "qlora", "--optimization-strategy", "-o", + help="Optimization strategy (qlora, lora, freeze_vision)" + ), + metrics: list[str] = typer.Option( + ["edit_distance"], "--metrics", "-m", help="Metrics to evaluate during training" + ), + output_dir: Optional[Path] = typer.Option( + None, "--output-dir", help="Directory to save trained model" + ), +) -> None: + """Fine-tune the SmolVLM2 model on a dataset.""" + try: + typer.echo("Starting SmolVLM2 fine-tuning...") + + if output_dir is None: + import tempfile + output_dir = Path(tempfile.mkdtemp()) + typer.echo(f"No output directory specified, using temporary directory: {output_dir}") + + # Create configuration for training + config = { + "dataset": str(dataset), + "epochs": epochs, + "batch_size": batch_size, + "optimization_strategy": optimization_strategy, + "metrics": metrics, + "output_dir": str(output_dir) + } + + # Import the train function here to avoid circular imports + from .core import train as train_model + + results = train_model(config) + + typer.echo(f"Training complete! Model saved to {output_dir}") + typer.echo(f"Final metrics: {results.get('metrics', {})}") + + except Exception as e: + typer.echo(f"Error during training: {e!s}", err=True) + raise typer.Exit(code=1) diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py new file mode 100644 index 00000000..9ed2c631 --- /dev/null +++ b/maestro/trainer/models/smolvlm2/inference.py @@ -0,0 +1,138 @@ +from typing import Optional, Union + +import torch +from transformers import AutoModelForVision2Seq, AutoProcessor + + +class SmolVLM2Inference: + """Inference interface for SmolVLM2 model.""" + + def __init__( + self, + model_name: str = "smol-ai/smolvlm2-500m", + device: str = "cuda" if torch.cuda.is_available() else "cpu", + **kwargs + ): + """Initialize inference interface.""" + self.model = AutoModelForVision2Seq.from_pretrained(model_name) + self.processor = AutoProcessor.from_pretrained(model_name) + self.device = device + + def generate( + self, + images: Union[str, list[str]], + prompt: Optional[str] = None, + max_new_tokens: int = 512, + **kwargs + ) -> dict: + """ + Generate text from images. + + Args: + images: Path(s) to image(s) + prompt: Optional prompt to guide generation + max_new_tokens: Maximum number of tokens to generate + **kwargs: Additional generation parameters + + Returns: + Dictionary containing generated text and other outputs + """ + # Process inputs + inputs = self.processor( + images=images, + text=prompt if prompt else "", + return_tensors="pt" + ) + + # Generate + outputs = self.model.generate( + input_ids=inputs["input_ids"].to(self.device), + pixel_values=inputs["pixel_values"].to(self.device), + max_new_tokens=max_new_tokens, + **kwargs + ) + + # Decode outputs + generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True) + + return { + "generated_text": generated_text, + "model_outputs": outputs + } + +def predict_with_inputs( + model: AutoModelForVision2Seq, + processor: AutoProcessor, + input_ids: torch.Tensor, + pixel_values: torch.Tensor, + device: Union[str, torch.device], + max_new_tokens: int = 512, + **kwargs +) -> list[str]: + """ + Generate text predictions using the model. + + Args: + model: The SmolVLM2 model + processor: The model's processor + input_ids: Input token IDs + pixel_values: Input image pixel values + device: Device to run inference on + max_new_tokens: Maximum number of tokens to generate + **kwargs: Additional generation parameters + + Returns: + List of generated text strings + """ + model.eval() + with torch.no_grad(): + outputs = model.generate( + input_ids=input_ids.to(device), + pixel_values=pixel_values.to(device), + max_new_tokens=max_new_tokens, + **kwargs + ) + return processor.batch_decode(outputs, skip_special_tokens=True) + +def predict_with_images( + model: AutoModelForVision2Seq, + processor: AutoProcessor, + images: Union[str, list[str]], + prompt: Optional[str] = None, + device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", + max_new_tokens: int = 512, + **kwargs +) -> list[str]: + """ + Generate text predictions from images. + + Args: + model: The SmolVLM2 model + processor: The model's processor + images: Path(s) to image(s) + prompt: Optional prompt to guide generation + device: Device to run inference on + max_new_tokens: Maximum number of tokens to generate + **kwargs: Additional generation parameters + + Returns: + List of generated text strings + """ + if isinstance(images, str): + images = [images] + + inputs = processor( + images=images, + text=prompt if prompt else "", + return_tensors="pt" + ) + + return predict_with_inputs( + model=model, + processor=processor, + input_ids=inputs["input_ids"], + pixel_values=inputs["pixel_values"], + device=device, + max_new_tokens=max_new_tokens, + **kwargs + ) diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py new file mode 100644 index 00000000..4f9bc8b9 --- /dev/null +++ b/maestro/trainer/models/smolvlm2/loaders.py @@ -0,0 +1,115 @@ +from typing import Optional + +import torch +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from transformers import AutoProcessor + + +class SmolVLM2Dataset(Dataset): + """Dataset for SmolVLM2 model.""" + + def __init__( + self, + image_paths: list[str], + texts: Optional[list[str]] = None, + processor: Optional[AutoProcessor] = None + ): + """ + Initialize dataset. + + Args: + image_paths: List of paths to images + texts: Optional list of corresponding texts + processor: Model processor for preprocessing + """ + self.image_paths = image_paths + self.texts = texts + self.processor = processor + + def __len__(self) -> int: + return len(self.image_paths) + + def __getitem__(self, idx: int) -> dict: + """Get a single item from the dataset.""" + image = Image.open(self.image_paths[idx]) + + if self.texts is not None: + text = self.texts[idx] + else: + text = "" + + if self.processor is not None: + return self.processor( + images=image, + text=text, + return_tensors="pt" + ) + else: + return { + "image": image, + "text": text + } + +def train_collate_fn(batch: list[dict]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Collate function for training data. + + Args: + batch: List of processed samples + + Returns: + Tuple of (input_ids, pixel_values, labels) + """ + input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch]) + pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch]) + labels = torch.stack([item["labels"].squeeze(0) for item in batch]) + + return input_ids, pixel_values, labels + +def evaluation_collate_fn( + batch: list[dict] +) -> tuple[torch.Tensor, torch.Tensor, list[Image.Image], list[str], list[str]]: + """ + Collate function for evaluation data. + + Args: + batch: List of processed samples + + Returns: + Tuple of (input_ids, pixel_values, images, prompts, targets) + """ + input_ids = torch.stack([item["input_ids"].squeeze(0) for item in batch]) + pixel_values = torch.stack([item["pixel_values"].squeeze(0) for item in batch]) + images = [item["image"] for item in batch] + prompts = [item["text"] for item in batch] + targets = [item["text"] for item in batch] # In evaluation, target is same as prompt + + return input_ids, pixel_values, images, prompts, targets + +def create_dataloader( + dataset: Dataset, + batch_size: int = 8, + num_workers: int = 4, + shuffle: bool = True, + collate_fn = None +) -> DataLoader: + """ + Create a DataLoader for the dataset. + + Args: + dataset: Dataset to create loader for + batch_size: Batch size + num_workers: Number of worker processes + shuffle: Whether to shuffle the data + collate_fn: Optional collate function + Returns: + DataLoader instance + """ + return DataLoader( + dataset, + batch_size=batch_size, + num_workers=num_workers, + shuffle=shuffle, + collate_fn=collate_fn + ) diff --git a/mkdocs.yaml b/mkdocs.yaml index 3f476c12..f232889b 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -27,6 +27,7 @@ nav: - Florence-2: models/florence_2.md - PaliGemma 2: models/paligemma_2.md - Qwen2.5-VL: models/qwen_2_5_vl.md + - SmolVLM2: models/smolvlm2.md - Datasets: - JSONL: datasets/jsonl.md diff --git a/pyproject.toml b/pyproject.toml index da476b4f..2aefbc3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,6 +91,14 @@ qwen_2_5_vl = [ "bitsandbytes>=0.45.0", "qwen-vl-utils>=0.0.8" ] +smolvlm2 = [ + "accelerate>=1.2.1", + "peft>=0.12", + "torch>=2.4.0", + "torchvision>=0.20.0", + "transformers>=4.49.0", + "bitsandbytes>=0.45.0" +] [project.scripts] maestro = "maestro.cli.main:app" @@ -147,62 +155,19 @@ line-length = 120 indent-width = 4 [tool.ruff.lint] - -# Enable pycodestyle (`E`) -select = ["E", "F", "I", "A", "Q", "W", "N", "T", "Q","TRY","UP","C90","RUF","NPY"] -ignore = ["T201","TRY003","NPY201"] - -# Allow autofix for all enabled rules (when `--fix`) is provided. -fixable = [ - "A", - "B", - "C", - "D", - "E", - "F", - "G", - "I", - "N", - "Q", - "S", - "T", - "W", - "ANN", - "ARG", - "BLE", - "COM", - "DJ", - "DTZ", - "EM", - "ERA", - "EXE", - "FBT", - "ICN", - "INP", - "ISC", - "NPY", - "PD", - "PGH", - "PIE", - "PL", - "PT", - "PTH", - "PYI", - "RET", - "RSE", - "RUF", - "SIM", - "SLF", - "TCH", - "TID", - "TRY", - "UP", - "YTT", -] +select = ["E", "F", "I", "A", "Q", "W", "N", "T", "TRY", "UP", "C90", "RUF", "NPY"] +ignore = ["T201", "TRY003", "NPY201"] +fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", + "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", + "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", + "TCH", "TID", "TRY", "UP", "YTT"] unfixable = [] + # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -pylint.max-args = 20 + +[tool.ruff.lint.pylint] +max-args = 20 [tool.ruff.lint.flake8-quotes] inline-quotes = "double" From 0518c67038e1059c032b700d8269232332234a48 Mon Sep 17 00:00:00 2001 From: AshAnand34 Date: Sat, 10 May 2025 12:26:34 -0700 Subject: [PATCH 2/9] Fixing lint errors and crated trainer for training dataset in smolvlm2 --- .../trainer/models/smolvlm2/checkpoints.py | 12 ++-- maestro/trainer/models/smolvlm2/core.py | 58 ++++++++++++++----- pyproject.toml | 50 ++++++++++++++-- 3 files changed, 96 insertions(+), 24 deletions(-) diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py index dbfc8f71..afa4c3c0 100644 --- a/maestro/trainer/models/smolvlm2/checkpoints.py +++ b/maestro/trainer/models/smolvlm2/checkpoints.py @@ -1,5 +1,5 @@ import os -from typing import Dict, Optional +from typing import Optional import torch from transformers import AutoModelForVision2Seq, AutoProcessor @@ -9,11 +9,11 @@ def save_checkpoint( model: AutoModelForVision2Seq, processor: AutoProcessor, path: str, - metadata: Optional[Dict] = None + metadata: Optional[dict] = None ) -> None: """ Save model checkpoint. - + Args: model: Model to save processor: Processor to save @@ -35,14 +35,14 @@ def save_checkpoint( def load_checkpoint( path: str, device: str = "cuda" if torch.cuda.is_available() else "cpu" -) -> Dict: +) -> dict: """ Load model checkpoint. - + Args: path: Path to checkpoint device: Device to load model on - + Returns: Dictionary containing model, processor, and metadata """ diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py index 101918e1..475d60c7 100644 --- a/maestro/trainer/models/smolvlm2/core.py +++ b/maestro/trainer/models/smolvlm2/core.py @@ -1,7 +1,8 @@ +import os from typing import Optional, Union import torch -from transformers import AutoModelForVision2Seq, AutoProcessor +from transformers import AutoModelForVision2Seq, AutoProcessor, Trainer class SmolVLM2Core: @@ -70,7 +71,7 @@ def decode_outputs( def train(config: dict) -> dict: """ Train SmolVLM2 model with provided configuration. - + Args: config: Dictionary containing training configuration - dataset: Path to dataset directory or file @@ -82,15 +83,19 @@ def train(config: dict) -> dict: Returns: Dictionary containing training results and metrics """ + from functools import partial + from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from transformers import BitsAndBytesConfig, TrainingArguments + from maestro.trainer.common.datasets.core import create_data_loaders, resolve_dataset_path + from maestro.trainer.models.smolvlm2.loaders import evaluation_collate_fn, train_collate_fn # Load dataset dataset_path = config["dataset"] + dataset_location = resolve_dataset_path(dataset_path) + if dataset_location is None: + return {"error": "Dataset not found"} - # TODO: Implement proper dataset loading logic based on the dataset format - # For now, we'll use a placeholder implementation - # Create model with the specified optimization strategy model_name = config.get("model_name", "smol-ai/smolvlm2-500m") strategy = config.get("optimization_strategy", "qlora") @@ -147,15 +152,27 @@ def train(config: dict) -> dict: else: raise ValueError(f"Unsupported optimization strategy: {strategy}") - processor = AutoProcessor.from_pretrained(model_name) + # Load datasets + train_loader, valid_loader, test_loader = create_data_loaders( + dataset_location=dataset_location, + train_batch_size=config.get("batch_size", 4), + train_collect_fn=partial(train_collate_fn, processor=processor), + train_num_workers=config.get("num_workers", 0), + test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)), + test_collect_fn=partial(evaluation_collate_fn, processor=processor), + test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)), ) + # Set up training arguments output_dir = config.get("output_dir", "./smolvlm2-finetuned") + os.makedirs(output_dir, exist_ok=True) + training_args = TrainingArguments( output_dir=output_dir, num_train_epochs=config.get("epochs", 10), per_device_train_batch_size=config.get("batch_size", 4), + per_device_eval_batch_size=config.get("val_batch_size", config.get("batch_size", 4)), gradient_accumulation_steps=4, learning_rate=2e-5, weight_decay=0.01, @@ -163,17 +180,30 @@ def train(config: dict) -> dict: save_strategy="epoch", save_total_limit=2, logging_steps=10, - remove_unused_columns=False, + evaluation_strategy="epoch", + load_best_model_at_end=True, + remove_unused_columns=False + ) + + # Set up trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_loader.dataset, + eval_dataset=valid_loader.dataset, + data_collator=lambda batch: train_collate_fn(batch, processor) ) - # TODO: Implement full training logic with dataset loading - # This is a placeholder that returns a mock result + # Train model + trainer.train() + + # Save model and processor + model.save_pretrained(output_dir) + processor.save_pretrained(output_dir) + # Return results return { "model_path": output_dir, - "metrics": { - "loss": 0.5, - "edit_distance": 0.2 - }, - "status": "Training implementation in progress" + "metrics": trainer.state.log_history[-1] if trainer.state.log_history else {"loss": "N/A"}, + "status": "Training completed" } diff --git a/pyproject.toml b/pyproject.toml index 2aefbc3f..f94e8c04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -157,10 +157,52 @@ indent-width = 4 [tool.ruff.lint] select = ["E", "F", "I", "A", "Q", "W", "N", "T", "TRY", "UP", "C90", "RUF", "NPY"] ignore = ["T201", "TRY003", "NPY201"] -fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", - "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", - "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", - "TCH", "TID", "TRY", "UP", "YTT"] +fixable = [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "I", + "N", + "Q", + "S", + "T", + "W", + "ANN", + "ARG", + "BLE", + "COM", + "DJ", + "DTZ", + "EM", + "ERA", + "EXE", + "FBT", + "ICN", + "INP", + "ISC", + "NPY", + "PD", + "PGH", + "PIE", + "PL", + "PT", + "PTH", + "PYI", + "RET", + "RSE", + "RUF", + "SIM", + "SLF", + "TCH", + "TID", + "TRY", + "UP", + "YTT" +] unfixable = [] # Allow unused variables when underscore-prefixed. From 727e01b57429c0f66d3c452653312b5b3812afa4 Mon Sep 17 00:00:00 2001 From: AshAnand34 Date: Sat, 10 May 2025 12:36:16 -0700 Subject: [PATCH 3/9] SmolVLM2 documented --- docs/index.md | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index f6266980..82dd2518 100644 --- a/docs/index.md +++ b/docs/index.md @@ -69,10 +69,16 @@ we recommend creating a dedicated Python environment for each model. pip install "maestro[qwen_2_5_vl]" ``` +=== "SmolVLM2" + + ```bash + pip install "maestro[smolvlm2]" + ``` + ### CLI Kick off fine-tuning with our command-line interface, which leverages the configuration -and training routines defined in each model’s core module. Simply specify key parameters such as +and training routines defined in each model's core module. Simply specify key parameters such as the dataset location, number of epochs, batch size, optimization strategy, and metrics. === "Florence-2" @@ -108,6 +114,17 @@ the dataset location, number of epochs, batch size, optimization strategy, and m --metrics "edit_distance" ``` +=== "SmolVLM2" + + ```bash + maestro smolvlm2 train \ + --dataset "dataset/location" \ + --epochs 10 \ + --batch-size 4 \ + --optimization_strategy "lora" \ + --metrics "edit_distance" + ``` + ### Python For greater control, use the Python API to fine-tune your models. @@ -148,7 +165,6 @@ and training setup. ``` === "Qwen2.5-VL" - ```python from maestro.trainer.models.qwen_2_5_vl.core import train @@ -162,3 +178,18 @@ and training setup. train(config) ``` + +=== "SmolVLM2" + ```python + from maestro.trainer.models.smolvlm2.core import train + + config = { + "dataset": "dataset/location", + "epochs": 10, + "batch_size": 4, + "optimization_strategy": "lora", + "metrics": ["edit_distance"], + } + + train(config) + ``` From d074602f3be86de09fec48025a9db2817cc5ea90 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 10 May 2025 19:46:10 +0000 Subject: [PATCH 4/9] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../trainer/models/smolvlm2/checkpoints.py | 17 ++--- maestro/trainer/models/smolvlm2/core.py | 59 +++++----------- maestro/trainer/models/smolvlm2/detection.py | 15 ++-- maestro/trainer/models/smolvlm2/entrypoint.py | 68 ++++++------------- maestro/trainer/models/smolvlm2/inference.py | 37 ++++------ maestro/trainer/models/smolvlm2/loaders.py | 35 +++------- pyproject.toml | 6 +- 7 files changed, 72 insertions(+), 165 deletions(-) diff --git a/maestro/trainer/models/smolvlm2/checkpoints.py b/maestro/trainer/models/smolvlm2/checkpoints.py index afa4c3c0..87d1aea8 100644 --- a/maestro/trainer/models/smolvlm2/checkpoints.py +++ b/maestro/trainer/models/smolvlm2/checkpoints.py @@ -6,10 +6,7 @@ def save_checkpoint( - model: AutoModelForVision2Seq, - processor: AutoProcessor, - path: str, - metadata: Optional[dict] = None + model: AutoModelForVision2Seq, processor: AutoProcessor, path: str, metadata: Optional[dict] = None ) -> None: """ Save model checkpoint. @@ -32,10 +29,8 @@ def save_checkpoint( if metadata is not None: torch.save(metadata, os.path.join(path, "metadata.pt")) -def load_checkpoint( - path: str, - device: str = "cuda" if torch.cuda.is_available() else "cpu" -) -> dict: + +def load_checkpoint(path: str, device: str = "cuda" if torch.cuda.is_available() else "cpu") -> dict: """ Load model checkpoint. @@ -57,8 +52,4 @@ def load_checkpoint( metadata_path = os.path.join(path, "metadata.pt") metadata = torch.load(metadata_path) if os.path.exists(metadata_path) else None - return { - "model": model, - "processor": processor, - "metadata": metadata - } + return {"model": model, "processor": processor, "metadata": metadata} diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py index 475d60c7..ffc34b2b 100644 --- a/maestro/trainer/models/smolvlm2/core.py +++ b/maestro/trainer/models/smolvlm2/core.py @@ -12,7 +12,7 @@ def __init__( self, model_name: str = "smol-ai/smolvlm2-500m", device: str = "cuda" if torch.cuda.is_available() else "cpu", - **kwargs + **kwargs, ): """ Initialize SmolVLM2 model. @@ -29,44 +29,21 @@ def __init__( self.model = AutoModelForVision2Seq.from_pretrained(model_name) self.model.to(device) - def process_inputs( - self, - images: Union[str, list[str]], - prompt: Optional[str] = None - ) -> dict: + def process_inputs(self, images: Union[str, list[str]], prompt: Optional[str] = None) -> dict: """Process input images and text.""" if isinstance(images, str): images = [images] - return self.processor( - images=images, - text=prompt if prompt else "", - return_tensors="pt" - ).to(self.device) + return self.processor(images=images, text=prompt if prompt else "", return_tensors="pt").to(self.device) - def generate( - self, - inputs: dict, - max_new_tokens: int = 512, - **kwargs - ) -> torch.Tensor: + def generate(self, inputs: dict, max_new_tokens: int = 512, **kwargs) -> torch.Tensor: """Generate text from processed inputs.""" - return self.model.generate( - **inputs, - max_new_tokens=max_new_tokens, - **kwargs - ) + return self.model.generate(**inputs, max_new_tokens=max_new_tokens, **kwargs) - def decode_outputs( - self, - outputs: torch.Tensor, - skip_special_tokens: bool = True - ) -> list[str]: + def decode_outputs(self, outputs: torch.Tensor, skip_special_tokens: bool = True) -> list[str]: """Decode model outputs to text.""" - return self.processor.batch_decode( - outputs, - skip_special_tokens=skip_special_tokens - ) + return self.processor.batch_decode(outputs, skip_special_tokens=skip_special_tokens) + def train(config: dict) -> dict: """ @@ -90,6 +67,7 @@ def train(config: dict) -> dict: from maestro.trainer.common.datasets.core import create_data_loaders, resolve_dataset_path from maestro.trainer.models.smolvlm2.loaders import evaluation_collate_fn, train_collate_fn + # Load dataset dataset_path = config["dataset"] dataset_location = resolve_dataset_path(dataset_path) @@ -109,11 +87,7 @@ def train(config: dict) -> dict: bnb_4bit_use_double_quant=True, ) - model = AutoModelForVision2Seq.from_pretrained( - model_name, - quantization_config=bnb_config, - device_map="auto" - ) + model = AutoModelForVision2Seq.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto") model = prepare_model_for_kbit_training(model) lora_config = LoraConfig( @@ -122,7 +96,7 @@ def train(config: dict) -> dict: target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_dropout=0.05, bias="none", - task_type="CAUSAL_LM" + task_type="CAUSAL_LM", ) model = get_peft_model(model, lora_config) @@ -137,7 +111,7 @@ def train(config: dict) -> dict: target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_dropout=0.05, bias="none", - task_type="CAUSAL_LM" + task_type="CAUSAL_LM", ) model = get_peft_model(model, lora_config) @@ -162,7 +136,8 @@ def train(config: dict) -> dict: train_num_workers=config.get("num_workers", 0), test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)), test_collect_fn=partial(evaluation_collate_fn, processor=processor), - test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)), ) + test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)), + ) # Set up training arguments output_dir = config.get("output_dir", "./smolvlm2-finetuned") @@ -182,7 +157,7 @@ def train(config: dict) -> dict: logging_steps=10, evaluation_strategy="epoch", load_best_model_at_end=True, - remove_unused_columns=False + remove_unused_columns=False, ) # Set up trainer @@ -191,7 +166,7 @@ def train(config: dict) -> dict: args=training_args, train_dataset=train_loader.dataset, eval_dataset=valid_loader.dataset, - data_collator=lambda batch: train_collate_fn(batch, processor) + data_collator=lambda batch: train_collate_fn(batch, processor), ) # Train model @@ -205,5 +180,5 @@ def train(config: dict) -> dict: return { "model_path": output_dir, "metrics": trainer.state.log_history[-1] if trainer.state.log_history else {"loss": "N/A"}, - "status": "Training completed" + "status": "Training completed", } diff --git a/maestro/trainer/models/smolvlm2/detection.py b/maestro/trainer/models/smolvlm2/detection.py index a52fcfdf..d78e2681 100644 --- a/maestro/trainer/models/smolvlm2/detection.py +++ b/maestro/trainer/models/smolvlm2/detection.py @@ -5,9 +5,7 @@ def result_to_detections_formatter( - text: str, - resolution_wh: tuple[int, int], - classes: Optional[list[str]] = None + text: str, resolution_wh: tuple[int, int], classes: Optional[list[str]] = None ) -> tuple[np.ndarray, np.ndarray]: """Converts SmolVLM2 text output into detection format. @@ -41,7 +39,7 @@ def result_to_detections_formatter( x_min, y_min, x_max, y_max = map(float, match.groups()) # Extract class name from text before the box - text_before = text[:match.start()].strip() + text_before = text[: match.start()].strip() class_name = text_before.split()[-1] if text_before else "unknown" if name_to_index is not None: @@ -59,11 +57,9 @@ def result_to_detections_formatter( return boxes, class_ids + def detections_to_text_formatter( - xyxy: np.ndarray, - class_id: np.ndarray, - classes: list[str], - resolution_wh: tuple[int, int] + xyxy: np.ndarray, class_id: np.ndarray, classes: list[str], resolution_wh: tuple[int, int] ) -> str: """Converts detections to SmolVLM2 text format. @@ -86,12 +82,13 @@ def detections_to_text_formatter( return " ".join(text_parts) + def format_prompt_for_detection( prompt: str, xyxy: Optional[np.ndarray] = None, class_id: Optional[np.ndarray] = None, classes: Optional[list[str]] = None, - resolution_wh: Optional[tuple[int, int]] = None + resolution_wh: Optional[tuple[int, int]] = None, ) -> str: """Formats a prompt for object detection with SmolVLM2. diff --git a/maestro/trainer/models/smolvlm2/entrypoint.py b/maestro/trainer/models/smolvlm2/entrypoint.py index 102c544e..a18ff74e 100644 --- a/maestro/trainer/models/smolvlm2/entrypoint.py +++ b/maestro/trainer/models/smolvlm2/entrypoint.py @@ -8,6 +8,7 @@ smolvlm2_app = typer.Typer() + class SmolVLM2: """Main entrypoint for SmolVLM2 model.""" @@ -15,17 +16,13 @@ def __init__( self, model_name: str = "smol-ai/smolvlm2-500m", device: str = "cuda" if torch.cuda.is_available() else "cpu", - **kwargs + **kwargs, ): """Initialize SmolVLM2 model.""" self.inference = SmolVLM2Inference(model_name=model_name, device=device, **kwargs) def generate( - self, - images: Union[str, list[str]], - prompt: Optional[str] = None, - max_new_tokens: int = 512, - **kwargs + self, images: Union[str, list[str]], prompt: Optional[str] = None, max_new_tokens: int = 512, **kwargs ) -> dict: """ Generate text from images. @@ -39,12 +36,8 @@ def generate( Returns: Dictionary containing generated text and other outputs """ - return self.inference.generate( - images=images, - prompt=prompt, - max_new_tokens=max_new_tokens, - **kwargs - ) + return self.inference.generate(images=images, prompt=prompt, max_new_tokens=max_new_tokens, **kwargs) + @smolvlm2_app.command(name="info", help="Get information about the SmolVLM2 model") def info() -> None: @@ -60,32 +53,22 @@ def info() -> None: typer.echo(f"Error retrieving model info: {e!s}", err=True) raise typer.Exit(code=1) + @smolvlm2_app.command(name="predict", help="Run inference on one or more images") def predict( - image: list[Path] = typer.Option( - ..., "--image", "-i", help="Path to image(s) for prediction" - ), - prompt: Optional[str] = typer.Option( - None, "--prompt", "-p", help="Optional prompt to guide generation" - ), - max_new_tokens: int = typer.Option( - 512, "--max-new-tokens", help="Maximum new tokens to generate" - ), - output: Optional[Path] = typer.Option( - None, "--output", "-o", help="Output file path to save results" - ), + image: list[Path] = typer.Option(..., "--image", "-i", help="Path to image(s) for prediction"), + prompt: Optional[str] = typer.Option(None, "--prompt", "-p", help="Optional prompt to guide generation"), + max_new_tokens: int = typer.Option(512, "--max-new-tokens", help="Maximum new tokens to generate"), + output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output file path to save results"), ) -> None: """Run inference on images using SmolVLM2.""" try: model = SmolVLM2() - result = model.generate( - images=[str(img) for img in image], - prompt=prompt, - max_new_tokens=max_new_tokens - ) + result = model.generate(images=[str(img) for img in image], prompt=prompt, max_new_tokens=max_new_tokens) if output: import json + with open(output, "w") as f: json.dump(result, f, indent=2) typer.echo(f"Results saved to {output}") @@ -96,27 +79,17 @@ def predict( typer.echo(f"Error during prediction: {e!s}", err=True) raise typer.Exit(code=1) + @smolvlm2_app.command(name="train", help="Fine-tune the SmolVLM2 model") def train( - dataset: Path = typer.Option( - ..., "--dataset", "-d", help="Path to dataset directory or file" - ), - epochs: int = typer.Option( - 10, "--epochs", "-e", help="Number of training epochs" - ), - batch_size: int = typer.Option( - 4, "--batch-size", "-b", help="Training batch size" - ), + dataset: Path = typer.Option(..., "--dataset", "-d", help="Path to dataset directory or file"), + epochs: int = typer.Option(10, "--epochs", "-e", help="Number of training epochs"), + batch_size: int = typer.Option(4, "--batch-size", "-b", help="Training batch size"), optimization_strategy: str = typer.Option( - "qlora", "--optimization-strategy", "-o", - help="Optimization strategy (qlora, lora, freeze_vision)" - ), - metrics: list[str] = typer.Option( - ["edit_distance"], "--metrics", "-m", help="Metrics to evaluate during training" - ), - output_dir: Optional[Path] = typer.Option( - None, "--output-dir", help="Directory to save trained model" + "qlora", "--optimization-strategy", "-o", help="Optimization strategy (qlora, lora, freeze_vision)" ), + metrics: list[str] = typer.Option(["edit_distance"], "--metrics", "-m", help="Metrics to evaluate during training"), + output_dir: Optional[Path] = typer.Option(None, "--output-dir", help="Directory to save trained model"), ) -> None: """Fine-tune the SmolVLM2 model on a dataset.""" try: @@ -124,6 +97,7 @@ def train( if output_dir is None: import tempfile + output_dir = Path(tempfile.mkdtemp()) typer.echo(f"No output directory specified, using temporary directory: {output_dir}") @@ -134,7 +108,7 @@ def train( "batch_size": batch_size, "optimization_strategy": optimization_strategy, "metrics": metrics, - "output_dir": str(output_dir) + "output_dir": str(output_dir), } # Import the train function here to avoid circular imports diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py index 9ed2c631..47654d34 100644 --- a/maestro/trainer/models/smolvlm2/inference.py +++ b/maestro/trainer/models/smolvlm2/inference.py @@ -11,7 +11,7 @@ def __init__( self, model_name: str = "smol-ai/smolvlm2-500m", device: str = "cuda" if torch.cuda.is_available() else "cpu", - **kwargs + **kwargs, ): """Initialize inference interface.""" self.model = AutoModelForVision2Seq.from_pretrained(model_name) @@ -19,11 +19,7 @@ def __init__( self.device = device def generate( - self, - images: Union[str, list[str]], - prompt: Optional[str] = None, - max_new_tokens: int = 512, - **kwargs + self, images: Union[str, list[str]], prompt: Optional[str] = None, max_new_tokens: int = 512, **kwargs ) -> dict: """ Generate text from images. @@ -38,27 +34,21 @@ def generate( Dictionary containing generated text and other outputs """ # Process inputs - inputs = self.processor( - images=images, - text=prompt if prompt else "", - return_tensors="pt" - ) + inputs = self.processor(images=images, text=prompt if prompt else "", return_tensors="pt") # Generate outputs = self.model.generate( input_ids=inputs["input_ids"].to(self.device), pixel_values=inputs["pixel_values"].to(self.device), max_new_tokens=max_new_tokens, - **kwargs + **kwargs, ) # Decode outputs generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True) - return { - "generated_text": generated_text, - "model_outputs": outputs - } + return {"generated_text": generated_text, "model_outputs": outputs} + def predict_with_inputs( model: AutoModelForVision2Seq, @@ -67,7 +57,7 @@ def predict_with_inputs( pixel_values: torch.Tensor, device: Union[str, torch.device], max_new_tokens: int = 512, - **kwargs + **kwargs, ) -> list[str]: """ Generate text predictions using the model. @@ -90,10 +80,11 @@ def predict_with_inputs( input_ids=input_ids.to(device), pixel_values=pixel_values.to(device), max_new_tokens=max_new_tokens, - **kwargs + **kwargs, ) return processor.batch_decode(outputs, skip_special_tokens=True) + def predict_with_images( model: AutoModelForVision2Seq, processor: AutoProcessor, @@ -101,7 +92,7 @@ def predict_with_images( prompt: Optional[str] = None, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", max_new_tokens: int = 512, - **kwargs + **kwargs, ) -> list[str]: """ Generate text predictions from images. @@ -121,11 +112,7 @@ def predict_with_images( if isinstance(images, str): images = [images] - inputs = processor( - images=images, - text=prompt if prompt else "", - return_tensors="pt" - ) + inputs = processor(images=images, text=prompt if prompt else "", return_tensors="pt") return predict_with_inputs( model=model, @@ -134,5 +121,5 @@ def predict_with_images( pixel_values=inputs["pixel_values"], device=device, max_new_tokens=max_new_tokens, - **kwargs + **kwargs, ) diff --git a/maestro/trainer/models/smolvlm2/loaders.py b/maestro/trainer/models/smolvlm2/loaders.py index 4f9bc8b9..a99af163 100644 --- a/maestro/trainer/models/smolvlm2/loaders.py +++ b/maestro/trainer/models/smolvlm2/loaders.py @@ -10,10 +10,7 @@ class SmolVLM2Dataset(Dataset): """Dataset for SmolVLM2 model.""" def __init__( - self, - image_paths: list[str], - texts: Optional[list[str]] = None, - processor: Optional[AutoProcessor] = None + self, image_paths: list[str], texts: Optional[list[str]] = None, processor: Optional[AutoProcessor] = None ): """ Initialize dataset. @@ -40,16 +37,10 @@ def __getitem__(self, idx: int) -> dict: text = "" if self.processor is not None: - return self.processor( - images=image, - text=text, - return_tensors="pt" - ) + return self.processor(images=image, text=text, return_tensors="pt") else: - return { - "image": image, - "text": text - } + return {"image": image, "text": text} + def train_collate_fn(batch: list[dict]) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ @@ -67,8 +58,9 @@ def train_collate_fn(batch: list[dict]) -> tuple[torch.Tensor, torch.Tensor, tor return input_ids, pixel_values, labels + def evaluation_collate_fn( - batch: list[dict] + batch: list[dict], ) -> tuple[torch.Tensor, torch.Tensor, list[Image.Image], list[str], list[str]]: """ Collate function for evaluation data. @@ -87,12 +79,9 @@ def evaluation_collate_fn( return input_ids, pixel_values, images, prompts, targets + def create_dataloader( - dataset: Dataset, - batch_size: int = 8, - num_workers: int = 4, - shuffle: bool = True, - collate_fn = None + dataset: Dataset, batch_size: int = 8, num_workers: int = 4, shuffle: bool = True, collate_fn=None ) -> DataLoader: """ Create a DataLoader for the dataset. @@ -106,10 +95,4 @@ def create_dataloader( Returns: DataLoader instance """ - return DataLoader( - dataset, - batch_size=batch_size, - num_workers=num_workers, - shuffle=shuffle, - collate_fn=collate_fn - ) + return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, collate_fn=collate_fn) diff --git a/pyproject.toml b/pyproject.toml index f94e8c04..82f15aeb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -172,7 +172,7 @@ fixable = [ "T", "W", "ANN", - "ARG", + "ARG", "BLE", "COM", "DJ", @@ -184,7 +184,7 @@ fixable = [ "ICN", "INP", "ISC", - "NPY", + "NPY", "PD", "PGH", "PIE", @@ -196,7 +196,7 @@ fixable = [ "RSE", "RUF", "SIM", - "SLF", + "SLF", "TCH", "TID", "TRY", From 6c35bef457f0806926a92c7f1402e6ecd4abd037 Mon Sep 17 00:00:00 2001 From: AshAnand34 Date: Sat, 10 May 2025 15:08:42 -0700 Subject: [PATCH 5/9] fixing errors in smolvlm2 interpretation --- maestro/trainer/models/smolvlm2/core.py | 31 +++++++++++++------- maestro/trainer/models/smolvlm2/detection.py | 13 +++++--- maestro/trainer/models/smolvlm2/inference.py | 29 ++++++++++++++++++ 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py index 475d60c7..4ffcc243 100644 --- a/maestro/trainer/models/smolvlm2/core.py +++ b/maestro/trainer/models/smolvlm2/core.py @@ -152,16 +152,27 @@ def train(config: dict) -> dict: else: raise ValueError(f"Unsupported optimization strategy: {strategy}") - processor = AutoProcessor.from_pretrained(model_name) - - # Load datasets + processor = AutoProcessor.from_pretrained(model_name) # Load datasets + + # Create processor wrapper to preprocess data before collating + def process_batch(batch): + processed_batch = [] + for item in batch: + processed_item = processor( + images=item.get("image"), + text=item.get("text", ""), + return_tensors="pt" + ) + processed_batch.append(processed_item) + return processed_batch + train_loader, valid_loader, test_loader = create_data_loaders( dataset_location=dataset_location, train_batch_size=config.get("batch_size", 4), - train_collect_fn=partial(train_collate_fn, processor=processor), + train_collect_fn=lambda batch: train_collate_fn(process_batch(batch)), train_num_workers=config.get("num_workers", 0), test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)), - test_collect_fn=partial(evaluation_collate_fn, processor=processor), + test_collect_fn=lambda batch: evaluation_collate_fn(process_batch(batch)), test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)), ) # Set up training arguments @@ -183,15 +194,13 @@ def train(config: dict) -> dict: evaluation_strategy="epoch", load_best_model_at_end=True, remove_unused_columns=False - ) - - # Set up trainer + ) # Set up trainer trainer = Trainer( model=model, args=training_args, - train_dataset=train_loader.dataset, - eval_dataset=valid_loader.dataset, - data_collator=lambda batch: train_collate_fn(batch, processor) + train_dataset=train_loader.dataset if train_loader is not None else None, + eval_dataset=valid_loader.dataset if valid_loader is not None else None, + tokenizer=processor ) # Train model diff --git a/maestro/trainer/models/smolvlm2/detection.py b/maestro/trainer/models/smolvlm2/detection.py index a52fcfdf..333297a9 100644 --- a/maestro/trainer/models/smolvlm2/detection.py +++ b/maestro/trainer/models/smolvlm2/detection.py @@ -99,13 +99,18 @@ def format_prompt_for_detection( prompt: Base prompt xyxy: Optional bounding boxes class_id: Optional class IDs - classes: Optional class names - resolution_wh: Optional image resolution - + classes: Optional class names resolution_wh: Optional image resolution + Returns: Formatted prompt string """ if all(x is not None for x in [xyxy, class_id, classes, resolution_wh]): - detection_text = detections_to_text_formatter(xyxy, class_id, classes, resolution_wh) + # Type-cast to the expected types before passing to formatter + detection_text = detections_to_text_formatter( + xyxy, + class_id if class_id is not None else [], + classes if classes is not None else [], + resolution_wh if resolution_wh is not None else (0, 0) + ) return f"{prompt} {detection_text}" return prompt diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py index 9ed2c631..540d6b2a 100644 --- a/maestro/trainer/models/smolvlm2/inference.py +++ b/maestro/trainer/models/smolvlm2/inference.py @@ -17,6 +17,35 @@ def __init__( self.model = AutoModelForVision2Seq.from_pretrained(model_name) self.processor = AutoProcessor.from_pretrained(model_name) self.device = device + self.model_name = model_name + + def get_model_info(self) -> dict: + """ + Get information about the loaded model. + + Returns: + Dictionary containing model information + """ + # Extract model size from model name (e.g., smolvlm2-500m -> 500M) + size_info = "unknown" + if "-" in self.model_name: + parts = self.model_name.split("-") + if len(parts) > 1 and parts[-1].endswith("m"): + size_info = parts[-1].upper() + + # Get total parameters + total_params = sum(p.numel() for p in self.model.parameters()) + trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) + + return { + "model_name": self.model_name, + "model_size": size_info, + "device": self.device, + "total_parameters": f"{total_params:,}", + "trainable_parameters": f"{trainable_params:,}", + "architecture": "Vision-Language Model (VLM)", + "framework": "PyTorch/Transformers" + } def generate( self, From 8fe68f2495f0d82025b6b5db34bd06a64d608184 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 10 May 2025 22:10:53 +0000 Subject: [PATCH 6/9] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- maestro/trainer/models/smolvlm2/core.py | 12 ++++-------- maestro/trainer/models/smolvlm2/detection.py | 6 +++--- maestro/trainer/models/smolvlm2/inference.py | 2 +- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py index 2a8c392d..7bb32488 100644 --- a/maestro/trainer/models/smolvlm2/core.py +++ b/maestro/trainer/models/smolvlm2/core.py @@ -126,20 +126,16 @@ def train(config: dict) -> dict: else: raise ValueError(f"Unsupported optimization strategy: {strategy}") - processor = AutoProcessor.from_pretrained(model_name) # Load datasets - + processor = AutoProcessor.from_pretrained(model_name) # Load datasets + # Create processor wrapper to preprocess data before collating def process_batch(batch): processed_batch = [] for item in batch: - processed_item = processor( - images=item.get("image"), - text=item.get("text", ""), - return_tensors="pt" - ) + processed_item = processor(images=item.get("image"), text=item.get("text", ""), return_tensors="pt") processed_batch.append(processed_item) return processed_batch - + train_loader, valid_loader, test_loader = create_data_loaders( dataset_location=dataset_location, train_batch_size=config.get("batch_size", 4), diff --git a/maestro/trainer/models/smolvlm2/detection.py b/maestro/trainer/models/smolvlm2/detection.py index 4642fd96..c03126a5 100644 --- a/maestro/trainer/models/smolvlm2/detection.py +++ b/maestro/trainer/models/smolvlm2/detection.py @@ -97,17 +97,17 @@ def format_prompt_for_detection( xyxy: Optional bounding boxes class_id: Optional class IDs classes: Optional class names resolution_wh: Optional image resolution - + Returns: Formatted prompt string """ if all(x is not None for x in [xyxy, class_id, classes, resolution_wh]): # Type-cast to the expected types before passing to formatter detection_text = detections_to_text_formatter( - xyxy, + xyxy, class_id if class_id is not None else [], classes if classes is not None else [], - resolution_wh if resolution_wh is not None else (0, 0) + resolution_wh if resolution_wh is not None else (0, 0), ) return f"{prompt} {detection_text}" return prompt diff --git a/maestro/trainer/models/smolvlm2/inference.py b/maestro/trainer/models/smolvlm2/inference.py index c302640b..31834b13 100644 --- a/maestro/trainer/models/smolvlm2/inference.py +++ b/maestro/trainer/models/smolvlm2/inference.py @@ -44,7 +44,7 @@ def get_model_info(self) -> dict: "total_parameters": f"{total_params:,}", "trainable_parameters": f"{trainable_params:,}", "architecture": "Vision-Language Model (VLM)", - "framework": "PyTorch/Transformers" + "framework": "PyTorch/Transformers", } def generate( From 4ff3d638fa71ebc3e9a1cb92b4fa554470b10e70 Mon Sep 17 00:00:00 2001 From: AshAnand34 Date: Sat, 10 May 2025 15:36:37 -0700 Subject: [PATCH 7/9] Fixing more errors with core.py --- maestro/trainer/models/smolvlm2/core.py | 28 +++++++++++++++++-------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py index 7bb32488..31ab6ca0 100644 --- a/maestro/trainer/models/smolvlm2/core.py +++ b/maestro/trainer/models/smolvlm2/core.py @@ -123,11 +123,12 @@ def train(config: dict) -> dict: # Freeze vision encoder parameters for param in model.vision_model.parameters(): param.requires_grad = False - else: raise ValueError(f"Unsupported optimization strategy: {strategy}") - processor = AutoProcessor.from_pretrained(model_name) # Load datasets - + + # Load processor and datasets + processor = AutoProcessor.from_pretrained(model_name) + # Create processor wrapper to preprocess data before collating def process_batch(batch): processed_batch = [] @@ -142,7 +143,7 @@ def process_batch(batch): train_collect_fn=lambda batch: train_collate_fn(process_batch(batch)), train_num_workers=config.get("num_workers", 0), test_batch_size=config.get("val_batch_size", config.get("batch_size", 4)), - test_collect_fn=partial(evaluation_collate_fn, processor=processor), + test_collect_fn=lambda batch: evaluation_collate_fn(process_batch(batch)), test_num_workers=config.get("val_num_workers", config.get("num_workers", 0)), ) @@ -166,14 +167,23 @@ def process_batch(batch): load_best_model_at_end=True, remove_unused_columns=False, ) - - # Set up trainer + + # Safely handle potential None loaders by directly checking train_loader/valid_loader before accessing dataset attribute + train_dataset = None + if train_loader is not None: + train_dataset = train_loader.dataset + + eval_dataset = None + if valid_loader is not None: + eval_dataset = valid_loader.dataset + + # Create data_collator that matches the train_collate_fn signature (doesn't pass processor) trainer = Trainer( model=model, args=training_args, - train_dataset=train_loader.dataset, - eval_dataset=valid_loader.dataset, - data_collator=lambda batch: train_collate_fn(batch, processor), + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator=lambda batch: train_collate_fn(process_batch(batch)), ) # Train model From 0e1804eae7f197a42442b957dd52f334aef1cad5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 10 May 2025 22:37:01 +0000 Subject: [PATCH 8/9] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- maestro/trainer/models/smolvlm2/core.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py index 31ab6ca0..14b38bd4 100644 --- a/maestro/trainer/models/smolvlm2/core.py +++ b/maestro/trainer/models/smolvlm2/core.py @@ -60,7 +60,6 @@ def train(config: dict) -> dict: Returns: Dictionary containing training results and metrics """ - from functools import partial from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from transformers import BitsAndBytesConfig, TrainingArguments @@ -125,10 +124,10 @@ def train(config: dict) -> dict: param.requires_grad = False else: raise ValueError(f"Unsupported optimization strategy: {strategy}") - + # Load processor and datasets processor = AutoProcessor.from_pretrained(model_name) - + # Create processor wrapper to preprocess data before collating def process_batch(batch): processed_batch = [] @@ -167,16 +166,16 @@ def process_batch(batch): load_best_model_at_end=True, remove_unused_columns=False, ) - + # Safely handle potential None loaders by directly checking train_loader/valid_loader before accessing dataset attribute train_dataset = None if train_loader is not None: train_dataset = train_loader.dataset - + eval_dataset = None if valid_loader is not None: eval_dataset = valid_loader.dataset - + # Create data_collator that matches the train_collate_fn signature (doesn't pass processor) trainer = Trainer( model=model, From 3ea55447ce1d00fedc167c6cca0674b14304d5f3 Mon Sep 17 00:00:00 2001 From: AshAnand34 Date: Sat, 10 May 2025 15:38:51 -0700 Subject: [PATCH 9/9] Fixed Ruff error with too long line --- maestro/trainer/models/smolvlm2/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/maestro/trainer/models/smolvlm2/core.py b/maestro/trainer/models/smolvlm2/core.py index 14b38bd4..e300982f 100644 --- a/maestro/trainer/models/smolvlm2/core.py +++ b/maestro/trainer/models/smolvlm2/core.py @@ -167,7 +167,8 @@ def process_batch(batch): remove_unused_columns=False, ) - # Safely handle potential None loaders by directly checking train_loader/valid_loader before accessing dataset attribute + # Safely handle potential None loaders by directly checking + # train_loader/valid_loader before accessing dataset attribute train_dataset = None if train_loader is not None: train_dataset = train_loader.dataset