patrickfleith · patrickfleith · Jun 14, 2025 · Jun 13, 2025 · Jun 13, 2025 · Jun 14, 2025
diff --git a/assets/datafast-badge-web.png b/assets/datafast-badge-web.png
diff --git a/assets/datafast-badge.svg b/assets/datafast-badge.svg
diff --git a/datafast/card_utils.py b/datafast/card_utils.py
@@ -0,0 +1,178 @@
+import os
+import re
+from pathlib import Path
+from huggingface_hub import HfApi, DatasetCard, DatasetCardData
+from huggingface_hub.file_download import hf_hub_download
+
+def extract_readme_metadata(repo_id: str, token: str | None = None) -> str:
+    """Extracts the metadata from the README.md file of the dataset repository.
+    We have to download the previous README.md file in the repo, extract the metadata from it.
+    Args:
+        repo_id: The ID of the repository to push to, from the `push_to_hub` method.
+        token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
+    Returns:
+        The metadata extracted from the README.md file of the dataset repository as a str.
+    """
+    try:
+        readme_path = Path(
+            hf_hub_download(repo_id, "README.md", repo_type="dataset", token=token)
+        )
+        # Extract the content between the '---' markers
+        metadata_match = re.findall(r"---\n(.*?)\n---", readme_path.read_text(), re.DOTALL)
+
+        if not metadata_match:
+            print("No YAML metadata found in the README.md")
+            return ""
+
+        return metadata_match[0]
+
+    except Exception as e:
+        print(f"Failed to extract metadata from README.md: {e}")
+        return ""
+
+
+def extract_dataset_info(repo_id: str, token: str | None = None) -> str:
+    """
+    Extract dataset_info section from README metadata.
+
+    Args:
+        repo_id: The dataset repository ID
+        token: Optional HuggingFace token for authentication
+
+    Returns:
+        The dataset_info section as a string, or empty string if not found
+    """       
+    readme_metadata = extract_readme_metadata(repo_id=repo_id, token=token)
+    if not readme_metadata:
+        return ""
+
+    section_prefix = "dataset_info:"
+    if section_prefix not in readme_metadata:
+        return ""
+
+    try:
+        # Extract the part after `dataset_info:` prefix
+        config_data = section_prefix + readme_metadata.split(section_prefix)[1]
+        return config_data
+    except IndexError:
+        print("Failed to extract dataset_info section from metadata")
+        return ""
+
+
+def _generate_and_upload_dataset_card(
+    repo_id: str,
+    token: str | None = None,
+    template_path: str | None = None
+) -> None:
+    """
+    Internal implementation that generates and uploads a dataset card to Hugging Face Hub.
+
+    This is the core implementation function called by the public upload_dataset_card() function.
+    It handles the actual card generation and uploading without performing configuration checks.
+
+    The dataset card includes:
+    1. Pipeline subset descriptions based on enabled stages
+    2. Full sanitized configuration for reproducibility
+    3. Datafast version and other metadata
+    4. Preserved dataset_info from the existing card for proper configuration display
+
+    Args:
+        template_path: Optional custom template path
+    """
+
+    try:
+        # Load template
+        if not template_path:
+            # Try to find template in utils directory
+            current_dir = os.path.dirname(__file__)
+            template_path = os.path.join(current_dir, "datafast_card_template.md")
+
+        if not os.path.exists(template_path):
+            print(f"Template file not found: {template_path}")
+            return
+
+        with open(template_path, "r", encoding="utf-8") as f:
+            template_str = f.read()
+
+        # Get HF token
+        if not token:
+            token = os.getenv("HF_TOKEN", None)
+
+        # Extract dataset_info section from existing README if available
+        config_data = extract_dataset_info(repo_id=repo_id, token=token)
+        print(f"Extracted dataset_info section, length: {len(config_data) if config_data else 0} characters")
+
+        dataset_name = repo_id.split("/")[-1]
+        pretty_name = dataset_name.replace("-", " ").replace("_", " ").title()
+
+        card_data_kwargs = {
+            "pretty_name": pretty_name
+        }
+
+        # Create DatasetCardData with our metadata
+        card_data = DatasetCardData(**card_data_kwargs)
+
+        # Get datafast version
+        from importlib.metadata import version, PackageNotFoundError
+
+        try:
+            version_str = version("datafast")
+        except PackageNotFoundError:
+            # Fallback for development installs
+            version_str = "dev"
+
+        # Prepare template variables
+        template_vars = {
+            "pretty_name": card_data.pretty_name,
+            "datafast_version": version_str,
+            "config_data": config_data,  # Use the extracted dataset_info section
+        }
+
+        print("Rendering dataset card from template")
+        print(f"Template variables: {list(template_vars.keys())}")
+
+        # Render card with our template and variables 
+        card = DatasetCard.from_template(
+            card_data=card_data,
+            template_str=template_str,
+            **template_vars
+        )
+
+        print("Template rendered successfully")
+        print(f"Rendered card content length: {len(str(card))} characters")
+
+        # Push to hub
+        print(f"Pushing dataset card to hub: {repo_id}")
+        card.push_to_hub(repo_id, token=token)
+
+        print(f"Dataset card successfully uploaded to: https://huggingface.co/datasets/{repo_id}")
+
+    except Exception as e:
+        print(f"Failed to upload dataset card: {e}")
+        print("Full traceback:")
+
+
+def upload_dataset_card(repo_id: str, token: str | None = None, template_path: str | None = None) -> None:
+    """
+    Public interface to generate and upload a dataset card to Hugging Face Hub.
+
+    This function performs configuration checks (like offline mode)
+    and then delegates to the internal _generate_and_upload_dataset_card() implementation.
+    It should be called at the end of the pipeline when all subsets are available.
+
+    Args:
+        repo_id: The ID of the repository to push to
+        token: The token to authenticate with the Hugging Face Hub
+        template_path: Optional custom template path
+    """
+    try:
+
+        print(f"Uploading dataset card to repository: {repo_id}")
+        _generate_and_upload_dataset_card(
+            repo_id=repo_id,
+            token=token,
+            template_path=template_path
+        )
+
+    except Exception as e:
+        print(f"Error uploading dataset card: {e}")
diff --git a/datafast/datafast_card_template.md b/datafast/datafast_card_template.md
@@ -0,0 +1,11 @@
+---
+{{ card_data }}
+{{ config_data }}
+---
+[<img src="https://raw.githubusercontent.com/patrickfleith/datafast/main/assets/datafast-badge-web.png"
+     alt="Built with Datafast" />](https://github.com/patrickfleith/datafast)
+
+# {{ pretty_name }}
+
+This dataset was generated using Datafast (v{{ datafast_version }}), an open-source package to generate high-quality and diverse synthetic text datasets for LLMs.
+
diff --git a/datafast/datasets.py b/datafast/datasets.py
@@ -116,12 +116,13 @@ def to_jsonl(self, filepath: str, rows: list[Any] = None, append: bool = False):
     def push_to_hub(
         self,
         repo_id: str,
-        token: Optional[str] = None,
+        token: str | None = None,
         private: bool = False,
-        commit_message: Optional[str] = None,
-        train_size: Optional[float] = None,
-        seed: Optional[int] = None,
-        shuffle: Optional[bool] = True,
+        commit_message: str | None = None,
+        train_size: float | None = None,
+        seed: int | None = None,
+        shuffle: bool | None = True,
+        upload_card: bool = True,
     ) -> str:
         """Push the dataset to Hugging Face Hub.
 
@@ -134,6 +135,7 @@ def push_to_hub(
             (e.g., 0.8 for 80% train)
             seed: Optional random seed for train_test_split
             shuffle: Optional boolean to shuffle the data for train_test_split
+            upload_card: Whether to automatically upload a dataset card after pushing
 
         Returns:
             str: URL of the dataset on the Hub
@@ -204,6 +206,16 @@ def push_to_hub(
                 )
             raise
 
+        # Upload dataset card if requested
+        if upload_card:
+            try:
+                from datafast.card_utils import upload_dataset_card
+                upload_dataset_card(repo_id=repo_id, token=token)
+                print("Dataset card uploaded successfully")
+            except Exception as e:
+                print(f"Warning: Failed to upload dataset card: {e}")
+                # Continue even if card upload fails
+
         return f"https://huggingface.co/datasets/{repo_id}"
 
 

diff --git a/datafast/examples/quickstart_example.py b/datafast/examples/quickstart_example.py
@@ -7,7 +7,7 @@
     classes=[
         {"name": "positive", "description": "Text expressing positive emotions or approval"},
         {"name": "negative", "description": "Text expressing negative emotions or criticism"},
-        {"name": "neutral", "description": "Text with neutral emotions or indifference"}
+        # {"name": "neutral", "description": "Text with neutral emotions or indifference"}
     ],
     num_samples_per_prompt=3,
     output_file="outdoor_activities_sentiments.jsonl",
@@ -34,7 +34,7 @@
 from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider, OllamaProvider
 
 providers = [
-    OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"),
+    OpenAIProvider(model_id="gpt-4.1-nano"),
     # AnthropicProvider(model_id="claude-3-5-haiku-latest"),
     # GeminiProvider(model_id="gemini-2.0-flash"),
     # OllamaProvider(model_id="gemma3:12b")