diff --git a/assets/datafast-badge-web.png b/assets/datafast-badge-web.png
new file mode 100644
index 0000000..dac5f20
Binary files /dev/null and b/assets/datafast-badge-web.png differ
diff --git a/assets/datafast-badge.svg b/assets/datafast-badge.svg
new file mode 100644
index 0000000..8563be5
--- /dev/null
+++ b/assets/datafast-badge.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/datafast/card_utils.py b/datafast/card_utils.py
new file mode 100644
index 0000000..b5df9c5
--- /dev/null
+++ b/datafast/card_utils.py
@@ -0,0 +1,178 @@
+import os
+import re
+from pathlib import Path
+from huggingface_hub import HfApi, DatasetCard, DatasetCardData
+from huggingface_hub.file_download import hf_hub_download
+
+def extract_readme_metadata(repo_id: str, token: str | None = None) -> str:
+ """Extracts the metadata from the README.md file of the dataset repository.
+ We have to download the previous README.md file in the repo, extract the metadata from it.
+ Args:
+ repo_id: The ID of the repository to push to, from the `push_to_hub` method.
+ token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
+ Returns:
+ The metadata extracted from the README.md file of the dataset repository as a str.
+ """
+ try:
+ readme_path = Path(
+ hf_hub_download(repo_id, "README.md", repo_type="dataset", token=token)
+ )
+ # Extract the content between the '---' markers
+ metadata_match = re.findall(r"---\n(.*?)\n---", readme_path.read_text(), re.DOTALL)
+
+ if not metadata_match:
+ print("No YAML metadata found in the README.md")
+ return ""
+
+ return metadata_match[0]
+
+ except Exception as e:
+ print(f"Failed to extract metadata from README.md: {e}")
+ return ""
+
+
+def extract_dataset_info(repo_id: str, token: str | None = None) -> str:
+ """
+ Extract dataset_info section from README metadata.
+
+ Args:
+ repo_id: The dataset repository ID
+ token: Optional HuggingFace token for authentication
+
+ Returns:
+ The dataset_info section as a string, or empty string if not found
+ """
+ readme_metadata = extract_readme_metadata(repo_id=repo_id, token=token)
+ if not readme_metadata:
+ return ""
+
+ section_prefix = "dataset_info:"
+ if section_prefix not in readme_metadata:
+ return ""
+
+ try:
+ # Extract the part after `dataset_info:` prefix
+ config_data = section_prefix + readme_metadata.split(section_prefix)[1]
+ return config_data
+ except IndexError:
+ print("Failed to extract dataset_info section from metadata")
+ return ""
+
+
+def _generate_and_upload_dataset_card(
+ repo_id: str,
+ token: str | None = None,
+ template_path: str | None = None
+) -> None:
+ """
+ Internal implementation that generates and uploads a dataset card to Hugging Face Hub.
+
+ This is the core implementation function called by the public upload_dataset_card() function.
+ It handles the actual card generation and uploading without performing configuration checks.
+
+ The dataset card includes:
+ 1. Pipeline subset descriptions based on enabled stages
+ 2. Full sanitized configuration for reproducibility
+ 3. Datafast version and other metadata
+ 4. Preserved dataset_info from the existing card for proper configuration display
+
+ Args:
+ template_path: Optional custom template path
+ """
+
+ try:
+ # Load template
+ if not template_path:
+ # Try to find template in utils directory
+ current_dir = os.path.dirname(__file__)
+ template_path = os.path.join(current_dir, "datafast_card_template.md")
+
+ if not os.path.exists(template_path):
+ print(f"Template file not found: {template_path}")
+ return
+
+ with open(template_path, "r", encoding="utf-8") as f:
+ template_str = f.read()
+
+ # Get HF token
+ if not token:
+ token = os.getenv("HF_TOKEN", None)
+
+ # Extract dataset_info section from existing README if available
+ config_data = extract_dataset_info(repo_id=repo_id, token=token)
+ print(f"Extracted dataset_info section, length: {len(config_data) if config_data else 0} characters")
+
+ dataset_name = repo_id.split("/")[-1]
+ pretty_name = dataset_name.replace("-", " ").replace("_", " ").title()
+
+ card_data_kwargs = {
+ "pretty_name": pretty_name
+ }
+
+ # Create DatasetCardData with our metadata
+ card_data = DatasetCardData(**card_data_kwargs)
+
+ # Get datafast version
+ from importlib.metadata import version, PackageNotFoundError
+
+ try:
+ version_str = version("datafast")
+ except PackageNotFoundError:
+ # Fallback for development installs
+ version_str = "dev"
+
+ # Prepare template variables
+ template_vars = {
+ "pretty_name": card_data.pretty_name,
+ "datafast_version": version_str,
+ "config_data": config_data, # Use the extracted dataset_info section
+ }
+
+ print("Rendering dataset card from template")
+ print(f"Template variables: {list(template_vars.keys())}")
+
+ # Render card with our template and variables
+ card = DatasetCard.from_template(
+ card_data=card_data,
+ template_str=template_str,
+ **template_vars
+ )
+
+ print("Template rendered successfully")
+ print(f"Rendered card content length: {len(str(card))} characters")
+
+ # Push to hub
+ print(f"Pushing dataset card to hub: {repo_id}")
+ card.push_to_hub(repo_id, token=token)
+
+ print(f"Dataset card successfully uploaded to: https://huggingface.co/datasets/{repo_id}")
+
+ except Exception as e:
+ print(f"Failed to upload dataset card: {e}")
+ print("Full traceback:")
+
+
+def upload_dataset_card(repo_id: str, token: str | None = None, template_path: str | None = None) -> None:
+ """
+ Public interface to generate and upload a dataset card to Hugging Face Hub.
+
+ This function performs configuration checks (like offline mode)
+ and then delegates to the internal _generate_and_upload_dataset_card() implementation.
+ It should be called at the end of the pipeline when all subsets are available.
+
+ Args:
+ repo_id: The ID of the repository to push to
+ token: The token to authenticate with the Hugging Face Hub
+ template_path: Optional custom template path
+ """
+ try:
+
+ print(f"Uploading dataset card to repository: {repo_id}")
+ _generate_and_upload_dataset_card(
+ repo_id=repo_id,
+ token=token,
+ template_path=template_path
+ )
+
+ except Exception as e:
+ print(f"Error uploading dataset card: {e}")
\ No newline at end of file
diff --git a/datafast/datafast_card_template.md b/datafast/datafast_card_template.md
new file mode 100644
index 0000000..e620d6f
--- /dev/null
+++ b/datafast/datafast_card_template.md
@@ -0,0 +1,11 @@
+---
+{{ card_data }}
+{{ config_data }}
+---
+[
](https://github.com/patrickfleith/datafast)
+
+# {{ pretty_name }}
+
+This dataset was generated using Datafast (v{{ datafast_version }}), an open-source package to generate high-quality and diverse synthetic text datasets for LLMs.
+
diff --git a/datafast/datasets.py b/datafast/datasets.py
index 06f04c9..925a245 100644
--- a/datafast/datasets.py
+++ b/datafast/datasets.py
@@ -116,12 +116,13 @@ def to_jsonl(self, filepath: str, rows: list[Any] = None, append: bool = False):
def push_to_hub(
self,
repo_id: str,
- token: Optional[str] = None,
+ token: str | None = None,
private: bool = False,
- commit_message: Optional[str] = None,
- train_size: Optional[float] = None,
- seed: Optional[int] = None,
- shuffle: Optional[bool] = True,
+ commit_message: str | None = None,
+ train_size: float | None = None,
+ seed: int | None = None,
+ shuffle: bool | None = True,
+ upload_card: bool = True,
) -> str:
"""Push the dataset to Hugging Face Hub.
@@ -134,6 +135,7 @@ def push_to_hub(
(e.g., 0.8 for 80% train)
seed: Optional random seed for train_test_split
shuffle: Optional boolean to shuffle the data for train_test_split
+ upload_card: Whether to automatically upload a dataset card after pushing
Returns:
str: URL of the dataset on the Hub
@@ -204,6 +206,16 @@ def push_to_hub(
)
raise
+ # Upload dataset card if requested
+ if upload_card:
+ try:
+ from datafast.card_utils import upload_dataset_card
+ upload_dataset_card(repo_id=repo_id, token=token)
+ print("Dataset card uploaded successfully")
+ except Exception as e:
+ print(f"Warning: Failed to upload dataset card: {e}")
+ # Continue even if card upload fails
+
return f"https://huggingface.co/datasets/{repo_id}"
diff --git a/datafast/examples/quickstart_example.py b/datafast/examples/quickstart_example.py
index 76d5671..6cff340 100644
--- a/datafast/examples/quickstart_example.py
+++ b/datafast/examples/quickstart_example.py
@@ -7,7 +7,7 @@
classes=[
{"name": "positive", "description": "Text expressing positive emotions or approval"},
{"name": "negative", "description": "Text expressing negative emotions or criticism"},
- {"name": "neutral", "description": "Text with neutral emotions or indifference"}
+ # {"name": "neutral", "description": "Text with neutral emotions or indifference"}
],
num_samples_per_prompt=3,
output_file="outdoor_activities_sentiments.jsonl",
@@ -34,7 +34,7 @@
from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider, OllamaProvider
providers = [
- OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"),
+ OpenAIProvider(model_id="gpt-4.1-nano"),
# AnthropicProvider(model_id="claude-3-5-haiku-latest"),
# GeminiProvider(model_id="gemini-2.0-flash"),
# OllamaProvider(model_id="gemma3:12b")