Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added assets/datafast-badge-web.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions assets/datafast-badge.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
178 changes: 178 additions & 0 deletions datafast/card_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import os
import re
from pathlib import Path
from huggingface_hub import HfApi, DatasetCard, DatasetCardData
from huggingface_hub.file_download import hf_hub_download

def extract_readme_metadata(repo_id: str, token: str | None = None) -> str:
"""Extracts the metadata from the README.md file of the dataset repository.
We have to download the previous README.md file in the repo, extract the metadata from it.
Args:
repo_id: The ID of the repository to push to, from the `push_to_hub` method.
token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method.
Returns:
The metadata extracted from the README.md file of the dataset repository as a str.
"""
try:
readme_path = Path(
hf_hub_download(repo_id, "README.md", repo_type="dataset", token=token)
)
# Extract the content between the '---' markers
metadata_match = re.findall(r"---\n(.*?)\n---", readme_path.read_text(), re.DOTALL)

if not metadata_match:
print("No YAML metadata found in the README.md")
return ""

return metadata_match[0]

except Exception as e:
print(f"Failed to extract metadata from README.md: {e}")
return ""


def extract_dataset_info(repo_id: str, token: str | None = None) -> str:
"""
Extract dataset_info section from README metadata.

Args:
repo_id: The dataset repository ID
token: Optional HuggingFace token for authentication

Returns:
The dataset_info section as a string, or empty string if not found
"""
readme_metadata = extract_readme_metadata(repo_id=repo_id, token=token)
if not readme_metadata:
return ""

section_prefix = "dataset_info:"
if section_prefix not in readme_metadata:
return ""

try:
# Extract the part after `dataset_info:` prefix
config_data = section_prefix + readme_metadata.split(section_prefix)[1]
return config_data
except IndexError:
print("Failed to extract dataset_info section from metadata")
return ""


def _generate_and_upload_dataset_card(
repo_id: str,
token: str | None = None,
template_path: str | None = None
) -> None:
"""
Internal implementation that generates and uploads a dataset card to Hugging Face Hub.

This is the core implementation function called by the public upload_dataset_card() function.
It handles the actual card generation and uploading without performing configuration checks.

The dataset card includes:
1. Pipeline subset descriptions based on enabled stages
2. Full sanitized configuration for reproducibility
3. Datafast version and other metadata
4. Preserved dataset_info from the existing card for proper configuration display

Args:
template_path: Optional custom template path
"""

try:
# Load template
if not template_path:
# Try to find template in utils directory
current_dir = os.path.dirname(__file__)
template_path = os.path.join(current_dir, "datafast_card_template.md")

if not os.path.exists(template_path):
print(f"Template file not found: {template_path}")
return

with open(template_path, "r", encoding="utf-8") as f:
template_str = f.read()

# Get HF token
if not token:
token = os.getenv("HF_TOKEN", None)

# Extract dataset_info section from existing README if available
config_data = extract_dataset_info(repo_id=repo_id, token=token)
print(f"Extracted dataset_info section, length: {len(config_data) if config_data else 0} characters")

dataset_name = repo_id.split("/")[-1]
pretty_name = dataset_name.replace("-", " ").replace("_", " ").title()

card_data_kwargs = {
"pretty_name": pretty_name
}

# Create DatasetCardData with our metadata
card_data = DatasetCardData(**card_data_kwargs)

# Get datafast version
from importlib.metadata import version, PackageNotFoundError

try:
version_str = version("datafast")
except PackageNotFoundError:
# Fallback for development installs
version_str = "dev"

# Prepare template variables
template_vars = {
"pretty_name": card_data.pretty_name,
"datafast_version": version_str,
"config_data": config_data, # Use the extracted dataset_info section
}

print("Rendering dataset card from template")
print(f"Template variables: {list(template_vars.keys())}")

# Render card with our template and variables
card = DatasetCard.from_template(
card_data=card_data,
template_str=template_str,
**template_vars
)

print("Template rendered successfully")
print(f"Rendered card content length: {len(str(card))} characters")

# Push to hub
print(f"Pushing dataset card to hub: {repo_id}")
card.push_to_hub(repo_id, token=token)

print(f"Dataset card successfully uploaded to: https://huggingface.co/datasets/{repo_id}")

except Exception as e:
print(f"Failed to upload dataset card: {e}")
print("Full traceback:")


def upload_dataset_card(repo_id: str, token: str | None = None, template_path: str | None = None) -> None:
"""
Public interface to generate and upload a dataset card to Hugging Face Hub.

This function performs configuration checks (like offline mode)
and then delegates to the internal _generate_and_upload_dataset_card() implementation.
It should be called at the end of the pipeline when all subsets are available.

Args:
repo_id: The ID of the repository to push to
token: The token to authenticate with the Hugging Face Hub
template_path: Optional custom template path
"""
try:

print(f"Uploading dataset card to repository: {repo_id}")
_generate_and_upload_dataset_card(
repo_id=repo_id,
token=token,
template_path=template_path
)

except Exception as e:
print(f"Error uploading dataset card: {e}")
11 changes: 11 additions & 0 deletions datafast/datafast_card_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
{{ card_data }}
{{ config_data }}
---
[<img src="https://raw.githubusercontent.com/patrickfleith/datafast/main/assets/datafast-badge-web.png"
alt="Built with Datafast" />](https://github.com/patrickfleith/datafast)

# {{ pretty_name }}

This dataset was generated using Datafast (v{{ datafast_version }}), an open-source package to generate high-quality and diverse synthetic text datasets for LLMs.

22 changes: 17 additions & 5 deletions datafast/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,13 @@ def to_jsonl(self, filepath: str, rows: list[Any] = None, append: bool = False):
def push_to_hub(
self,
repo_id: str,
token: Optional[str] = None,
token: str | None = None,
private: bool = False,
commit_message: Optional[str] = None,
train_size: Optional[float] = None,
seed: Optional[int] = None,
shuffle: Optional[bool] = True,
commit_message: str | None = None,
train_size: float | None = None,
seed: int | None = None,
shuffle: bool | None = True,
upload_card: bool = True,
) -> str:
"""Push the dataset to Hugging Face Hub.

Expand All @@ -134,6 +135,7 @@ def push_to_hub(
(e.g., 0.8 for 80% train)
seed: Optional random seed for train_test_split
shuffle: Optional boolean to shuffle the data for train_test_split
upload_card: Whether to automatically upload a dataset card after pushing

Returns:
str: URL of the dataset on the Hub
Expand Down Expand Up @@ -204,6 +206,16 @@ def push_to_hub(
)
raise

# Upload dataset card if requested
if upload_card:
try:
from datafast.card_utils import upload_dataset_card
upload_dataset_card(repo_id=repo_id, token=token)
print("Dataset card uploaded successfully")
except Exception as e:
print(f"Warning: Failed to upload dataset card: {e}")
# Continue even if card upload fails

return f"https://huggingface.co/datasets/{repo_id}"


Expand Down
4 changes: 2 additions & 2 deletions datafast/examples/quickstart_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
classes=[
{"name": "positive", "description": "Text expressing positive emotions or approval"},
{"name": "negative", "description": "Text expressing negative emotions or criticism"},
{"name": "neutral", "description": "Text with neutral emotions or indifference"}
# {"name": "neutral", "description": "Text with neutral emotions or indifference"}
],
num_samples_per_prompt=3,
output_file="outdoor_activities_sentiments.jsonl",
Expand All @@ -34,7 +34,7 @@
from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider, OllamaProvider

providers = [
OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"),
OpenAIProvider(model_id="gpt-4.1-nano"),
# AnthropicProvider(model_id="claude-3-5-haiku-latest"),
# GeminiProvider(model_id="gemini-2.0-flash"),
# OllamaProvider(model_id="gemma3:12b")
Expand Down
Loading