diff --git a/assets/datafast-badge-web.png b/assets/datafast-badge-web.png new file mode 100644 index 0000000..dac5f20 Binary files /dev/null and b/assets/datafast-badge-web.png differ diff --git a/assets/datafast-badge.svg b/assets/datafast-badge.svg new file mode 100644 index 0000000..8563be5 --- /dev/null +++ b/assets/datafast-badge.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/datafast/card_utils.py b/datafast/card_utils.py new file mode 100644 index 0000000..b5df9c5 --- /dev/null +++ b/datafast/card_utils.py @@ -0,0 +1,178 @@ +import os +import re +from pathlib import Path +from huggingface_hub import HfApi, DatasetCard, DatasetCardData +from huggingface_hub.file_download import hf_hub_download + +def extract_readme_metadata(repo_id: str, token: str | None = None) -> str: + """Extracts the metadata from the README.md file of the dataset repository. + We have to download the previous README.md file in the repo, extract the metadata from it. + Args: + repo_id: The ID of the repository to push to, from the `push_to_hub` method. + token: The token to authenticate with the Hugging Face Hub, from the `push_to_hub` method. + Returns: + The metadata extracted from the README.md file of the dataset repository as a str. + """ + try: + readme_path = Path( + hf_hub_download(repo_id, "README.md", repo_type="dataset", token=token) + ) + # Extract the content between the '---' markers + metadata_match = re.findall(r"---\n(.*?)\n---", readme_path.read_text(), re.DOTALL) + + if not metadata_match: + print("No YAML metadata found in the README.md") + return "" + + return metadata_match[0] + + except Exception as e: + print(f"Failed to extract metadata from README.md: {e}") + return "" + + +def extract_dataset_info(repo_id: str, token: str | None = None) -> str: + """ + Extract dataset_info section from README metadata. + + Args: + repo_id: The dataset repository ID + token: Optional HuggingFace token for authentication + + Returns: + The dataset_info section as a string, or empty string if not found + """ + readme_metadata = extract_readme_metadata(repo_id=repo_id, token=token) + if not readme_metadata: + return "" + + section_prefix = "dataset_info:" + if section_prefix not in readme_metadata: + return "" + + try: + # Extract the part after `dataset_info:` prefix + config_data = section_prefix + readme_metadata.split(section_prefix)[1] + return config_data + except IndexError: + print("Failed to extract dataset_info section from metadata") + return "" + + +def _generate_and_upload_dataset_card( + repo_id: str, + token: str | None = None, + template_path: str | None = None +) -> None: + """ + Internal implementation that generates and uploads a dataset card to Hugging Face Hub. + + This is the core implementation function called by the public upload_dataset_card() function. + It handles the actual card generation and uploading without performing configuration checks. + + The dataset card includes: + 1. Pipeline subset descriptions based on enabled stages + 2. Full sanitized configuration for reproducibility + 3. Datafast version and other metadata + 4. Preserved dataset_info from the existing card for proper configuration display + + Args: + template_path: Optional custom template path + """ + + try: + # Load template + if not template_path: + # Try to find template in utils directory + current_dir = os.path.dirname(__file__) + template_path = os.path.join(current_dir, "datafast_card_template.md") + + if not os.path.exists(template_path): + print(f"Template file not found: {template_path}") + return + + with open(template_path, "r", encoding="utf-8") as f: + template_str = f.read() + + # Get HF token + if not token: + token = os.getenv("HF_TOKEN", None) + + # Extract dataset_info section from existing README if available + config_data = extract_dataset_info(repo_id=repo_id, token=token) + print(f"Extracted dataset_info section, length: {len(config_data) if config_data else 0} characters") + + dataset_name = repo_id.split("/")[-1] + pretty_name = dataset_name.replace("-", " ").replace("_", " ").title() + + card_data_kwargs = { + "pretty_name": pretty_name + } + + # Create DatasetCardData with our metadata + card_data = DatasetCardData(**card_data_kwargs) + + # Get datafast version + from importlib.metadata import version, PackageNotFoundError + + try: + version_str = version("datafast") + except PackageNotFoundError: + # Fallback for development installs + version_str = "dev" + + # Prepare template variables + template_vars = { + "pretty_name": card_data.pretty_name, + "datafast_version": version_str, + "config_data": config_data, # Use the extracted dataset_info section + } + + print("Rendering dataset card from template") + print(f"Template variables: {list(template_vars.keys())}") + + # Render card with our template and variables + card = DatasetCard.from_template( + card_data=card_data, + template_str=template_str, + **template_vars + ) + + print("Template rendered successfully") + print(f"Rendered card content length: {len(str(card))} characters") + + # Push to hub + print(f"Pushing dataset card to hub: {repo_id}") + card.push_to_hub(repo_id, token=token) + + print(f"Dataset card successfully uploaded to: https://huggingface.co/datasets/{repo_id}") + + except Exception as e: + print(f"Failed to upload dataset card: {e}") + print("Full traceback:") + + +def upload_dataset_card(repo_id: str, token: str | None = None, template_path: str | None = None) -> None: + """ + Public interface to generate and upload a dataset card to Hugging Face Hub. + + This function performs configuration checks (like offline mode) + and then delegates to the internal _generate_and_upload_dataset_card() implementation. + It should be called at the end of the pipeline when all subsets are available. + + Args: + repo_id: The ID of the repository to push to + token: The token to authenticate with the Hugging Face Hub + template_path: Optional custom template path + """ + try: + + print(f"Uploading dataset card to repository: {repo_id}") + _generate_and_upload_dataset_card( + repo_id=repo_id, + token=token, + template_path=template_path + ) + + except Exception as e: + print(f"Error uploading dataset card: {e}") \ No newline at end of file diff --git a/datafast/datafast_card_template.md b/datafast/datafast_card_template.md new file mode 100644 index 0000000..e620d6f --- /dev/null +++ b/datafast/datafast_card_template.md @@ -0,0 +1,11 @@ +--- +{{ card_data }} +{{ config_data }} +--- +[Built with Datafast](https://github.com/patrickfleith/datafast) + +# {{ pretty_name }} + +This dataset was generated using Datafast (v{{ datafast_version }}), an open-source package to generate high-quality and diverse synthetic text datasets for LLMs. + diff --git a/datafast/datasets.py b/datafast/datasets.py index 06f04c9..925a245 100644 --- a/datafast/datasets.py +++ b/datafast/datasets.py @@ -116,12 +116,13 @@ def to_jsonl(self, filepath: str, rows: list[Any] = None, append: bool = False): def push_to_hub( self, repo_id: str, - token: Optional[str] = None, + token: str | None = None, private: bool = False, - commit_message: Optional[str] = None, - train_size: Optional[float] = None, - seed: Optional[int] = None, - shuffle: Optional[bool] = True, + commit_message: str | None = None, + train_size: float | None = None, + seed: int | None = None, + shuffle: bool | None = True, + upload_card: bool = True, ) -> str: """Push the dataset to Hugging Face Hub. @@ -134,6 +135,7 @@ def push_to_hub( (e.g., 0.8 for 80% train) seed: Optional random seed for train_test_split shuffle: Optional boolean to shuffle the data for train_test_split + upload_card: Whether to automatically upload a dataset card after pushing Returns: str: URL of the dataset on the Hub @@ -204,6 +206,16 @@ def push_to_hub( ) raise + # Upload dataset card if requested + if upload_card: + try: + from datafast.card_utils import upload_dataset_card + upload_dataset_card(repo_id=repo_id, token=token) + print("Dataset card uploaded successfully") + except Exception as e: + print(f"Warning: Failed to upload dataset card: {e}") + # Continue even if card upload fails + return f"https://huggingface.co/datasets/{repo_id}" diff --git a/datafast/examples/quickstart_example.py b/datafast/examples/quickstart_example.py index 76d5671..6cff340 100644 --- a/datafast/examples/quickstart_example.py +++ b/datafast/examples/quickstart_example.py @@ -7,7 +7,7 @@ classes=[ {"name": "positive", "description": "Text expressing positive emotions or approval"}, {"name": "negative", "description": "Text expressing negative emotions or criticism"}, - {"name": "neutral", "description": "Text with neutral emotions or indifference"} + # {"name": "neutral", "description": "Text with neutral emotions or indifference"} ], num_samples_per_prompt=3, output_file="outdoor_activities_sentiments.jsonl", @@ -34,7 +34,7 @@ from datafast.llms import OpenAIProvider, AnthropicProvider, GeminiProvider, OllamaProvider providers = [ - OpenAIProvider(model_id="gpt-4.1-mini-2025-04-14"), + OpenAIProvider(model_id="gpt-4.1-nano"), # AnthropicProvider(model_id="claude-3-5-haiku-latest"), # GeminiProvider(model_id="gemini-2.0-flash"), # OllamaProvider(model_id="gemma3:12b")