From 91734745d28d33a82bbab26318ab6612315f27d9 Mon Sep 17 00:00:00 2001 From: Weronika Date: Sat, 10 Jan 2026 12:50:48 +0100 Subject: [PATCH 01/10] Issue fixes - mostly refactoring code and documentation --- README.md | 237 +++----- ...example_1.livemd => usage_examples.livemd} | 80 +-- lib/elixir_datasets.ex | 568 +++--------------- lib/elixir_datasets/filter.ex | 104 ++++ lib/elixir_datasets/info.ex | 174 ++++++ lib/elixir_datasets/loader.ex | 211 +++++++ lib/elixir_datasets/repository.ex | 203 +++++++ lib/elixir_datasets/streaming.ex | 190 ++++++ lib/huggingface/hub.ex | 336 ----------- test/elixir_datasets/utils/filter_test.exs | 118 ++++ test/elixir_datasets/utils/info_test.exs | 92 +++ test/elixir_datasets/utils/loader_test.exs | 212 +++++++ .../elixir_datasets/utils/repository_test.exs | 64 ++ test/elixir_datasets/utils/streaming_test.exs | 139 +++++ test/elixir_datasets_test.exs | 423 +------------ test/huggingface/hub_test.exs | 477 --------------- 16 files changed, 1704 insertions(+), 1924 deletions(-) rename examples/{example_1.livemd => usage_examples.livemd} (95%) create mode 100644 lib/elixir_datasets/filter.ex create mode 100644 lib/elixir_datasets/info.ex create mode 100644 lib/elixir_datasets/loader.ex create mode 100644 lib/elixir_datasets/repository.ex create mode 100644 lib/elixir_datasets/streaming.ex delete mode 100644 lib/huggingface/hub.ex create mode 100644 test/elixir_datasets/utils/filter_test.exs create mode 100644 test/elixir_datasets/utils/info_test.exs create mode 100644 test/elixir_datasets/utils/loader_test.exs create mode 100644 test/elixir_datasets/utils/repository_test.exs create mode 100644 test/elixir_datasets/utils/streaming_test.exs delete mode 100644 test/huggingface/hub_test.exs diff --git a/README.md b/README.md index b6a8057..24c657e 100644 --- a/README.md +++ b/README.md @@ -32,122 +32,82 @@ end ## πŸš€ Quick Start -### Load a Dataset from Hugging Face - ```elixir -{:ok, dataset} = ElixirDatasets.load_dataset({:hf, "imdb"}) - -{:ok, train_data} = ElixirDatasets.load_dataset( - {:hf, "imdb"}, - split: "train" -) - -{:ok, dataset} = ElixirDatasets.load_dataset( - {:hf, "glue"}, - name: "sst2", +# Load a dataset from Hugging Face +{:ok, [train_df]} = ElixirDatasets.load_dataset( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train" ) -``` -### Stream Large Datasets +# Load from local directory +{:ok, datasets} = ElixirDatasets.load_dataset({:local, "./data"}) -```elixir +# Stream large datasets without loading into memory {:ok, stream} = ElixirDatasets.load_dataset( - {:hf, "c4"}, + {:hf, "stanfordnlp/imdb", subdir: "plain_text"}, split: "train", streaming: true ) -stream -|> Enum.take(1000) -|> Enum.each(&process_row/1) -``` - -### Parallel Loading for Performance - -```elixir -{:ok, dataset} = ElixirDatasets.load_dataset( - {:hf, "multi-file-dataset"}, - num_proc: System.schedulers_online() -) -``` - -### Upload Your Own Dataset - -```elixir -df = Explorer.DataFrame.new(%{ - id: [1, 2, 3], - text: ["Hello", "World", "!"], - label: [0, 1, 0] -}) - -{:ok, _response} = ElixirDatasets.upload_dataset( - df, - "username/my-dataset", - file_extension: "parquet", - commit_message: "Initial upload", - auth_token: System.get_env("HF_TOKEN") -) -``` - -### Work with Local Files - -```elixir -{:ok, dataset} = ElixirDatasets.load_dataset( - {:local, "./data"}, - split: "train" -) +stream |> Enum.take(100) |> Enum.each(&process_row/1) ``` ## πŸ“š Examples -### Example 1: Text Classification with GLUE +### Text Classification with Sentiment Analysis ```elixir -{:ok, train} = ElixirDatasets.load_dataset( - {:hf, "glue"}, - name: "sst2", +# Load training data +{:ok, [train_df]} = ElixirDatasets.load_dataset( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train" ) -IO.inspect(Explorer.DataFrame.head(train, 5)) +# Explore the data with Explorer +require Explorer.DataFrame, as: DF -positive = Explorer.DataFrame.filter(train, label == 1) +train_df +|> DF.head(5) +|> IO.inspect() -stats = Explorer.DataFrame.summarise(train, - total: count(label), - positive: sum(label) +# Get dataset metadata +{:ok, splits} = ElixirDatasets.get_dataset_split_names( + "cornell-movie-review-data/rotten_tomatoes" ) +IO.inspect(splits) # ["train", "validation", "test"] ``` -### Example 2: Streaming Large Dataset +### Streaming Large Datasets ```elixir +# Stream data without loading everything into memory {:ok, stream} = ElixirDatasets.load_dataset( - {:hf, "wikipedia"}, - name: "20220301.en", + {:hf, "stanfordnlp/imdb", subdir: "plain_text"}, split: "train", streaming: true ) +# Process data progressively stream -|> Stream.chunk_every(100) -|> Stream.each(fn batch -> - batch |> Enum.each(&analyze_text/1) -end) -|> Stream.run() +|> Stream.filter(fn row -> String.length(row["text"]) > 100 end) +|> Stream.take(1000) +|> Enum.each(&process_review/1) ``` -### Example 3: Offline Mode +### Working Offline ```elixir -{:ok, _} = ElixirDatasets.load_dataset({:hf, "imdb"}) - -System.put_env("ELIXIR_DATASETS_OFFLINE", "1") +# Download once +{:ok, _} = ElixirDatasets.load_dataset( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + split: "train" +) -{:ok, dataset} = ElixirDatasets.load_dataset( - {:hf, "imdb"}, - download_mode: :reuse_dataset_if_exists +# Use cached version offline +{:ok, [data]} = ElixirDatasets.load_dataset( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + split: "train", + offline: true ) ``` @@ -155,118 +115,63 @@ System.put_env("ELIXIR_DATASETS_OFFLINE", "1") ### Environment Variables -- `ELIXIR_DATASETS_CACHE_DIR` - Custom cache directory (default: system cache) +- `ELIXIR_DATASETS_CACHE_DIR` - Custom cache directory - `ELIXIR_DATASETS_OFFLINE` - Enable offline mode (`"1"` or `"true"`) -- `HUGGING_FACE_HUB_TOKEN` - Authentication token for private datasets +- `HF_TOKEN` - Authentication token for private datasets -### Cache Management +### Common Options ```elixir -cache_dir = ElixirDatasets.cache_dir() +# Load specific split +ElixirDatasets.load_dataset({:hf, "dataset"}, split: "train") -{:ok, dataset} = ElixirDatasets.load_dataset( - {:hf, "dataset_name"}, - download_mode: :force_redownload -) +# Stream large datasets +ElixirDatasets.load_dataset({:hf, "dataset"}, streaming: true) -{:ok, dataset} = ElixirDatasets.load_dataset( - {:hf, "dataset_name"}, - verification_mode: :no_checks -) +ElixirDatasets.load_dataset({:hf, "dataset"}, num_proc: 4) + +ElixirDatasets.load_dataset({:hf, "dataset"}, offline: true) + +ElixirDatasets.load_dataset({:hf, "dataset"}, download_mode: :force_redownload) ``` -## πŸ†š Comparison with Python `datasets` - -| Feature | ElixirDatasets | Python `datasets` | -|---------|----------------|-------------------| -| Load from Hugging Face Hub | βœ… | βœ… | -| Streaming | βœ… | βœ… | -| Caching | βœ… | βœ… | -| Parallel Processing | βœ… | βœ… | -| Upload to Hub | βœ… | βœ… | -| Multiple Formats (CSV, Parquet, JSONL) | βœ… | βœ… | -| Offline Mode | βœ… | βœ… | -| Private Datasets | βœ… | βœ… | -| DataFrame Integration | βœ… (Explorer) | βœ… (Pandas/Polars) | -| Map/Filter Operations | ⚠️ (via Explorer) | βœ… | -| Custom Dataset Scripts | ❌ | βœ… | -| Audio/Image Processing | ❌ | βœ… | -| Metrics | ❌ | βœ… | - -**Legend:** βœ… Fully Supported | ⚠️ Partial Support | ❌ Not Supported - -### What's Supported - -ElixirDatasets focuses on core dataset loading and management features: -- βœ… Loading datasets from Hugging Face Hub -- βœ… Streaming for large datasets -- βœ… Parallel processing with `num_proc` -- βœ… Smart caching and offline mode -- βœ… Upload and manage datasets -- βœ… CSV, Parquet, and JSONL formats -- βœ… Integration with Explorer DataFrames - -### What's Different - -- **DataFrame Library**: Uses Explorer instead of Pandas -- **Data Processing**: Leverage Explorer's powerful API for transformations -- **Concurrency**: Built on Elixir's process model for true parallelism -- **Simplicity**: Focused API without custom dataset scripts +See the [full documentation](https://hexdocs.pm/elixir_datasets) for all available options. ## πŸ”— Integration with Elixir ML Ecosystem -### Axon (Neural Networks) +Works seamlessly with Explorer, Nx, Axon, and Bumblebee: ```elixir -{:ok, train} = ElixirDatasets.load_dataset({:hf, "mnist"}) - -train_tensors = train -|> Explorer.DataFrame.to_rows() -|> Enum.map(fn row -> - {Nx.tensor(row["image"]), Nx.tensor(row["label"])} -end) - -model = Axon.input("input", shape: {nil, 784}) -|> Axon.dense(128, activation: :relu) -|> Axon.dense(10, activation: :softmax) -``` +{:ok, [train_df]} = ElixirDatasets.load_dataset( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + split: "train" +) -### Bumblebee (Transformers) +require Explorer.DataFrame, as: DF +train_df |> DF.filter(label == 1) |> DF.head(10) -```elixir -{:ok, dataset} = ElixirDatasets.load_dataset({:hf, "imdb"}, split: "train") +texts = DF.pull(train_df, "text") +labels = DF.pull(train_df, "label") |> Nx.tensor() -{:ok, model_info} = Bumblebee.load_model({:hf, "bert-base-uncased"}) {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-uncased"}) - -texts = Explorer.DataFrame.pull(dataset, "text") inputs = Bumblebee.apply_tokenizer(tokenizer, texts) ``` -### Nx (Numerical Computing) - -```elixir -{:ok, dataset} = ElixirDatasets.load_dataset({:hf, "california_housing"}) - -features = dataset -|> Explorer.DataFrame.select(["feature1", "feature2", "feature3"]) -|> Explorer.DataFrame.to_columns() -|> Map.values() -|> Enum.map(&Nx.tensor/1) -|> Nx.stack() -``` - ## πŸ“– Documentation Full documentation is available at [HexDocs](https://hexdocs.pm/elixir_datasets). -### Key Modules +## πŸ““ Interactive Examples + +Explore interactive examples in Livebook: `examples/usage_examples.livemd` + +```bash +mix escript.install hex livebook + +livebook server examples/usage_examples.livemd +``` -- `ElixirDatasets` - Main API for loading and managing datasets -- `ElixirDatasets.DatasetInfo` - Dataset metadata management -- `ElixirDatasets.Utils.Loader` - File loading utilities -- `ElixirDatasets.Utils.Uploader` - Upload functionality -- `ElixirDatasets.HuggingFace.Hub` - Hugging Face Hub integration +The notebook includes examples for loading, streaming, parallel processing, and uploading datasets. ## πŸ§ͺ Testing diff --git a/examples/example_1.livemd b/examples/usage_examples.livemd similarity index 95% rename from examples/example_1.livemd rename to examples/usage_examples.livemd index 953331a..fc8fdb5 100644 --- a/examples/example_1.livemd +++ b/examples/usage_examples.livemd @@ -1,6 +1,6 @@ -# Example_1 +# Usage examples ```elixir ##### Target version @@ -382,52 +382,25 @@ Streaming from HuggingFace... #### Parallel processing with num_proc -Use `num_proc` to load multiple files in parallel: +Use `num_proc` to load multiple files in parallel. Parallel processing is most effective with large files or when downloading fresh data. ```elixir -IO.puts("Loading with num_proc: 1 (sequential)...") -{time_seq, {:ok, datasets_seq}} = :timer.tc(fn -> - ElixirDatasets.load_dataset( - {:hf, "aaaaa32r/elixirDatasets"}, - num_proc: 1 - ) -end) +# Compare sequential vs parallel loading +# Note: Using force_redownload to demonstrate real performance difference +dataset_repo = {:hf, "stanfordnlp/imdb", subdir: "plain_text"} -IO.puts("Loading with num_proc: 4 (parallel)...") -{time_par, {:ok, datasets_par}} = :timer.tc(fn -> - ElixirDatasets.load_dataset( - {:hf, "aaaaa32r/elixirDatasets"}, - num_proc: 4 - ) +{time_seq, {:ok, _}} = :timer.tc(fn -> + ElixirDatasets.load_dataset(dataset_repo, num_proc: 1, download_mode: :force_redownload) end) -time_seq_sec = time_seq / 1_000_000 -time_par_sec = time_par / 1_000_000 -speedup = time_seq / time_par - -IO.puts(" Performance Comparison:") -IO.puts(" Sequential (num_proc: 1): #{Float.round(time_seq_sec, 3)}s") -IO.puts(" Parallel (num_proc: 4): #{Float.round(time_par_sec, 3)}s") -IO.puts(" Speedup: #{Float.round(speedup, 2)}x") -IO.puts(" Datasets loaded: #{length(datasets_par)}") -``` - - - -``` -Loading with num_proc: 1 (sequential)... -Loading with num_proc: 4 (parallel)... - Performance Comparison: - Sequential (num_proc: 1): 0.282s - Parallel (num_proc: 4): 0.284s - Speedup: 0.99x - Datasets loaded: 4 -``` - - +{time_par, {:ok, datasets}} = :timer.tc(fn -> + ElixirDatasets.load_dataset(dataset_repo, num_proc: 4, download_mode: :force_redownload) +end) -``` -:ok +IO.puts("Sequential: #{Float.round(time_seq / 1_000_000, 2)}s") +IO.puts("Parallel: #{Float.round(time_par / 1_000_000, 2)}s") +IO.puts("Speedup: #{Float.round(time_seq / time_par, 2)}x") +IO.puts("Loaded #{length(datasets)} datasets") ``` #### Filter datasets by name pattern @@ -581,7 +554,7 @@ Available `verification_mode` options: Combine data loading options with Hub options for maximum control: ```elixir -{:ok, file_paths} = ElixirDatasets.load_dataset( +{:ok, stream} = ElixirDatasets.load_dataset( {:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "test", streaming: true, @@ -589,21 +562,16 @@ Combine data loading options with Hub options for maximum control: verification_mode: :no_checks ) -IO.puts("Got #{length(file_paths)} file path(s) in streaming mode") -Enum.each(file_paths, fn {path, ext} -> - IO.puts(" - #{Path.basename(path)} (#{ext})") -end) -``` - - - -``` -** (ArgumentError) errors were found at the given arguments: - - * 1st argument: not a list +IO.puts("βœ“ Created stream in streaming mode") +IO.puts(" Stream type: #{inspect(is_function(stream, 2))}") - (erts 15.2.7) :erlang.length(#Function<53.117496853/2 in Stream.resource/3>) - #cell:o36uvndfgwkvdreh:9: (file) +IO.puts("\nFetching first 3 rows...") +sample_rows = stream |> Enum.take(3) +IO.puts("βœ“ Fetched #{length(sample_rows)} rows") +sample_rows |> Enum.with_index(1) |> Enum.each(fn {row, idx} -> + keys = Map.keys(row) |> Enum.join(", ") + IO.puts(" Row #{idx}: [#{keys}]") +end) ``` #### Using custom cache directory diff --git a/lib/elixir_datasets.ex b/lib/elixir_datasets.ex index 6314fe1..be6f052 100644 --- a/lib/elixir_datasets.ex +++ b/lib/elixir_datasets.ex @@ -4,12 +4,41 @@ defmodule ElixirDatasets do @moduledoc """ - Todo: Add documentation for ElixirDatasets. + ElixirDatasets is a comprehensive library for accessing and managing datasets from Hugging Face Hub in Elixir. + + This module provides the main public API for loading datasets, fetching metadata, + and uploading datasets to Hugging Face Hub. + + ## Main Functions + + * `load_dataset/2` - Load datasets from Hugging Face or local files + * `load_dataset!/2` - Same as `load_dataset/2` but raises on error + * `get_dataset_info/2` - Fetch dataset metadata + * `get_dataset_infos/2` - Fetch all dataset configurations + * `get_dataset_split_names/2` - Get available splits (train/test/validation) + * `get_dataset_config_names/2` - Get available configurations + * `upload_dataset/3` - Upload a dataset to Hugging Face Hub + * `cache_dir/0` - Get the cache directory path + + ## Examples + + # Load a dataset from Hugging Face + {:ok, datasets} = ElixirDatasets.load_dataset({:hf, "imdb"}) + + # Load with specific split + {:ok, train_data} = ElixirDatasets.load_dataset({:hf, "imdb"}, split: "train") + + # Stream large datasets + {:ok, stream} = ElixirDatasets.load_dataset({:hf, "c4"}, streaming: true) + stream |> Enum.take(100) + + # Get dataset information + {:ok, info} = ElixirDatasets.get_dataset_info("imdb") + """ @compile if Mix.env() == :test, do: :export_all - alias ElixirDatasets.HuggingFace - alias ElixirDatasets.DatasetInfo - @valid_extensions_list ["jsonl", "csv", "parquet"] + + alias ElixirDatasets.{Info, Loader, Repository} @typedoc """ A location to fetch dataset files from. @@ -22,64 +51,15 @@ defmodule ElixirDatasets do * `{:local, path}` - a local directory or file path containing the datasets """ - @type t_repository :: {:hf, String.t()} | {:hf, String.t(), keyword()} | {:local, Path.t()} - - defp do_load_spec(repository, repo_files, num_proc) do - files_to_download = - Enum.filter(repo_files, fn {file_name, _etag} -> - extension = file_name |> Path.extname() |> String.trim_leading(".") - extension in @valid_extensions_list - end) - - if num_proc > 1 do - files_to_download - |> Task.async_stream( - fn {file_name, etag} -> - extension = file_name |> Path.extname() |> String.trim_leading(".") - - case download(repository, file_name, etag) do - {:ok, path} -> {:ok, {path, extension}} - {:error, reason} -> {:error, "failed to download #{file_name}: #{reason}"} - end - end, - max_concurrency: num_proc, - ordered: true - ) - |> Enum.reduce_while({:ok, []}, fn - {:ok, {:ok, path_ext}}, {:ok, acc} -> - {:cont, {:ok, [path_ext | acc]}} - - {:ok, {:error, reason}}, _acc -> - {:halt, {:error, reason}} - - {:exit, reason}, _acc -> - {:halt, {:error, "task failed: #{inspect(reason)}"}} - end) - |> case do - {:ok, paths} -> {:ok, Enum.reverse(paths)} - error -> error - end - else - Enum.reduce_while(files_to_download, [], fn {file_name, etag}, acc -> - extension = file_name |> Path.extname() |> String.trim_leading(".") - - case download(repository, file_name, etag) do - {:ok, path} -> - {:cont, [{path, extension} | acc]} - - {:error, reason} -> - {:halt, - {:error, "failed to download #{file_name} from #{inspect(repository)}: #{reason}"}} - end - end) - |> case do - {:error, _} = error -> error - paths -> {:ok, Enum.reverse(paths)} - end - end + @type t_repository :: Repository.t_repository() + + # Delegated to Loader module for backward compatibility with tests + def do_load_spec(repository, repo_files, num_proc) do + Loader.load_spec(repository, repo_files, num_proc) end - defp decode_config(path) do + # Delegated to Repository module for backward compatibility with tests + def decode_config(path) do path |> File.read!() |> Jason.decode() @@ -96,6 +76,8 @@ defmodule ElixirDatasets do @doc """ Fetches dataset information from the Hugging Face API. + Delegates to `ElixirDatasets.Info.get_dataset_info/2`. + ## Parameters * `repository_id` - the Hugging Face dataset repository ID (e.g., "aaaaa32r/elixirDatasets") @@ -108,24 +90,13 @@ defmodule ElixirDatasets do or `{:error, reason}` if the request fails. """ @spec get_dataset_info(String.t(), keyword()) :: {:ok, map()} | {:error, String.t()} - def get_dataset_info(repository_id, opts \\ []) when is_binary(repository_id) do - url = HuggingFace.Hub.dataset_info_url(repository_id) - - headers = - case HuggingFace.Hub.get_auth_token(opts) do - {:ok, auth_token} -> [{"Authorization", "Bearer #{auth_token}"}] - {:error, _} -> [] - end - - with {:ok, response} <- ElixirDatasets.Utils.HTTP.request(:get, url, headers: headers), - {:ok, data} <- Jason.decode(response.body) do - {:ok, data} - end - end + defdelegate get_dataset_info(repository_id, opts \\ []), to: Info @doc """ Fetches dataset information from the Hugging Face API and returns a list of DatasetInfo structs. + Delegates to `ElixirDatasets.Info.get_dataset_infos/2`. + This function retrieves all available dataset configurations for a given repository. ## Parameters @@ -146,21 +117,14 @@ defmodule ElixirDatasets do ["csv", "default"] """ @spec get_dataset_infos(String.t(), keyword()) :: - {:ok, [DatasetInfo.t()]} | {:error, String.t()} - def get_dataset_infos(repository_id, opts \\ []) when is_binary(repository_id) do - case get_dataset_info(repository_id, opts) do - {:ok, info} -> - dataset_infos = parse_dataset_infos(info) - {:ok, dataset_infos} - - {:error, reason} -> - {:error, reason} - end - end + {:ok, [ElixirDatasets.DatasetInfo.t()]} | {:error, String.t()} + defdelegate get_dataset_infos(repository_id, opts \\ []), to: Info @doc """ Parses raw dataset info map into a list of DatasetInfo structs. + Delegates to `ElixirDatasets.Info.parse_dataset_infos/1`. + Extracts the dataset_info array from the HuggingFace API response's cardData field and converts each entry into a DatasetInfo struct. @@ -172,20 +136,14 @@ defmodule ElixirDatasets do A list of DatasetInfo structs. """ - @spec parse_dataset_infos(map()) :: [DatasetInfo.t()] - def parse_dataset_infos(data) when is_map(data) do - data - |> Map.get("cardData", %{}) - |> Map.get("dataset_info", []) - |> case do - list when is_list(list) -> Enum.map(list, &DatasetInfo.from_map/1) - single -> [DatasetInfo.from_map(single)] - end - end + @spec parse_dataset_infos(map()) :: [ElixirDatasets.DatasetInfo.t()] + defdelegate parse_dataset_infos(data), to: Info @doc """ Gets the split names (e.g., 'train', 'test', 'validation') for a dataset. + Delegates to `ElixirDatasets.Info.get_dataset_split_names/2`. + ## Parameters * `repository_id` - the Hugging Face dataset repository ID (e.g., "cornell-movie-review-data/rotten_tomatoes") @@ -205,29 +163,13 @@ defmodule ElixirDatasets do """ @spec get_dataset_split_names(String.t(), keyword()) :: {:ok, [String.t()]} | {:error, String.t()} - def get_dataset_split_names(repository_id, opts \\ []) when is_binary(repository_id) do - case get_dataset_infos(repository_id, opts) do - {:ok, infos} -> - split_names = - infos - |> Enum.flat_map(fn info -> - case info.splits do - nil -> [] - splits -> Enum.map(splits, fn split -> split["name"] end) - end - end) - |> Enum.uniq() - - {:ok, split_names} - - {:error, reason} -> - {:error, reason} - end - end + defdelegate get_dataset_split_names(repository_id, opts \\ []), to: Info @doc """ Gets the configuration names available for a dataset. + Delegates to `ElixirDatasets.Info.get_dataset_config_names/2`. + ## Parameters * `repository_id` - the Hugging Face dataset repository ID (e.g., "glue") @@ -247,20 +189,13 @@ defmodule ElixirDatasets do """ @spec get_dataset_config_names(String.t(), keyword()) :: {:ok, [String.t()]} | {:error, String.t()} - def get_dataset_config_names(repository_id, opts \\ []) when is_binary(repository_id) do - case get_dataset_infos(repository_id, opts) do - {:ok, infos} -> - config_names = Enum.map(infos, fn info -> info.config_name end) - {:ok, config_names} - - {:error, reason} -> - {:error, reason} - end - end + defdelegate get_dataset_config_names(repository_id, opts \\ []), to: Info @doc """ Loads a dataset from the given repository. + Delegates to `ElixirDatasets.Loader.load_dataset/2`. + The repository can be either a local directory or a Hugging Face repository. ## Options @@ -315,51 +250,30 @@ defmodule ElixirDatasets do ## Examples - # Load only the training split ElixirDatasets.load_dataset({:hf, "dataset_name"}, split: "train") - # Load a specific configuration ElixirDatasets.load_dataset({:hf, "glue"}, name: "sst2") - # Load a specific split of a specific configuration ElixirDatasets.load_dataset({:hf, "glue"}, name: "sst2", split: "train") - # Stream data progressively without downloading {:ok, stream} = ElixirDatasets.load_dataset( {:hf, "large_dataset"}, split: "train", streaming: true ) - # Process first 100 rows without downloading entire dataset stream |> Stream.take(100) |> Enum.each(&process_row/1) """ @spec load_dataset(t_repository(), keyword()) :: {:ok, [Explorer.DataFrame.t()] | Enumerable.t()} | {:error, Exception.t()} - def load_dataset(repository, opts \\ []) do - repository = normalize_repository!(repository) - split = opts[:split] - name = opts[:name] - streaming = opts[:streaming] || false - num_proc = opts[:num_proc] || 1 - - with {:ok, repo_files} <- get_repo_files(repository), - {:ok, filtered_files} <- filter_files_by_config_and_split(repo_files, name, split) do - if streaming do - {:ok, build_streaming_dataset(repository, filtered_files, opts)} - else - with {:ok, paths_with_extensions} <- - maybe_load_model_spec(opts, repository, filtered_files) do - ElixirDatasets.Utils.Loader.load_datasets_from_paths(paths_with_extensions, num_proc) - end - end - end - end + defdelegate load_dataset(repository, opts \\ []), to: Loader @doc """ Similar to `load_dataset/2` but raises an error if loading fails. + Delegates to `ElixirDatasets.Loader.load_dataset!/2`. + Accepts the same options as `load_dataset/2`: * `:split` - which split to load (e.g., "train", "test", "validation") * `:name` - dataset configuration name @@ -372,195 +286,33 @@ defmodule ElixirDatasets do ## Examples - # Load only training data datasets = ElixirDatasets.load_dataset!({:hf, "dataset_name"}, split: "train") - # Stream data progressively stream = ElixirDatasets.load_dataset!({:hf, "dataset"}, streaming: true) stream |> Enum.take(10) """ @spec load_dataset!(t_repository(), keyword()) :: [Explorer.DataFrame.t()] | Enumerable.t() - def load_dataset!(repository, opts \\ []) do - case load_dataset(repository, opts) do - {:ok, datasets} -> datasets - {:error, reason} -> raise reason - end - end - - @spec upload_dataset(Explorer.DataFrame.t(), String.t(), keyword()) :: - {:error, String.t()} | {:ok, binary()} - def upload_dataset(df, repository, file_extension) do - ElixirDatasets.Utils.Uploader.upload_dataset(df, repository, file_extension) - end - - defp filter_files_by_config_and_split(repo_files, name, split) do - filtered = - repo_files - |> filter_by_config_name(name) - |> filter_by_split(split) - - {:ok, filtered} - end - - defp filter_by_config_name(repo_files, nil), do: repo_files - - defp filter_by_config_name(repo_files, config_name) do - filtered = - Enum.filter(repo_files, fn {file_name, _etag} -> - String.contains?(file_name, config_name) - end) - - if is_map(repo_files) do - Map.new(filtered) - else - filtered - end - end - - defp filter_by_split(repo_files, nil), do: repo_files - - defp filter_by_split(repo_files, split) when is_binary(split) do - filtered = - Enum.filter(repo_files, fn {file_name, _etag} -> - base_name = Path.basename(file_name, Path.extname(file_name)) - String.contains?(base_name, split) - end) - - if is_map(repo_files) do - Map.new(filtered) - else - filtered - end - end + defdelegate load_dataset!(repository, opts \\ []), to: Loader - defp maybe_load_model_spec(opts, repository, repo_files) do - num_proc = opts[:num_proc] || 1 - - with {:ok, spec} <- do_load_spec(repository, repo_files, num_proc) do - {:ok, spec} - end - end - - defp get_repo_files({:local, dir}) do - case File.ls(dir) do - {:ok, filenames} -> - repo_files = - for filename <- filenames, - path = Path.join(dir, filename), - File.regular?(path), - into: %{}, - do: {filename, nil} - - {:ok, repo_files} - - {:error, reason} -> - {:error, "could not read #{dir}, reason: #{:file.format_error(reason)}"} - end - end - - defp get_repo_files({:hf, repository_id, opts}) do - subdir = opts[:subdir] - url = HuggingFace.Hub.file_listing_url(repository_id, subdir, opts[:revision]) - cache_scope = repository_id_to_cache_scope(repository_id) - - passthrough_opts = [ - :cache_dir, - :offline, - :auth_token, - :etag, - :download_mode, - :verification_mode - ] - - result = - HuggingFace.Hub.cached_download( - url, - [cache_scope: cache_scope] ++ Keyword.take(opts, passthrough_opts) - ) - - with {:ok, path} <- result, - {:ok, data} <- decode_config(path) do - repo_files = - for entry <- data, entry["type"] == "file", into: %{} do - path = entry["path"] - - name = - if subdir do - String.replace_leading(path, subdir <> "/", "") - else - path - end - - etag_content = entry["lfs"]["oid"] || entry["oid"] - etag = <> - {name, etag} - end - - {:ok, repo_files} - end - end - - defp download({:local, dir}, filename, _etag) do - path = Path.join(dir, filename) - - if File.exists?(path) do - {:ok, path} - else - {:error, "local file #{inspect(path)} does not exist"} - end - end - - defp download({:hf, repository_id, opts}, filename, etag) do - filename = - if subdir = opts[:subdir] do - subdir <> "/" <> filename - else - filename - end - - url = HuggingFace.Hub.file_url(repository_id, filename, opts[:revision]) - cache_scope = repository_id_to_cache_scope(repository_id) - - passthrough_opts = [ - :cache_dir, - :offline, - :auth_token, - :download_mode, - :verification_mode - ] - - HuggingFace.Hub.cached_download( - url, - [etag: etag, cache_scope: cache_scope] ++ - Keyword.take(opts, passthrough_opts) - ) - end - - defp repository_id_to_cache_scope(repository_id) do - repository_id - |> String.replace("/", "--") - |> String.replace(~r/[^\w-]/, "") - end + @doc """ + Uploads a dataset to Hugging Face Hub. - defp normalize_repository!({:hf, repository_id}) when is_binary(repository_id) do - {:hf, repository_id, []} - end + ## Parameters - defp normalize_repository!({:hf, repository_id, opts}) when is_binary(repository_id) do - opts = Keyword.validate!(opts, [:revision, :cache_dir, :offline, :auth_token, :subdir]) - {:hf, repository_id, opts} - end + * `df` - Explorer.DataFrame to upload + * `repository` - repository ID (e.g., "username/dataset-name") + * `file_extension` - keyword list with file extension option - defp normalize_repository!({:local, dir}) when is_binary(dir) do - {:local, dir} - end + ## Returns - defp normalize_repository!(other) do - raise ArgumentError, - "expected repository to be either {:hf, repository_id}, {:hf, repository_id, options}" <> - " or {:local, directory}, got: #{inspect(other)}" + `{:ok, response}` on success, or `{:error, reason}` on failure. + """ + @spec upload_dataset(Explorer.DataFrame.t(), String.t(), keyword()) :: + {:error, String.t()} | {:ok, binary()} + def upload_dataset(df, repository, file_extension) do + ElixirDatasets.Utils.Uploader.upload_dataset(df, repository, file_extension) end @doc """ @@ -568,6 +320,15 @@ defmodule ElixirDatasets do Defaults to the standard cache location for the given operating system. Can be configured with the `ELIXIR_DATASETS_CACHE_DIR` environment variable. + + ## Examples + + iex> is_binary(ElixirDatasets.cache_dir()) + true + + iex> String.ends_with?(ElixirDatasets.cache_dir(), "elixir_datasets") + true + """ @spec cache_dir() :: String.t() def cache_dir() do @@ -577,163 +338,4 @@ defmodule ElixirDatasets do :filename.basedir(:user_cache, "elixir_datasets") end end - - defp build_streaming_dataset(repository, filtered_files, opts) do - batch_size = opts[:batch_size] || 1000 - - urls = build_streaming_urls(repository, filtered_files, opts) - - Stream.resource( - fn -> init_streaming_state(urls, batch_size) end, - &fetch_next_streaming_batch/1, - &cleanup_streaming/1 - ) - end - - defp build_streaming_urls({:hf, repository_id, repo_opts}, filtered_files, load_opts) do - auth_token = load_opts[:auth_token] - - Enum.map(filtered_files, fn {file_name, _etag} -> - filename = - if subdir = repo_opts[:subdir] do - subdir <> "/" <> file_name - else - file_name - end - - extension = file_name |> Path.extname() |> String.trim_leading(".") - url = HuggingFace.Hub.file_url(repository_id, filename, repo_opts[:revision]) - - {url, extension, auth_token} - end) - end - - defp build_streaming_urls({:local, dir}, filtered_files, _opts) do - Enum.map(filtered_files, fn {file_name, _etag} -> - path = Path.join(dir, file_name) - extension = file_name |> Path.extname() |> String.trim_leading(".") - {path, extension, nil} - end) - end - - defp init_streaming_state(urls, batch_size) do - %{ - urls: urls, - current_url_index: 0, - current_lazy_df: nil, - current_offset: 0, - batch_size: batch_size, - total_urls: length(urls) - } - end - - defp fetch_next_streaming_batch(%{current_url_index: idx, total_urls: total} = state) - when idx >= total do - {:halt, state} - end - - defp fetch_next_streaming_batch(state) do - case ensure_lazy_df_loaded(state) do - {:ok, state_with_df} -> - fetch_batch_from_lazy_df(state_with_df) - - {:error, _reason} -> - new_state = %{state | current_url_index: state.current_url_index + 1, current_offset: 0} - fetch_next_streaming_batch(new_state) - end - end - - defp ensure_lazy_df_loaded(%{current_lazy_df: nil} = state) do - {url, extension, auth_token} = Enum.at(state.urls, state.current_url_index) - - case load_lazy_dataframe_from_url(url, extension, auth_token) do - {:ok, lazy_df} -> - {:ok, %{state | current_lazy_df: lazy_df}} - - {:error, reason} -> - {:error, reason} - end - end - - defp ensure_lazy_df_loaded(state), do: {:ok, state} - - defp load_lazy_dataframe_from_url(url_or_path, extension, _auth_token) do - is_url = - String.starts_with?(url_or_path, "http://") or String.starts_with?(url_or_path, "https://") - - case {extension, is_url} do - {"parquet", true} -> - Explorer.DataFrame.from_parquet(url_or_path, lazy: true) - - {"parquet", false} -> - Explorer.DataFrame.from_parquet(url_or_path, lazy: true) - - {"csv", false} -> - Explorer.DataFrame.from_csv(url_or_path, lazy: true) - - {"jsonl", false} -> - Explorer.DataFrame.from_ndjson(url_or_path, lazy: true) - - {"csv", true} -> - case Explorer.DataFrame.from_csv(url_or_path) do - {:ok, df} -> {:ok, df} - error -> error - end - - {"jsonl", true} -> - case Explorer.DataFrame.from_ndjson(url_or_path) do - {:ok, df} -> {:ok, df} - error -> error - end - - _ -> - {:error, "Unsupported format for streaming: #{extension}"} - end - end - - defp fetch_batch_from_lazy_df(state) do - %{current_lazy_df: df, current_offset: offset, batch_size: batch_size} = state - - batch_df = - df - |> Explorer.DataFrame.slice(offset, batch_size) - |> then(fn sliced -> - if Explorer.DataFrame.lazy?(sliced) do - Explorer.DataFrame.collect(sliced) - else - sliced - end - end) - - batch_rows = Explorer.DataFrame.to_rows(batch_df) - num_rows = length(batch_rows) - - cond do - num_rows == 0 -> - new_state = %{ - state - | current_url_index: state.current_url_index + 1, - current_lazy_df: nil, - current_offset: 0 - } - - fetch_next_streaming_batch(new_state) - - num_rows < batch_size -> - new_state = %{ - state - | current_url_index: state.current_url_index + 1, - current_lazy_df: nil, - current_offset: 0 - } - - {batch_rows, new_state} - - true -> - new_state = %{state | current_offset: offset + batch_size} - {batch_rows, new_state} - end - end - - defp cleanup_streaming(_state), do: :ok end diff --git a/lib/elixir_datasets/filter.ex b/lib/elixir_datasets/filter.ex new file mode 100644 index 0000000..98294d0 --- /dev/null +++ b/lib/elixir_datasets/filter.ex @@ -0,0 +1,104 @@ +defmodule ElixirDatasets.Filter do + @moduledoc """ + Functions for filtering dataset files by configuration and split. + """ + + @doc """ + Filters repository files by configuration name and split. + + ## Parameters + + * `repo_files` - map of files from repository (%{filename => etag}) + * `name` - optional configuration name to filter by + * `split` - optional split name to filter by (e.g., "train", "test") + + ## Returns + + `{:ok, filtered_files}` where `filtered_files` is a map of matching files. + + ## Examples + + iex> files = %{"train.csv" => nil, "test.csv" => nil} + iex> ElixirDatasets.Filter.by_config_and_split(files, nil, "train") + {:ok, %{"train.csv" => nil}} + """ + @spec by_config_and_split(map(), String.t() | nil, String.t() | nil) :: {:ok, map()} + def by_config_and_split(repo_files, name, split) do + filtered = + repo_files + |> by_config_name(name) + |> by_split(split) + + {:ok, filtered} + end + + @doc """ + Filters files by configuration name. + + If `config_name` is nil, returns all files unchanged. + Otherwise, returns only files whose path contains the config name. + + ## Parameters + + * `repo_files` - map or list of files + * `config_name` - optional configuration name to filter by + + ## Returns + + Filtered files in the same format as input (map or list). + """ + @spec by_config_name(map() | list(), String.t() | nil) :: map() | list() + def by_config_name(repo_files, nil), do: repo_files + + def by_config_name(repo_files, config_name) do + filtered = + Enum.filter(repo_files, fn {file_name, _etag} -> + String.contains?(file_name, config_name) + end) + + if is_map(repo_files) do + Map.new(filtered) + else + filtered + end + end + + @doc """ + Filters files by split name. + + If `split` is nil, returns all files unchanged. + Otherwise, returns only files whose basename (without extension) contains the split name. + + ## Parameters + + * `repo_files` - map or list of files + * `split` - optional split name to filter by (e.g., "train", "test", "validation") + + ## Returns + + Filtered files in the same format as input (map or list). + + ## Examples + + iex> files = %{"train.csv" => nil, "test.csv" => nil, "validation.csv" => nil} + iex> ElixirDatasets.Filter.by_split(files, "train") + %{"train.csv" => nil} + """ + @spec by_split(map() | list(), String.t() | nil) :: map() | list() + def by_split(repo_files, nil), do: repo_files + + def by_split(repo_files, split) when is_binary(split) do + filtered = + Enum.filter(repo_files, fn {file_name, _etag} -> + base_name = Path.basename(file_name, Path.extname(file_name)) + String.contains?(base_name, split) + end) + + if is_map(repo_files) do + Map.new(filtered) + else + filtered + end + end +end + diff --git a/lib/elixir_datasets/info.ex b/lib/elixir_datasets/info.ex new file mode 100644 index 0000000..1150297 --- /dev/null +++ b/lib/elixir_datasets/info.ex @@ -0,0 +1,174 @@ +defmodule ElixirDatasets.Info do + @moduledoc """ + Functions for fetching and parsing dataset metadata from Hugging Face Hub. + """ + + alias ElixirDatasets.HuggingFace + alias ElixirDatasets.DatasetInfo + + @doc """ + Fetches dataset information from the Hugging Face API. + + ## Parameters + + * `repository_id` - the Hugging Face dataset repository ID (e.g., "aaaaa32r/elixirDatasets") + * `opts` - optional keyword list with the following options: + * `:auth_token` - the token to use as HTTP bearer authorization + + ## Returns + + Returns `{:ok, dataset_info}` where `dataset_info` is a map containing the dataset metadata, + or `{:error, reason}` if the request fails. + """ + @spec get_dataset_info(String.t(), keyword()) :: {:ok, map()} | {:error, String.t()} + def get_dataset_info(repository_id, opts \\ []) when is_binary(repository_id) do + url = HuggingFace.Hub.dataset_info_url(repository_id) + + headers = + case HuggingFace.Hub.get_auth_token(opts) do + {:ok, auth_token} -> [{"Authorization", "Bearer #{auth_token}"}] + {:error, _} -> [] + end + + with {:ok, response} <- ElixirDatasets.Utils.HTTP.request(:get, url, headers: headers), + {:ok, data} <- Jason.decode(response.body) do + {:ok, data} + end + end + + @doc """ + Fetches dataset information from the Hugging Face API and returns a list of DatasetInfo structs. + + This function retrieves all available dataset configurations for a given repository. + + ## Parameters + + * `repository_id` - the Hugging Face dataset repository ID (e.g., "aaaaa32r/elixirDatasets") + * `opts` - optional keyword list with the following options: + * `:auth_token` - the token to use as HTTP bearer authorization + + ## Returns + + Returns `{:ok, dataset_infos}` where `dataset_infos` is a list of DatasetInfo structs, + or `{:error, reason}` if the request fails. + + ## Examples + + iex> {:ok, infos} = ElixirDatasets.Info.get_dataset_infos("aaaaa32r/elixirDatasets") + iex> Enum.map(infos, & &1.config_name) + ["csv", "default"] + """ + @spec get_dataset_infos(String.t(), keyword()) :: + {:ok, [DatasetInfo.t()]} | {:error, String.t()} + def get_dataset_infos(repository_id, opts \\ []) when is_binary(repository_id) do + case get_dataset_info(repository_id, opts) do + {:ok, info} -> + dataset_infos = parse_dataset_infos(info) + {:ok, dataset_infos} + + {:error, reason} -> + {:error, reason} + end + end + + @doc """ + Parses raw dataset info map into a list of DatasetInfo structs. + + Extracts the dataset_info array from the HuggingFace API response's cardData field + and converts each entry into a DatasetInfo struct. + + ## Parameters + + * `data` - the raw response map from the HuggingFace API + + ## Returns + + A list of DatasetInfo structs. + """ + @spec parse_dataset_infos(map()) :: [DatasetInfo.t()] + def parse_dataset_infos(data) when is_map(data) do + data + |> Map.get("cardData", %{}) + |> Map.get("dataset_info", []) + |> case do + list when is_list(list) -> Enum.map(list, &DatasetInfo.from_map/1) + single -> [DatasetInfo.from_map(single)] + end + end + + @doc """ + Gets the split names (e.g., 'train', 'test', 'validation') for a dataset. + + ## Parameters + + * `repository_id` - the Hugging Face dataset repository ID (e.g., "cornell-movie-review-data/rotten_tomatoes") + * `opts` - optional keyword list with the following options: + * `:auth_token` - the token to use as HTTP bearer authorization + + ## Returns + + Returns `{:ok, split_names}` where `split_names` is a list of strings representing + the available splits, or `{:error, reason}` if the request fails. + + ## Examples + + iex> {:ok, splits} = ElixirDatasets.Info.get_dataset_split_names("cornell-movie-review-data/rotten_tomatoes") + iex> splits + ["train", "validation", "test"] + """ + @spec get_dataset_split_names(String.t(), keyword()) :: + {:ok, [String.t()]} | {:error, String.t()} + def get_dataset_split_names(repository_id, opts \\ []) when is_binary(repository_id) do + case get_dataset_infos(repository_id, opts) do + {:ok, infos} -> + split_names = + infos + |> Enum.flat_map(fn info -> + case info.splits do + nil -> [] + splits -> Enum.map(splits, fn split -> split["name"] end) + end + end) + |> Enum.uniq() + + {:ok, split_names} + + {:error, reason} -> + {:error, reason} + end + end + + @doc """ + Gets the configuration names available for a dataset. + + ## Parameters + + * `repository_id` - the Hugging Face dataset repository ID (e.g., "glue") + * `opts` - optional keyword list with the following options: + * `:auth_token` - the token to use as HTTP bearer authorization + + ## Returns + + Returns `{:ok, config_names}` where `config_names` is a list of configuration names, + or `{:error, reason}` if the request fails. + + ## Examples + + iex> {:ok, configs} = ElixirDatasets.Info.get_dataset_config_names("glue") + iex> Enum.member?(configs, "cola") + true + """ + @spec get_dataset_config_names(String.t(), keyword()) :: + {:ok, [String.t()]} | {:error, String.t()} + def get_dataset_config_names(repository_id, opts \\ []) when is_binary(repository_id) do + case get_dataset_infos(repository_id, opts) do + {:ok, infos} -> + config_names = Enum.map(infos, fn info -> info.config_name end) + {:ok, config_names} + + {:error, reason} -> + {:error, reason} + end + end +end + diff --git a/lib/elixir_datasets/loader.ex b/lib/elixir_datasets/loader.ex new file mode 100644 index 0000000..f888a0f --- /dev/null +++ b/lib/elixir_datasets/loader.ex @@ -0,0 +1,211 @@ +defmodule ElixirDatasets.Loader do + @moduledoc """ + Functions for loading datasets from repositories. + """ + + alias ElixirDatasets.Repository + alias ElixirDatasets.Filter + alias ElixirDatasets.Streaming + + @valid_extensions_list ["jsonl", "csv", "parquet"] + + @doc """ + Loads a dataset from the given repository. + + The repository can be either a local directory or a Hugging Face repository. + + ## Options + + ### Data Loading Options + + * `:split` - which split of the data to load (e.g., "train", "test", "validation"). + If not specified, all splits are loaded. Files are matched by name patterns + (e.g., "train.csv", "test-00000.parquet", "validation.jsonl"). + + * `:name` - the name of the dataset configuration to load. For datasets with + multiple configurations, this specifies which one to use. Files are matched + by looking for the config name in the file path (e.g., "sst2/train.parquet"). + + * `:streaming` - if `true`, returns an enumerable that progressively yields + data rows (maps) without loading the entire dataset into memory. Data is + fetched on-demand as you iterate. Useful for large datasets. Default is `false`. + + ### HuggingFace Hub Options + + * `:auth_token` - the token to use as HTTP bearer authorization + for remote files. If not provided, the token from the + `ELIXIR_DATASETS_HF_TOKEN` environment variable is used. + + * `:cache_dir` - the directory to store downloaded files in. + Defaults to the standard cache location for the operating system. + + * `:offline` - if `true`, only cached files are used and no network + requests are made. Returns an error if the file is not cached. + + * `:etag` - if provided, skips the HEAD request to fetch the latest + ETag value and uses this value instead. + + * `:download_mode` - controls download/cache behavior. Can be: + - `:reuse_dataset_if_exists` (default) - reuse cached data if available + - `:force_redownload` - always download, even if cached + + * `:verification_mode` - controls verification checks. Can be: + - `:basic_checks` (default) - basic validation + - `:no_checks` - skip all validation + + * `:num_proc` - number of processes to use for parallel dataset processing. + Default is `1` (no parallelization). Set to a higher number to speed up + dataset downloading and loading. For example, `num_proc: 4` will use 4 + parallel processes. + + ## Returns + + - When `streaming: false` (default): `{:ok, datasets}` where `datasets` is a list of Explorer.DataFrame.t() + - When `streaming: true`: `{:ok, stream}` where `stream` is an Enumerable that yields rows progressively + - On error: `{:error, reason}` + + ## Examples + + ElixirDatasets.Loader.load_dataset({:hf, "dataset_name"}, split: "train") + + ElixirDatasets.Loader.load_dataset({:hf, "glue"}, name: "sst2") + + {:ok, stream} = ElixirDatasets.Loader.load_dataset( + {:hf, "large_dataset"}, + split: "train", + streaming: true + ) + + stream |> Stream.take(100) |> Enum.each(&process_row/1) + + """ + @spec load_dataset(Repository.t_repository(), keyword()) :: + {:ok, [Explorer.DataFrame.t()] | Enumerable.t()} | {:error, Exception.t()} + def load_dataset(repository, opts \\ []) do + repository = Repository.normalize!(repository) + split = opts[:split] + name = opts[:name] + streaming = opts[:streaming] || false + num_proc = opts[:num_proc] || 1 + + with {:ok, repo_files} <- Repository.get_files(repository), + {:ok, filtered_files} <- Filter.by_config_and_split(repo_files, name, split) do + if streaming do + {:ok, Streaming.build(repository, filtered_files, opts)} + else + with {:ok, paths_with_extensions} <- load_spec(repository, filtered_files, num_proc) do + ElixirDatasets.Utils.Loader.load_datasets_from_paths(paths_with_extensions, num_proc) + end + end + end + end + + @doc """ + Similar to `load_dataset/2` but raises an error if loading fails. + + Accepts the same options as `load_dataset/2`. + + ## Returns + + * a list of loaded datasets (or a Stream if streaming is enabled) + * raises an error if loading fails + + ## Examples + + datasets = ElixirDatasets.Loader.load_dataset!({:hf, "dataset_name"}, split: "train") + + stream = ElixirDatasets.Loader.load_dataset!({:hf, "dataset"}, streaming: true) + stream |> Enum.take(10) + + """ + @spec load_dataset!(Repository.t_repository(), keyword()) :: + [Explorer.DataFrame.t()] | Enumerable.t() + def load_dataset!(repository, opts \\ []) do + case load_dataset(repository, opts) do + {:ok, datasets} -> datasets + {:error, reason} -> raise reason + end + end + + @doc """ + Loads the specification of files to download from a repository. + + Filters files by valid extensions and downloads them in parallel if num_proc > 1. + + ## Parameters + + * `repository` - normalized repository tuple + * `repo_files` - map of files from repository + * `num_proc` - number of parallel processes to use + + ## Returns + + `{:ok, paths_with_extensions}` where each element is `{path, extension}`, + or `{:error, reason}` if download fails. + """ + @spec load_spec(tuple(), map(), pos_integer()) :: + {:ok, list({String.t(), String.t()})} | {:error, String.t()} + def load_spec(repository, repo_files, num_proc) do + files_to_download = + Enum.filter(repo_files, fn {file_name, _etag} -> + extension = file_name |> Path.extname() |> String.trim_leading(".") + extension in @valid_extensions_list + end) + + if num_proc > 1 do + download_parallel(repository, files_to_download, num_proc) + else + download_sequential(repository, files_to_download) + end + end + + defp download_parallel(repository, files_to_download, num_proc) do + files_to_download + |> Task.async_stream( + fn {file_name, etag} -> + extension = file_name |> Path.extname() |> String.trim_leading(".") + + case Repository.download(repository, file_name, etag) do + {:ok, path} -> {:ok, {path, extension}} + {:error, reason} -> {:error, "failed to download #{file_name}: #{reason}"} + end + end, + max_concurrency: num_proc, + ordered: true + ) + |> Enum.reduce_while({:ok, []}, fn + {:ok, {:ok, path_ext}}, {:ok, acc} -> + {:cont, {:ok, [path_ext | acc]}} + + {:ok, {:error, reason}}, _acc -> + {:halt, {:error, reason}} + + {:exit, reason}, _acc -> + {:halt, {:error, "task failed: #{inspect(reason)}"}} + end) + |> case do + {:ok, paths} -> {:ok, Enum.reverse(paths)} + error -> error + end + end + + defp download_sequential(repository, files_to_download) do + Enum.reduce_while(files_to_download, [], fn {file_name, etag}, acc -> + extension = file_name |> Path.extname() |> String.trim_leading(".") + + case Repository.download(repository, file_name, etag) do + {:ok, path} -> + {:cont, [{path, extension} | acc]} + + {:error, reason} -> + {:halt, + {:error, "failed to download #{file_name} from #{inspect(repository)}: #{reason}"}} + end + end) + |> case do + {:error, _} = error -> error + paths -> {:ok, Enum.reverse(paths)} + end + end +end + diff --git a/lib/elixir_datasets/repository.ex b/lib/elixir_datasets/repository.ex new file mode 100644 index 0000000..cfa6d2c --- /dev/null +++ b/lib/elixir_datasets/repository.ex @@ -0,0 +1,203 @@ +defmodule ElixirDatasets.Repository do + @moduledoc """ + Functions for managing dataset repositories (local and Hugging Face). + """ + + alias ElixirDatasets.HuggingFace + + @typedoc """ + A location to fetch dataset files from. + Can be either a Hugging Face repository or a local resources: + + * `{:hf, repository_id}` - the Hugging Face repository ID + + * `{:hf, repository_id, options}` - the Hugging Face repository ID + with additional options + + * `{:local, path}` - a local directory or file path containing the datasets + """ + @type t_repository :: {:hf, String.t()} | {:hf, String.t(), keyword()} | {:local, Path.t()} + + @doc """ + Normalizes repository specification to a consistent format. + + ## Examples + + iex> ElixirDatasets.Repository.normalize!({:hf, "repo/name"}) + {:hf, "repo/name", []} + + iex> ElixirDatasets.Repository.normalize!({:local, "/path/to/data"}) + {:local, "/path/to/data"} + """ + @spec normalize!(t_repository()) :: t_repository() + def normalize!({:hf, repository_id}) when is_binary(repository_id) do + {:hf, repository_id, []} + end + + def normalize!({:hf, repository_id, opts}) when is_binary(repository_id) do + opts = Keyword.validate!(opts, [:revision, :cache_dir, :offline, :auth_token, :subdir]) + {:hf, repository_id, opts} + end + + def normalize!({:local, dir}) when is_binary(dir) do + {:local, dir} + end + + def normalize!(other) do + raise ArgumentError, + "expected repository to be either {:hf, repository_id}, {:hf, repository_id, options}" <> + " or {:local, directory}, got: #{inspect(other)}" + end + + @doc """ + Gets the list of files in a repository. + + For local repositories, lists files in the directory. + For Hugging Face repositories, fetches the file listing from the API. + + ## Returns + + `{:ok, repo_files}` where `repo_files` is a map of `%{filename => etag}`, + or `{:error, reason}` if the operation fails. + """ + @spec get_files(t_repository()) :: {:ok, map()} | {:error, String.t()} + def get_files({:local, dir}) do + case File.ls(dir) do + {:ok, filenames} -> + repo_files = + for filename <- filenames, + path = Path.join(dir, filename), + File.regular?(path), + into: %{}, + do: {filename, nil} + + {:ok, repo_files} + + {:error, reason} -> + {:error, "could not read #{dir}, reason: #{:file.format_error(reason)}"} + end + end + + def get_files({:hf, repository_id, opts}) do + subdir = opts[:subdir] + url = HuggingFace.Hub.file_listing_url(repository_id, subdir, opts[:revision]) + cache_scope = repository_id_to_cache_scope(repository_id) + + passthrough_opts = [ + :cache_dir, + :offline, + :auth_token, + :etag, + :download_mode, + :verification_mode + ] + + result = + HuggingFace.Hub.cached_download( + url, + [cache_scope: cache_scope] ++ Keyword.take(opts, passthrough_opts) + ) + + with {:ok, path} <- result, + {:ok, data} <- decode_config(path) do + repo_files = + for entry <- data, entry["type"] == "file", into: %{} do + path = entry["path"] + + name = + if subdir do + String.replace_leading(path, subdir <> "/", "") + else + path + end + + etag_content = entry["lfs"]["oid"] || entry["oid"] + etag = <> + {name, etag} + end + + {:ok, repo_files} + end + end + + @doc """ + Downloads a file from a repository. + + For local repositories, verifies the file exists. + For Hugging Face repositories, downloads the file using the Hub API. + + ## Returns + + `{:ok, path}` where `path` is the local file path, + or `{:error, reason}` if the download fails. + """ + @spec download(t_repository(), String.t(), String.t() | nil) :: + {:ok, String.t()} | {:error, String.t()} + def download({:local, dir}, filename, _etag) do + path = Path.join(dir, filename) + + if File.exists?(path) do + {:ok, path} + else + {:error, "local file #{inspect(path)} does not exist"} + end + end + + def download({:hf, repository_id, opts}, filename, etag) do + filename = + if subdir = opts[:subdir] do + subdir <> "/" <> filename + else + filename + end + + url = HuggingFace.Hub.file_url(repository_id, filename, opts[:revision]) + cache_scope = repository_id_to_cache_scope(repository_id) + + passthrough_opts = [ + :cache_dir, + :offline, + :auth_token, + :download_mode, + :verification_mode + ] + + HuggingFace.Hub.cached_download( + url, + [etag: etag, cache_scope: cache_scope] ++ + Keyword.take(opts, passthrough_opts) + ) + end + + @doc """ + Converts a repository ID to a cache scope string. + + Replaces slashes with double dashes and removes non-word characters. + + ## Examples + + iex> ElixirDatasets.Repository.repository_id_to_cache_scope("user/repo-name") + "user--repo-name" + """ + @spec repository_id_to_cache_scope(String.t()) :: String.t() + def repository_id_to_cache_scope(repository_id) do + repository_id + |> String.replace("/", "--") + |> String.replace(~r/[^\w-]/, "") + end + + defp decode_config(path) do + path + |> File.read!() + |> Jason.decode() + |> case do + {:ok, data} -> + {:ok, data} + + {:error, reason} -> + {:error, + "failed to parse the config file, it is not a valid JSON. Reason: #{inspect(reason)}"} + end + end +end + diff --git a/lib/elixir_datasets/streaming.ex b/lib/elixir_datasets/streaming.ex new file mode 100644 index 0000000..7c1f60a --- /dev/null +++ b/lib/elixir_datasets/streaming.ex @@ -0,0 +1,190 @@ +defmodule ElixirDatasets.Streaming do + @moduledoc """ + Functions for streaming datasets progressively without loading everything into memory. + """ + + alias ElixirDatasets.HuggingFace + + @doc """ + Builds a streaming dataset that yields rows progressively. + + ## Parameters + + * `repository` - normalized repository tuple + * `filtered_files` - map of files to stream from + * `opts` - options including: + * `:batch_size` - number of rows to fetch per batch (default: 1000) + * `:auth_token` - authentication token for Hugging Face + + ## Returns + + A Stream that yields rows as maps. + """ + @spec build(tuple(), map(), keyword()) :: Enumerable.t() + def build(repository, filtered_files, opts) do + batch_size = opts[:batch_size] || 1000 + + urls = build_urls(repository, filtered_files, opts) + + Stream.resource( + fn -> init_state(urls, batch_size) end, + &fetch_next_batch/1, + &cleanup/1 + ) + end + + @doc """ + Builds URLs for streaming from repository files. + + For Hugging Face repositories, creates HTTP URLs. + For local repositories, uses file paths. + """ + @spec build_urls(tuple(), map(), keyword()) :: list() + def build_urls({:hf, repository_id, repo_opts}, filtered_files, load_opts) do + auth_token = load_opts[:auth_token] + + Enum.map(filtered_files, fn {file_name, _etag} -> + filename = + if subdir = repo_opts[:subdir] do + subdir <> "/" <> file_name + else + file_name + end + + extension = file_name |> Path.extname() |> String.trim_leading(".") + url = HuggingFace.Hub.file_url(repository_id, filename, repo_opts[:revision]) + + {url, extension, auth_token} + end) + end + + def build_urls({:local, dir}, filtered_files, _opts) do + Enum.map(filtered_files, fn {file_name, _etag} -> + path = Path.join(dir, file_name) + extension = file_name |> Path.extname() |> String.trim_leading(".") + {path, extension, nil} + end) + end + + defp init_state(urls, batch_size) do + %{ + urls: urls, + current_url_index: 0, + current_lazy_df: nil, + current_offset: 0, + batch_size: batch_size, + total_urls: length(urls) + } + end + + defp fetch_next_batch(%{current_url_index: idx, total_urls: total} = state) + when idx >= total do + {:halt, state} + end + + defp fetch_next_batch(state) do + case ensure_lazy_df_loaded(state) do + {:ok, state_with_df} -> + fetch_batch_from_lazy_df(state_with_df) + + {:error, _reason} -> + new_state = %{state | current_url_index: state.current_url_index + 1, current_offset: 0} + fetch_next_batch(new_state) + end + end + + defp ensure_lazy_df_loaded(%{current_lazy_df: nil} = state) do + {url, extension, auth_token} = Enum.at(state.urls, state.current_url_index) + + case load_lazy_dataframe(url, extension, auth_token) do + {:ok, lazy_df} -> + {:ok, %{state | current_lazy_df: lazy_df}} + + {:error, reason} -> + {:error, reason} + end + end + + defp ensure_lazy_df_loaded(state), do: {:ok, state} + + defp load_lazy_dataframe(url_or_path, extension, _auth_token) do + is_url = + String.starts_with?(url_or_path, "http://") or String.starts_with?(url_or_path, "https://") + + case {extension, is_url} do + {"parquet", true} -> + Explorer.DataFrame.from_parquet(url_or_path, lazy: true) + + {"parquet", false} -> + Explorer.DataFrame.from_parquet(url_or_path, lazy: true) + + {"csv", false} -> + Explorer.DataFrame.from_csv(url_or_path, lazy: true) + + {"jsonl", false} -> + Explorer.DataFrame.from_ndjson(url_or_path, lazy: true) + + {"csv", true} -> + case Explorer.DataFrame.from_csv(url_or_path) do + {:ok, df} -> {:ok, df} + error -> error + end + + {"jsonl", true} -> + case Explorer.DataFrame.from_ndjson(url_or_path) do + {:ok, df} -> {:ok, df} + error -> error + end + + _ -> + {:error, "Unsupported format for streaming: #{extension}"} + end + end + + defp fetch_batch_from_lazy_df(state) do + %{current_lazy_df: df, current_offset: offset, batch_size: batch_size} = state + + batch_df = + df + |> Explorer.DataFrame.slice(offset, batch_size) + |> then(fn sliced -> + if Explorer.DataFrame.lazy?(sliced) do + Explorer.DataFrame.collect(sliced) + else + sliced + end + end) + + batch_rows = Explorer.DataFrame.to_rows(batch_df) + num_rows = length(batch_rows) + + cond do + num_rows == 0 -> + new_state = %{ + state + | current_url_index: state.current_url_index + 1, + current_lazy_df: nil, + current_offset: 0 + } + + fetch_next_batch(new_state) + + num_rows < batch_size -> + new_state = %{ + state + | current_url_index: state.current_url_index + 1, + current_lazy_df: nil, + current_offset: 0 + } + + {batch_rows, new_state} + + true -> + new_state = %{state | current_offset: offset + batch_size} + {batch_rows, new_state} + end + end + + defp cleanup(_state), do: :ok +end + diff --git a/lib/huggingface/hub.ex b/lib/huggingface/hub.ex deleted file mode 100644 index ff84e75..0000000 --- a/lib/huggingface/hub.ex +++ /dev/null @@ -1,336 +0,0 @@ -# This file, part of the ElixirDatasets project, has been adapted from code originally under Apache License 2.0. -# The original code can be found at: -# https://github.com/elixir-nx/bumblebee/blob/710a645222948f80208c348d3a2589cbd3ab8e7d/lib/bumblebee/huggingface/hub.ex - -defmodule ElixirDatasets.HuggingFace.Hub do - @moduledoc false - @compile if Mix.env() == :test, do: :export_all - alias ElixirDatasets.Utils.HTTP - - @huggingface_endpoint "https://huggingface.co" - - @doc """ - Returns a URL pointing to the given file in a Hugging Face repository. - """ - @spec file_url(String.t(), String.t(), String.t() | nil) :: String.t() - def file_url(repository_id, filename, revision) do - revision = revision || "main" - @huggingface_endpoint <> "/datasets/#{repository_id}/resolve/#{revision}/#{filename}" - end - - @doc """ - Returns a URL to list the contents of a Hugging Face repository. - """ - @spec file_listing_url(String.t(), String.t() | nil, String.t() | nil) :: String.t() - def file_listing_url(repository_id, subdir, revision) do - revision = revision || "main" - path = if(subdir, do: "/" <> subdir) - @huggingface_endpoint <> "/api/datasets/#{repository_id}/tree/#{revision}#{path}" - end - - @doc """ - Returns a URL to fetch dataset information from the Hugging Face API. - """ - @spec dataset_info_url(String.t()) :: String.t() - def dataset_info_url(repository_id) do - @huggingface_endpoint <> "/api/datasets/#{repository_id}" - end - - @doc """ - Downloads file from the given URL and returns a path to the file. - - The file is cached based on the received ETag. Subsequent requests - for the same URL validate the ETag and return a file from the cache - if there is a match. - - ## Options - - * `:cache_dir` - the directory to store the downloaded files in. - Defaults to the standard cache location for the given operating - system - - * `:offline` - if `true`, cached path is returned if exists and - and error otherwise - - * `:auth_token` - the token to use as HTTP bearer authorization - for remote files - - * `:etag` - by default a HEAD request is made to fetch the latest - ETag value, however if the value is already known, it can be - passed as an option instead (to skip the extra request) - - * `:cache_scope` - a namespace to put the cached files under in - the cache directory - - * `:download_mode` - controls download/cache behavior. Can be: - - `:reuse_dataset_if_exists` (default) - reuse cached data if available - - `:force_redownload` - always download, even if cached - - * `:verification_mode` - controls whether basic verification checks - are applied. Can be: - - `:basic_checks` (default) - perform basic validation - - `:no_checks` - skip validation (for example, file existence checks) - Note: Currently, `:verification_mode` only distinguishes between - performing the default basic checks and skipping them via `:no_checks`. - - """ - @spec cached_download(String.t(), keyword()) :: {:ok, String.t()} | {:error, String.t()} - def cached_download(url, opts \\ []) do - cache_dir = opts[:cache_dir] || ElixirDatasets.cache_dir() - offline = Keyword.get(opts, :offline, elixir_datasets_offline?()) - auth_token = opts[:auth_token] - download_mode = opts[:download_mode] || :reuse_dataset_if_exists - verification_mode = opts[:verification_mode] || :basic_checks - - dir = Path.join(cache_dir, "huggingface") - - dir = - if cache_scope = opts[:cache_scope] do - Path.join(dir, cache_scope) - else - dir - end - - File.mkdir_p!(dir) - - headers = - if auth_token do - [{"Authorization", "Bearer " <> auth_token}] - else - [] - end - - metadata_path = Path.join(dir, metadata_filename(url)) - - # Handle force_redownload mode - delete cached files - if download_mode == :force_redownload do - File.rm(metadata_path) - end - - cond do - offline -> - case load_json(metadata_path) do - {:ok, %{"etag" => etag}} -> - entry_path = Path.join(dir, entry_filename(url, etag)) - - cond do - File.exists?(entry_path) -> - {:ok, entry_path} - - verification_mode == :no_checks -> - IO.warn( - "ElixirDatasets.HuggingFace.Hub.cached_download/2: " <> - "returning path to non-existent cached file in offline mode with " <> - ":no_checks verification_mode: #{entry_path}" - ) - - {:ok, entry_path} - - true -> - {:error, "cached file not found: #{entry_path}"} - end - - _ -> - {:error, - "could not find file in local cache and outgoing traffic is disabled, url: #{url}"} - end - - entry_path = opts[:etag] && cached_path_for_etag(dir, url, opts[:etag]) -> - {:ok, entry_path} - - true -> - with {:ok, etag, download_url, redirect?} <- head_download(url, headers) do - cached_entry = - if download_mode != :force_redownload, do: cached_path_for_etag(dir, url, etag) - - if cached_entry do - {:ok, cached_entry} - else - entry_path = Path.join(dir, entry_filename(url, etag)) - - headers = - if redirect? do - List.keydelete(headers, "Authorization", 0) - else - headers - end - - download_url - |> HTTP.download(entry_path, headers: headers) - |> finish_request(download_url) - |> case do - :ok -> - :ok = store_json(metadata_path, %{"etag" => etag, "url" => url}) - {:ok, entry_path} - - error -> - File.rm_rf!(metadata_path) - File.rm_rf!(entry_path) - error - end - end - end - end - end - - defp cached_path_for_etag(dir, url, etag) do - metadata_path = Path.join(dir, metadata_filename(url)) - - case load_json(metadata_path) do - {:ok, %{"etag" => ^etag}} -> - path = Path.join(dir, entry_filename(url, etag)) - - # Make sure the file exists, in case someone manually removed it - if File.exists?(path) do - path - end - - _ -> - nil - end - end - - defp head_download(url, headers) do - with {:ok, response} <- - HTTP.request(:head, url, follow_redirects: false, headers: headers) - |> finish_request(url) do - if response.status in 300..399 do - location = HTTP.get_header(response, "location") - - # Follow relative redirects - if URI.parse(location).host == nil do - url = - url - |> URI.parse() - |> Map.replace!(:path, location) - |> URI.to_string() - - head_download(url, headers) - else - with {:ok, etag} <- fetch_etag(response), do: {:ok, etag, location, true} - end - else - with {:ok, etag} <- fetch_etag(response), do: {:ok, etag, url, false} - end - end - end - - defp finish_request(:ok, _url), do: :ok - - defp finish_request({:ok, response}, _url) when response.status in 100..399, do: {:ok, response} - - defp finish_request({:ok, response}, url) do - case HTTP.get_header(response, "x-error-code") do - code when code == "RepoNotFound" or response.status == 401 -> - {:error, - "repository not found, url: #{url}. Please make sure you specified" <> - " the correct repository id. If you are trying to access a private" <> - " or gated repository, use an authentication token"} - - "EntryNotFound" -> - {:error, "file not found, url: #{url}"} - - "RevisionNotFound" -> - {:error, "revision not found, url: #{url}"} - - "GatedRepo" -> - {:error, - "cannot access gated repository, url: #{url}. Make sure to request access" <> - " for the repository and use an authentication token"} - - _ -> - {:error, "HTTP request failed with status #{response.status}, url: #{url}"} - end - end - - defp finish_request({:error, reason}, _url) do - {:error, "failed to make an HTTP request, reason: #{inspect(reason)}"} - end - - defp fetch_etag(response) do - if etag = HTTP.get_header(response, "x-linked-etag") || HTTP.get_header(response, "etag") do - {:ok, etag} - else - {:error, "no ETag found on the resource"} - end - end - - @doc """ - Gets the HuggingFace authentication token. Requires that it starts with "hf_". - - Looks for the token in the following order: - 1. From options (`:auth_token` key) - 2. From system environment variable (`HF_TOKEN`) - 3. Returns error if not found - - ## Parameters - - * `opts` - keyword list with optional `:auth_token` key - - ## Returns - - * `{:ok, String.t()}` - the authentication token - * `{:error, String.t()}` - if no token is found or invalid - - ## Examples - - iex> ElixirDatasets.HuggingFace.Hub.get_auth_token(auth_token: "hf_my_token") - {:ok, "hf_my_token"} - - iex> ElixirDatasets.HuggingFace.Hub.get_auth_token(auth_token: "my_invalid_token") - {:error, "The provided Hugging Face authentication token does not start with 'hf_'."} - - # iex> ElixirDatasets.HuggingFace.Hub.get_auth_token([]) - # the value of HF_TOKEN environment variable if valid else error - """ - @spec get_auth_token(keyword()) :: {:ok, String.t()} | {:error, String.t()} - def get_auth_token(opts \\ []) do - token = opts[:auth_token] || System.get_env("HF_TOKEN") - validate_auth_token(token) - end - - @spec validate_auth_token(String.t() | nil) :: {:ok, String.t()} | {:error, String.t()} - defp validate_auth_token(token) when is_binary(token) do - cond do - String.starts_with?(token, "hf_") -> - {:ok, token} - - true -> - {:error, "The provided Hugging Face authentication token does not start with 'hf_'."} - end - end - - defp validate_auth_token(_), do: {:error, "No Hugging Face authentication token provided."} - - defp metadata_filename(url) do - encode_url(url) <> ".json" - end - - defp entry_filename(url, etag) do - encode_url(url) <> "." <> encode_etag(etag) - end - - defp encode_url(url) do - url |> :erlang.md5() |> Base.encode32(case: :lower, padding: false) - end - - defp encode_etag(etag) do - Base.encode32(etag, case: :lower, padding: false) - end - - defp load_json(path) do - case File.read(path) do - {:ok, content} -> {:ok, Jason.decode!(content)} - _error -> :error - end - end - - defp store_json(path, data) do - json = Jason.encode!(data) - File.write(path, json) - end - - defp elixir_datasets_offline?() do - System.get_env("ELIXIR_DATASETS_OFFLINE") in ~w(1 true) - end -end diff --git a/test/elixir_datasets/utils/filter_test.exs b/test/elixir_datasets/utils/filter_test.exs new file mode 100644 index 0000000..99cfe75 --- /dev/null +++ b/test/elixir_datasets/utils/filter_test.exs @@ -0,0 +1,118 @@ +defmodule ElixirDatasets.FilterTest do + use ExUnit.Case, async: true + doctest ElixirDatasets.Filter + + alias ElixirDatasets.Filter + + describe "by_config_and_split/3" do + @sample_files %{ + "train.csv" => "etag1", + "test.csv" => "etag2", + "validation.csv" => "etag3", + "sst2/train.parquet" => "etag4", + "sst2/test.parquet" => "etag5", + "cola/train.parquet" => "etag6" + } + + test "returns all files when no filters applied" do + assert {:ok, filtered} = Filter.by_config_and_split(@sample_files, nil, nil) + assert filtered == @sample_files + end + + test "filters by split name" do + assert {:ok, filtered} = Filter.by_config_and_split(@sample_files, nil, "train") + assert map_size(filtered) == 3 + assert Map.has_key?(filtered, "train.csv") + assert Map.has_key?(filtered, "sst2/train.parquet") + assert Map.has_key?(filtered, "cola/train.parquet") + end + + test "filters by config name" do + assert {:ok, filtered} = Filter.by_config_and_split(@sample_files, "sst2", nil) + assert map_size(filtered) == 2 + assert Map.has_key?(filtered, "sst2/train.parquet") + assert Map.has_key?(filtered, "sst2/test.parquet") + end + + test "filters by both config and split" do + assert {:ok, filtered} = Filter.by_config_and_split(@sample_files, "sst2", "train") + assert map_size(filtered) == 1 + assert Map.has_key?(filtered, "sst2/train.parquet") + end + + test "returns empty map when no matches" do + assert {:ok, filtered} = Filter.by_config_and_split(@sample_files, "nonexistent", nil) + assert map_size(filtered) == 0 + end + end + + describe "by_config_name/2" do + @sample_files %{ + "train.csv" => "etag1", + "sst2/train.parquet" => "etag2", + "cola/train.parquet" => "etag3" + } + + test "returns all files when config is nil" do + filtered = Filter.by_config_name(@sample_files, nil) + assert filtered == @sample_files + end + + test "filters files by config name" do + filtered = Filter.by_config_name(@sample_files, "sst2") + assert is_map(filtered) + assert map_size(filtered) == 1 + assert Map.has_key?(filtered, "sst2/train.parquet") + end + + test "works with list input" do + files_list = [{"train.csv", "etag1"}, {"sst2/train.parquet", "etag2"}] + filtered = Filter.by_config_name(files_list, "sst2") + assert is_list(filtered) + assert length(filtered) == 1 + assert {"sst2/train.parquet", "etag2"} in filtered + end + end + + describe "by_split/2" do + @sample_files %{ + "train.csv" => "etag1", + "test.csv" => "etag2", + "validation.csv" => "etag3" + } + + test "returns all files when split is nil" do + filtered = Filter.by_split(@sample_files, nil) + assert filtered == @sample_files + end + + test "filters files by split name" do + filtered = Filter.by_split(@sample_files, "train") + assert is_map(filtered) + assert map_size(filtered) == 1 + assert Map.has_key?(filtered, "train.csv") + end + + test "filters files with split in basename" do + files = %{ + "train-00000.parquet" => "etag1", + "test-00000.parquet" => "etag2", + "train-00001.parquet" => "etag3" + } + + filtered = Filter.by_split(files, "train") + assert map_size(filtered) == 2 + assert Map.has_key?(filtered, "train-00000.parquet") + assert Map.has_key?(filtered, "train-00001.parquet") + end + + test "works with list input" do + files_list = [{"train.csv", "etag1"}, {"test.csv", "etag2"}] + filtered = Filter.by_split(files_list, "train") + assert is_list(filtered) + assert length(filtered) == 1 + assert {"train.csv", "etag1"} in filtered + end + end +end + diff --git a/test/elixir_datasets/utils/info_test.exs b/test/elixir_datasets/utils/info_test.exs new file mode 100644 index 0000000..3ff9e56 --- /dev/null +++ b/test/elixir_datasets/utils/info_test.exs @@ -0,0 +1,92 @@ +defmodule ElixirDatasets.InfoTest do + use ExUnit.Case, async: true + doctest ElixirDatasets.Info + + alias ElixirDatasets.Info + + describe "get_dataset_info/2" do + test "fetches dataset info from Hugging Face API" do + assert {:ok, info} = Info.get_dataset_info("aaaaa32r/elixirDatasets") + assert is_map(info) + assert info["id"] == "aaaaa32r/elixirDatasets" + + assert is_map(info["cardData"]) + dataset_info = info["cardData"]["dataset_info"] + assert is_list(dataset_info) + + first_config = Enum.at(dataset_info, 0) + assert first_config["config_name"] == "csv" + assert is_list(first_config["features"]) + assert is_list(first_config["splits"]) + + first_split = Enum.at(first_config["splits"], 0) + assert first_split["num_examples"] == 10 + end + end + + describe "get_dataset_infos/2" do + test "fetches dataset infos as DatasetInfo structs" do + assert {:ok, infos} = Info.get_dataset_infos("aaaaa32r/elixirDatasets") + assert is_list(infos) + assert Enum.count(infos) > 0 + + first_info = Enum.at(infos, 0) + assert %ElixirDatasets.DatasetInfo{} = first_info + assert first_info.config_name == "csv" + assert is_list(first_info.features) + assert is_list(first_info.splits) + end + end + + describe "parse_dataset_infos/1" do + test "parses raw dataset info map into DatasetInfo structs" do + data = %{ + "cardData" => %{ + "dataset_info" => [ + %{ + "config_name" => "csv", + "features" => [%{"name" => "id", "dtype" => "int64"}], + "splits" => [%{"name" => "train", "num_examples" => 10}] + } + ] + } + } + + infos = Info.parse_dataset_infos(data) + assert is_list(infos) + assert Enum.count(infos) == 1 + + first_info = Enum.at(infos, 0) + assert %ElixirDatasets.DatasetInfo{} = first_info + assert first_info.config_name == "csv" + assert first_info.features == [%{"name" => "id", "dtype" => "int64"}] + assert first_info.splits == [%{"name" => "train", "num_examples" => 10}] + end + + test "handles missing dataset_info gracefully" do + data = %{"cardData" => %{}} + infos = Info.parse_dataset_infos(data) + assert infos == [] + end + end + + describe "get_dataset_split_names/2" do + test "fetches split names from dataset" do + assert {:ok, splits} = Info.get_dataset_split_names("aaaaa32r/elixirDatasets") + assert is_list(splits) + assert Enum.count(splits) > 0 + assert Enum.all?(splits, &is_binary/1) + end + end + + describe "get_dataset_config_names/2" do + test "fetches config names from dataset" do + assert {:ok, configs} = Info.get_dataset_config_names("aaaaa32r/elixirDatasets") + assert is_list(configs) + assert Enum.count(configs) > 0 + assert Enum.all?(configs, &is_binary/1) + assert Enum.member?(configs, "csv") + end + end +end + diff --git a/test/elixir_datasets/utils/loader_test.exs b/test/elixir_datasets/utils/loader_test.exs new file mode 100644 index 0000000..30c1edc --- /dev/null +++ b/test/elixir_datasets/utils/loader_test.exs @@ -0,0 +1,212 @@ +defmodule ElixirDatasets.LoaderTest do + use ExUnit.Case, async: false + doctest ElixirDatasets.Loader + + alias ElixirDatasets.Loader + + describe "load_spec/3" do + @cache_dir "test_cache_load_spec" + @repository {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]} + @valid_repo_files %{ + "resources/csv-test.csv" => "\"2dccc814f47c01b5344abbb72367a5b322656b0b\"" + } + @invalid_repo_files %{"invalid.csv" => "\"1234567890asdfgh\""} + + test "loads valid files" do + assert {:ok, _paths} = Loader.load_spec(@repository, @valid_repo_files, 1) + File.rm_rf!(@cache_dir) + end + + test "returns error for invalid files" do + assert {:error, _reason} = Loader.load_spec(@repository, @invalid_repo_files, 1) + File.rm_rf!(@cache_dir) + end + + test "loads files with num_proc > 1" do + assert {:ok, paths} = Loader.load_spec(@repository, @valid_repo_files, 4) + assert is_list(paths) + File.rm_rf!(@cache_dir) + end + end + + describe "load_dataset/2" do + @cache_dir "test_cache_load_dataset" + @repository {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]} + + setup do + on_exit(fn -> + File.rm_rf!(@cache_dir) + + File.rm_rf!( + :filename.basedir( + :user_cache, + "elixir_datasets" <> "/huggingface/aaaaa32r--elixirDatasets" + ) + ) + end) + end + + test "loads a dataset from Hugging Face" do + assert {:ok, datasets} = Loader.load_dataset(@repository) + assert is_list(datasets) + end + + test "loads a dataset from Hugging Face without opts" do + repository_short = {:hf, "aaaaa32r/elixirDatasets"} + assert {:ok, datasets} = Loader.load_dataset(repository_short) + assert is_list(datasets) + end + + test "loads a dataset from local directory" do + repository = {:local, "resources"} + assert {:ok, datasets} = Loader.load_dataset(repository) + assert is_list(datasets) + end + + test "raises error when invalid local directory" do + repository = {:local, "invalid/path"} + assert {:error, _reason} = Loader.load_dataset(repository) + end + + test "loads dataset offline" do + repository = {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]} + assert {:ok, datasets} = Loader.load_dataset(repository) + assert is_list(datasets) + + repository_offline = {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir, offline: true]} + assert {:ok, datasets} = Loader.load_dataset(repository_offline) + assert is_list(datasets) + + repository_offline_invalid = {:hf, "not/exists", [cache_dir: @cache_dir, offline: true]} + assert {:error, _reason} = Loader.load_dataset(repository_offline_invalid) + end + + test "loads a dataset from Hugging Face with subdirectory" do + repository_subdir = + {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir, subdir: "resources"]} + + assert {:ok, datasets} = Loader.load_dataset(repository_subdir) + assert is_list(datasets) + end + + test "returns error for non-existent dataset" do + repository = {:test, "nonexistent/repo", []} + + assert_raise ArgumentError, fn -> + Loader.load_dataset(repository) + end + end + + test "loads dataset with split parameter from local directory" do + repository = {:local, "resources"} + assert {:ok, datasets} = Loader.load_dataset(repository, split: "train") + assert is_list(datasets) + end + + test "loads dataset with name parameter filters files" do + repository = {:local, "resources"} + assert {:ok, datasets} = Loader.load_dataset(repository, name: "csv") + assert is_list(datasets) + end + + test "loads dataset with split and name parameters combined" do + repository = {:local, "resources"} + + assert {:ok, datasets} = + Loader.load_dataset(repository, split: "train", name: "csv") + + assert is_list(datasets) + end + + test "loads dataset with download_mode option" do + repository = {:local, "resources"} + + assert {:ok, datasets} = + Loader.load_dataset(repository, download_mode: :reuse_dataset_if_exists) + + assert is_list(datasets) + end + + test "loads dataset with verification_mode option" do + repository = {:local, "resources"} + + assert {:ok, datasets} = + Loader.load_dataset(repository, verification_mode: :no_checks) + + assert is_list(datasets) + end + + test "loads dataset with num_proc for parallel processing" do + repository = {:local, "resources"} + assert {:ok, datasets} = Loader.load_dataset(repository, num_proc: 2) + assert is_list(datasets) + assert length(datasets) > 0 + end + + test "loads dataset with num_proc=1 (sequential)" do + repository = {:local, "resources"} + assert {:ok, datasets} = Loader.load_dataset(repository, num_proc: 1) + assert is_list(datasets) + end + + test "num_proc=4 is faster than num_proc=1 for parallel loading" do + repository = @repository + + {time_sequential, {:ok, datasets_seq}} = + :timer.tc(fn -> + Loader.load_dataset(repository, num_proc: 1) + end) + + {time_parallel, {:ok, datasets_par}} = + :timer.tc(fn -> + Loader.load_dataset(repository, num_proc: 4) + end) + + assert length(datasets_seq) == length(datasets_par) + + total_rows_seq = + Enum.reduce(datasets_seq, 0, fn df, acc -> + acc + Explorer.DataFrame.n_rows(df) + end) + + total_rows_par = + Enum.reduce(datasets_par, 0, fn df, acc -> + acc + Explorer.DataFrame.n_rows(df) + end) + + assert total_rows_seq == total_rows_par + + assert time_parallel <= time_sequential * 1.5, + "Parallel processing overhead should be reasonable for this dataset size" + end + + test "num_proc produces same results as sequential" do + repository = {:local, "resources"} + + {:ok, datasets_seq} = Loader.load_dataset(repository, num_proc: 1) + {:ok, datasets_par} = Loader.load_dataset(repository, num_proc: 4) + + assert length(datasets_seq) == length(datasets_par) + seq_row_counts = Enum.map(datasets_seq, &Explorer.DataFrame.n_rows/1) |> Enum.sort() + par_row_counts = Enum.map(datasets_par, &Explorer.DataFrame.n_rows/1) |> Enum.sort() + + assert seq_row_counts == par_row_counts + end + end + + describe "load_dataset!/2" do + test "loads dataset successfully" do + repository = {:local, "resources"} + datasets = Loader.load_dataset!(repository) + assert is_list(datasets) + end + + test "raises error on failure" do + repository = {:local, "invalid/path"} + + assert_raise RuntimeError, fn -> + Loader.load_dataset!(repository) + end + end + end +end diff --git a/test/elixir_datasets/utils/repository_test.exs b/test/elixir_datasets/utils/repository_test.exs new file mode 100644 index 0000000..5573c43 --- /dev/null +++ b/test/elixir_datasets/utils/repository_test.exs @@ -0,0 +1,64 @@ +defmodule ElixirDatasets.RepositoryTest do + use ExUnit.Case, async: true + doctest ElixirDatasets.Repository + + alias ElixirDatasets.Repository + + describe "normalize!/1" do + test "normalizes {:hf, repository_id} format" do + assert {:hf, "user/repo", []} = Repository.normalize!({:hf, "user/repo"}) + end + + test "normalizes {:hf, repository_id, opts} format" do + opts = [revision: "main", cache_dir: "/tmp"] + {:hf, "user/repo", normalized_opts} = Repository.normalize!({:hf, "user/repo", opts}) + assert Keyword.get(normalized_opts, :revision) == "main" + assert Keyword.get(normalized_opts, :cache_dir) == "/tmp" + end + + test "normalizes {:local, dir} format" do + assert {:local, "/path/to/dir"} = Repository.normalize!({:local, "/path/to/dir"}) + end + + test "raises error for invalid format" do + assert_raise ArgumentError, fn -> + Repository.normalize!({:invalid, "repo"}) + end + end + + test "raises error for invalid options" do + assert_raise ArgumentError, fn -> + Repository.normalize!({:hf, "user/repo", [invalid_opt: true]}) + end + end + end + + describe "get_files/1" do + test "gets files from local directory" do + repository = {:local, "resources"} + assert {:ok, files} = Repository.get_files(repository) + assert is_map(files) + assert map_size(files) > 0 + end + + test "returns error for non-existent local directory" do + repository = {:local, "non_existent_dir"} + assert {:error, _reason} = Repository.get_files(repository) + end + end + + describe "repository_id_to_cache_scope/1" do + test "converts repository ID to cache scope" do + assert "user--repo" = Repository.repository_id_to_cache_scope("user/repo") + end + + test "removes special characters" do + assert "user--repo-name" = Repository.repository_id_to_cache_scope("user/repo-name") + end + + test "handles underscores" do + assert "user--repo_name" = Repository.repository_id_to_cache_scope("user/repo_name") + end + end +end + diff --git a/test/elixir_datasets/utils/streaming_test.exs b/test/elixir_datasets/utils/streaming_test.exs new file mode 100644 index 0000000..506cf7c --- /dev/null +++ b/test/elixir_datasets/utils/streaming_test.exs @@ -0,0 +1,139 @@ +defmodule ElixirDatasets.StreamingTest do + use ExUnit.Case, async: false + doctest ElixirDatasets.Streaming + + alias ElixirDatasets.Loader + + describe "streaming mode" do + @cache_dir "test_cache_streaming" + @repository {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]} + + setup do + on_exit(fn -> + File.rm_rf!(@cache_dir) + + File.rm_rf!( + :filename.basedir( + :user_cache, + "elixir_datasets" <> "/huggingface/aaaaa32r--elixirDatasets" + ) + ) + end) + end + + test "loads dataset with streaming parameter returns Stream" do + repository = {:local, "resources"} + assert {:ok, stream} = Loader.load_dataset(repository, streaming: true) + + assert is_function(stream, 2), "Expected a Stream (function/2)" + + rows = stream |> Enum.take(5) + assert is_list(rows) + assert Enum.all?(rows, &is_map/1), "Each row should be a map" + end + + test "streaming mode fetches data progressively" do + repository = {:local, "resources"} + assert {:ok, stream} = Loader.load_dataset(repository, streaming: true) + + rows = stream |> Enum.take(3) + assert length(rows) <= 3 + assert Enum.all?(rows, &is_map/1) + end + + test "streaming with custom batch_size" do + repository = {:local, "resources"} + + assert {:ok, stream} = + Loader.load_dataset( + repository, + streaming: true, + batch_size: 2 + ) + + rows = stream |> Enum.take(5) + assert is_list(rows) + end + + test "streaming is lazy - data fetched on demand, not upfront" do + repository = {:local, "resources"} + + {:ok, stream} = Loader.load_dataset(repository, streaming: true) + + rows1 = stream |> Enum.take(3) + assert length(rows1) == 3 + + Process.sleep(2000) + + rows2 = stream |> Enum.take(5) + assert length(rows2) == 5 + + fetch_count = :counters.new(1, [:atomics]) + + counted_stream = + stream + |> Stream.map(fn row -> + :counters.add(fetch_count, 1, 1) + row + end) + + _small_batch = counted_stream |> Enum.take(2) + count_after_2 = :counters.get(fetch_count, 1) + + :counters.put(fetch_count, 1, 0) + + _large_batch = counted_stream |> Enum.take(10) + count_after_10 = :counters.get(fetch_count, 1) + + assert count_after_2 <= 5, "Should fetch minimal rows for small take" + assert count_after_10 >= 8, "Should fetch more rows for larger take" + end + + test "streaming from HuggingFace demonstrates progressive fetching" do + repository = @repository + + {:ok, stream} = Loader.load_dataset(repository, streaming: true, batch_size: 5) + + rows1 = stream |> Enum.take(3) + assert length(rows1) == 3 + + Process.sleep(1000) + + rows2 = stream |> Enum.take(8) + assert length(rows2) == 8 + + result = + stream + |> Stream.filter(fn row -> Map.has_key?(row, "id") end) + |> Stream.take(5) + |> Enum.to_list() + + assert length(result) <= 5 + end + + test "verification_mode works with streaming" do + repository = @repository + + {:ok, stream1} = + Loader.load_dataset( + repository, + streaming: true, + verification_mode: :basic_checks + ) + + rows1 = stream1 |> Enum.take(2) + assert length(rows1) == 2 + + {:ok, stream2} = + Loader.load_dataset( + repository, + streaming: true, + verification_mode: :no_checks + ) + + rows2 = stream2 |> Enum.take(2) + assert length(rows2) == 2 + end + end +end + diff --git a/test/elixir_datasets_test.exs b/test/elixir_datasets_test.exs index 4399d9e..ee7b547 100644 --- a/test/elixir_datasets_test.exs +++ b/test/elixir_datasets_test.exs @@ -10,42 +10,9 @@ defmodule ElixirDatasetsTest do assert Code.ensure_loaded?(ElixirDatasets) end - describe "do_load_spec/2" do - @cache_dir "test_cache_do_load_spec" - @repository {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]} - @valid_repo_files %{ - "resources/csv-test.csv" => "\"2dccc814f47c01b5344abbb72367a5b322656b0b\"" - } - @invalid_repo_files %{"invalid.csv" => "\"1234567890asdfgh\""} - - test "Loads valid files" do - assert {:ok, _paths} = ElixirDatasets.do_load_spec(@repository, @valid_repo_files, 1) - File.rm_rf!(@cache_dir) - end - - test "Return error for invalid files" do - assert {:error, _reason} = ElixirDatasets.do_load_spec(@repository, @invalid_repo_files, 1) - - File.rm_rf!(@cache_dir) - end - end - - describe "decode_config/1" do - test "Decodes a valid JSON file" do - File.write!("valid.json", ~s({"key": "value"})) - assert {:ok, %{"key" => "value"}} = ElixirDatasets.decode_config("valid.json") - File.rm!("valid.json") - end - - test "Fails to decode JSON file" do - File.write!("invalid.json", "{invalid_json}") - assert {:error, _} = ElixirDatasets.decode_config("invalid.json") - File.rm!("invalid.json") - end - end - - describe "load_dataset/2" do - @cache_dir "test_cache_load_dataset" + # Integration tests for public API + describe "load_dataset/2 - Public API Integration Tests" do + @cache_dir "test_cache_integration" @repository {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]} setup do @@ -66,332 +33,27 @@ defmodule ElixirDatasetsTest do assert is_list(datasets) end - test "loads a dataset from Hugging Face without opts" do - repository_short = {:hf, "aaaaa32r/elixirDatasets"} - assert {:ok, datasets} = ElixirDatasets.load_dataset(repository_short) - assert is_list(datasets) - end - test "loads a dataset from local directory" do repository = {:local, "resources"} assert {:ok, datasets} = ElixirDatasets.load_dataset(repository) assert is_list(datasets) end + end - test "raise error when invalid local directory" do - repository = {:local, "invalid/path"} - assert {:error, _reason} = ElixirDatasets.load_dataset(repository) - end - - test "loads dataset offline" do - repository = {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir]} - assert {:ok, datasets} = ElixirDatasets.load_dataset(repository) - assert is_list(datasets) - repositoryOffline = {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir, offline: true]} - assert {:ok, datasets} = ElixirDatasets.load_dataset(repositoryOffline) - assert is_list(datasets) - repositoryOfflineInvalid = {:hf, "not/exists", [cache_dir: @cache_dir, offline: true]} - assert {:error, _reason} = ElixirDatasets.load_dataset(repositoryOfflineInvalid) - end - - test "loads a dataset from Hugging Face with subdirectory" do - repositorySubdir = - {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir, subdir: "resources"]} - - assert {:ok, datasets} = ElixirDatasets.load_dataset(repositorySubdir) - assert is_list(datasets) - end - - test "returns error for non-existent dataset" do - repository = {:test, "nonexistent/repo", []} - - assert_raise ArgumentError, fn -> - ElixirDatasets.load_dataset(repository) - end - end - - test "loads dataset with split parameter from local directory" do - repository = {:local, "resources"} - assert {:ok, datasets} = ElixirDatasets.load_dataset(repository, split: "train") - assert is_list(datasets) - end - - test "loads dataset with name parameter filters files" do - repository = {:local, "resources"} - assert {:ok, datasets} = ElixirDatasets.load_dataset(repository, name: "csv") - assert is_list(datasets) - end - - test "loads dataset with streaming parameter returns Stream" do - repository = {:local, "resources"} - assert {:ok, stream} = ElixirDatasets.load_dataset(repository, streaming: true) - - assert is_function(stream, 2), "Expected a Stream (function/2)" - - rows = stream |> Enum.take(5) - assert is_list(rows) - assert Enum.all?(rows, &is_map/1), "Each row should be a map" - end - - test "streaming mode fetches data progressively" do - repository = {:local, "resources"} - assert {:ok, stream} = ElixirDatasets.load_dataset(repository, streaming: true) - - rows = stream |> Enum.take(3) - assert length(rows) <= 3 - assert Enum.all?(rows, &is_map/1) - end - - test "streaming with custom batch_size" do - repository = {:local, "resources"} - - assert {:ok, stream} = - ElixirDatasets.load_dataset( - repository, - streaming: true, - batch_size: 2 - ) - - rows = stream |> Enum.take(5) - assert is_list(rows) - end - - test "streaming is lazy - data fetched on demand, not upfront" do - repository = {:local, "resources"} - - {:ok, stream} = ElixirDatasets.load_dataset(repository, streaming: true) - - IO.puts("\n πŸ” Testing lazy streaming behavior:") - - IO.puts(" 1. Fetching first 3 rows...") - - {time1, rows1} = - :timer.tc(fn -> - stream |> Enum.take(3) - end) - - IO.puts(" βœ“ Got #{length(rows1)} rows in #{time1 / 1000}ms") - assert length(rows1) == 3 - - IO.puts(" 2. Waiting 2 seconds...") - Process.sleep(2000) - - IO.puts(" 3. Fetching 5 rows from same stream...") - - {time2, rows2} = - :timer.tc(fn -> - stream |> Enum.take(5) - end) - - IO.puts(" βœ“ Got #{length(rows2)} rows in #{time2 / 1000}ms") - assert length(rows2) == 5 - - IO.puts(" 4. Key insight: Stream is reusable, each Enum.take starts fresh") - - IO.puts(" 5. Demonstrating progressive fetching...") - - fetch_count = :counters.new(1, [:atomics]) - - counted_stream = - stream - |> Stream.map(fn row -> - :counters.add(fetch_count, 1, 1) - row - end) - - IO.puts(" Taking 2 rows...") - _small_batch = counted_stream |> Enum.take(2) - count_after_2 = :counters.get(fetch_count, 1) - IO.puts(" βœ“ Fetched #{count_after_2} rows (should be ~2)") - - :counters.put(fetch_count, 1, 0) - - IO.puts(" Taking 10 rows...") - _large_batch = counted_stream |> Enum.take(10) - count_after_10 = :counters.get(fetch_count, 1) - IO.puts(" βœ“ Fetched #{count_after_10} rows (should be ~10)") - - assert count_after_2 <= 5, "Should fetch minimal rows for small take" - assert count_after_10 >= 8, "Should fetch more rows for larger take" - - IO.puts(" βœ… Streaming is truly lazy - fetches only what's needed!") - end - - test "streaming from HuggingFace demonstrates progressive fetching" do - repository = @repository - - IO.puts("\n 🌐 Testing HuggingFace streaming:") - - {:ok, stream} = ElixirDatasets.load_dataset(repository, streaming: true, batch_size: 5) - IO.puts(" βœ“ Created stream (no data downloaded yet)") - - IO.puts(" 1. Fetching only 3 rows...") - - {time1, rows1} = - :timer.tc(fn -> - stream |> Enum.take(3) - end) - - IO.puts(" βœ“ Got #{length(rows1)} rows in #{Float.round(time1 / 1000, 2)}ms") - assert length(rows1) == 3 - - IO.puts(" 2. Waiting 1 second...") - Process.sleep(1000) - - IO.puts(" 3. Fetching 8 rows from same stream...") - - {time2, rows2} = - :timer.tc(fn -> - stream |> Enum.take(8) - end) - - IO.puts(" βœ“ Got #{length(rows2)} rows in #{Float.round(time2 / 1000, 2)}ms") - assert length(rows2) == 8 - - IO.puts(" 4. Processing with Stream operations (lazy)...") - - result = - stream - |> Stream.filter(fn row -> Map.has_key?(row, "id") end) - |> Stream.take(5) - |> Enum.to_list() - - IO.puts(" βœ“ Processed and got #{length(result)} filtered rows") - assert length(result) <= 5 - - IO.puts(" βœ… HuggingFace streaming works progressively!") - end - - test "verification_mode works with streaming" do - repository = @repository - - IO.puts("\n πŸ” Testing verification_mode with streaming:") - - IO.puts(" 1. With verification_mode: :basic_checks (default)...") - - {:ok, stream1} = - ElixirDatasets.load_dataset( - repository, - streaming: true, - verification_mode: :basic_checks - ) - - rows1 = stream1 |> Enum.take(2) - IO.puts(" βœ“ Got #{length(rows1)} rows") - assert length(rows1) == 2 - - IO.puts(" 2. With verification_mode: :no_checks...") - - {:ok, stream2} = - ElixirDatasets.load_dataset( - repository, - streaming: true, - verification_mode: :no_checks - ) - - rows2 = stream2 |> Enum.take(2) - IO.puts(" βœ“ Got #{length(rows2)} rows") - assert length(rows2) == 2 - - IO.puts(" ℹ️ Note: verification_mode applies to metadata fetching,") - IO.puts(" not to the streaming data itself (which comes from URLs)") - IO.puts(" βœ… verification_mode works with streaming!") - end - - test "loads dataset with split and name parameters combined" do - repository = {:local, "resources"} - - assert {:ok, datasets} = - ElixirDatasets.load_dataset(repository, split: "train", name: "csv") - - assert is_list(datasets) - end - - test "loads dataset with download_mode option" do - repository = {:local, "resources"} - - assert {:ok, datasets} = - ElixirDatasets.load_dataset(repository, download_mode: :reuse_dataset_if_exists) - - assert is_list(datasets) - end - - test "loads dataset with verification_mode option" do - repository = {:local, "resources"} - - assert {:ok, datasets} = - ElixirDatasets.load_dataset(repository, verification_mode: :no_checks) - - assert is_list(datasets) - end - - test "loads dataset with num_proc for parallel processing" do - repository = {:local, "resources"} - assert {:ok, datasets} = ElixirDatasets.load_dataset(repository, num_proc: 2) - assert is_list(datasets) - assert length(datasets) > 0 - end - - test "loads dataset with num_proc=1 (sequential)" do + describe "load_dataset!/2 - Public API" do + test "loads dataset successfully" do repository = {:local, "resources"} - assert {:ok, datasets} = ElixirDatasets.load_dataset(repository, num_proc: 1) + datasets = ElixirDatasets.load_dataset!(repository) assert is_list(datasets) end - test "num_proc=4 is faster than num_proc=1 for parallel loading" do - repository = @repository - - {time_sequential, {:ok, datasets_seq}} = - :timer.tc(fn -> - ElixirDatasets.load_dataset(repository, num_proc: 1) - end) - - {time_parallel, {:ok, datasets_par}} = - :timer.tc(fn -> - ElixirDatasets.load_dataset(repository, num_proc: 4) - end) - - assert length(datasets_seq) == length(datasets_par) - - total_rows_seq = - Enum.reduce(datasets_seq, 0, fn df, acc -> - acc + Explorer.DataFrame.n_rows(df) - end) - - total_rows_par = - Enum.reduce(datasets_par, 0, fn df, acc -> - acc + Explorer.DataFrame.n_rows(df) - end) - - assert total_rows_seq == total_rows_par - - time_seq_sec = time_sequential / 1_000_000 - time_par_sec = time_parallel / 1_000_000 - speedup = time_sequential / time_parallel - - IO.puts("\n ⏱️ Performance Comparison:") - IO.puts(" Sequential (num_proc: 1): #{Float.round(time_seq_sec, 3)}s") - IO.puts(" Parallel (num_proc: 4): #{Float.round(time_par_sec, 3)}s") - IO.puts(" Speedup: #{Float.round(speedup, 2)}x") - - assert time_parallel <= time_sequential * 1.5, - "Parallel processing overhead should be reasonable for this dataset size (no more than 1.5x slower than sequential)" - end - - test "num_proc produces same results as sequential" do - repository = {:local, "resources"} - - {:ok, datasets_seq} = ElixirDatasets.load_dataset(repository, num_proc: 1) - {:ok, datasets_par} = ElixirDatasets.load_dataset(repository, num_proc: 4) - - assert length(datasets_seq) == length(datasets_par) - seq_row_counts = Enum.map(datasets_seq, &Explorer.DataFrame.n_rows/1) |> Enum.sort() - par_row_counts = Enum.map(datasets_par, &Explorer.DataFrame.n_rows/1) |> Enum.sort() + test "raises error on failure" do + repository = {:local, "invalid/path"} - assert seq_row_counts == par_row_counts + assert_raise RuntimeError, fn -> + ElixirDatasets.load_dataset!(repository) + end end - - # todo more tests for load_dataset/2 end describe "cache_dir/0" do @@ -421,87 +83,36 @@ defmodule ElixirDatasetsTest do end end - describe "get_dataset_info/2" do + # Public API tests for dataset info functions + describe "get_dataset_info/2 - Public API" do test "fetches dataset info from Hugging Face API" do assert {:ok, info} = ElixirDatasets.get_dataset_info("aaaaa32r/elixirDatasets") assert is_map(info) assert info["id"] == "aaaaa32r/elixirDatasets" - - assert is_map(info["cardData"]) - dataset_info = info["cardData"]["dataset_info"] - assert is_list(dataset_info) - - first_config = Enum.at(dataset_info, 0) - assert first_config["config_name"] == "csv" - assert is_list(first_config["features"]) - assert is_list(first_config["splits"]) - - first_split = Enum.at(first_config["splits"], 0) - assert first_split["num_examples"] == 10 end end - describe "get_dataset_infos/2" do + describe "get_dataset_infos/2 - Public API" do test "fetches dataset infos as DatasetInfo structs" do assert {:ok, infos} = ElixirDatasets.get_dataset_infos("aaaaa32r/elixirDatasets") assert is_list(infos) assert Enum.count(infos) > 0 - - first_info = Enum.at(infos, 0) - assert %ElixirDatasets.DatasetInfo{} = first_info - assert first_info.config_name == "csv" - assert is_list(first_info.features) - assert is_list(first_info.splits) - end - end - - describe "parse_dataset_infos/1" do - test "parses raw dataset info map into DatasetInfo structs" do - data = %{ - "cardData" => %{ - "dataset_info" => [ - %{ - "config_name" => "csv", - "features" => [%{"name" => "id", "dtype" => "int64"}], - "splits" => [%{"name" => "train", "num_examples" => 10}] - } - ] - } - } - - infos = ElixirDatasets.parse_dataset_infos(data) - assert is_list(infos) - assert Enum.count(infos) == 1 - - first_info = Enum.at(infos, 0) - assert %ElixirDatasets.DatasetInfo{} = first_info - assert first_info.config_name == "csv" - assert first_info.features == [%{"name" => "id", "dtype" => "int64"}] - assert first_info.splits == [%{"name" => "train", "num_examples" => 10}] - end - - test "handles missing dataset_info gracefully" do - data = %{"cardData" => %{}} - infos = ElixirDatasets.parse_dataset_infos(data) - assert infos == [] end end - describe "get_dataset_split_names/2" do + describe "get_dataset_split_names/2 - Public API" do test "fetches split names from dataset" do assert {:ok, splits} = ElixirDatasets.get_dataset_split_names("aaaaa32r/elixirDatasets") assert is_list(splits) assert Enum.count(splits) > 0 - assert Enum.all?(splits, &is_binary/1) end end - describe "get_dataset_config_names/2" do + describe "get_dataset_config_names/2 - Public API" do test "fetches config names from dataset" do assert {:ok, configs} = ElixirDatasets.get_dataset_config_names("aaaaa32r/elixirDatasets") assert is_list(configs) assert Enum.count(configs) > 0 - assert Enum.all?(configs, &is_binary/1) assert Enum.member?(configs, "csv") end end diff --git a/test/huggingface/hub_test.exs b/test/huggingface/hub_test.exs deleted file mode 100644 index 0e86eff..0000000 --- a/test/huggingface/hub_test.exs +++ /dev/null @@ -1,477 +0,0 @@ -defmodule ElixirDatasets.HuggingFace.HubTest do - use ExUnit.Case, async: true - - doctest ElixirDatasets.HuggingFace.Hub - - describe "file_url/3" do - @repository_id "test-user/test-repo" - @filename "test-file.txt" - - test "returns correct URL with no revision" do - revision = nil - - expected_url = - "https://huggingface.co/datasets/test-user/test-repo/resolve/main/test-file.txt" - - assert ElixirDatasets.HuggingFace.Hub.file_url(@repository_id, @filename, revision) == - expected_url - end - - test "returns correct URL with revision" do - revision = "test-revision" - - expected_url = - "https://huggingface.co/datasets/test-user/test-repo/resolve/test-revision/test-file.txt" - - assert ElixirDatasets.HuggingFace.Hub.file_url(@repository_id, @filename, revision) == - expected_url - end - end - - describe "file_listing_url/3" do - @repository_id "test-user/test-repo" - test "returns correct URL with no subdir and no revision" do - subdir = nil - revision = nil - - expected_url = - "https://huggingface.co/api/datasets/test-user/test-repo/tree/main" - - assert ElixirDatasets.HuggingFace.Hub.file_listing_url(@repository_id, subdir, revision) == - expected_url - end - - test "returns correct URL with subdir and revision" do - subdir = "test-subdir/test-subdir2" - revision = "test-revision" - - expected_url = - "https://huggingface.co/api/datasets/test-user/test-repo/tree/test-revision/test-subdir/test-subdir2" - - assert ElixirDatasets.HuggingFace.Hub.file_listing_url(@repository_id, subdir, revision) == - expected_url - end - end - - describe "dataset_info_url/1" do - @repository_id "test-user/test-repo" - - test "returns correct URL for dataset info API" do - expected_url = "https://huggingface.co/api/datasets/test-user/test-repo" - - assert ElixirDatasets.HuggingFace.Hub.dataset_info_url(@repository_id) == expected_url - end - end - - describe "cached_download/2" do - @url "https://huggingface.co/datasets/aaaaa32r/elixirDatasets" - @url_redirect "https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT/resolve/main/medical_o1_sft_Chinese.json" - @cache_dir "test_cache_dir_cached_download" - @cache_scope "test_cache_scope" - @opts [cache_dir: @cache_dir, cache_scope: @cache_scope] - - test "No cache_scope" do - assert {:ok, _path} = - ElixirDatasets.HuggingFace.Hub.cached_download(@url, cache_dir: @cache_dir) - - # Clean up - File.rm_rf!(@cache_dir) - end - - test "With cache_scope" do - File.mkdir_p!(@cache_dir) - - assert {:ok, _path} = ElixirDatasets.HuggingFace.Hub.cached_download(@url, @opts) - - # Clean up - File.rm_rf!(@cache_dir) - end - - test "with cache_scope, redirect" do - File.mkdir_p!(@cache_dir) - - assert {:ok, _path} = ElixirDatasets.HuggingFace.Hub.cached_download(@url_redirect, @opts) - - # Clean up - File.rm_rf!(@cache_dir) - end - - test "with download_mode: :force_redownload" do - File.mkdir_p!(@cache_dir) - - assert {:ok, path1} = ElixirDatasets.HuggingFace.Hub.cached_download(@url, @opts) - assert File.exists?(path1) - - assert {:ok, path2} = - ElixirDatasets.HuggingFace.Hub.cached_download( - @url, - @opts ++ [download_mode: :force_redownload] - ) - - assert File.exists?(path2) - assert String.contains?(path1, @cache_dir) - assert String.contains?(path2, @cache_dir) - - File.rm_rf!(@cache_dir) - end - - test "with verification_mode: :no_checks" do - File.mkdir_p!(@cache_dir) - - assert {:ok, _path} = - ElixirDatasets.HuggingFace.Hub.cached_download( - @url, - @opts ++ [verification_mode: :no_checks] - ) - - File.rm_rf!(@cache_dir) - end - - test "verification_mode: :no_checks skips file existence check in offline mode" do - File.mkdir_p!(@cache_dir) - - assert {:ok, cached_path} = - ElixirDatasets.HuggingFace.Hub.cached_download(@url, @opts) - - assert File.exists?(cached_path) - - File.rm!(cached_path) - refute File.exists?(cached_path) - - assert {:error, error_msg} = - ElixirDatasets.HuggingFace.Hub.cached_download( - @url, - @opts ++ [offline: true, verification_mode: :basic_checks] - ) - - assert error_msg =~ "cached file not found" - - assert {:ok, returned_path} = - ElixirDatasets.HuggingFace.Hub.cached_download( - @url, - @opts ++ [offline: true, verification_mode: :no_checks] - ) - - assert returned_path == cached_path - refute File.exists?(returned_path) - - File.rm_rf!(@cache_dir) - end - - test "verification_mode: :basic_checks fails when cached file is missing" do - File.mkdir_p!(@cache_dir) - - assert {:ok, cached_path} = - ElixirDatasets.HuggingFace.Hub.cached_download(@url, @opts) - - assert File.exists?(cached_path) - - File.rm!(cached_path) - - assert {:error, error_msg} = - ElixirDatasets.HuggingFace.Hub.cached_download( - @url, - @opts ++ [offline: true, verification_mode: :basic_checks] - ) - - assert error_msg =~ "cached file not found" - - File.rm_rf!(@cache_dir) - end - - test "verification_mode comparison: :basic_checks vs :no_checks" do - File.mkdir_p!(@cache_dir) - - {:ok, cached_path} = ElixirDatasets.HuggingFace.Hub.cached_download(@url, @opts) - File.rm!(cached_path) - - IO.puts("\n πŸ” Testing verification_mode behavior:") - IO.puts(" Cache file deleted: #{cached_path}") - - IO.puts("\n 1. With verification_mode: :basic_checks (offline)") - - result_basic = - ElixirDatasets.HuggingFace.Hub.cached_download( - @url, - @opts ++ [offline: true, verification_mode: :basic_checks] - ) - - case result_basic do - {:error, msg} -> - IO.puts(" βœ“ Failed as expected: #{msg}") - assert msg =~ "cached file not found" - - {:ok, _} -> - IO.puts(" βœ— Should have failed!") - flunk("Expected :basic_checks to fail with missing file") - end - - IO.puts("\n 2. With verification_mode: :no_checks (offline)") - - result_no_checks = - ElixirDatasets.HuggingFace.Hub.cached_download( - @url, - @opts ++ [offline: true, verification_mode: :no_checks] - ) - - case result_no_checks do - {:ok, path} -> - IO.puts(" βœ“ Succeeded (returns path without checking)") - IO.puts(" βœ“ Returned path: #{path}") - IO.puts(" βœ“ File exists? #{File.exists?(path)}") - assert path == cached_path - refute File.exists?(path) - - {:error, msg} -> - IO.puts(" βœ— Should have succeeded!") - flunk("Expected :no_checks to succeed, got error: #{msg}") - end - - IO.puts("\n βœ… verification_mode works correctly!") - IO.puts(" :basic_checks = validates file exists") - IO.puts(" :no_checks = skips validation (faster but risky)") - - File.rm_rf!(@cache_dir) - end - end - - describe "cached_path_for_etag/3" do - @dir "test_cache_dir_cached_path_for_etag" - @url "https://example.com/test-file.txt" - @etag "1234567890abcdef" - @fileContent "jrdifprgyy26hfylusnlbth2ie.gezdgnbvgy3tqojqmfrggzdfmy" - @fileJson "jrdifprgyy26hfylusnlbth2ie.json" - - test "returns cached path for known etag" do - File.mkdir_p!(@dir) - File.write!(Path.join(@dir, @fileContent), "test content") - File.write!(Path.join(@dir, @fileJson), Jason.encode!(%{"etag" => @etag})) - expected_path = @dir <> "/" <> @fileContent - - assert ElixirDatasets.HuggingFace.Hub.cached_path_for_etag(@dir, @url, @etag) == - expected_path - - # Clean up - File.rm!(Path.join(@dir, @fileContent)) - File.rm!(Path.join(@dir, @fileJson)) - File.rmdir!(@dir) - end - - test "returns nil for invalid etag" do - File.mkdir_p!(@dir) - File.write!(Path.join(@dir, @fileJson), Jason.encode!(%{"etag" => "invalid-etag"})) - - assert ElixirDatasets.HuggingFace.Hub.cached_path_for_etag(@dir, @url, @etag) == nil - - # Clean up - File.rm!(Path.join(@dir, @fileJson)) - File.rmdir!(@dir) - end - end - - describe "head_download/2" do - @url "https://huggingface.co/datasets/aaaaa32r/elixirDatasets" - @url_redirect "https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT/resolve/main/medical_o1_sft_Chinese.json" - # @urlNilHost "http://localhost:32123/sessions/7xre6dqd37a6olsi4dmdddndzz6te5cdimmshjblbbsot2cg" # This URL is not valid for testing, as it does not exist outside of my local environment - @headers [{"Content-Type", "application/json"}] - - test "returns :ok with valid response, without redirection" do - assert {:ok, _etag, @url, false} = - ElixirDatasets.HuggingFace.Hub.head_download(@url, @headers) - end - - test "returns :ok with valid response, with redirection" do - assert {:ok, _etag, _url_redirect, true} = - ElixirDatasets.HuggingFace.Hub.head_download(@url_redirect, @headers) - end - - # test "returns :error, when host location is nil" do # todo - # assert {:error, _reason} = - # ElixirDatasets.HuggingFace.Hub.head_download(@urlNilHost, @headers) - # end - end - - describe "finish_request" do - test "response is :ok" do - assert ElixirDatasets.HuggingFace.Hub.finish_request(:ok, @url) == :ok - end - - test "response is :ok, status in 100..399" do - response = {:ok, %{status: 200}} - assert ElixirDatasets.HuggingFace.Hub.finish_request(response, @url) == response - end - - test "response is :ok, status is out 100..399" do - responses = [ - {:ok, %{status: 404, headers: [{"x-error-code", "RepoNotFound"}]}}, - {:ok, %{status: 404, headers: [{"x-error-code", "GatedRepo"}]}}, - {:ok, %{status: 404, headers: [{"x-error-code", "OtherError"}]}}, - {:ok, %{status: 500, headers: [{"x-error-code", "EntryNotFound"}]}}, - {:ok, %{status: 500, headers: [{"x-error-code", "RevisionNotFound"}]}} - ] - - Enum.each(responses, fn response -> - assert {:error, _} = ElixirDatasets.HuggingFace.Hub.finish_request(response, @url) - end) - end - - test "response is error" do - response = {:error, "test-error"} - - assert ElixirDatasets.HuggingFace.Hub.finish_request(response, @url) == - {:error, "failed to make an HTTP request, reason: \"test-error\""} - end - end - - describe "fetch_etag/1" do - test "when etag is present" do - response = %{ - status: 200, - headers: [{"Content-Type", "application/json"}, {"etag", "1234567890abcdef"}], - body: "{}" - } - - assert ElixirDatasets.HuggingFace.Hub.fetch_etag(response) == - {:ok, "1234567890abcdef"} - end - - test "when etag is not present" do - response = %{ - status: 200, - headers: [{"Content-Type", "application/json"}], - body: "{}" - } - - assert ElixirDatasets.HuggingFace.Hub.fetch_etag(response) == - {:error, "no ETag found on the resource"} - end - end - - describe "metadata_filename/1" do - @url "https://example.com/test-file.txt" - - test "generates correct metadata filename from URL" do - expected_filename = "jrdifprgyy26hfylusnlbth2ie.json" - - assert ElixirDatasets.HuggingFace.Hub.metadata_filename(@url) == expected_filename - end - end - - describe "entry_filename/2, encode_url/1, encode_etag/1" do - test "generates correct filenames based on URL and ETag" do - etag = "1234567890abcdef" - - expected_entry_filename = "jrdifprgyy26hfylusnlbth2ie.gezdgnbvgy3tqojqmfrggzdfmy" - - assert ElixirDatasets.HuggingFace.Hub.entry_filename(@url, etag) == - expected_entry_filename - end - end - - describe "store_json/2, load_json/1" do - @data %{"key" => "value"} - test "stores JSON data to a file and loads it back" do - path = "test_data.json" - - assert ElixirDatasets.HuggingFace.Hub.store_json(path, @data) == :ok - assert File.exists?(path) - - assert ElixirDatasets.HuggingFace.Hub.load_json(path) == {:ok, @data} - - # Clean up - File.rm!(path) - end - - test "returns error when unable to write to file and returns error when trying to load" do - path = "/invalid/path/test_data.json" - - assert ElixirDatasets.HuggingFace.Hub.store_json(path, @data) == - {:error, :enoent} - - assert ElixirDatasets.HuggingFace.Hub.load_json(path) == :error - end - end - - describe "elixirDatasets_offline?/0" do - test "returns true when ELIXIR_DATASETS_OFFLINE is set to '1'" do - System.put_env("ELIXIR_DATASETS_OFFLINE", "1") - assert ElixirDatasets.HuggingFace.Hub.elixir_datasets_offline?() == true - System.delete_env("ELIXIR_DATASETS_OFFLINE") - end - - test "returns true when ELIXIR_DATASETS_OFFLINE is set to 'true'" do - System.put_env("ELIXIR_DATASETS_OFFLINE", "true") - assert ElixirDatasets.HuggingFace.Hub.elixir_datasets_offline?() == true - System.delete_env("ELIXIR_DATASETS_OFFLINE") - end - - test "returns false when ELIXIR_DATASETS_OFFLINE is not set" do - assert ElixirDatasets.HuggingFace.Hub.elixir_datasets_offline?() == false - end - - test "returns false when ELIXIR_DATASETS_OFFLINE is set to '0'" do - System.put_env("ELIXIR_DATASETS_OFFLINE", "0") - assert ElixirDatasets.HuggingFace.Hub.elixir_datasets_offline?() == false - System.delete_env("ELIXIR_DATASETS_OFFLINE") - end - - test "returns false when ELIXIR_DATASETS_OFFLINE is set to 'false'" do - System.put_env("ELIXIR_DATASETS_OFFLINE", "false") - assert ElixirDatasets.HuggingFace.Hub.elixir_datasets_offline?() == false - System.delete_env("ELIXIR_DATASETS_OFFLINE") - end - end - - describe "get_auth_token/1" do - setup do - # Save the current HF_TOKEN env var if it exists - original_token = System.get_env("HF_TOKEN") - System.delete_env("HF_TOKEN") - - on_exit(fn -> - if original_token do - System.put_env("HF_TOKEN", original_token) - else - System.delete_env("HF_TOKEN") - end - end) - - :ok - end - - test "returns token from options when provided" do - System.delete_env("HF_TOKEN") - token = "hf_test_token_from_options" - assert ElixirDatasets.HuggingFace.Hub.get_auth_token(auth_token: token) == {:ok, token} - end - - test "returns token from environment variable when not in options" do - env_token = "hf_test_token_from_env" - System.put_env("HF_TOKEN", env_token) - assert ElixirDatasets.HuggingFace.Hub.get_auth_token([]) == {:ok, env_token} - end - - test "prioritizes options token over environment variable" do - options_token = "hf_token_from_options" - env_token = "hf_token_from_env" - System.put_env("HF_TOKEN", env_token) - - assert ElixirDatasets.HuggingFace.Hub.get_auth_token(auth_token: options_token) == - {:ok, options_token} - end - - test "returns error when no token provided and env var not set" do - System.delete_env("HF_TOKEN") - - assert ElixirDatasets.HuggingFace.Hub.get_auth_token([]) == - {:error, "No Hugging Face authentication token provided."} - end - - test "returns error when empty options and env var not set" do - System.delete_env("HF_TOKEN") - - assert ElixirDatasets.HuggingFace.Hub.get_auth_token() == - {:error, "No Hugging Face authentication token provided."} - end - end -end From a0e9db184ca739423bc92cb4005f9ae676f5f6f6 Mon Sep 17 00:00:00 2001 From: Weronika Date: Sat, 10 Jan 2026 12:59:09 +0100 Subject: [PATCH 02/10] Readme update --- README.md | 58 +------------------------------------------------------ 1 file changed, 1 insertion(+), 57 deletions(-) diff --git a/README.md b/README.md index 24c657e..d17e249 100644 --- a/README.md +++ b/README.md @@ -33,16 +33,13 @@ end ## πŸš€ Quick Start ```elixir -# Load a dataset from Hugging Face {:ok, [train_df]} = ElixirDatasets.load_dataset( {:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train" ) -# Load from local directory {:ok, datasets} = ElixirDatasets.load_dataset({:local, "./data"}) -# Stream large datasets without loading into memory {:ok, stream} = ElixirDatasets.load_dataset( {:hf, "stanfordnlp/imdb", subdir: "plain_text"}, split: "train", @@ -57,37 +54,32 @@ stream |> Enum.take(100) |> Enum.each(&process_row/1) ### Text Classification with Sentiment Analysis ```elixir -# Load training data {:ok, [train_df]} = ElixirDatasets.load_dataset( {:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train" ) -# Explore the data with Explorer require Explorer.DataFrame, as: DF train_df |> DF.head(5) |> IO.inspect() -# Get dataset metadata {:ok, splits} = ElixirDatasets.get_dataset_split_names( "cornell-movie-review-data/rotten_tomatoes" ) -IO.inspect(splits) # ["train", "validation", "test"] +IO.inspect(splits) ``` ### Streaming Large Datasets ```elixir -# Stream data without loading everything into memory {:ok, stream} = ElixirDatasets.load_dataset( {:hf, "stanfordnlp/imdb", subdir: "plain_text"}, split: "train", streaming: true ) -# Process data progressively stream |> Stream.filter(fn row -> String.length(row["text"]) > 100 end) |> Stream.take(1000) @@ -97,13 +89,11 @@ stream ### Working Offline ```elixir -# Download once {:ok, _} = ElixirDatasets.load_dataset( {:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train" ) -# Use cached version offline {:ok, [data]} = ElixirDatasets.load_dataset( {:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train", @@ -119,44 +109,8 @@ stream - `ELIXIR_DATASETS_OFFLINE` - Enable offline mode (`"1"` or `"true"`) - `HF_TOKEN` - Authentication token for private datasets -### Common Options - -```elixir -# Load specific split -ElixirDatasets.load_dataset({:hf, "dataset"}, split: "train") - -# Stream large datasets -ElixirDatasets.load_dataset({:hf, "dataset"}, streaming: true) - -ElixirDatasets.load_dataset({:hf, "dataset"}, num_proc: 4) - -ElixirDatasets.load_dataset({:hf, "dataset"}, offline: true) - -ElixirDatasets.load_dataset({:hf, "dataset"}, download_mode: :force_redownload) -``` - See the [full documentation](https://hexdocs.pm/elixir_datasets) for all available options. -## πŸ”— Integration with Elixir ML Ecosystem - -Works seamlessly with Explorer, Nx, Axon, and Bumblebee: - -```elixir -{:ok, [train_df]} = ElixirDatasets.load_dataset( - {:hf, "cornell-movie-review-data/rotten_tomatoes"}, - split: "train" -) - -require Explorer.DataFrame, as: DF -train_df |> DF.filter(label == 1) |> DF.head(10) - -texts = DF.pull(train_df, "text") -labels = DF.pull(train_df, "label") |> Nx.tensor() - -{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "bert-base-uncased"}) -inputs = Bumblebee.apply_tokenizer(tokenizer, texts) -``` - ## πŸ“– Documentation Full documentation is available at [HexDocs](https://hexdocs.pm/elixir_datasets). @@ -183,16 +137,6 @@ mix coveralls mix test test/elixir_datasets_test.exs ``` -## 🀝 Contributing - -Contributions are welcome! Please feel free to submit a Pull Request. - -1. Fork the repository -2. Create your feature branch (`git checkout -b feature/amazing-feature`) -3. Commit your changes (`git commit -m 'Add amazing feature'`) -4. Push to the branch (`git push origin feature/amazing-feature`) -5. Open a Pull Request - ## πŸ“„ License This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. From c8c695c3125e2e8c4d4644855b00558c10add076 Mon Sep 17 00:00:00 2001 From: Weronika Date: Sat, 10 Jan 2026 14:56:08 +0100 Subject: [PATCH 03/10] Moved hub --- lib/elixir_datasets/huggingface/hub.ex | 336 +++++++++++++ test/elixir_datasets/huggingface/hub_test.exs | 460 ++++++++++++++++++ 2 files changed, 796 insertions(+) create mode 100644 lib/elixir_datasets/huggingface/hub.ex create mode 100644 test/elixir_datasets/huggingface/hub_test.exs diff --git a/lib/elixir_datasets/huggingface/hub.ex b/lib/elixir_datasets/huggingface/hub.ex new file mode 100644 index 0000000..ff84e75 --- /dev/null +++ b/lib/elixir_datasets/huggingface/hub.ex @@ -0,0 +1,336 @@ +# This file, part of the ElixirDatasets project, has been adapted from code originally under Apache License 2.0. +# The original code can be found at: +# https://github.com/elixir-nx/bumblebee/blob/710a645222948f80208c348d3a2589cbd3ab8e7d/lib/bumblebee/huggingface/hub.ex + +defmodule ElixirDatasets.HuggingFace.Hub do + @moduledoc false + @compile if Mix.env() == :test, do: :export_all + alias ElixirDatasets.Utils.HTTP + + @huggingface_endpoint "https://huggingface.co" + + @doc """ + Returns a URL pointing to the given file in a Hugging Face repository. + """ + @spec file_url(String.t(), String.t(), String.t() | nil) :: String.t() + def file_url(repository_id, filename, revision) do + revision = revision || "main" + @huggingface_endpoint <> "/datasets/#{repository_id}/resolve/#{revision}/#{filename}" + end + + @doc """ + Returns a URL to list the contents of a Hugging Face repository. + """ + @spec file_listing_url(String.t(), String.t() | nil, String.t() | nil) :: String.t() + def file_listing_url(repository_id, subdir, revision) do + revision = revision || "main" + path = if(subdir, do: "/" <> subdir) + @huggingface_endpoint <> "/api/datasets/#{repository_id}/tree/#{revision}#{path}" + end + + @doc """ + Returns a URL to fetch dataset information from the Hugging Face API. + """ + @spec dataset_info_url(String.t()) :: String.t() + def dataset_info_url(repository_id) do + @huggingface_endpoint <> "/api/datasets/#{repository_id}" + end + + @doc """ + Downloads file from the given URL and returns a path to the file. + + The file is cached based on the received ETag. Subsequent requests + for the same URL validate the ETag and return a file from the cache + if there is a match. + + ## Options + + * `:cache_dir` - the directory to store the downloaded files in. + Defaults to the standard cache location for the given operating + system + + * `:offline` - if `true`, cached path is returned if exists and + and error otherwise + + * `:auth_token` - the token to use as HTTP bearer authorization + for remote files + + * `:etag` - by default a HEAD request is made to fetch the latest + ETag value, however if the value is already known, it can be + passed as an option instead (to skip the extra request) + + * `:cache_scope` - a namespace to put the cached files under in + the cache directory + + * `:download_mode` - controls download/cache behavior. Can be: + - `:reuse_dataset_if_exists` (default) - reuse cached data if available + - `:force_redownload` - always download, even if cached + + * `:verification_mode` - controls whether basic verification checks + are applied. Can be: + - `:basic_checks` (default) - perform basic validation + - `:no_checks` - skip validation (for example, file existence checks) + Note: Currently, `:verification_mode` only distinguishes between + performing the default basic checks and skipping them via `:no_checks`. + + """ + @spec cached_download(String.t(), keyword()) :: {:ok, String.t()} | {:error, String.t()} + def cached_download(url, opts \\ []) do + cache_dir = opts[:cache_dir] || ElixirDatasets.cache_dir() + offline = Keyword.get(opts, :offline, elixir_datasets_offline?()) + auth_token = opts[:auth_token] + download_mode = opts[:download_mode] || :reuse_dataset_if_exists + verification_mode = opts[:verification_mode] || :basic_checks + + dir = Path.join(cache_dir, "huggingface") + + dir = + if cache_scope = opts[:cache_scope] do + Path.join(dir, cache_scope) + else + dir + end + + File.mkdir_p!(dir) + + headers = + if auth_token do + [{"Authorization", "Bearer " <> auth_token}] + else + [] + end + + metadata_path = Path.join(dir, metadata_filename(url)) + + # Handle force_redownload mode - delete cached files + if download_mode == :force_redownload do + File.rm(metadata_path) + end + + cond do + offline -> + case load_json(metadata_path) do + {:ok, %{"etag" => etag}} -> + entry_path = Path.join(dir, entry_filename(url, etag)) + + cond do + File.exists?(entry_path) -> + {:ok, entry_path} + + verification_mode == :no_checks -> + IO.warn( + "ElixirDatasets.HuggingFace.Hub.cached_download/2: " <> + "returning path to non-existent cached file in offline mode with " <> + ":no_checks verification_mode: #{entry_path}" + ) + + {:ok, entry_path} + + true -> + {:error, "cached file not found: #{entry_path}"} + end + + _ -> + {:error, + "could not find file in local cache and outgoing traffic is disabled, url: #{url}"} + end + + entry_path = opts[:etag] && cached_path_for_etag(dir, url, opts[:etag]) -> + {:ok, entry_path} + + true -> + with {:ok, etag, download_url, redirect?} <- head_download(url, headers) do + cached_entry = + if download_mode != :force_redownload, do: cached_path_for_etag(dir, url, etag) + + if cached_entry do + {:ok, cached_entry} + else + entry_path = Path.join(dir, entry_filename(url, etag)) + + headers = + if redirect? do + List.keydelete(headers, "Authorization", 0) + else + headers + end + + download_url + |> HTTP.download(entry_path, headers: headers) + |> finish_request(download_url) + |> case do + :ok -> + :ok = store_json(metadata_path, %{"etag" => etag, "url" => url}) + {:ok, entry_path} + + error -> + File.rm_rf!(metadata_path) + File.rm_rf!(entry_path) + error + end + end + end + end + end + + defp cached_path_for_etag(dir, url, etag) do + metadata_path = Path.join(dir, metadata_filename(url)) + + case load_json(metadata_path) do + {:ok, %{"etag" => ^etag}} -> + path = Path.join(dir, entry_filename(url, etag)) + + # Make sure the file exists, in case someone manually removed it + if File.exists?(path) do + path + end + + _ -> + nil + end + end + + defp head_download(url, headers) do + with {:ok, response} <- + HTTP.request(:head, url, follow_redirects: false, headers: headers) + |> finish_request(url) do + if response.status in 300..399 do + location = HTTP.get_header(response, "location") + + # Follow relative redirects + if URI.parse(location).host == nil do + url = + url + |> URI.parse() + |> Map.replace!(:path, location) + |> URI.to_string() + + head_download(url, headers) + else + with {:ok, etag} <- fetch_etag(response), do: {:ok, etag, location, true} + end + else + with {:ok, etag} <- fetch_etag(response), do: {:ok, etag, url, false} + end + end + end + + defp finish_request(:ok, _url), do: :ok + + defp finish_request({:ok, response}, _url) when response.status in 100..399, do: {:ok, response} + + defp finish_request({:ok, response}, url) do + case HTTP.get_header(response, "x-error-code") do + code when code == "RepoNotFound" or response.status == 401 -> + {:error, + "repository not found, url: #{url}. Please make sure you specified" <> + " the correct repository id. If you are trying to access a private" <> + " or gated repository, use an authentication token"} + + "EntryNotFound" -> + {:error, "file not found, url: #{url}"} + + "RevisionNotFound" -> + {:error, "revision not found, url: #{url}"} + + "GatedRepo" -> + {:error, + "cannot access gated repository, url: #{url}. Make sure to request access" <> + " for the repository and use an authentication token"} + + _ -> + {:error, "HTTP request failed with status #{response.status}, url: #{url}"} + end + end + + defp finish_request({:error, reason}, _url) do + {:error, "failed to make an HTTP request, reason: #{inspect(reason)}"} + end + + defp fetch_etag(response) do + if etag = HTTP.get_header(response, "x-linked-etag") || HTTP.get_header(response, "etag") do + {:ok, etag} + else + {:error, "no ETag found on the resource"} + end + end + + @doc """ + Gets the HuggingFace authentication token. Requires that it starts with "hf_". + + Looks for the token in the following order: + 1. From options (`:auth_token` key) + 2. From system environment variable (`HF_TOKEN`) + 3. Returns error if not found + + ## Parameters + + * `opts` - keyword list with optional `:auth_token` key + + ## Returns + + * `{:ok, String.t()}` - the authentication token + * `{:error, String.t()}` - if no token is found or invalid + + ## Examples + + iex> ElixirDatasets.HuggingFace.Hub.get_auth_token(auth_token: "hf_my_token") + {:ok, "hf_my_token"} + + iex> ElixirDatasets.HuggingFace.Hub.get_auth_token(auth_token: "my_invalid_token") + {:error, "The provided Hugging Face authentication token does not start with 'hf_'."} + + # iex> ElixirDatasets.HuggingFace.Hub.get_auth_token([]) + # the value of HF_TOKEN environment variable if valid else error + """ + @spec get_auth_token(keyword()) :: {:ok, String.t()} | {:error, String.t()} + def get_auth_token(opts \\ []) do + token = opts[:auth_token] || System.get_env("HF_TOKEN") + validate_auth_token(token) + end + + @spec validate_auth_token(String.t() | nil) :: {:ok, String.t()} | {:error, String.t()} + defp validate_auth_token(token) when is_binary(token) do + cond do + String.starts_with?(token, "hf_") -> + {:ok, token} + + true -> + {:error, "The provided Hugging Face authentication token does not start with 'hf_'."} + end + end + + defp validate_auth_token(_), do: {:error, "No Hugging Face authentication token provided."} + + defp metadata_filename(url) do + encode_url(url) <> ".json" + end + + defp entry_filename(url, etag) do + encode_url(url) <> "." <> encode_etag(etag) + end + + defp encode_url(url) do + url |> :erlang.md5() |> Base.encode32(case: :lower, padding: false) + end + + defp encode_etag(etag) do + Base.encode32(etag, case: :lower, padding: false) + end + + defp load_json(path) do + case File.read(path) do + {:ok, content} -> {:ok, Jason.decode!(content)} + _error -> :error + end + end + + defp store_json(path, data) do + json = Jason.encode!(data) + File.write(path, json) + end + + defp elixir_datasets_offline?() do + System.get_env("ELIXIR_DATASETS_OFFLINE") in ~w(1 true) + end +end diff --git a/test/elixir_datasets/huggingface/hub_test.exs b/test/elixir_datasets/huggingface/hub_test.exs new file mode 100644 index 0000000..d56e90f --- /dev/null +++ b/test/elixir_datasets/huggingface/hub_test.exs @@ -0,0 +1,460 @@ +defmodule ElixirDatasets.HuggingFace.HubTest do + use ExUnit.Case, async: true + + doctest ElixirDatasets.HuggingFace.Hub + + describe "file_url/3" do + @repository_id "test-user/test-repo" + @filename "test-file.txt" + + test "returns correct URL with no revision" do + revision = nil + + expected_url = + "https://huggingface.co/datasets/test-user/test-repo/resolve/main/test-file.txt" + + assert ElixirDatasets.HuggingFace.Hub.file_url(@repository_id, @filename, revision) == + expected_url + end + + test "returns correct URL with revision" do + revision = "test-revision" + + expected_url = + "https://huggingface.co/datasets/test-user/test-repo/resolve/test-revision/test-file.txt" + + assert ElixirDatasets.HuggingFace.Hub.file_url(@repository_id, @filename, revision) == + expected_url + end + end + + describe "file_listing_url/3" do + @repository_id "test-user/test-repo" + test "returns correct URL with no subdir and no revision" do + subdir = nil + revision = nil + + expected_url = + "https://huggingface.co/api/datasets/test-user/test-repo/tree/main" + + assert ElixirDatasets.HuggingFace.Hub.file_listing_url(@repository_id, subdir, revision) == + expected_url + end + + test "returns correct URL with subdir and revision" do + subdir = "test-subdir/test-subdir2" + revision = "test-revision" + + expected_url = + "https://huggingface.co/api/datasets/test-user/test-repo/tree/test-revision/test-subdir/test-subdir2" + + assert ElixirDatasets.HuggingFace.Hub.file_listing_url(@repository_id, subdir, revision) == + expected_url + end + end + + describe "dataset_info_url/1" do + @repository_id "test-user/test-repo" + + test "returns correct URL for dataset info API" do + expected_url = "https://huggingface.co/api/datasets/test-user/test-repo" + + assert ElixirDatasets.HuggingFace.Hub.dataset_info_url(@repository_id) == expected_url + end + end + + describe "cached_download/2" do + @url "https://huggingface.co/datasets/aaaaa32r/elixirDatasets" + @url_redirect "https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT/resolve/main/medical_o1_sft_Chinese.json" + @cache_dir "test_cache_dir_cached_download" + @cache_scope "test_cache_scope" + @opts [cache_dir: @cache_dir, cache_scope: @cache_scope] + + test "No cache_scope" do + assert {:ok, _path} = + ElixirDatasets.HuggingFace.Hub.cached_download(@url, cache_dir: @cache_dir) + + # Clean up + File.rm_rf!(@cache_dir) + end + + test "With cache_scope" do + File.mkdir_p!(@cache_dir) + + assert {:ok, _path} = ElixirDatasets.HuggingFace.Hub.cached_download(@url, @opts) + + # Clean up + File.rm_rf!(@cache_dir) + end + + test "with cache_scope, redirect" do + File.mkdir_p!(@cache_dir) + + assert {:ok, _path} = ElixirDatasets.HuggingFace.Hub.cached_download(@url_redirect, @opts) + + # Clean up + File.rm_rf!(@cache_dir) + end + + test "with download_mode: :force_redownload" do + File.mkdir_p!(@cache_dir) + + assert {:ok, path1} = ElixirDatasets.HuggingFace.Hub.cached_download(@url, @opts) + assert File.exists?(path1) + + assert {:ok, path2} = + ElixirDatasets.HuggingFace.Hub.cached_download( + @url, + @opts ++ [download_mode: :force_redownload] + ) + + assert File.exists?(path2) + assert String.contains?(path1, @cache_dir) + assert String.contains?(path2, @cache_dir) + + File.rm_rf!(@cache_dir) + end + + test "with verification_mode: :no_checks" do + File.mkdir_p!(@cache_dir) + + assert {:ok, _path} = + ElixirDatasets.HuggingFace.Hub.cached_download( + @url, + @opts ++ [verification_mode: :no_checks] + ) + + File.rm_rf!(@cache_dir) + end + + test "verification_mode: :no_checks skips file existence check in offline mode" do + File.mkdir_p!(@cache_dir) + + assert {:ok, cached_path} = + ElixirDatasets.HuggingFace.Hub.cached_download(@url, @opts) + + assert File.exists?(cached_path) + + File.rm!(cached_path) + refute File.exists?(cached_path) + + assert {:error, error_msg} = + ElixirDatasets.HuggingFace.Hub.cached_download( + @url, + @opts ++ [offline: true, verification_mode: :basic_checks] + ) + + assert error_msg =~ "cached file not found" + + assert {:ok, returned_path} = + ElixirDatasets.HuggingFace.Hub.cached_download( + @url, + @opts ++ [offline: true, verification_mode: :no_checks] + ) + + assert returned_path == cached_path + refute File.exists?(returned_path) + + File.rm_rf!(@cache_dir) + end + + test "verification_mode: :basic_checks fails when cached file is missing" do + File.mkdir_p!(@cache_dir) + + assert {:ok, cached_path} = + ElixirDatasets.HuggingFace.Hub.cached_download(@url, @opts) + + assert File.exists?(cached_path) + + File.rm!(cached_path) + + assert {:error, error_msg} = + ElixirDatasets.HuggingFace.Hub.cached_download( + @url, + @opts ++ [offline: true, verification_mode: :basic_checks] + ) + + assert error_msg =~ "cached file not found" + + File.rm_rf!(@cache_dir) + end + + test "verification_mode comparison: :basic_checks vs :no_checks" do + File.mkdir_p!(@cache_dir) + + {:ok, cached_path} = ElixirDatasets.HuggingFace.Hub.cached_download(@url, @opts) + File.rm!(cached_path) + + result_basic = + ElixirDatasets.HuggingFace.Hub.cached_download( + @url, + @opts ++ [offline: true, verification_mode: :basic_checks] + ) + + case result_basic do + {:error, msg} -> + assert msg =~ "cached file not found" + + {:ok, _} -> + flunk("Expected :basic_checks to fail with missing file") + end + + result_no_checks = + ElixirDatasets.HuggingFace.Hub.cached_download( + @url, + @opts ++ [offline: true, verification_mode: :no_checks] + ) + + case result_no_checks do + {:ok, path} -> + assert path == cached_path + refute File.exists?(path) + + {:error, msg} -> + flunk("Expected :no_checks to succeed, got error: #{msg}") + end + + File.rm_rf!(@cache_dir) + end + end + + describe "cached_path_for_etag/3" do + @dir "test_cache_dir_cached_path_for_etag" + @url "https://example.com/test-file.txt" + @etag "1234567890abcdef" + @fileContent "jrdifprgyy26hfylusnlbth2ie.gezdgnbvgy3tqojqmfrggzdfmy" + @fileJson "jrdifprgyy26hfylusnlbth2ie.json" + + test "returns cached path for known etag" do + File.mkdir_p!(@dir) + File.write!(Path.join(@dir, @fileContent), "test content") + File.write!(Path.join(@dir, @fileJson), Jason.encode!(%{"etag" => @etag})) + expected_path = @dir <> "/" <> @fileContent + + assert ElixirDatasets.HuggingFace.Hub.cached_path_for_etag(@dir, @url, @etag) == + expected_path + + # Clean up + File.rm!(Path.join(@dir, @fileContent)) + File.rm!(Path.join(@dir, @fileJson)) + File.rmdir!(@dir) + end + + test "returns nil for invalid etag" do + File.mkdir_p!(@dir) + File.write!(Path.join(@dir, @fileJson), Jason.encode!(%{"etag" => "invalid-etag"})) + + assert ElixirDatasets.HuggingFace.Hub.cached_path_for_etag(@dir, @url, @etag) == nil + + # Clean up + File.rm!(Path.join(@dir, @fileJson)) + File.rmdir!(@dir) + end + end + + describe "head_download/2" do + @url "https://huggingface.co/datasets/aaaaa32r/elixirDatasets" + @url_redirect "https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT/resolve/main/medical_o1_sft_Chinese.json" + # @urlNilHost "http://localhost:32123/sessions/7xre6dqd37a6olsi4dmdddndzz6te5cdimmshjblbbsot2cg" # This URL is not valid for testing, as it does not exist outside of my local environment + @headers [{"Content-Type", "application/json"}] + + test "returns :ok with valid response, without redirection" do + assert {:ok, _etag, @url, false} = + ElixirDatasets.HuggingFace.Hub.head_download(@url, @headers) + end + + test "returns :ok with valid response, with redirection" do + assert {:ok, _etag, _url_redirect, true} = + ElixirDatasets.HuggingFace.Hub.head_download(@url_redirect, @headers) + end + + # test "returns :error, when host location is nil" do # todo + # assert {:error, _reason} = + # ElixirDatasets.HuggingFace.Hub.head_download(@urlNilHost, @headers) + # end + end + + describe "finish_request" do + test "response is :ok" do + assert ElixirDatasets.HuggingFace.Hub.finish_request(:ok, @url) == :ok + end + + test "response is :ok, status in 100..399" do + response = {:ok, %{status: 200}} + assert ElixirDatasets.HuggingFace.Hub.finish_request(response, @url) == response + end + + test "response is :ok, status is out 100..399" do + responses = [ + {:ok, %{status: 404, headers: [{"x-error-code", "RepoNotFound"}]}}, + {:ok, %{status: 404, headers: [{"x-error-code", "GatedRepo"}]}}, + {:ok, %{status: 404, headers: [{"x-error-code", "OtherError"}]}}, + {:ok, %{status: 500, headers: [{"x-error-code", "EntryNotFound"}]}}, + {:ok, %{status: 500, headers: [{"x-error-code", "RevisionNotFound"}]}} + ] + + Enum.each(responses, fn response -> + assert {:error, _} = ElixirDatasets.HuggingFace.Hub.finish_request(response, @url) + end) + end + + test "response is error" do + response = {:error, "test-error"} + + assert ElixirDatasets.HuggingFace.Hub.finish_request(response, @url) == + {:error, "failed to make an HTTP request, reason: \"test-error\""} + end + end + + describe "fetch_etag/1" do + test "when etag is present" do + response = %{ + status: 200, + headers: [{"Content-Type", "application/json"}, {"etag", "1234567890abcdef"}], + body: "{}" + } + + assert ElixirDatasets.HuggingFace.Hub.fetch_etag(response) == + {:ok, "1234567890abcdef"} + end + + test "when etag is not present" do + response = %{ + status: 200, + headers: [{"Content-Type", "application/json"}], + body: "{}" + } + + assert ElixirDatasets.HuggingFace.Hub.fetch_etag(response) == + {:error, "no ETag found on the resource"} + end + end + + describe "metadata_filename/1" do + @url "https://example.com/test-file.txt" + + test "generates correct metadata filename from URL" do + expected_filename = "jrdifprgyy26hfylusnlbth2ie.json" + + assert ElixirDatasets.HuggingFace.Hub.metadata_filename(@url) == expected_filename + end + end + + describe "entry_filename/2, encode_url/1, encode_etag/1" do + test "generates correct filenames based on URL and ETag" do + etag = "1234567890abcdef" + + expected_entry_filename = "jrdifprgyy26hfylusnlbth2ie.gezdgnbvgy3tqojqmfrggzdfmy" + + assert ElixirDatasets.HuggingFace.Hub.entry_filename(@url, etag) == + expected_entry_filename + end + end + + describe "store_json/2, load_json/1" do + @data %{"key" => "value"} + test "stores JSON data to a file and loads it back" do + path = "test_data.json" + + assert ElixirDatasets.HuggingFace.Hub.store_json(path, @data) == :ok + assert File.exists?(path) + + assert ElixirDatasets.HuggingFace.Hub.load_json(path) == {:ok, @data} + + # Clean up + File.rm!(path) + end + + test "returns error when unable to write to file and returns error when trying to load" do + path = "/invalid/path/test_data.json" + + assert ElixirDatasets.HuggingFace.Hub.store_json(path, @data) == + {:error, :enoent} + + assert ElixirDatasets.HuggingFace.Hub.load_json(path) == :error + end + end + + describe "elixirDatasets_offline?/0" do + test "returns true when ELIXIR_DATASETS_OFFLINE is set to '1'" do + System.put_env("ELIXIR_DATASETS_OFFLINE", "1") + assert ElixirDatasets.HuggingFace.Hub.elixir_datasets_offline?() == true + System.delete_env("ELIXIR_DATASETS_OFFLINE") + end + + test "returns true when ELIXIR_DATASETS_OFFLINE is set to 'true'" do + System.put_env("ELIXIR_DATASETS_OFFLINE", "true") + assert ElixirDatasets.HuggingFace.Hub.elixir_datasets_offline?() == true + System.delete_env("ELIXIR_DATASETS_OFFLINE") + end + + test "returns false when ELIXIR_DATASETS_OFFLINE is not set" do + assert ElixirDatasets.HuggingFace.Hub.elixir_datasets_offline?() == false + end + + test "returns false when ELIXIR_DATASETS_OFFLINE is set to '0'" do + System.put_env("ELIXIR_DATASETS_OFFLINE", "0") + assert ElixirDatasets.HuggingFace.Hub.elixir_datasets_offline?() == false + System.delete_env("ELIXIR_DATASETS_OFFLINE") + end + + test "returns false when ELIXIR_DATASETS_OFFLINE is set to 'false'" do + System.put_env("ELIXIR_DATASETS_OFFLINE", "false") + assert ElixirDatasets.HuggingFace.Hub.elixir_datasets_offline?() == false + System.delete_env("ELIXIR_DATASETS_OFFLINE") + end + end + + describe "get_auth_token/1" do + setup do + # Save the current HF_TOKEN env var if it exists + original_token = System.get_env("HF_TOKEN") + System.delete_env("HF_TOKEN") + + on_exit(fn -> + if original_token do + System.put_env("HF_TOKEN", original_token) + else + System.delete_env("HF_TOKEN") + end + end) + + :ok + end + + test "returns token from options when provided" do + System.delete_env("HF_TOKEN") + token = "hf_test_token_from_options" + assert ElixirDatasets.HuggingFace.Hub.get_auth_token(auth_token: token) == {:ok, token} + end + + test "returns token from environment variable when not in options" do + env_token = "hf_test_token_from_env" + System.put_env("HF_TOKEN", env_token) + assert ElixirDatasets.HuggingFace.Hub.get_auth_token([]) == {:ok, env_token} + end + + test "prioritizes options token over environment variable" do + options_token = "hf_token_from_options" + env_token = "hf_token_from_env" + System.put_env("HF_TOKEN", env_token) + + assert ElixirDatasets.HuggingFace.Hub.get_auth_token(auth_token: options_token) == + {:ok, options_token} + end + + test "returns error when no token provided and env var not set" do + System.delete_env("HF_TOKEN") + + assert ElixirDatasets.HuggingFace.Hub.get_auth_token([]) == + {:error, "No Hugging Face authentication token provided."} + end + + test "returns error when empty options and env var not set" do + System.delete_env("HF_TOKEN") + + assert ElixirDatasets.HuggingFace.Hub.get_auth_token() == + {:error, "No Hugging Face authentication token provided."} + end + end +end From 5ee3ac67935187fc39c5e1a094babbb6f3baec9a Mon Sep 17 00:00:00 2001 From: Weronika Date: Sat, 10 Jan 2026 15:25:07 +0100 Subject: [PATCH 04/10] Moved files --- .../{loader.ex => dataset_loader.ex} | 13 ++++++++++++- lib/elixir_datasets/{info.ex => info_getter.ex} | 0 .../utils/{loader.ex => file_loader.ex} | 0 .../loader_test.exs => dataset_loader_test.exs} | 0 test/elixir_datasets/{utils => }/filter_test.exs | 0 .../{utils/info_test.exs => info_getter_test.exs} | 0 .../elixir_datasets/{utils => }/repository_test.exs | 0 test/elixir_datasets/{utils => }/streaming_test.exs | 0 8 files changed, 12 insertions(+), 1 deletion(-) rename lib/elixir_datasets/{loader.ex => dataset_loader.ex} (94%) rename lib/elixir_datasets/{info.ex => info_getter.ex} (100%) rename lib/elixir_datasets/utils/{loader.ex => file_loader.ex} (100%) rename test/elixir_datasets/{utils/loader_test.exs => dataset_loader_test.exs} (100%) rename test/elixir_datasets/{utils => }/filter_test.exs (100%) rename test/elixir_datasets/{utils/info_test.exs => info_getter_test.exs} (100%) rename test/elixir_datasets/{utils => }/repository_test.exs (100%) rename test/elixir_datasets/{utils => }/streaming_test.exs (100%) diff --git a/lib/elixir_datasets/loader.ex b/lib/elixir_datasets/dataset_loader.ex similarity index 94% rename from lib/elixir_datasets/loader.ex rename to lib/elixir_datasets/dataset_loader.ex index f888a0f..65f2f88 100644 --- a/lib/elixir_datasets/loader.ex +++ b/lib/elixir_datasets/dataset_loader.ex @@ -88,6 +88,8 @@ defmodule ElixirDatasets.Loader do streaming = opts[:streaming] || false num_proc = opts[:num_proc] || 1 + repository = merge_download_opts(repository, opts) + with {:ok, repo_files} <- Repository.get_files(repository), {:ok, filtered_files} <- Filter.by_config_and_split(repo_files, name, split) do if streaming do @@ -171,7 +173,8 @@ defmodule ElixirDatasets.Loader do end end, max_concurrency: num_proc, - ordered: true + ordered: true, + timeout: :infinity ) |> Enum.reduce_while({:ok, []}, fn {:ok, {:ok, path_ext}}, {:ok, acc} -> @@ -207,5 +210,13 @@ defmodule ElixirDatasets.Loader do paths -> {:ok, Enum.reverse(paths)} end end + + defp merge_download_opts({:hf, repository_id, repo_opts}, load_opts) do + download_opts = [:download_mode, :verification_mode] + merged_opts = Keyword.merge(repo_opts, Keyword.take(load_opts, download_opts)) + {:hf, repository_id, merged_opts} + end + + defp merge_download_opts(repository, _load_opts), do: repository end diff --git a/lib/elixir_datasets/info.ex b/lib/elixir_datasets/info_getter.ex similarity index 100% rename from lib/elixir_datasets/info.ex rename to lib/elixir_datasets/info_getter.ex diff --git a/lib/elixir_datasets/utils/loader.ex b/lib/elixir_datasets/utils/file_loader.ex similarity index 100% rename from lib/elixir_datasets/utils/loader.ex rename to lib/elixir_datasets/utils/file_loader.ex diff --git a/test/elixir_datasets/utils/loader_test.exs b/test/elixir_datasets/dataset_loader_test.exs similarity index 100% rename from test/elixir_datasets/utils/loader_test.exs rename to test/elixir_datasets/dataset_loader_test.exs diff --git a/test/elixir_datasets/utils/filter_test.exs b/test/elixir_datasets/filter_test.exs similarity index 100% rename from test/elixir_datasets/utils/filter_test.exs rename to test/elixir_datasets/filter_test.exs diff --git a/test/elixir_datasets/utils/info_test.exs b/test/elixir_datasets/info_getter_test.exs similarity index 100% rename from test/elixir_datasets/utils/info_test.exs rename to test/elixir_datasets/info_getter_test.exs diff --git a/test/elixir_datasets/utils/repository_test.exs b/test/elixir_datasets/repository_test.exs similarity index 100% rename from test/elixir_datasets/utils/repository_test.exs rename to test/elixir_datasets/repository_test.exs diff --git a/test/elixir_datasets/utils/streaming_test.exs b/test/elixir_datasets/streaming_test.exs similarity index 100% rename from test/elixir_datasets/utils/streaming_test.exs rename to test/elixir_datasets/streaming_test.exs From 27f1ee56f97380b31c2fcc833d4da5bfdf62c6a7 Mon Sep 17 00:00:00 2001 From: Weronika Date: Sat, 10 Jan 2026 16:13:55 +0100 Subject: [PATCH 05/10] Updated examples --- examples/usage_examples.livemd | 356 +++++++++++++++++---------------- 1 file changed, 182 insertions(+), 174 deletions(-) diff --git a/examples/usage_examples.livemd b/examples/usage_examples.livemd index fc8fdb5..0697935 100644 --- a/examples/usage_examples.livemd +++ b/examples/usage_examples.livemd @@ -38,83 +38,57 @@ This section demonstrates all the ways to load datasets using `ElixirDatasets.lo #### Load dataset from Huggingface +Load a dataset directly from the Hugging Face Hub. + ```elixir -ElixirDatasets.load_dataset({:hf, "fka/awesome-chatgpt-prompts"}) +{:ok, datasets} = ElixirDatasets.load_dataset({:hf, "fka/awesome-chatgpt-prompts"}) +IO.puts("Loaded #{length(datasets)} dataset(s)") +datasets ``` ``` -{:ok, - [ - #Explorer.DataFrame< - Polars[948 x 5] - act string ["Ethereum Developer", "Linux Terminal", "English Translator and Improver", - "Job Interviewer", "JavaScript Console", ...] - prompt string ["Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.", - "I want you to act as a linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. when i need to tell you something in english, i will do so by putting text inside curly brackets {like this}. my first command is pwd", - "I want you to act as an English translator, spelling corrector and improver. I will speak to you in any language and you will detect the language, translate it and answer in the corrected and improved version of my text, in English. I want you to replace my simplified A0-level words and sentences with more beautiful and elegant, upper level English words and sentences. Keep the meaning same, but make them more literary. I want you to only reply the correction, the improvements and nothing else, do not write explanations. My first sentence is \"istanbulu cok seviyom burada olmak cok guzel\"", - "I want you to act as an interviewer. I will be the candidate and you will ask me the interview questions for the ${Position:Software Developer} position. I want you to only reply as the interviewer. Do not write all the conversation at once. I want you to only do the interview with me. Ask me the questions and wait for my answers. Do not write explanations. Ask me the questions one by one like an interviewer does and wait for my answers.\n\nMy first sentence is \"Hi\"", - "I want you to act as a javascript console. I will type commands and you will reply with what the javascript console should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. when i need to tell you something in english, i will do so by putting text inside curly brackets {like this}. my first command is console.log(\"Hello World\");", - ...] - for_devs boolean [true, true, false, false, true, ...] - type string ["TEXT", "TEXT", "TEXT", "TEXT", "TEXT", ...] - contributor string ["ameya-2003", "f", "f", "f,iltekin", "omerimzali", ...] - > - ]} +Loaded 1 dataset(s) +``` + + + +``` +[ + #Explorer.DataFrame< + Polars[974 x 5] + act string ["Ethereum Developer", "Linux Terminal", "English Translator and Improver", + "Job Interviewer", "JavaScript Console", ...] + prompt string ["Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.", + "I want you to act as a linux terminal. I will type commands and you will reply with what the terminal should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. when i need to tell you something in english, i will do so by putting text inside curly brackets {like this}. my first command is pwd", + "I want you to act as an English translator, spelling corrector and improver. I will speak to you in any language and you will detect the language, translate it and answer in the corrected and improved version of my text, in English. I want you to replace my simplified A0-level words and sentences with more beautiful and elegant, upper level English words and sentences. Keep the meaning same, but make them more literary. I want you to only reply the correction, the improvements and nothing else, do not write explanations. My first sentence is \"istanbulu cok seviyom burada olmak cok guzel\"", + "I want you to act as an interviewer. I will be the candidate and you will ask me the interview questions for the ${Position:Software Developer} position. I want you to only reply as the interviewer. Do not write all the conversation at once. I want you to only do the interview with me. Ask me the questions and wait for my answers. Do not write explanations. Ask me the questions one by one like an interviewer does and wait for my answers.\n\nMy first sentence is \"Hi\"", + "I want you to act as a javascript console. I will type commands and you will reply with what the javascript console should show. I want you to only reply with the terminal output inside one unique code block, and nothing else. do not write explanations. do not type commands unless I instruct you to do so. when i need to tell you something in english, i will do so by putting text inside curly brackets {like this}. my first command is console.log(\"Hello World\");", + ...] + for_devs boolean [true, true, false, false, true, ...] + type string ["TEXT", "TEXT", "TEXT", "TEXT", "TEXT", ...] + contributor string ["ameya-2003", "f", "f", "f,iltekin", "omerimzali", ...] + > +] ``` #### Load dataset from Huggingface from given subdir +Some datasets have multiple configurations in subdirectories. + ```elixir -ElixirDatasets.load_dataset( - {:hf, "stanfordnlp/imdb", subdir: "plain_text"}) +{:ok, datasets} = ElixirDatasets.load_dataset( + {:hf, "stanfordnlp/imdb", subdir: "plain_text"} +) +IO.puts("Loaded #{length(datasets)} dataset(s) from 'plain_text' configuration") +datasets ``` ``` -{:ok, - [ - #Explorer.DataFrame< - Polars[25000 x 2] - text string ["I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichΓ©d and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.", - "Worth the entertainment value of a rental, especially if you like action movies. This one features the usual car chases, fights with the great Van Damme kick style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. All of this is entertaining and competently handled but there is nothing that really blows you away if you've seen your share before.

The plot is made interesting by the inclusion of a rabbit, which is clever but hardly profound. Many of the characters are heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat federale who looks like he was typecast as the Mexican in a Hollywood movie from the 1940s. All passably acted but again nothing special.

I thought the main villains were pretty well done and fairly well acted. By the end of the movie you certainly knew who the good guys were and weren't. There was an emotional lift as the really bad ones got their just deserts. Very simplistic, but then you weren't expecting Hamlet, right? The only thing I found really annoying was the constant cuts to VDs daughter during the last fight scene.

Not bad. Not good. Passable 4.", - "its a totally average film with a few semi-alright action sequences that make the plot seem a little better and remind the viewer of the classic van dam films. parts of the plot don't make sense and seem to be added in to use up time. the end plot is that of a very basic type that doesn't leave the viewer guessing and any twists are obvious from the beginning. the end scene with the flask backs don't make sense as they are added in and seem to have little relevance to the history of van dam's character. not really worth watching again, bit disappointed in the end production, even though it is apparent it was shot on a low budget certain shots and sections in the film are of poor directed quality", - "STAR RATING: ***** Saturday Night **** Friday Night *** Friday Morning ** Sunday Night * Monday Morning

Former New Orleans homicide cop Jack Robideaux (Jean Claude Van Damme) is re-assigned to Columbus, a small but violent town in Mexico to help the police there with their efforts to stop a major heroin smuggling operation into their town. The culprits turn out to be ex-military, lead by former commander Benjamin Meyers (Stephen Lord, otherwise known as Jase from East Enders) who is using a special method he learned in Afghanistan to fight off his opponents. But Jack has a more personal reason for taking him down, that draws the two men into an explosive final showdown where only one will walk away alive.

After Until Death, Van Damme appeared to be on a high, showing he could make the best straight to video films in the action market. While that was a far more drama oriented film, with The Shepherd he has returned to the high-kicking, no brainer action that first made him famous and has sadly produced his worst film since Derailed. It's nowhere near as bad as that film, but what I said still stands.

A dull, predictable film, with very little in the way of any exciting action. What little there is mainly consists of some limp fight scenes, trying to look cool and trendy with some cheap slo-mo/sped up effects added to them that sadly instead make them look more desperate. Being a Mexican set film, director Isaac Florentine has tried to give the film a Robert Rodriguez/Desperado sort of feel, but this only adds to the desperation.

VD gives a particularly uninspired performance and given he's never been a Robert De Niro sort of actor, that can't be good. As the villain, Lord shouldn't expect to leave the beeb anytime soon. He gets little dialogue at the beginning as he struggles to muster an American accent but gets mysteriously better towards the end. All the supporting cast are equally bland, and do nothing to raise the films spirits at all.

This is one shepherd that's strayed right from the flock. *", - "First off let me say, If you haven't enjoyed a Van Damme movie since bloodsport, you probably will not like this movie. Most of these movies may not have the best plots or best actors but I enjoy these kinds of movies for what they are. This movie is much better than any of the movies the other action guys (Segal and Dolph) have thought about putting out the past few years. Van Damme is good in the movie, the movie is only worth watching to Van Damme fans. It is not as good as Wake of Death (which i highly recommend to anyone of likes Van Damme) or In hell but, in my opinion it's worth watching. It has the same type of feel to it as Nowhere to Run. Good fun stuff!", - ...] - label s64 [0, 0, 0, 0, 0, ...] - >, - #Explorer.DataFrame< - Polars[25000 x 2] - text string ["I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered \"controversial\" I really had to see this for myself.

The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.

What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. While my countrymen mind find it shocking, in reality sex and nudity are a major staple in Swedish cinema. Even Ingmar Bergman, arguably their answer to good old boy John Ford, had sex scenes in his films.

I do commend the filmmakers for the fact that any sex shown in the film is shown for artistic purposes rather than just to shock people and make money to be shown in pornographic theaters in America. I AM CURIOUS-YELLOW is a good film for anyone wanting to study the meat and potatoes (no pun intended) of Swedish cinema. But really, this film doesn't have much of a plot.", - "\"I Am Curious: Yellow\" is a risible and pretentious steaming pile. It doesn't matter what one's political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn't true. I've seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don't exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we're treated to the site of Vincent Gallo's throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) \"double-standard\" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, and the same cannot be said for a man. In fact, you generally won't see female genitals in an American film in anything short of porn or explicit erotica. This alleged double-standard is less a double standard than an admittedly depressing ability to come to terms culturally with the insides of women's bodies.", - "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.

One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).

One might better spend one's time staring out a window at a tree growing.

", - "This film was probably inspired by Godard's Masculin, fΓ©minin and I urge you to see that film instead.

The film has two strong elements and those are, (1) the realistic acting (2) the impressive, undeservedly good, photo. Apart from that, what strikes me most is the endless stream of silliness. Lena Nyman has to be most annoying actress in the world. She acts so stupid and with all the nudity in this film,...it's unattractive. Comparing to Godard's film, intellectuality has been replaced with stupidity. Without going too far on this subject, I would say that follows from the difference in ideals between the French and the Swedish society.

A movie of its time, and place. 2/10.", - "Oh, brother...after hearing about this ridiculous film for umpteen years all I can think of is that old Peggy Lee song..

\"Is that all there is??\" ...I was just an early teen when this smoked fish hit the U.S. I was too young to get in the theater (although I did manage to sneak into \"Goodbye Columbus\"). Then a screening at a local film museum beckoned - Finally I could see this film, except now I was as old as my parents were when they schlepped to see it!!

The ONLY reason this film was not condemned to the anonymous sands of time was because of the obscenity case sparked by its U.S. release. MILLIONS of people flocked to this stinker, thinking they were going to see a sex film...Instead, they got lots of closeups of gnarly, repulsive Swedes, on-street interviews in bland shopping malls, asinie political pretension...and feeble who-cares simulated sex scenes with saggy, pale actors.

Cultural icon, holy grail, historic artifact..whatever this thing was, shred it, burn it, then stuff the ashes in a lead box!

Elite esthetes still scrape to find value in its boring pseudo revolutionary political spewings..But if it weren't for the censorship scandal, it would have been ignored, then forgotten.

Instead, the \"I Am Blank, Blank\" rhythymed title was repeated endlessly for years as a titilation for porno films (I am Curious, Lavender - for gay films, I Am Curious, Black - for blaxploitation films, etc..) and every ten years or so the thing rises from the dead, to be viewed by a new generation of suckers who want to see that \"naughty sex film\" that \"revolutionized the film industry\"...

Yeesh, avoid like the plague..Or if you MUST see it - rent the video and fast forward to the \"dirty\" parts, just to get it over with.

", - ...] - label s64 [0, 0, 0, 0, 0, ...] - >, - #Explorer.DataFrame< - Polars[50000 x 2] - text string ["This is just a precious little diamond. The play, the script are excellent. I cant compare this movie with anything else, maybe except the movie \"Leon\" wonderfully played by Jean Reno and Natalie Portman. But... What can I say about this one? This is the best movie Anne Parillaud has ever played in (See please \"Frankie Starlight\", she's speaking English there) to see what I mean. The story of young punk girl Nikita, taken into the depraved world of the secret government forces has been exceptionally over used by Americans. Never mind the \"Point of no return\" and especially the \"La femme Nikita\" TV series. They cannot compare the original believe me! Trash these videos. Buy this one, do not rent it, BUY it. BTW beware of the subtitles of the LA company which \"translate\" the US release. What a disgrace! If you cant understand French, get a dubbed version. But you'll regret later :)", - "When I say this is my favourite film of all time, that comment is not to be taken lightly. I probably watch far too many films than is healthy for me, and have loved quite a few of them. I first saw \"La Femme Nikita\" nearly ten years ago, and it still manages to be my absolute favourite. Why?

This is more than an incredibly stylish and sexy thriller. Luc Besson's great flair for impeccable direction, fashion, and appropriate usage of music makes this a very watchable film. But it is Anne Parillaud's perfect rendering of a complex character who transforms from a heartless killer into a compassionate, vibrant young woman that makes this film beautiful. I can't keep my eyes off of her when she is on screen.

I have seen several of Luc Besson's films including \"Subway\", \"The Professional\", and the irritating \"Fifth Element\", and \"Nikita\" is without a doubt, far superior to any of these. Although this film has tragic elements, it is ultimately extremely hopeful. It is the story of a person who is cruel and merciless, who ultimately comes to realize her own humanity and her own personal power. That, to me is extremely inspiring. If there is hope for Nikita, there is hope for all of us.", - "I saw this movie because I am a huge fan of the TV series of the same name starring Roy Dupuis and Pet Wilson. The movie was really good and I saw how the TV show is based on the movie. A few episodes of the TV series came directly from the movie and their similarity was amazing. To keep things short, any fan of the movie has to watch the series and any fan of the series must see the original Nikita.", - "Being that the only foreign films I usually like star a Japanese person in a rubber suit who crushes little tiny buildings and tanks, I had high hopes for this movie. I thought that this was a movie that wouldn't put me to sleep. WRONG! Starts off with a bang, okay, now she's in training, alright, she's an assassin, I'm still with you, oh, now she's having this moral dilemma and she can't decide if she loves her boyfriend or her controller, zzzzz.... Oh well, back to Gamera!", - "After seeing Point of No Return (a great movie) and being told that the original was better, I was certainly thrilled to see that one of the indie film channels was running La Femme Nikita. Then I saw the movie. Ouch! This was a major let-down.

Nikita herself reminds me of Jar Jar Binks more than any other character I've seen recently. She comes across entirely as comic relief. The movie simply has nothing to recommend it besides the core concept of an evil, inhuman character paradoxically learning to be human while training as an assassin, and that concept failed miserably in Nikita due to the poor writing of the title role.", - ...] - label s64 [-1, -1, -1, -1, -1, ...] - > - ]} -``` - -### Load dataset from Huggingface with auth token as option - -```elixir -ElixirDatasets.load_dataset!( - {:hf, "cornell-movie-review-data/rotten_tomatoes"}, - %{auth_token: auth_token}) +Loaded 3 dataset(s) from 'plain_text' configuration ``` @@ -122,63 +96,111 @@ ElixirDatasets.load_dataset!( ``` [ #Explorer.DataFrame< - Polars[1066 x 2] - text string ["lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .", - "consistently clever and suspenseful .", - "it's like a \" big chill \" reunion of the baader-meinhof gang , only these guys are more harmless pranksters than political activists .", - "the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with tremendous skill .", - "red dragon \" never cuts corners .", ...] - label s64 [1, 1, 1, 1, 1, ...] + Polars[25000 x 2] + text string ["I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichΓ©d and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.", + "Worth the entertainment value of a rental, especially if you like action movies. This one features the usual car chases, fights with the great Van Damme kick style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. All of this is entertaining and competently handled but there is nothing that really blows you away if you've seen your share before.

The plot is made interesting by the inclusion of a rabbit, which is clever but hardly profound. Many of the characters are heavily stereotyped -- the angry veterans, the terrified illegal aliens, the crooked cops, the indifferent feds, the bitchy tough lady station head, the crooked politician, the fat federale who looks like he was typecast as the Mexican in a Hollywood movie from the 1940s. All passably acted but again nothing special.

I thought the main villains were pretty well done and fairly well acted. By the end of the movie you certainly knew who the good guys were and weren't. There was an emotional lift as the really bad ones got their just deserts. Very simplistic, but then you weren't expecting Hamlet, right? The only thing I found really annoying was the constant cuts to VDs daughter during the last fight scene.

Not bad. Not good. Passable 4.", + "its a totally average film with a few semi-alright action sequences that make the plot seem a little better and remind the viewer of the classic van dam films. parts of the plot don't make sense and seem to be added in to use up time. the end plot is that of a very basic type that doesn't leave the viewer guessing and any twists are obvious from the beginning. the end scene with the flask backs don't make sense as they are added in and seem to have little relevance to the history of van dam's character. not really worth watching again, bit disappointed in the end production, even though it is apparent it was shot on a low budget certain shots and sections in the film are of poor directed quality", + "STAR RATING: ***** Saturday Night **** Friday Night *** Friday Morning ** Sunday Night * Monday Morning

Former New Orleans homicide cop Jack Robideaux (Jean Claude Van Damme) is re-assigned to Columbus, a small but violent town in Mexico to help the police there with their efforts to stop a major heroin smuggling operation into their town. The culprits turn out to be ex-military, lead by former commander Benjamin Meyers (Stephen Lord, otherwise known as Jase from East Enders) who is using a special method he learned in Afghanistan to fight off his opponents. But Jack has a more personal reason for taking him down, that draws the two men into an explosive final showdown where only one will walk away alive.

After Until Death, Van Damme appeared to be on a high, showing he could make the best straight to video films in the action market. While that was a far more drama oriented film, with The Shepherd he has returned to the high-kicking, no brainer action that first made him famous and has sadly produced his worst film since Derailed. It's nowhere near as bad as that film, but what I said still stands.

A dull, predictable film, with very little in the way of any exciting action. What little there is mainly consists of some limp fight scenes, trying to look cool and trendy with some cheap slo-mo/sped up effects added to them that sadly instead make them look more desperate. Being a Mexican set film, director Isaac Florentine has tried to give the film a Robert Rodriguez/Desperado sort of feel, but this only adds to the desperation.

VD gives a particularly uninspired performance and given he's never been a Robert De Niro sort of actor, that can't be good. As the villain, Lord shouldn't expect to leave the beeb anytime soon. He gets little dialogue at the beginning as he struggles to muster an American accent but gets mysteriously better towards the end. All the supporting cast are equally bland, and do nothing to raise the films spirits at all.

This is one shepherd that's strayed right from the flock. *", + "First off let me say, If you haven't enjoyed a Van Damme movie since bloodsport, you probably will not like this movie. Most of these movies may not have the best plots or best actors but I enjoy these kinds of movies for what they are. This movie is much better than any of the movies the other action guys (Segal and Dolph) have thought about putting out the past few years. Van Damme is good in the movie, the movie is only worth watching to Van Damme fans. It is not as good as Wake of Death (which i highly recommend to anyone of likes Van Damme) or In hell but, in my opinion it's worth watching. It has the same type of feel to it as Nowhere to Run. Good fun stuff!", + ...] + label s64 [0, 0, 0, 0, 0, ...] >, #Explorer.DataFrame< - Polars[8530 x 2] - text string ["the rock is destined to be the 21st century's new \" conan \" and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .", - "the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .", - "effective but too-tepid biopic", - "if you sometimes like to go to the movies to have fun , wasabi is a good place to start .", - "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .", + Polars[25000 x 2] + text string ["I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered \"controversial\" I really had to see this for myself.

The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.

What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. While my countrymen mind find it shocking, in reality sex and nudity are a major staple in Swedish cinema. Even Ingmar Bergman, arguably their answer to good old boy John Ford, had sex scenes in his films.

I do commend the filmmakers for the fact that any sex shown in the film is shown for artistic purposes rather than just to shock people and make money to be shown in pornographic theaters in America. I AM CURIOUS-YELLOW is a good film for anyone wanting to study the meat and potatoes (no pun intended) of Swedish cinema. But really, this film doesn't have much of a plot.", + "\"I Am Curious: Yellow\" is a risible and pretentious steaming pile. It doesn't matter what one's political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn't true. I've seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don't exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we're treated to the site of Vincent Gallo's throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) \"double-standard\" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, and the same cannot be said for a man. In fact, you generally won't see female genitals in an American film in anything short of porn or explicit erotica. This alleged double-standard is less a double standard than an admittedly depressing ability to come to terms culturally with the insides of women's bodies.", + "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.

One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).

One might better spend one's time staring out a window at a tree growing.

", + "This film was probably inspired by Godard's Masculin, fΓ©minin and I urge you to see that film instead.

The film has two strong elements and those are, (1) the realistic acting (2) the impressive, undeservedly good, photo. Apart from that, what strikes me most is the endless stream of silliness. Lena Nyman has to be most annoying actress in the world. She acts so stupid and with all the nudity in this film,...it's unattractive. Comparing to Godard's film, intellectuality has been replaced with stupidity. Without going too far on this subject, I would say that follows from the difference in ideals between the French and the Swedish society.

A movie of its time, and place. 2/10.", + "Oh, brother...after hearing about this ridiculous film for umpteen years all I can think of is that old Peggy Lee song..

\"Is that all there is??\" ...I was just an early teen when this smoked fish hit the U.S. I was too young to get in the theater (although I did manage to sneak into \"Goodbye Columbus\"). Then a screening at a local film museum beckoned - Finally I could see this film, except now I was as old as my parents were when they schlepped to see it!!

The ONLY reason this film was not condemned to the anonymous sands of time was because of the obscenity case sparked by its U.S. release. MILLIONS of people flocked to this stinker, thinking they were going to see a sex film...Instead, they got lots of closeups of gnarly, repulsive Swedes, on-street interviews in bland shopping malls, asinie political pretension...and feeble who-cares simulated sex scenes with saggy, pale actors.

Cultural icon, holy grail, historic artifact..whatever this thing was, shred it, burn it, then stuff the ashes in a lead box!

Elite esthetes still scrape to find value in its boring pseudo revolutionary political spewings..But if it weren't for the censorship scandal, it would have been ignored, then forgotten.

Instead, the \"I Am Blank, Blank\" rhythymed title was repeated endlessly for years as a titilation for porno films (I am Curious, Lavender - for gay films, I Am Curious, Black - for blaxploitation films, etc..) and every ten years or so the thing rises from the dead, to be viewed by a new generation of suckers who want to see that \"naughty sex film\" that \"revolutionized the film industry\"...

Yeesh, avoid like the plague..Or if you MUST see it - rent the video and fast forward to the \"dirty\" parts, just to get it over with.

", ...] - label s64 [1, 1, 1, 1, 1, ...] + label s64 [0, 0, 0, 0, 0, ...] >, #Explorer.DataFrame< - Polars[1066 x 2] - text string ["compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .", - "the soundtrack alone is worth the price of admission .", - "rodriguez does a splendid job of racial profiling hollywood style--casting excellent latin actors of all ages--a trend long overdue .", - "beneath the film's obvious determination to shock at any cost lies considerable skill and determination , backed by sheer nerve .", - "bielinsky is a filmmaker of impressive talent .", ...] - label s64 [1, 1, 1, 1, 1, ...] + Polars[50000 x 2] + text string ["This is just a precious little diamond. The play, the script are excellent. I cant compare this movie with anything else, maybe except the movie \"Leon\" wonderfully played by Jean Reno and Natalie Portman. But... What can I say about this one? This is the best movie Anne Parillaud has ever played in (See please \"Frankie Starlight\", she's speaking English there) to see what I mean. The story of young punk girl Nikita, taken into the depraved world of the secret government forces has been exceptionally over used by Americans. Never mind the \"Point of no return\" and especially the \"La femme Nikita\" TV series. They cannot compare the original believe me! Trash these videos. Buy this one, do not rent it, BUY it. BTW beware of the subtitles of the LA company which \"translate\" the US release. What a disgrace! If you cant understand French, get a dubbed version. But you'll regret later :)", + "When I say this is my favourite film of all time, that comment is not to be taken lightly. I probably watch far too many films than is healthy for me, and have loved quite a few of them. I first saw \"La Femme Nikita\" nearly ten years ago, and it still manages to be my absolute favourite. Why?

This is more than an incredibly stylish and sexy thriller. Luc Besson's great flair for impeccable direction, fashion, and appropriate usage of music makes this a very watchable film. But it is Anne Parillaud's perfect rendering of a complex character who transforms from a heartless killer into a compassionate, vibrant young woman that makes this film beautiful. I can't keep my eyes off of her when she is on screen.

I have seen several of Luc Besson's films including \"Subway\", \"The Professional\", and the irritating \"Fifth Element\", and \"Nikita\" is without a doubt, far superior to any of these. Although this film has tragic elements, it is ultimately extremely hopeful. It is the story of a person who is cruel and merciless, who ultimately comes to realize her own humanity and her own personal power. That, to me is extremely inspiring. If there is hope for Nikita, there is hope for all of us.", + "I saw this movie because I am a huge fan of the TV series of the same name starring Roy Dupuis and Pet Wilson. The movie was really good and I saw how the TV show is based on the movie. A few episodes of the TV series came directly from the movie and their similarity was amazing. To keep things short, any fan of the movie has to watch the series and any fan of the series must see the original Nikita.", + "Being that the only foreign films I usually like star a Japanese person in a rubber suit who crushes little tiny buildings and tanks, I had high hopes for this movie. I thought that this was a movie that wouldn't put me to sleep. WRONG! Starts off with a bang, okay, now she's in training, alright, she's an assassin, I'm still with you, oh, now she's having this moral dilemma and she can't decide if she loves her boyfriend or her controller, zzzzz.... Oh well, back to Gamera!", + "After seeing Point of No Return (a great movie) and being told that the original was better, I was certainly thrilled to see that one of the indie film channels was running La Femme Nikita. Then I saw the movie. Ouch! This was a major let-down.

Nikita herself reminds me of Jar Jar Binks more than any other character I've seen recently. She comes across entirely as comic relief. The movie simply has nothing to recommend it besides the core concept of an evil, inhuman character paradoxically learning to be human while training as an assassin, and that concept failed miserably in Nikita due to the poor writing of the title role.", + ...] + label s64 [-1, -1, -1, -1, -1, ...] > ] ``` +### Load dataset from Huggingface with auth token as option + +Use an authentication token to access private datasets. + +```elixir +datasets = ElixirDatasets.load_dataset!( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + %{auth_token: auth_token} +) +IO.puts("Loaded #{length(datasets)} dataset(s) with authentication") +datasets +``` + + + +``` +** (FunctionClauseError) no function clause matching in Keyword.take/2 + + The following arguments were given to Keyword.take/2: + + # 1 + %{auth_token: nil} + + # 2 + [:download_mode, :verification_mode] + + Attempted function clauses (showing 1 out of 1): + + def take(keywords, keys) when is_list(keywords) and is_list(keys) + + (elixir 1.18.3) lib/keyword.ex:1279: Keyword.take/2 + (elixir_datasets 0.0.1) lib/elixir_datasets/dataset_loader.ex:216: ElixirDatasets.Loader.merge_download_opts/2 + (elixir_datasets 0.0.1) lib/elixir_datasets/dataset_loader.ex:91: ElixirDatasets.Loader.load_dataset/2 + (elixir_datasets 0.0.1) lib/elixir_datasets/dataset_loader.ex:126: ElixirDatasets.Loader.load_dataset!/2 + #cell:3fjksspnhlhcid4k:1: (file) +``` + ### Load dataset from local resources +Load datasets from local files (CSV, Parquet, or JSONL). + ```elixir -ElixirDatasets.load_dataset({:local, "#{__DIR__}/../resources"}) +{:ok, datasets} = ElixirDatasets.load_dataset({:local, "#{__DIR__}/../resources"}) +IO.puts("Loaded #{length(datasets)} dataset(s) from local directory") +datasets ``` ``` -{:ok, - [ - #Explorer.DataFrame< - Polars[11 x 2] - id s64 [0, 1, 2, 3, 4, ...] - number string ["csv", "one", "two", "three", "four", ...] - >, - #Explorer.DataFrame< - Polars[11 x 2] - id s64 [0, 1, 2, 3, 4, ...] - number string ["jsonl", "one", "two", "three", "four", ...] - >, - #Explorer.DataFrame< - Polars[11 x 2] - id s64 [0, 1, 2, 3, 4, ...] - number string ["parquet", "one", "two", "three", "four", ...] - > - ]} +Loaded 3 dataset(s) from local directory +``` + + + +``` +[ + #Explorer.DataFrame< + Polars[11 x 2] + id s64 [0, 1, 2, 3, 4, ...] + number string ["csv", "one", "two", "three", "four", ...] + >, + #Explorer.DataFrame< + Polars[11 x 2] + id s64 [0, 1, 2, 3, 4, ...] + number string ["jsonl", "one", "two", "three", "four", ...] + >, + #Explorer.DataFrame< + Polars[11 x 2] + id s64 [0, 1, 2, 3, 4, ...] + number string ["parquet", "one", "two", "three", "four", ...] + > +] ``` ### Advanced Loading Options @@ -382,71 +404,21 @@ Streaming from HuggingFace... #### Parallel processing with num_proc -Use `num_proc` to load multiple files in parallel. Parallel processing is most effective with large files or when downloading fresh data. +Use `num_proc` to load multiple files in parallel for faster processing. **This is most efficient for datasets with many files.** ```elixir -# Compare sequential vs parallel loading -# Note: Using force_redownload to demonstrate real performance difference -dataset_repo = {:hf, "stanfordnlp/imdb", subdir: "plain_text"} - -{time_seq, {:ok, _}} = :timer.tc(fn -> - ElixirDatasets.load_dataset(dataset_repo, num_proc: 1, download_mode: :force_redownload) -end) - -{time_par, {:ok, datasets}} = :timer.tc(fn -> - ElixirDatasets.load_dataset(dataset_repo, num_proc: 4, download_mode: :force_redownload) -end) - -IO.puts("Sequential: #{Float.round(time_seq / 1_000_000, 2)}s") -IO.puts("Parallel: #{Float.round(time_par / 1_000_000, 2)}s") -IO.puts("Speedup: #{Float.round(time_seq / time_par, 2)}x") -IO.puts("Loaded #{length(datasets)} datasets") -``` - -#### Filter datasets by name pattern - -The `name` parameter filters files by matching the name in the file path: - -```elixir -# Load only files containing "csv" in their filename -{:ok, csv_only} = ElixirDatasets.load_dataset( - {:local, "#{__DIR__}/../resources"}, - name: "csv" +{:ok, data} = ElixirDatasets.load_dataset( + {:hf, "glue", subdir: "mrpc"}, + num_proc: 4 ) -IO.puts("Loaded #{length(csv_only)} dataset(s) matching 'csv'") -[csv_df] = csv_only -IO.inspect(csv_df) +IO.puts("Loaded #{length(data)} splits") ``` ``` -Loaded 1 dataset(s) matching 'csv' -#Explorer.DataFrame< - Polars[11 x 2] - id s64 [0, 1, 2, 3, 4, ...] - number string ["csv", "one", "two", "three", "four", ...] -> -``` - - - -``` -#Explorer.DataFrame< - Polars[11 x 2] - id s64 [0, 1, 2, 3, 4, ...] - number string ["csv", "one", "two", "three", "four", ...] -> -``` - - - -``` -Dataset: aaaaa32r/elixirDatasets -Config: csv -Features: [%{"dtype" => "int64", "name" => "id"}, %{"dtype" => "string", "name" => "number"}] -Training examples: 10 +Loaded 3 splits ``` @@ -489,7 +461,7 @@ Validation examples: 1066 #### Force redownload with download_mode -Use `download_mode` to control caching behavior: +Force a fresh download even if the dataset is already cached. ```elixir {:ok, [fresh_data]} = ElixirDatasets.load_dataset( @@ -504,6 +476,7 @@ IO.puts("Freshly downloaded dataset has #{Explorer.DataFrame.n_rows(fresh_data)} ``` +|=============================================================| 100% (698.84 KB) Freshly downloaded dataset has 8530 rows ``` @@ -520,7 +493,7 @@ Available `download_mode` options: #### Skip verification with verification_mode -Use `verification_mode` to control validation checks: +Skip validation checks for faster loading when you trust the data source. ```elixir {:ok, [quick_data]} = ElixirDatasets.load_dataset( @@ -636,11 +609,14 @@ end ## Upload dataset +Upload your own datasets to Hugging Face Hub. + ### Prepare datasets to upload ```elixir -[ df_head | df_tail ] = ElixirDatasets.load_dataset!({:local, "#{__DIR__}/../resources"}) -nil +[df_head | _df_tail] = ElixirDatasets.load_dataset!({:local, "#{__DIR__}/../resources"}) +IO.puts("Prepared dataset with #{Explorer.DataFrame.n_rows(df_head)} rows for upload") +:ok ``` @@ -651,12 +627,16 @@ nil ### Upload dataset to huggingface hub +Upload a DataFrame as a dataset file (CSV, Parquet, or JSONL). + ```elixir -# Commented out to avoid cluttering the repository +# Uncomment to upload (requires HF_TOKEN) # ElixirDatasets.upload_dataset( # df_head, -# "aaaaa32r/elixirDatasets", -# [file_extension: "csv"]) +# "username/dataset-name", +# file_extension: "csv" +# ) +IO.puts("Upload example (commented out)") ``` @@ -667,11 +647,15 @@ nil ### Delete dataset file from huggingface hub +Remove a specific file from your dataset repository. + ```elixir -# Commented out to avoid cluttering the repository +# Uncomment to delete a file (requires HF_TOKEN) # ElixirDatasets.Utils.Uploader.delete_file_from_dataset( -# "aaaaa32r/elixirDatasets", -# "briefly-576460442698708888-7FDZDhwtp6dOsH5dAT") +# "username/dataset-name", +# "file-id-to-delete" +# ) +IO.puts("Delete example (commented out)") ``` @@ -682,11 +666,15 @@ nil ### Upload dataset to huggingface hub via lfs +Upload large files using Git LFS for better performance. + ```elixir -# Commented out to avoid cluttering the repository +# Uncomment to upload via LFS (requires HF_TOKEN and git-lfs) # ElixirDatasets.Utils.Uploader.upload_file_via_lfs( -# "/Users/radoslawrolka/Downloads/companies-2023-q4-sm.csv.zip", -# "aaaaa32r/elixirDatasets") +# "/path/to/large-file.csv.zip", +# "username/dataset-name" +# ) +IO.puts("LFS upload example (commented out)") ``` @@ -697,10 +685,16 @@ nil ## Other loading methods +Query dataset metadata without downloading the full dataset. + ### Get dataset infos +Retrieve detailed information about a dataset including splits and features. + ```elixir -ElixirDatasets.get_dataset_infos("cornell-movie-review-data/rotten_tomatoes") +{:ok, infos} = ElixirDatasets.get_dataset_infos("cornell-movie-review-data/rotten_tomatoes") +IO.puts("Retrieved #{length(infos)} dataset configuration(s)") +infos ``` @@ -732,8 +726,12 @@ ElixirDatasets.get_dataset_infos("cornell-movie-review-data/rotten_tomatoes") ### Get dataset split names +Get available splits (train, validation, test) for a dataset. + ```elixir -ElixirDatasets.get_dataset_split_names("cornell-movie-review-data/rotten_tomatoes") +{:ok, splits} = ElixirDatasets.get_dataset_split_names("cornell-movie-review-data/rotten_tomatoes") +IO.puts("Available splits: #{Enum.join(splits, ", ")}") +splits ``` @@ -744,8 +742,12 @@ ElixirDatasets.get_dataset_split_names("cornell-movie-review-data/rotten_tomatoe ### Get dataset config names +Get available configurations for datasets with multiple configs. + ```elixir -ElixirDatasets.get_dataset_config_names("aaaaa32r/elixirDatasets") +{:ok, configs} = ElixirDatasets.get_dataset_config_names("aaaaa32r/elixirDatasets") +IO.puts("Available configs: #{Enum.join(configs, ", ")}") +configs ``` @@ -756,10 +758,16 @@ ElixirDatasets.get_dataset_config_names("aaaaa32r/elixirDatasets") ### Write-to-file & read-from-file datasetInfo +Save and load dataset metadata to/from disk for offline use. + ```elixir {:ok, dataset_info} = ElixirDatasets.get_dataset_infos("aaaaa32r/elixirDatasets") ElixirDatasets.DatasetInfo.write_to_directory(dataset_info, "my-dir") -ElixirDatasets.DatasetInfo.from_directory("my-dir") +IO.puts("Saved dataset info to 'my-dir'") + +{:ok, loaded_info} = ElixirDatasets.DatasetInfo.from_directory("my-dir") +IO.puts("Loaded #{length(loaded_info)} dataset info(s) from disk") +loaded_info ``` From 5c035e62c396993d1abf7f78a0a87f6c9167c5d0 Mon Sep 17 00:00:00 2001 From: Weronika Date: Sat, 10 Jan 2026 16:22:54 +0100 Subject: [PATCH 06/10] Updated examples --- examples/usage_examples.livemd | 225 ++++++++++++++++++++++----------- 1 file changed, 150 insertions(+), 75 deletions(-) diff --git a/examples/usage_examples.livemd b/examples/usage_examples.livemd index 0697935..4a6d9a8 100644 --- a/examples/usage_examples.livemd +++ b/examples/usage_examples.livemd @@ -135,7 +135,7 @@ Use an authentication token to access private datasets. ```elixir datasets = ElixirDatasets.load_dataset!( {:hf, "cornell-movie-review-data/rotten_tomatoes"}, - %{auth_token: auth_token} + auth_token: auth_token ) IO.puts("Loaded #{length(datasets)} dataset(s) with authentication") datasets @@ -144,25 +144,42 @@ datasets ``` -** (FunctionClauseError) no function clause matching in Keyword.take/2 - - The following arguments were given to Keyword.take/2: - - # 1 - %{auth_token: nil} - - # 2 - [:download_mode, :verification_mode] - - Attempted function clauses (showing 1 out of 1): - - def take(keywords, keys) when is_list(keywords) and is_list(keys) - - (elixir 1.18.3) lib/keyword.ex:1279: Keyword.take/2 - (elixir_datasets 0.0.1) lib/elixir_datasets/dataset_loader.ex:216: ElixirDatasets.Loader.merge_download_opts/2 - (elixir_datasets 0.0.1) lib/elixir_datasets/dataset_loader.ex:91: ElixirDatasets.Loader.load_dataset/2 - (elixir_datasets 0.0.1) lib/elixir_datasets/dataset_loader.ex:126: ElixirDatasets.Loader.load_dataset!/2 - #cell:3fjksspnhlhcid4k:1: (file) +Loaded 3 dataset(s) with authentication +``` + + + +``` +[ + #Explorer.DataFrame< + Polars[1066 x 2] + text string ["lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .", + "consistently clever and suspenseful .", + "it's like a \" big chill \" reunion of the baader-meinhof gang , only these guys are more harmless pranksters than political activists .", + "the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with tremendous skill .", + "red dragon \" never cuts corners .", ...] + label s64 [1, 1, 1, 1, 1, ...] + >, + #Explorer.DataFrame< + Polars[8530 x 2] + text string ["the rock is destined to be the 21st century's new \" conan \" and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .", + "the gorgeously elaborate continuation of \" the lord of the rings \" trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth .", + "effective but too-tepid biopic", + "if you sometimes like to go to the movies to have fun , wasabi is a good place to start .", + "emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one .", + ...] + label s64 [1, 1, 1, 1, 1, ...] + >, + #Explorer.DataFrame< + Polars[1066 x 2] + text string ["compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .", + "the soundtrack alone is worth the price of admission .", + "rodriguez does a splendid job of racial profiling hollywood style--casting excellent latin actors of all ages--a trend long overdue .", + "beneath the film's obvious determination to shock at any cost lies considerable skill and determination , backed by sheer nerve .", + "bielinsky is a filmmaker of impressive talent .", ...] + label s64 [1, 1, 1, 1, 1, ...] + > +] ``` ### Load dataset from local resources @@ -547,6 +564,25 @@ sample_rows |> Enum.with_index(1) |> Enum.each(fn {row, idx} -> end) ``` + + +``` +βœ“ Created stream in streaming mode + Stream type: true + +Fetching first 3 rows... +βœ“ Fetched 3 rows + Row 1: [label, text] + Row 2: [label, text] + Row 3: [label, text] +``` + + + +``` +:ok +``` + #### Using custom cache directory Control where downloaded files are stored: @@ -622,7 +658,13 @@ IO.puts("Prepared dataset with #{Explorer.DataFrame.n_rows(df_head)} rows for up ``` -nil +Prepared dataset with 11 rows for upload +``` + + + +``` +:ok ``` ### Upload dataset to huggingface hub @@ -642,7 +684,13 @@ IO.puts("Upload example (commented out)") ``` -nil +Upload example (commented out) +``` + + + +``` +:ok ``` ### Delete dataset file from huggingface hub @@ -661,7 +709,13 @@ IO.puts("Delete example (commented out)") ``` -nil +Delete example (commented out) +``` + + + +``` +:ok ``` ### Upload dataset to huggingface hub via lfs @@ -680,16 +734,14 @@ IO.puts("LFS upload example (commented out)") ``` -nil +LFS upload example (commented out) ``` -## Other loading methods - -Query dataset metadata without downloading the full dataset. - -### Get dataset infos + -Retrieve detailed information about a dataset including splits and features. +``` +:ok +``` ```elixir {:ok, infos} = ElixirDatasets.get_dataset_infos("cornell-movie-review-data/rotten_tomatoes") @@ -700,28 +752,33 @@ infos ``` -{:ok, - [ - %ElixirDatasets.DatasetInfo{ - config_name: nil, - features: [ - %{"dtype" => "string", "name" => "text"}, - %{ - "dtype" => %{"class_label" => %{"names" => %{"0" => "neg", "1" => "pos"}}}, - "name" => "label" - } - ], - splits: [ - %{"name" => "train", "num_bytes" => 1074810, "num_examples" => 8530}, - %{"name" => "validation", "num_bytes" => 134679, "num_examples" => 1066}, - %{"name" => "test", "num_bytes" => 135972, "num_examples" => 1066} - ], - description: nil, - homepage: nil, - license: nil, - citation: nil - } - ]} +Retrieved 1 dataset configuration(s) +``` + + + +``` +[ + %ElixirDatasets.DatasetInfo{ + config_name: nil, + features: [ + %{"dtype" => "string", "name" => "text"}, + %{ + "dtype" => %{"class_label" => %{"names" => %{"0" => "neg", "1" => "pos"}}}, + "name" => "label" + } + ], + splits: [ + %{"name" => "train", "num_bytes" => 1074810, "num_examples" => 8530}, + %{"name" => "validation", "num_bytes" => 134679, "num_examples" => 1066}, + %{"name" => "test", "num_bytes" => 135972, "num_examples" => 1066} + ], + description: nil, + homepage: nil, + license: nil, + citation: nil + } +] ``` ### Get dataset split names @@ -737,7 +794,13 @@ splits ``` -{:ok, ["train", "validation", "test"]} +Available splits: train, validation, test +``` + + + +``` +["train", "validation", "test"] ``` ### Get dataset config names @@ -753,7 +816,13 @@ configs ``` -{:ok, ["csv", "default"]} +Available configs: csv, default +``` + + + +``` +["csv", "default"] ``` ### Write-to-file & read-from-file datasetInfo @@ -773,25 +842,31 @@ loaded_info ``` -{:ok, - [ - %ElixirDatasets.DatasetInfo{ - config_name: "csv", - features: [%{"dtype" => "int64", "name" => "id"}, %{"dtype" => "string", "name" => "number"}], - splits: [%{"name" => "train", "num_bytes" => 160, "num_examples" => 10}], - description: nil, - homepage: nil, - license: nil, - citation: nil - }, - %ElixirDatasets.DatasetInfo{ - config_name: "default", - features: [%{"dtype" => "int64", "name" => "id"}, %{"dtype" => "string", "name" => "number"}], - splits: [%{"name" => "train", "num_bytes" => 160, "num_examples" => 10}], - description: nil, - homepage: nil, - license: nil, - citation: nil - } - ]} +Saved dataset info to 'my-dir' +Loaded 2 dataset info(s) from disk +``` + + + +``` +[ + %ElixirDatasets.DatasetInfo{ + config_name: "csv", + features: [%{"dtype" => "int64", "name" => "id"}, %{"dtype" => "string", "name" => "number"}], + splits: [%{"name" => "train", "num_bytes" => 160, "num_examples" => 10}], + description: nil, + homepage: nil, + license: nil, + citation: nil + }, + %ElixirDatasets.DatasetInfo{ + config_name: "default", + features: [%{"dtype" => "int64", "name" => "id"}, %{"dtype" => "string", "name" => "number"}], + splits: [%{"name" => "train", "num_bytes" => 160, "num_examples" => 10}], + description: nil, + homepage: nil, + license: nil, + citation: nil + } +] ``` From ea23ed0ba1f09c8ca3eb950e16d117f9e6e417b3 Mon Sep 17 00:00:00 2001 From: Weronika Date: Sat, 10 Jan 2026 16:24:51 +0100 Subject: [PATCH 07/10] Format --- lib/elixir_datasets/dataset_loader.ex | 1 - lib/elixir_datasets/filter.ex | 1 - lib/elixir_datasets/info_getter.ex | 1 - lib/elixir_datasets/repository.ex | 1 - lib/elixir_datasets/streaming.ex | 1 - test/elixir_datasets/dataset_loader_test.exs | 4 +++- test/elixir_datasets/filter_test.exs | 1 - test/elixir_datasets/info_getter_test.exs | 1 - test/elixir_datasets/repository_test.exs | 1 - test/elixir_datasets/streaming_test.exs | 1 - 10 files changed, 3 insertions(+), 10 deletions(-) diff --git a/lib/elixir_datasets/dataset_loader.ex b/lib/elixir_datasets/dataset_loader.ex index 65f2f88..6e6b550 100644 --- a/lib/elixir_datasets/dataset_loader.ex +++ b/lib/elixir_datasets/dataset_loader.ex @@ -219,4 +219,3 @@ defmodule ElixirDatasets.Loader do defp merge_download_opts(repository, _load_opts), do: repository end - diff --git a/lib/elixir_datasets/filter.ex b/lib/elixir_datasets/filter.ex index 98294d0..e34af09 100644 --- a/lib/elixir_datasets/filter.ex +++ b/lib/elixir_datasets/filter.ex @@ -101,4 +101,3 @@ defmodule ElixirDatasets.Filter do end end end - diff --git a/lib/elixir_datasets/info_getter.ex b/lib/elixir_datasets/info_getter.ex index 1150297..57e0f9a 100644 --- a/lib/elixir_datasets/info_getter.ex +++ b/lib/elixir_datasets/info_getter.ex @@ -171,4 +171,3 @@ defmodule ElixirDatasets.Info do end end end - diff --git a/lib/elixir_datasets/repository.ex b/lib/elixir_datasets/repository.ex index cfa6d2c..c9fa212 100644 --- a/lib/elixir_datasets/repository.ex +++ b/lib/elixir_datasets/repository.ex @@ -200,4 +200,3 @@ defmodule ElixirDatasets.Repository do end end end - diff --git a/lib/elixir_datasets/streaming.ex b/lib/elixir_datasets/streaming.ex index 7c1f60a..e70f0a3 100644 --- a/lib/elixir_datasets/streaming.ex +++ b/lib/elixir_datasets/streaming.ex @@ -187,4 +187,3 @@ defmodule ElixirDatasets.Streaming do defp cleanup(_state), do: :ok end - diff --git a/test/elixir_datasets/dataset_loader_test.exs b/test/elixir_datasets/dataset_loader_test.exs index 30c1edc..bdf97e3 100644 --- a/test/elixir_datasets/dataset_loader_test.exs +++ b/test/elixir_datasets/dataset_loader_test.exs @@ -73,7 +73,9 @@ defmodule ElixirDatasets.LoaderTest do assert {:ok, datasets} = Loader.load_dataset(repository) assert is_list(datasets) - repository_offline = {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir, offline: true]} + repository_offline = + {:hf, "aaaaa32r/elixirDatasets", [cache_dir: @cache_dir, offline: true]} + assert {:ok, datasets} = Loader.load_dataset(repository_offline) assert is_list(datasets) diff --git a/test/elixir_datasets/filter_test.exs b/test/elixir_datasets/filter_test.exs index 99cfe75..84788f4 100644 --- a/test/elixir_datasets/filter_test.exs +++ b/test/elixir_datasets/filter_test.exs @@ -115,4 +115,3 @@ defmodule ElixirDatasets.FilterTest do end end end - diff --git a/test/elixir_datasets/info_getter_test.exs b/test/elixir_datasets/info_getter_test.exs index 3ff9e56..0b9e611 100644 --- a/test/elixir_datasets/info_getter_test.exs +++ b/test/elixir_datasets/info_getter_test.exs @@ -89,4 +89,3 @@ defmodule ElixirDatasets.InfoTest do end end end - diff --git a/test/elixir_datasets/repository_test.exs b/test/elixir_datasets/repository_test.exs index 5573c43..e218353 100644 --- a/test/elixir_datasets/repository_test.exs +++ b/test/elixir_datasets/repository_test.exs @@ -61,4 +61,3 @@ defmodule ElixirDatasets.RepositoryTest do end end end - diff --git a/test/elixir_datasets/streaming_test.exs b/test/elixir_datasets/streaming_test.exs index 506cf7c..f0d61ea 100644 --- a/test/elixir_datasets/streaming_test.exs +++ b/test/elixir_datasets/streaming_test.exs @@ -136,4 +136,3 @@ defmodule ElixirDatasets.StreamingTest do end end end - From 22a6ec22f94b24e9d45c309f8f6aa885c7099723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Rolka?= Date: Sun, 11 Jan 2026 11:18:32 +0100 Subject: [PATCH 08/10] Integration example (#50) Co-authored-by: Weronika --- examples/integration_examples.livemd | 213 +++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 examples/integration_examples.livemd diff --git a/examples/integration_examples.livemd b/examples/integration_examples.livemd new file mode 100644 index 0000000..f87bf3b --- /dev/null +++ b/examples/integration_examples.livemd @@ -0,0 +1,213 @@ +# Integration Examples + +```elixir +Mix.install([ + {:elixir_datasets, path: Path.join(__DIR__, "..")}, + {:nx, "~> 0.7"}, + {:axon, "~> 0.6"}, + {:bumblebee, "~> 0.5"}, + {:kino, "~> 0.12"} +]) +``` + +## Setup + +```elixir +auth_token = System.get_env("HF_TOKEN") +:ok +``` + +## Integration with Nx + +### Convert DataFrame to Nx Tensors + +Load a dataset and convert it to Nx tensors for numerical computing: + +```elixir +{:ok, [train_df]} = ElixirDatasets.load_dataset( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + split: "train" +) + +labels = + train_df + |> Explorer.DataFrame.pull("label") + |> Explorer.Series.to_list() + |> Nx.tensor() + +IO.puts("Labels tensor shape: #{inspect(Nx.shape(labels))}") +IO.puts("Labels tensor type: #{inspect(Nx.type(labels))}") +IO.inspect(labels[0..9], label: "First 10 labels") +``` + +### Prepare Data for Training + +```elixir +{:ok, [train_df]} = ElixirDatasets.load_dataset( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + split: "train" +) + +{:ok, [val_df]} = ElixirDatasets.load_dataset( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + split: "validation" +) + +train_labels = train_df |> Explorer.DataFrame.pull("label") |> Explorer.Series.to_list() |> Nx.tensor() +val_labels = val_df |> Explorer.DataFrame.pull("label") |> Explorer.Series.to_list() |> Nx.tensor() + +IO.puts("Training samples: #{Nx.size(train_labels)}") +IO.puts("Validation samples: #{Nx.size(val_labels)}") + +positive_count = train_labels |> Nx.sum() |> Nx.to_number() +total_count = Nx.size(train_labels) +IO.puts("Positive class ratio: #{Float.round(positive_count / total_count, 3)}") +``` + +## Integration with Bumblebee + +### Fill-Mask with DistilBERT (Quick Demo) + +Demonstrate Bumblebee integration with ElixirDatasets: + +```elixir +{:ok, model_info} = Bumblebee.load_model({:hf, "distilbert/distilbert-base-uncased"}) +{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "distilbert/distilbert-base-uncased"}) + +serving = Bumblebee.Text.fill_mask(model_info, tokenizer) + +IO.puts("βœ“ Model loaded successfully!") +``` + +Use the model with data from ElixirDatasets: + +```elixir +{:ok, [test_df]} = ElixirDatasets.load_dataset( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + split: "validation" +) + +sample_text = + test_df + |> Explorer.DataFrame.slice(0, 1) + |> Explorer.DataFrame.pull("text") + |> Explorer.Series.first() + +IO.puts("\n=== Dataset Sample ===") +IO.puts("Dataset: rotten_tomatoes") +IO.puts("Sample: #{String.slice(sample_text, 0, 80)}...") + +masked_text = "This movie is [MASK]." + +IO.puts("\nRunning inference (first run compiles ~1-2 min)...") + +result = Nx.Serving.run(serving, masked_text) +top = result.predictions |> List.first() + +IO.puts("\n=== Fill-Mask Result ===") +IO.puts("Input: #{masked_text}") +IO.puts("Predicted: '#{top.token}' (score: #{Float.round(top.score, 3)})") +``` + +## Integration with Axon + +### Build a Simple Neural Network + +Create a text classification model using Axon with data from ElixirDatasets: + +```elixir +{:ok, [train_df]} = ElixirDatasets.load_dataset( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + split: "train" +) + +model = + Axon.input("input", shape: {nil, 100}) + |> Axon.dense(64, activation: :relu) + |> Axon.dropout(rate: 0.5) + |> Axon.dense(32, activation: :relu) + |> Axon.dense(2, activation: :softmax) + +Axon.Display.as_graph(model, Nx.template({1, 100}, :f32)) +``` + +### Streaming Data for Training + +Use streaming to efficiently process large datasets: + +```elixir +{:ok, stream} = ElixirDatasets.load_dataset( + {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + split: "train", + streaming: true +) + +batch_size = 32 + +batched_stream = + stream + |> Stream.chunk_every(batch_size) + |> Stream.take(5) + +IO.puts("Processing batches of #{batch_size} samples:\n") +for {batch, idx} <- Enum.with_index(batched_stream, 1) do + labels = batch |> Enum.map(& &1["label"]) |> Nx.tensor() + IO.puts("Batch #{idx}: #{length(batch)} samples, labels shape: #{inspect(Nx.shape(labels))}") +end +``` + +## Advanced: Custom Data Pipeline + +### Combine ElixirDatasets with Nx and Axon for End-to-End Training + +```elixir +defmodule DataPipeline do + @doc """ + Creates a data pipeline that loads, preprocesses, and batches data + """ + def create_pipeline(dataset_name, split, batch_size) do + {:ok, stream} = ElixirDatasets.load_dataset( + {:hf, dataset_name}, + split: split, + streaming: true + ) + + stream + |> Stream.chunk_every(batch_size) + |> Stream.map(&prepare_batch/1) + end + + defp prepare_batch(batch) do + labels = + batch + |> Enum.map(& &1["label"]) + |> Nx.tensor() + + texts = Enum.map(batch, & &1["text"]) + + {texts, labels} + end +end + +pipeline = DataPipeline.create_pipeline( + "cornell-movie-review-data/rotten_tomatoes", + "train", + 16 +) + +{texts, labels} = Enum.at(pipeline, 0) +IO.puts("Batch size: #{length(texts)}") +IO.puts("Labels shape: #{inspect(Nx.shape(labels))}") +IO.puts("Sample text: #{List.first(texts) |> String.slice(0..100)}...") +``` + +## Summary + +This notebook demonstrates how to integrate ElixirDatasets with: + +* **Nx**: Convert DataFrames to tensors for numerical computing +* **Bumblebee**: Use pre-trained models with loaded datasets +* **Axon**: Build and train neural networks with dataset streams +* **Custom Pipelines**: Create efficient data processing workflows + +These integrations enable you to build complete machine learning pipelines in Elixir! From 836f5507bc468f9feef8b58b7014b8add777bf64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Rolka?= Date: Sun, 11 Jan 2026 17:03:15 +0100 Subject: [PATCH 09/10] PR fixes --- README.md | 91 +++--------------------- examples/integration_examples.livemd | 2 +- examples/usage_examples.livemd | 23 ++++-- lib/elixir_datasets.ex | 35 +++++---- lib/elixir_datasets/dataset_loader.ex | 33 +++++---- lib/elixir_datasets/huggingface/hub.ex | 2 +- lib/elixir_datasets/utils/file_loader.ex | 6 +- 7 files changed, 67 insertions(+), 125 deletions(-) diff --git a/README.md b/README.md index d17e249..868ebbd 100644 --- a/README.md +++ b/README.md @@ -4,18 +4,16 @@ [![Documentation](https://img.shields.io/badge/docs-hexdocs-blue.svg)](https://hexdocs.pm/elixir_datasets) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -**ElixirDatasets** is a comprehensive library for accessing and managing datasets from Hugging Face Hub in Elixir. Inspired by the Python `datasets` library, it brings powerful dataset management capabilities to the Elixir ecosystem with seamless integration with Explorer DataFrames. +**ElixirDatasets** is a comprehensive library for accessing and managing datasets from Hugging Face Hub in Elixir. Inspired by the [Python `datasets` library](https://github.com/huggingface/datasets), it brings powerful dataset management capabilities to the Elixir ecosystem with seamless integration with Explorer DataFrames. ## ✨ Features - πŸš€ **Easy Access to Hugging Face Hub** - Load thousands of datasets with a single function call - πŸ“Š **Explorer Integration** - Automatic conversion to Explorer DataFrames for data manipulation -- ⚑ **High Performance** - Parallel processing support for loading multiple files - πŸ’Ύ **Smart Caching** - Intelligent local caching to avoid redundant downloads - 🌊 **Streaming Support** - Process large datasets without loading everything into memory - πŸ“€ **Upload Datasets** - Publish your own datasets to Hugging Face Hub - πŸ”’ **Private Repositories** - Full support for authentication and private datasets -- πŸ”Œ **Offline Mode** - Work with cached datasets without internet connection - 🎯 **Multiple Formats** - Support for CSV, Parquet, and JSONL files ## πŸ“¦ Installation @@ -46,60 +44,14 @@ end streaming: true ) -stream |> Enum.take(100) |> Enum.each(&process_row/1) +stream |> Enum.take(100) |> IO.inspect() ``` ## πŸ“š Examples -### Text Classification with Sentiment Analysis - -```elixir -{:ok, [train_df]} = ElixirDatasets.load_dataset( - {:hf, "cornell-movie-review-data/rotten_tomatoes"}, - split: "train" -) - -require Explorer.DataFrame, as: DF - -train_df -|> DF.head(5) -|> IO.inspect() - -{:ok, splits} = ElixirDatasets.get_dataset_split_names( - "cornell-movie-review-data/rotten_tomatoes" -) -IO.inspect(splits) -``` - -### Streaming Large Datasets - -```elixir -{:ok, stream} = ElixirDatasets.load_dataset( - {:hf, "stanfordnlp/imdb", subdir: "plain_text"}, - split: "train", - streaming: true -) - -stream -|> Stream.filter(fn row -> String.length(row["text"]) > 100 end) -|> Stream.take(1000) -|> Enum.each(&process_review/1) -``` - -### Working Offline - -```elixir -{:ok, _} = ElixirDatasets.load_dataset( - {:hf, "cornell-movie-review-data/rotten_tomatoes"}, - split: "train" -) - -{:ok, [data]} = ElixirDatasets.load_dataset( - {:hf, "cornell-movie-review-data/rotten_tomatoes"}, - split: "train", - offline: true -) -``` +All examples can be found in the [examples](examples) directory. +- `examples/usage_examples.livemd` - Comprehensive usage examples of the elixir_datasets api +- `examples/integration_examples.livemd` - Examples demonstrating integration with other Elixir libraries like [Nx](https://github.com/elixir-nx/nx), [Axon](https://github.com/elixir-nx/axon), and [Bumblebee](https://github.com/elixir-nx/bumblebee) ## πŸ”§ Configuration @@ -108,33 +60,20 @@ stream - `ELIXIR_DATASETS_CACHE_DIR` - Custom cache directory - `ELIXIR_DATASETS_OFFLINE` - Enable offline mode (`"1"` or `"true"`) - `HF_TOKEN` - Authentication token for private datasets - -See the [full documentation](https://hexdocs.pm/elixir_datasets) for all available options. +- [🚧 In-progress] `HF_DEBUG` - Enable debug logging (`"1"` or `"true"`) ## πŸ“– Documentation -Full documentation is available at [HexDocs](https://hexdocs.pm/elixir_datasets). - -## πŸ““ Interactive Examples - -Explore interactive examples in Livebook: `examples/usage_examples.livemd` +Full documentation is available at [HexDocs](https://hexdocs.pm/elixir_datasets) and hosted on [GitHub Pages](https://radoslawrolka.github.io/ElixirDatasets/api-reference.html) for current status of under-development features. Documentation can be generated locally using: ```bash -mix escript.install hex livebook - -livebook server examples/usage_examples.livemd +mix docs ``` -The notebook includes examples for loading, streaming, parallel processing, and uploading datasets. - ## πŸ§ͺ Testing ```bash -mix test - -mix coveralls - -mix test test/elixir_datasets_test.exs +MIX_ENV=test mix test ``` ## πŸ“„ License @@ -143,16 +82,4 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file Copyright (c) 2025 RadosΕ‚aw Rolka, Weronika Wojtas -## πŸ™ Acknowledgments - -- Inspired by [Hugging Face Datasets](https://github.com/huggingface/datasets) -- Built with [Explorer](https://github.com/elixir-nx/explorer) for DataFrame operations -- Uses [Req](https://github.com/wojtekmach/req) for HTTP requests - -## πŸ“ž Support - -- πŸ“š [Documentation](https://hexdocs.pm/elixir_datasets) -- πŸ› [Issue Tracker](https://github.com/yourusername/elixir_datasets/issues) -- πŸ’¬ [Discussions](https://github.com/yourusername/elixir_datasets/discussions) - --- diff --git a/examples/integration_examples.livemd b/examples/integration_examples.livemd index f87bf3b..8eb98e8 100644 --- a/examples/integration_examples.livemd +++ b/examples/integration_examples.livemd @@ -2,7 +2,7 @@ ```elixir Mix.install([ - {:elixir_datasets, path: Path.join(__DIR__, "..")}, + {:elixir_datasets, "~> 0.1.0"}, {:nx, "~> 0.7"}, {:axon, "~> 0.6"}, {:bumblebee, "~> 0.5"}, diff --git a/examples/usage_examples.livemd b/examples/usage_examples.livemd index 4a6d9a8..adeacd2 100644 --- a/examples/usage_examples.livemd +++ b/examples/usage_examples.livemd @@ -3,14 +3,10 @@ # Usage examples ```elixir -##### Target version # Install dependencies -# Mix.install([ -# {:elixir_datasets, "0.0.1"} -# ]) -##### Local dev-testing version Mix.install([ - {:elixir_datasets, path: "#{__DIR__}/.."} + {:elixir_datasets, "0.1.0"} +# {:elixir_datasets, path: "#{__DIR__}/.."} # Local dev-testing version ]) # get auth_token explicitly for downloading from HuggingFace @@ -49,6 +45,7 @@ datasets ``` +|===============================================================| 100% (1.51 MB) Loaded 1 dataset(s) ``` @@ -57,7 +54,7 @@ Loaded 1 dataset(s) ``` [ #Explorer.DataFrame< - Polars[974 x 5] + Polars[983 x 5] act string ["Ethereum Developer", "Linux Terminal", "English Translator and Improver", "Job Interviewer", "JavaScript Console", ...] prompt string ["Imagine you are an experienced Ethereum developer tasked with creating a smart contract for a blockchain messenger. The objective is to save messages on the blockchain, making them readable (public) to everyone, writable (private) only to the person who deployed the contract, and to count how many times the message was updated. Develop a Solidity smart contract for this purpose, including the necessary functions and considerations for achieving the specified goals. Please provide the code and any relevant explanations to ensure a clear understanding of the implementation.", @@ -88,6 +85,9 @@ datasets ``` +|==============================================================| 100% (20.47 MB) +|==============================================================| 100% (20.97 MB) +|==============================================================| 100% (41.99 MB) Loaded 3 dataset(s) from 'plain_text' configuration ``` @@ -144,6 +144,9 @@ datasets ``` +|==============================================================| 100% (92.20 KB) +|=============================================================| 100% (698.84 KB) +|==============================================================| 100% (90.00 KB) Loaded 3 dataset(s) with authentication ``` @@ -278,6 +281,9 @@ IO.puts("Loaded #{length(sst2_data)} dataset(s) from 'sst2' configuration") ``` +|=============================================================| 100% (147.79 KB) +|===============================================================| 100% (3.11 MB) +|==============================================================| 100% (72.81 KB) Loaded 3 dataset(s) from 'sst2' configuration ``` @@ -435,6 +441,9 @@ IO.puts("Loaded #{length(data)} splits") ``` +|=============================================================| 100% (308.44 KB) +|=============================================================| 100% (649.28 KB) +|==============================================================| 100% (75.67 KB) Loaded 3 splits ``` diff --git a/lib/elixir_datasets.ex b/lib/elixir_datasets.ex index be6f052..e673a33 100644 --- a/lib/elixir_datasets.ex +++ b/lib/elixir_datasets.ex @@ -23,17 +23,17 @@ defmodule ElixirDatasets do ## Examples # Load a dataset from Hugging Face - {:ok, datasets} = ElixirDatasets.load_dataset({:hf, "imdb"}) + iex> {:ok, datasets} = ElixirDatasets.load_dataset({:hf, "imdb"}) # Load with specific split - {:ok, train_data} = ElixirDatasets.load_dataset({:hf, "imdb"}, split: "train") + iex> {:ok, train_data} = ElixirDatasets.load_dataset({:hf, "imdb"}, split: "train") # Stream large datasets - {:ok, stream} = ElixirDatasets.load_dataset({:hf, "c4"}, streaming: true) - stream |> Enum.take(100) + iex> {:ok, stream} = ElixirDatasets.load_dataset({:hf, "c4"}, streaming: true) + iex> stream |> Enum.take(100) # Get dataset information - {:ok, info} = ElixirDatasets.get_dataset_info("imdb") + iex> {:ok, info} = ElixirDatasets.get_dataset_info("imdb") """ @compile if Mix.env() == :test, do: :export_all @@ -250,19 +250,18 @@ defmodule ElixirDatasets do ## Examples - ElixirDatasets.load_dataset({:hf, "dataset_name"}, split: "train") + iex> ElixirDatasets.load_dataset({:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train") - ElixirDatasets.load_dataset({:hf, "glue"}, name: "sst2") + iex> ElixirDatasets.load_dataset({:hf, "glue"}, name: "sst2") - ElixirDatasets.load_dataset({:hf, "glue"}, name: "sst2", split: "train") + iex> ElixirDatasets.load_dataset({:hf, "glue"}, name: "sst2", split: "train") + iex> {:ok, stream} = ElixirDatasets.load_dataset( + ...> {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + ...> split: "train", + ...> streaming: true + ...> ) - {:ok, stream} = ElixirDatasets.load_dataset( - {:hf, "large_dataset"}, - split: "train", - streaming: true - ) - - stream |> Stream.take(100) |> Enum.each(&process_row/1) + ...> stream |> Stream.take(3) |> IO.inspect() """ @spec load_dataset(t_repository(), keyword()) :: @@ -286,10 +285,10 @@ defmodule ElixirDatasets do ## Examples - datasets = ElixirDatasets.load_dataset!({:hf, "dataset_name"}, split: "train") + iex> datasets = ElixirDatasets.load_dataset!({:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train") - stream = ElixirDatasets.load_dataset!({:hf, "dataset"}, streaming: true) - stream |> Enum.take(10) + iex> stream = ElixirDatasets.load_dataset!({:hf, "cornell-movie-review-data/rotten_tomatoes"}, streaming: true) + iex> stream |> Enum.take(10) """ @spec load_dataset!(t_repository(), keyword()) :: diff --git a/lib/elixir_datasets/dataset_loader.ex b/lib/elixir_datasets/dataset_loader.ex index 6e6b550..934c33a 100644 --- a/lib/elixir_datasets/dataset_loader.ex +++ b/lib/elixir_datasets/dataset_loader.ex @@ -66,17 +66,16 @@ defmodule ElixirDatasets.Loader do ## Examples - ElixirDatasets.Loader.load_dataset({:hf, "dataset_name"}, split: "train") + iex> ElixirDatasets.Loader.load_dataset({:hf, "dataset_name"}, split: "train") - ElixirDatasets.Loader.load_dataset({:hf, "glue"}, name: "sst2") + iex> ElixirDatasets.Loader.load_dataset({:hf, "glue"}, name: "sst2") - {:ok, stream} = ElixirDatasets.Loader.load_dataset( - {:hf, "large_dataset"}, - split: "train", - streaming: true - ) - - stream |> Stream.take(100) |> Enum.each(&process_row/1) + iex> {:ok, stream} = ElixirDatasets.Loader.load_dataset( + ...> {:hf, "cornell-movie-review-data/rotten_tomatoes"}, + ...> split: "train", + ...> streaming: true + ...> ) + ...> stream |> Stream.take(100) |> IO.inspect() """ @spec load_dataset(Repository.t_repository(), keyword()) :: @@ -114,10 +113,10 @@ defmodule ElixirDatasets.Loader do ## Examples - datasets = ElixirDatasets.Loader.load_dataset!({:hf, "dataset_name"}, split: "train") + iex> datasets = ElixirDatasets.Loader.load_dataset!({:hf, "cornell-movie-review-data/rotten_tomatoes"}, split: "train") - stream = ElixirDatasets.Loader.load_dataset!({:hf, "dataset"}, streaming: true) - stream |> Enum.take(10) + iex> stream = ElixirDatasets.Loader.load_dataset!({:hf, "cornell-movie-review-data/rotten_tomatoes"}, streaming: true) + iex> stream |> Enum.take(10) """ @spec load_dataset!(Repository.t_repository(), keyword()) :: @@ -212,7 +211,15 @@ defmodule ElixirDatasets.Loader do end defp merge_download_opts({:hf, repository_id, repo_opts}, load_opts) do - download_opts = [:download_mode, :verification_mode] + download_opts = [ + :download_mode, + :verification_mode, + :cache_dir, + :auth_token, + :offline, + :revision + ] + merged_opts = Keyword.merge(repo_opts, Keyword.take(load_opts, download_opts)) {:hf, repository_id, merged_opts} end diff --git a/lib/elixir_datasets/huggingface/hub.ex b/lib/elixir_datasets/huggingface/hub.ex index ff84e75..ad119f8 100644 --- a/lib/elixir_datasets/huggingface/hub.ex +++ b/lib/elixir_datasets/huggingface/hub.ex @@ -76,7 +76,7 @@ defmodule ElixirDatasets.HuggingFace.Hub do """ @spec cached_download(String.t(), keyword()) :: {:ok, String.t()} | {:error, String.t()} def cached_download(url, opts \\ []) do - cache_dir = opts[:cache_dir] || ElixirDatasets.cache_dir() + cache_dir = opts[:cache_dir] || ElixirDatasets.cache_dir() |> Path.expand() offline = Keyword.get(opts, :offline, elixir_datasets_offline?()) auth_token = opts[:auth_token] download_mode = opts[:download_mode] || :reuse_dataset_if_exists diff --git a/lib/elixir_datasets/utils/file_loader.ex b/lib/elixir_datasets/utils/file_loader.ex index fcd8715..fbeaeeb 100644 --- a/lib/elixir_datasets/utils/file_loader.ex +++ b/lib/elixir_datasets/utils/file_loader.ex @@ -96,11 +96,11 @@ defmodule ElixirDatasets.Utils.Loader do ## Examples # Sequential loading - paths = [{"data1.csv", "csv"}, {"data2.parquet", "parquet"}] - datasets = load_datasets_from_paths!(paths) + iex> paths = [{"data1.csv", "csv"}, {"data2.parquet", "parquet"}] + iex> datasets = load_datasets_from_paths!(paths) # Parallel loading with 4 processes - datasets = load_datasets_from_paths!(paths, 4) + iex> datasets = load_datasets_from_paths!(paths, 4) """ @spec load_datasets_from_paths!([{Path.t(), String.t()}], pos_integer()) :: [ From 6820e1f76f01c258c76ab8f901c6a0b04caf8fb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Rolka?= Date: Sun, 11 Jan 2026 17:16:47 +0100 Subject: [PATCH 10/10] Update lib/elixir_datasets/repository.ex Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- lib/elixir_datasets/repository.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/elixir_datasets/repository.ex b/lib/elixir_datasets/repository.ex index c9fa212..4fbdc50 100644 --- a/lib/elixir_datasets/repository.ex +++ b/lib/elixir_datasets/repository.ex @@ -7,7 +7,7 @@ defmodule ElixirDatasets.Repository do @typedoc """ A location to fetch dataset files from. - Can be either a Hugging Face repository or a local resources: + Can be either a Hugging Face repository or a local resource: * `{:hf, repository_id}` - the Hugging Face repository ID