diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 6781d2e..9d56552 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -103,6 +103,7 @@ Each provider module generates a specific type of fake data: | company.rs | company, job, catch_phrase | Business data | | network.rs | url, domain_name, ipv4, ipv6, mac_address | Network identifiers | | finance.rs | credit_card, iban | Financial identifiers with valid checksums | +| packages.rs | commit_sha, semver, calver, spdx_license, git_username, pypi/npm/cargo/gem/maven package names, version constraints, maven_coordinate, pypi_requirement | Package-registry data for PyPI, npm, Maven, Cargo, RubyGems | | records.rs | records | Structured data from schema DSL (Rust-only, not yet exposed to Python) | All providers follow the same pattern: @@ -125,6 +126,8 @@ Static data organized by locale, embedded at compile time as `&'static [&str]`: - `countries.rs`: ~200 countries - `companies.rs`: Company name components - `tlds.rs`: ~20 top-level domains + - `spdx_licenses.rs`: 50 common SPDX license identifiers + - `packages.rs`: package-name keywords, modifiers, Maven/npm scope components, pre-release tags, Maven qualifiers Each data file includes tests for uniqueness and non-empty values. diff --git a/CHANGELOG.md b/CHANGELOG.md index cbbac1c..0d40830 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.4.0] - 2026-04-17 + ### Added +- **Package Registry Providers**: Cross-ecosystem fake data for seeding PyPI, + npm, Maven, Cargo, and RubyGems test databases. 22 method pairs + (44 Python-visible methods). + - Cross-ecosystem primitives: `commit_sha()` / `short_commit_sha()`, + `semver()` / `semver_prerelease()`, `calver()`, `spdx_license()` (50 + common IDs), `git_username()` (enforces GitHub's rules: alphanumerics + and single hyphens, no leading/trailing hyphen, no consecutive hyphens, + ≤ 39 chars). + - Ecosystem-specific versions: `pypi_version()` (PEP 440 — includes + pre/post/dev releases), `maven_version()` (with qualifiers like + `-SNAPSHOT`, `.RELEASE`, `.Final`, `-RC1`). + - Version constraints: `pypi_version_specifier()` (PEP 440), + `npm_version_range()`, `cargo_version_req()`, `maven_version_range()`, + `gem_version_requirement()`. + - Package identity: `pypi_package_name()` (PEP 503 normalised: lowercase + `[a-z0-9-]`, hyphen as the sole separator), + `npm_package_name()` (plain or `@scope/pkg`), `cargo_package_name()`, + `gem_name()`, `maven_group_id()` (reverse domain), + `maven_artifact_id()`, `maven_coordinate()` (GAV form + `group:artifact:version`). + - Full requirement line: `pypi_requirement()` (e.g., + `requests>=2.0.0,<3.0.0`). + - All batch methods support parallel generation via `set_parallel()`. - **Parallel Generation**: Opt-in multi-threaded batch generation via Rayon - `set_parallel(enabled, num_threads=None)`: Enable/disable parallel mode - `get_parallel()` / `get_num_threads()`: Query current parallel settings @@ -18,6 +43,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - ~3.3x speedup at 100K+ items (names: 83ms -> 25ms for 1M items) - `unique=True` always uses sequential path (requires shared state) - Criterion benchmarks for parallel vs sequential comparison +- **Streaming file writer**: `records_to_file(path, n, schema, ...)` generates + records in chunks and writes each chunk to disk, keeping peak memory bounded + by `chunk_size` regardless of `n`. Supports CSV, NDJSON, SQL, and Parquet + with auto-detection from the file extension. Includes an optional progress + callback and an `estimate_memory()` utility. +- **Serialized output formats** for `records()` — serialised directly in Rust, + avoiding the cost of materialising Python objects before serialising: + - `records_csv()` — RFC 4180 CSV with header row + - `records_json()` — JSON array with proper scalar types + - `records_ndjson()` — newline-delimited JSON + - `records_parquet()` — Parquet bytes via the Arrow path + - `records_sql()` — ANSI SQL `INSERT`s, batched at 1000 rows, + with identifier quoting ## [0.3.0] - 2026-03-17 @@ -160,6 +198,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - SonarCloud integration for code quality - CodeQL static analysis -[Unreleased]: https://github.com/williajm/forgery/compare/v0.3.0...HEAD +[Unreleased]: https://github.com/williajm/forgery/compare/v0.4.0...HEAD +[0.4.0]: https://github.com/williajm/forgery/compare/v0.3.0...v0.4.0 [0.3.0]: https://github.com/williajm/forgery/compare/v0.2.0...v0.3.0 [0.1.0]: https://github.com/williajm/forgery/releases/tag/v0.1.0 diff --git a/Cargo.lock b/Cargo.lock index 5e532d3..a8fa33a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -503,7 +503,7 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "forgery" -version = "0.3.0" +version = "0.4.0" dependencies = [ "arrow-array", "arrow-buffer", diff --git a/Cargo.toml b/Cargo.toml index 804e8a2..4a140fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "forgery" -version = "0.3.0" +version = "0.4.0" edition = "2021" description = "Fake data at the speed of Rust" license = "MIT" diff --git a/README.md b/README.md index caeb661..6df91ec 100644 --- a/README.md +++ b/README.md @@ -412,6 +412,87 @@ License plate formats by locale: | `it_IT` | AB 123 CD | `"FG 482 HJ"` | | `ja_JP` | 300 12-34 | `"500 38-47"` | +### Package Registry Data + +For seeding test databases of package registries (PyPI, npm, Maven, Cargo, RubyGems). +Cross-ecosystem primitives share one API; ecosystem-specific shapes have their own +methods. + +**Cross-ecosystem primitives** + +| Batch | Single | Description | +|-------|--------|-------------| +| `commit_shas(n)` | `commit_sha()` | 40-hex-char git commit SHA | +| `short_commit_shas(n)` | `short_commit_sha()` | 7-hex-char short SHA | +| `semvers(n)` | `semver()` | SemVer `MAJOR.MINOR.PATCH` | +| `semver_prereleases(n)` | `semver_prerelease()` | Pre-release (e.g. `1.2.3-alpha.1+build.5`) | +| `calvers(n)` | `calver()` | CalVer in mixed schemes (`YYYY.MM.DD`, `YY.MM`, ...) | +| `spdx_licenses(n)` | `spdx_license()` | SPDX identifier (50 common IDs) | +| `git_usernames(n)` | `git_username()` | GitHub/GitLab/Bitbucket-compatible username | + +**Ecosystem-specific versions** (where SemVer alone doesn't cover the format) + +| Batch | Single | Description | +|-------|--------|-------------| +| `pypi_versions(n)` | `pypi_version()` | PEP 440 (pre/post/dev releases) | +| `maven_versions(n)` | `maven_version()` | Maven version with qualifiers (`-SNAPSHOT`, `.RELEASE`, ...) | + +**Version constraints** + +| Batch | Single | Description | +|-------|--------|-------------| +| `pypi_version_specifiers(n)` | `pypi_version_specifier()` | PEP 440 (e.g. `>=1.2,<2.0`, `~=1.0`) | +| `npm_version_ranges(n)` | `npm_version_range()` | npm (e.g. `^1.2.3`, `~1.2.3`, `1.x`) | +| `cargo_version_reqs(n)` | `cargo_version_req()` | Cargo (e.g. `^1.0`, `~1.2`) | +| `maven_version_ranges(n)` | `maven_version_range()` | Maven (e.g. `[1.0,2.0)`) | +| `gem_version_requirements(n)` | `gem_version_requirement()` | RubyGems (e.g. `~> 1.2`) | + +**Package identity** + +| Batch | Single | Description | +|-------|--------|-------------| +| `pypi_package_names(n)` | `pypi_package_name()` | PEP 503 normalised (lowercase `[a-z0-9-]`) | +| `npm_package_names(n)` | `npm_package_name()` | Plain or `@scope/pkg` (~30% scoped) | +| `cargo_package_names(n)` | `cargo_package_name()` | Rust-ident flavour | +| `gem_names(n)` | `gem_name()` | RubyGems gem name | +| `maven_group_ids(n)` | `maven_group_id()` | Reverse domain (e.g. `com.example.tools`) | +| `maven_artifact_ids(n)` | `maven_artifact_id()` | Lowercase with hyphens | +| `maven_coordinates(n)` | `maven_coordinate()` | GAV (`group:artifact:version`) | + +**Full requirement lines** + +| Batch | Single | Description | +|-------|--------|-------------| +| `pypi_requirements(n)` | `pypi_requirement()` | e.g. `requests>=2.0.0,<3.0.0` | + +```python +from forgery import Faker + +fake = Faker() +fake.seed(42) +fake.pypi_requirement() # 'requests>=2.0.0,<3.0.0' +fake.maven_coordinate() # 'com.example.tools:widget-core:1.2.3-SNAPSHOT' +fake.npm_package_name() # '@types/fast-parser' +fake.spdx_license() # 'Apache-2.0' +fake.git_username() # 'tiny-logger42' +fake.commit_sha() # 'a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2' +``` + +The nine batch methods below accept `unique=True` for no-duplicate output, +matching the `names(n, unique=True)` pattern — useful when seeding registry +tables that have a unique-name constraint. Exhausting the combinatorial pool +raises `ValueError`: + +```python +fake.pypi_package_names(100, unique=True) # 100 distinct package names +fake.maven_coordinates(500, unique=True) # 500 distinct GAVs +fake.spdx_licenses(60, unique=True) # ValueError: only 50 SPDX IDs available +``` + +Methods with `unique` support: `pypi_package_names`, `npm_package_names`, +`cargo_package_names`, `gem_names`, `maven_group_ids`, `maven_artifact_ids`, +`maven_coordinates`, `git_usernames`, `spdx_licenses`. + ### Profile | Batch | Single | Description | diff --git a/pyproject.toml b/pyproject.toml index 070ee78..5e57319 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "forgery" -version = "0.3.0" +version = "0.4.0" description = "Fake data at the speed of Rust" readme = "README.md" license = { text = "MIT" } diff --git a/python/forgery/__init__.py b/python/forgery/__init__.py index 2085a79..1d8cdd0 100644 --- a/python/forgery/__init__.py +++ b/python/forgery/__init__.py @@ -81,6 +81,12 @@ "booleans", "bothify", "bothify_batch", + "calver", + "calvers", + "cargo_package_name", + "cargo_package_names", + "cargo_version_req", + "cargo_version_reqs", "catch_phrase", "catch_phrases", "chrome", @@ -89,6 +95,8 @@ "city", "color", "colors", + "commit_sha", + "commit_shas", "companies", "company", "content_type_header", @@ -145,8 +153,14 @@ "floats", "free_email", "free_emails", + "gem_name", + "gem_names", + "gem_version_requirement", + "gem_version_requirements", "generate", "generate_batch", + "git_username", + "git_usernames", "has_provider", "hex_color", "hex_colors", @@ -187,6 +201,16 @@ "longitudes", "mac_address", "mac_addresses", + "maven_artifact_id", + "maven_artifact_ids", + "maven_coordinate", + "maven_coordinates", + "maven_group_id", + "maven_group_ids", + "maven_version", + "maven_version_range", + "maven_version_ranges", + "maven_versions", "md5", "md5s", "meta_description", @@ -195,6 +219,10 @@ "mime_types", "name", "names", + "npm_package_name", + "npm_package_names", + "npm_version_range", + "npm_version_ranges", "numerify", "numerify_batch", "og_tags", @@ -215,6 +243,14 @@ "product_names", "profile", "profiles", + "pypi_package_name", + "pypi_package_names", + "pypi_requirement", + "pypi_requirements", + "pypi_version", + "pypi_version_specifier", + "pypi_version_specifiers", + "pypi_versions", "query_string", "query_strings", "records", @@ -239,12 +275,20 @@ "safe_email", "safe_emails", "seed", + "semver", + "semver_prerelease", + "semver_prereleases", + "semvers", "sentence", "sentences", "sha256", "sha256s", + "short_commit_sha", + "short_commit_shas", "sort_code", "sort_codes", + "spdx_license", + "spdx_licenses", "ssn", "ssns", "state", @@ -285,7 +329,7 @@ "zip_codes", ] -__version__ = "0.3.0" +__version__ = "0.4.0" # Default Faker instance for convenient access. # WARNING: Not thread-safe. For multi-threaded use, create separate Faker instances. @@ -2494,3 +2538,242 @@ def website(pages: int = 10, domain: str = _DEFAULT_DOMAIN) -> dict[str, str]: ValueError: If pages exceeds 10,000. """ return fake.website(pages, domain) + + +# === Package Registry Data === + + +def commit_sha() -> str: + """Generate a single full-length git commit SHA (40 lowercase hex chars).""" + return fake.commit_sha() + + +def commit_shas(n: int) -> list[str]: + """Generate a batch of full-length git commit SHAs.""" + return fake.commit_shas(n) + + +def short_commit_sha() -> str: + """Generate a single short git commit SHA (7 lowercase hex chars).""" + return fake.short_commit_sha() + + +def short_commit_shas(n: int) -> list[str]: + """Generate a batch of short git commit SHAs.""" + return fake.short_commit_shas(n) + + +def semver() -> str: + """Generate a single SemVer ``MAJOR.MINOR.PATCH`` string.""" + return fake.semver() + + +def semvers(n: int) -> list[str]: + """Generate a batch of SemVer version strings.""" + return fake.semvers(n) + + +def semver_prerelease() -> str: + """Generate a single SemVer pre-release string (e.g. ``1.2.3-alpha.1``).""" + return fake.semver_prerelease() + + +def semver_prereleases(n: int) -> list[str]: + """Generate a batch of SemVer pre-release strings.""" + return fake.semver_prereleases(n) + + +def calver() -> str: + """Generate a single calendar-versioning (CalVer) string.""" + return fake.calver() + + +def calvers(n: int) -> list[str]: + """Generate a batch of CalVer strings.""" + return fake.calvers(n) + + +def spdx_license() -> str: + """Generate a single SPDX license identifier (e.g. ``MIT``, ``Apache-2.0``).""" + return fake.spdx_license() + + +def spdx_licenses(n: int, unique: bool = False) -> list[str]: + """Generate a batch of SPDX license identifiers. + + With ``unique=True`` at most 50 distinct values can be returned + (the size of the built-in list). + """ + return fake.spdx_licenses(n, unique) + + +def git_username() -> str: + """Generate a single GitHub/GitLab/Bitbucket-compatible username. + + Guarantees GitHub's rules: alphanumerics + single hyphens, no leading + or trailing hyphen, no consecutive hyphens, at most 39 characters. + """ + return fake.git_username() + + +def git_usernames(n: int, unique: bool = False) -> list[str]: + """Generate a batch of Git-platform-compatible usernames.""" + return fake.git_usernames(n, unique) + + +def pypi_version() -> str: + """Generate a single PEP 440 version string (PyPI).""" + return fake.pypi_version() + + +def pypi_versions(n: int) -> list[str]: + """Generate a batch of PEP 440 version strings.""" + return fake.pypi_versions(n) + + +def maven_version() -> str: + """Generate a single Maven-style version string (may include a qualifier).""" + return fake.maven_version() + + +def maven_versions(n: int) -> list[str]: + """Generate a batch of Maven-style version strings.""" + return fake.maven_versions(n) + + +def pypi_version_specifier() -> str: + """Generate a single PEP 440 version specifier (e.g. ``>=1.2.3``, ``~=1.2``).""" + return fake.pypi_version_specifier() + + +def pypi_version_specifiers(n: int) -> list[str]: + """Generate a batch of PEP 440 version specifiers.""" + return fake.pypi_version_specifiers(n) + + +def npm_version_range() -> str: + """Generate a single npm-style version range (e.g. ``^1.2.3``, ``~1.2.3``).""" + return fake.npm_version_range() + + +def npm_version_ranges(n: int) -> list[str]: + """Generate a batch of npm-style version ranges.""" + return fake.npm_version_ranges(n) + + +def cargo_version_req() -> str: + """Generate a single Cargo version requirement (e.g. ``^1.2.3``, ``~1.2``).""" + return fake.cargo_version_req() + + +def cargo_version_reqs(n: int) -> list[str]: + """Generate a batch of Cargo version requirements.""" + return fake.cargo_version_reqs(n) + + +def maven_version_range() -> str: + """Generate a single Maven version range (e.g. ``[1.0.0,2.0.0)``).""" + return fake.maven_version_range() + + +def maven_version_ranges(n: int) -> list[str]: + """Generate a batch of Maven version ranges.""" + return fake.maven_version_ranges(n) + + +def gem_version_requirement() -> str: + """Generate a single RubyGems version requirement (e.g. ``~> 1.2``).""" + return fake.gem_version_requirement() + + +def gem_version_requirements(n: int) -> list[str]: + """Generate a batch of RubyGems version requirements.""" + return fake.gem_version_requirements(n) + + +def pypi_package_name() -> str: + """Generate a single PEP 503-normalized PyPI package name. + + Output is lowercase ASCII with hyphen as the sole separator (no + underscores or dots), matching the canonical form PyPI's simple + index serves. + """ + return fake.pypi_package_name() + + +def pypi_package_names(n: int, unique: bool = False) -> list[str]: + """Generate a batch of PyPI package names.""" + return fake.pypi_package_names(n, unique) + + +def npm_package_name() -> str: + """Generate a single npm package name (plain or ``@scope/pkg``).""" + return fake.npm_package_name() + + +def npm_package_names(n: int, unique: bool = False) -> list[str]: + """Generate a batch of npm package names.""" + return fake.npm_package_names(n, unique) + + +def cargo_package_name() -> str: + """Generate a single Cargo-compatible crate name.""" + return fake.cargo_package_name() + + +def cargo_package_names(n: int, unique: bool = False) -> list[str]: + """Generate a batch of Cargo crate names.""" + return fake.cargo_package_names(n, unique) + + +def gem_name() -> str: + """Generate a single RubyGems gem name.""" + return fake.gem_name() + + +def gem_names(n: int, unique: bool = False) -> list[str]: + """Generate a batch of RubyGems gem names.""" + return fake.gem_names(n, unique) + + +def maven_group_id() -> str: + """Generate a single Maven reverse-domain group ID (e.g. ``com.example.tools``).""" + return fake.maven_group_id() + + +def maven_group_ids(n: int, unique: bool = False) -> list[str]: + """Generate a batch of Maven group IDs.""" + return fake.maven_group_ids(n, unique) + + +def maven_artifact_id() -> str: + """Generate a single Maven artifact ID.""" + return fake.maven_artifact_id() + + +def maven_artifact_ids(n: int, unique: bool = False) -> list[str]: + """Generate a batch of Maven artifact IDs.""" + return fake.maven_artifact_ids(n, unique) + + +def maven_coordinate() -> str: + """Generate a single Maven GAV coordinate (``group:artifact:version``).""" + return fake.maven_coordinate() + + +def maven_coordinates(n: int, unique: bool = False) -> list[str]: + """Generate a batch of Maven GAV coordinates.""" + return fake.maven_coordinates(n, unique) + + +def pypi_requirement() -> str: + """Generate a single pip-install-compatible requirement line. + + Example: ``requests>=2.0.0,<3.0.0``. + """ + return fake.pypi_requirement() + + +def pypi_requirements(n: int) -> list[str]: + """Generate a batch of pip-install requirement lines.""" + return fake.pypi_requirements(n) diff --git a/python/forgery/__init__.pyi b/python/forgery/__init__.pyi index 6014273..b891db0 100644 --- a/python/forgery/__init__.pyi +++ b/python/forgery/__init__.pyi @@ -1002,3 +1002,49 @@ def generate_batch(name: str, n: int) -> list[str]: ValueError: If provider doesn't exist or n exceeds batch limit. """ ... + +# Package registry data +def commit_sha() -> str: ... +def commit_shas(n: int) -> list[str]: ... +def short_commit_sha() -> str: ... +def short_commit_shas(n: int) -> list[str]: ... +def semver() -> str: ... +def semvers(n: int) -> list[str]: ... +def semver_prerelease() -> str: ... +def semver_prereleases(n: int) -> list[str]: ... +def calver() -> str: ... +def calvers(n: int) -> list[str]: ... +def spdx_license() -> str: ... +def spdx_licenses(n: int, unique: bool = False) -> list[str]: ... +def git_username() -> str: ... +def git_usernames(n: int, unique: bool = False) -> list[str]: ... +def pypi_version() -> str: ... +def pypi_versions(n: int) -> list[str]: ... +def maven_version() -> str: ... +def maven_versions(n: int) -> list[str]: ... +def pypi_version_specifier() -> str: ... +def pypi_version_specifiers(n: int) -> list[str]: ... +def npm_version_range() -> str: ... +def npm_version_ranges(n: int) -> list[str]: ... +def cargo_version_req() -> str: ... +def cargo_version_reqs(n: int) -> list[str]: ... +def maven_version_range() -> str: ... +def maven_version_ranges(n: int) -> list[str]: ... +def gem_version_requirement() -> str: ... +def gem_version_requirements(n: int) -> list[str]: ... +def pypi_package_name() -> str: ... +def pypi_package_names(n: int, unique: bool = False) -> list[str]: ... +def npm_package_name() -> str: ... +def npm_package_names(n: int, unique: bool = False) -> list[str]: ... +def cargo_package_name() -> str: ... +def cargo_package_names(n: int, unique: bool = False) -> list[str]: ... +def gem_name() -> str: ... +def gem_names(n: int, unique: bool = False) -> list[str]: ... +def maven_group_id() -> str: ... +def maven_group_ids(n: int, unique: bool = False) -> list[str]: ... +def maven_artifact_id() -> str: ... +def maven_artifact_ids(n: int, unique: bool = False) -> list[str]: ... +def maven_coordinate() -> str: ... +def maven_coordinates(n: int, unique: bool = False) -> list[str]: ... +def pypi_requirement() -> str: ... +def pypi_requirements(n: int) -> list[str]: ... diff --git a/python/forgery/_forgery.pyi b/python/forgery/_forgery.pyi index 06dec33..b34190c 100644 --- a/python/forgery/_forgery.pyi +++ b/python/forgery/_forgery.pyi @@ -1539,3 +1539,183 @@ class Faker: ValueError: If provider doesn't exist or n exceeds batch limit """ ... + + # Package registry generators + def commit_sha(self) -> str: + """Generate a single git commit SHA (40 lowercase hex chars).""" + ... + + def commit_shas(self, n: int) -> list[str]: + """Generate a batch of git commit SHAs.""" + ... + + def short_commit_sha(self) -> str: + """Generate a single short git commit SHA (7 lowercase hex chars).""" + ... + + def short_commit_shas(self, n: int) -> list[str]: + """Generate a batch of short git commit SHAs.""" + ... + + def semver(self) -> str: + """Generate a single SemVer ``MAJOR.MINOR.PATCH`` string.""" + ... + + def semvers(self, n: int) -> list[str]: + """Generate a batch of SemVer version strings.""" + ... + + def semver_prerelease(self) -> str: + """Generate a single SemVer pre-release (e.g. ``1.2.3-alpha.1``).""" + ... + + def semver_prereleases(self, n: int) -> list[str]: + """Generate a batch of SemVer pre-release strings.""" + ... + + def calver(self) -> str: + """Generate a single CalVer string.""" + ... + + def calvers(self, n: int) -> list[str]: + """Generate a batch of CalVer strings.""" + ... + + def spdx_license(self) -> str: + """Generate a single SPDX license identifier.""" + ... + + def spdx_licenses(self, n: int, unique: bool = False) -> list[str]: + """Generate a batch of SPDX license identifiers. + + With ``unique=True`` at most 50 distinct values are available. + """ + ... + + def git_username(self) -> str: + """Generate a single GitHub/GitLab/Bitbucket-compatible username.""" + ... + + def git_usernames(self, n: int, unique: bool = False) -> list[str]: + """Generate a batch of Git-platform-compatible usernames.""" + ... + + def pypi_version(self) -> str: + """Generate a single PEP 440 version string.""" + ... + + def pypi_versions(self, n: int) -> list[str]: + """Generate a batch of PEP 440 version strings.""" + ... + + def maven_version(self) -> str: + """Generate a single Maven-style version string.""" + ... + + def maven_versions(self, n: int) -> list[str]: + """Generate a batch of Maven-style version strings.""" + ... + + def pypi_version_specifier(self) -> str: + """Generate a single PEP 440 version specifier.""" + ... + + def pypi_version_specifiers(self, n: int) -> list[str]: + """Generate a batch of PEP 440 version specifiers.""" + ... + + def npm_version_range(self) -> str: + """Generate a single npm-style version range.""" + ... + + def npm_version_ranges(self, n: int) -> list[str]: + """Generate a batch of npm-style version ranges.""" + ... + + def cargo_version_req(self) -> str: + """Generate a single Cargo version requirement.""" + ... + + def cargo_version_reqs(self, n: int) -> list[str]: + """Generate a batch of Cargo version requirements.""" + ... + + def maven_version_range(self) -> str: + """Generate a single Maven version range.""" + ... + + def maven_version_ranges(self, n: int) -> list[str]: + """Generate a batch of Maven version ranges.""" + ... + + def gem_version_requirement(self) -> str: + """Generate a single RubyGems version requirement.""" + ... + + def gem_version_requirements(self, n: int) -> list[str]: + """Generate a batch of RubyGems version requirements.""" + ... + + def pypi_package_name(self) -> str: + """Generate a single PEP 503-normalized PyPI package name.""" + ... + + def pypi_package_names(self, n: int, unique: bool = False) -> list[str]: + """Generate a batch of PyPI package names.""" + ... + + def npm_package_name(self) -> str: + """Generate a single npm package name (plain or ``@scope/pkg``).""" + ... + + def npm_package_names(self, n: int, unique: bool = False) -> list[str]: + """Generate a batch of npm package names.""" + ... + + def cargo_package_name(self) -> str: + """Generate a single Cargo-compatible crate name.""" + ... + + def cargo_package_names(self, n: int, unique: bool = False) -> list[str]: + """Generate a batch of Cargo crate names.""" + ... + + def gem_name(self) -> str: + """Generate a single RubyGems gem name.""" + ... + + def gem_names(self, n: int, unique: bool = False) -> list[str]: + """Generate a batch of RubyGems gem names.""" + ... + + def maven_group_id(self) -> str: + """Generate a single Maven reverse-domain group ID.""" + ... + + def maven_group_ids(self, n: int, unique: bool = False) -> list[str]: + """Generate a batch of Maven group IDs.""" + ... + + def maven_artifact_id(self) -> str: + """Generate a single Maven artifact ID.""" + ... + + def maven_artifact_ids(self, n: int, unique: bool = False) -> list[str]: + """Generate a batch of Maven artifact IDs.""" + ... + + def maven_coordinate(self) -> str: + """Generate a single Maven GAV coordinate (``group:artifact:version``).""" + ... + + def maven_coordinates(self, n: int, unique: bool = False) -> list[str]: + """Generate a batch of Maven GAV coordinates.""" + ... + + def pypi_requirement(self) -> str: + """Generate a single pip-install requirement line.""" + ... + + def pypi_requirements(self, n: int) -> list[str]: + """Generate a batch of pip-install requirement lines.""" + ... diff --git a/src/data/en_us/mod.rs b/src/data/en_us/mod.rs index 00eda33..0730f06 100644 --- a/src/data/en_us/mod.rs +++ b/src/data/en_us/mod.rs @@ -10,6 +10,8 @@ mod countries; mod first_names; mod last_names; mod lorem; +mod packages; +mod spdx_licenses; mod states; mod streets; mod tlds; @@ -25,6 +27,11 @@ pub use countries::COUNTRIES; pub use first_names::FIRST_NAMES; pub use last_names::LAST_NAMES; pub use lorem::LOREM_WORDS; +pub use packages::{ + MAVEN_ORG_COMPONENTS, MAVEN_QUALIFIERS, MAVEN_TLDS, NPM_SCOPE_PREFIXES, PACKAGE_KEYWORDS, + PACKAGE_MODIFIERS, PRERELEASE_TAGS, +}; +pub use spdx_licenses::SPDX_LICENSES; pub use states::{STATES, STATE_ABBRS}; pub use streets::{STREET_NAMES, STREET_SUFFIXES}; pub use tlds::{FREE_EMAIL_DOMAINS, SAFE_EMAIL_DOMAINS, TLDS}; diff --git a/src/data/en_us/packages.rs b/src/data/en_us/packages.rs new file mode 100644 index 0000000..647ca65 --- /dev/null +++ b/src/data/en_us/packages.rs @@ -0,0 +1,355 @@ +//! Word lists used to assemble realistic-looking package names, +//! Maven coordinates, version qualifiers, and usernames. +//! +//! All entries are lowercase ASCII (except `MAVEN_QUALIFIERS`, which preserves +//! the mixed-case convention seen in real artifacts, e.g., `RELEASE`, `Final`). + +/// Tech-flavoured nouns used as the stem of package names and usernames. +/// +/// Sized so that combinatorial patterns (e.g. `{primary}-{secondary}`) yield +/// tens of thousands of distinct names, which gives `unique=True` headroom +/// for seeding sizeable test registries. +pub const PACKAGE_KEYWORDS: &[&str] = &[ + "adapter", + "agent", + "api", + "archive", + "assembly", + "async", + "audit", + "auth", + "backup", + "bench", + "binding", + "blob", + "bot", + "bridge", + "broker", + "buffer", + "builder", + "bundle", + "byte", + "cache", + "canvas", + "cert", + "chain", + "channel", + "chart", + "cipher", + "cli", + "client", + "cluster", + "codec", + "collector", + "command", + "compiler", + "config", + "connector", + "console", + "container", + "context", + "cookie", + "core", + "counter", + "crypto", + "cursor", + "daemon", + "data", + "database", + "db", + "debugger", + "decoder", + "delta", + "dispatch", + "doc", + "driver", + "editor", + "emitter", + "encoder", + "engine", + "entry", + "env", + "event", + "executor", + "export", + "factory", + "feed", + "fetch", + "field", + "filter", + "finder", + "firewall", + "flag", + "flow", + "format", + "forwarder", + "frame", + "gateway", + "graph", + "handler", + "hash", + "helper", + "hook", + "http", + "image", + "importer", + "index", + "inspector", + "installer", + "io", + "iterator", + "jobs", + "json", + "key", + "keyring", + "kit", + "layer", + "layout", + "lexer", + "lib", + "lint", + "list", + "loader", + "lock", + "log", + "logger", + "loop", + "macro", + "manager", + "map", + "marshaller", + "matcher", + "matrix", + "memo", + "menu", + "merger", + "meta", + "meter", + "metric", + "middleware", + "migrator", + "mock", + "model", + "module", + "monitor", + "namespace", + "net", + "node", + "normalizer", + "notifier", + "observer", + "orchestrator", + "orm", + "output", + "packager", + "page", + "panel", + "parser", + "patch", + "path", + "pipe", + "pipeline", + "planner", + "plugin", + "policy", + "pool", + "process", + "profile", + "promise", + "protocol", + "provider", + "proxy", + "pubsub", + "publisher", + "queue", + "query", + "reactor", + "reader", + "reducer", + "registry", + "release", + "render", + "repo", + "reporter", + "request", + "resolver", + "resource", + "response", + "rest", + "retry", + "route", + "router", + "rule", + "runtime", + "sampler", + "scanner", + "scheduler", + "schema", + "scope", + "script", + "sdk", + "search", + "secret", + "selector", + "semaphore", + "serializer", + "server", + "service", + "session", + "shader", + "shard", + "shim", + "signal", + "slice", + "slot", + "socket", + "spec", + "splitter", + "stack", + "stage", + "state", + "store", + "stream", + "string", + "subscriber", + "supervisor", + "switch", + "sync", + "table", + "tag", + "task", + "template", + "tensor", + "theme", + "thread", + "timer", + "token", + "tool", + "tracer", + "tracker", + "transform", + "translator", + "tree", + "trigger", + "tuple", + "types", + "ui", + "upload", + "uri", + "url", + "user", + "util", + "utils", + "validator", + "vault", + "vector", + "version", + "view", + "visitor", + "walker", + "watcher", + "web", + "websocket", + "worker", + "wrapper", + "xml", + "yaml", + "zone", +]; + +/// Adjective-style modifiers used as package name prefixes. +/// +/// Kept tech-adjacent so combined names still read like plausible packages. +pub const PACKAGE_MODIFIERS: &[&str] = &[ + "async", + "atomic", + "auto", + "basic", + "batch", + "better", + "bulk", + "clean", + "compact", + "concurrent", + "deep", + "distributed", + "easy", + "eager", + "embedded", + "fast", + "flex", + "global", + "hyper", + "immutable", + "inline", + "instant", + "isolated", + "lazy", + "light", + "live", + "local", + "managed", + "mega", + "micro", + "mini", + "minimal", + "modern", + "modular", + "nano", + "native", + "neo", + "nested", + "parallel", + "persistent", + "plain", + "portable", + "pretty", + "pure", + "quick", + "rapid", + "realtime", + "remote", + "resilient", + "robust", + "safe", + "scalable", + "shared", + "simple", + "smart", + "smol", + "sparse", + "static", + "stateless", + "strict", + "super", + "swift", + "tiny", + "typed", + "ultra", + "virtual", + "zero", +]; + +/// Top-level domain components used to build Maven-style reverse-domain group IDs. +pub const MAVEN_TLDS: &[&str] = &["com", "org", "io", "net", "dev", "co", "tech", "app", "xyz"]; + +/// Organisation-style components used between the TLD and artifact in +/// Maven group IDs (e.g., `com..project`). +pub const MAVEN_ORG_COMPONENTS: &[&str] = &[ + "acme", "example", "widgets", "foo", "bar", "baz", "quux", "corp", "labs", "tools", "systems", + "soft", "works", "studio", "group", "craft", "forge", "hub", "chain", "hq", +]; + +/// Common npm scope prefixes used when generating scoped package names. +pub const NPM_SCOPE_PREFIXES: &[&str] = &[ + "types", "babel", "angular", "nrwl", "nx", "aws-sdk", "azure", "stripe", "nestjs", "vue", + "rollup", "svelte", "prisma", "google", "sentry", "myorg", "acme", "example", +]; + +/// Pre-release tags for SemVer-style versions (e.g., `1.0.0-alpha.1`). +pub const PRERELEASE_TAGS: &[&str] = &["alpha", "beta", "rc", "pre", "dev"]; + +/// Maven version qualifiers, preserving their canonical casing. +pub const MAVEN_QUALIFIERS: &[&str] = &[ + "SNAPSHOT", "RELEASE", "Final", "RC1", "RC2", "RC3", "M1", "M2", "alpha", "beta", "CR1", +]; diff --git a/src/data/en_us/spdx_licenses.rs b/src/data/en_us/spdx_licenses.rs new file mode 100644 index 0000000..ab70203 --- /dev/null +++ b/src/data/en_us/spdx_licenses.rs @@ -0,0 +1,59 @@ +//! Common SPDX license identifiers. +//! +//! A curated subset of the full SPDX License List focused on identifiers +//! seen most often in public package registries. Values are valid +//! identifiers as of SPDX License List v3.23. + +/// Commonly used SPDX license identifiers. +pub const SPDX_LICENSES: &[&str] = &[ + "MIT", + "Apache-2.0", + "BSD-2-Clause", + "BSD-3-Clause", + "BSD-4-Clause", + "0BSD", + "GPL-2.0-only", + "GPL-2.0-or-later", + "GPL-3.0-only", + "GPL-3.0-or-later", + "LGPL-2.0-only", + "LGPL-2.0-or-later", + "LGPL-2.1-only", + "LGPL-2.1-or-later", + "LGPL-3.0-only", + "LGPL-3.0-or-later", + "AGPL-3.0-only", + "AGPL-3.0-or-later", + "MPL-2.0", + "ISC", + "Unlicense", + "CC0-1.0", + "CC-BY-4.0", + "CC-BY-SA-4.0", + "CC-BY-NC-4.0", + "CC-BY-NC-SA-4.0", + "CC-BY-ND-4.0", + "CC-BY-NC-ND-4.0", + "Zlib", + "Artistic-2.0", + "EPL-1.0", + "EPL-2.0", + "EUPL-1.1", + "EUPL-1.2", + "BSL-1.0", + "WTFPL", + "CDDL-1.0", + "CDDL-1.1", + "BlueOak-1.0.0", + "PostgreSQL", + "Python-2.0", + "OFL-1.1", + "Ruby", + "MS-PL", + "MS-RL", + "AFL-3.0", + "NCSA", + "OSL-3.0", + "UPL-1.0", + "X11", +]; diff --git a/src/lib.rs b/src/lib.rs index 12cc100..bc90ead 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -162,6 +162,22 @@ macro_rules! batch_simple { }}; } +/// Batch method without locale, with optional `unique=true` support +/// (Pattern C with unique). +/// +/// The single-value generator takes only `&mut ForgeryRng`; we wrap it in +/// a closure that ignores the locale argument `generate_unique` passes. +macro_rules! batch_simple_unique { + ($self:ident, $n:ident, $unique:ident, $single_fn:path, $batch_fn:path) => {{ + validate_batch_size($n)?; + if $unique { + $self.generate_unique($n, |rng, _locale| $single_fn(rng)) + } else { + Ok($self.maybe_parallel($n, |rng, chunk_n| $batch_fn(rng, chunk_n))) + } + }}; +} + /// Batch method without locale, provider returns Result (Pattern C validated). macro_rules! batch_validated { ($self:ident, $n:ident, $batch_fn:path) => {{ @@ -2550,6 +2566,314 @@ impl Faker { domain, )) } + + // === Package Registry Data === + + /// Generate a batch of full-length (40 lowercase hex chars) git commit SHAs. + pub fn commit_shas(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!(self, n, providers::packages::generate_commit_shas) + } + + /// Generate a single git commit SHA (40 lowercase hex chars). + pub fn commit_sha(&mut self) -> String { + providers::packages::generate_commit_sha(&mut self.rng) + } + + /// Generate a batch of short (7 lowercase hex chars) git commit SHAs. + pub fn short_commit_shas(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!(self, n, providers::packages::generate_short_commit_shas) + } + + /// Generate a single short git commit SHA (7 lowercase hex chars). + pub fn short_commit_sha(&mut self) -> String { + providers::packages::generate_short_commit_sha(&mut self.rng) + } + + /// Generate a batch of SemVer `MAJOR.MINOR.PATCH` strings. + pub fn semvers(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!(self, n, providers::packages::generate_semvers) + } + + /// Generate a single SemVer string. + pub fn semver(&mut self) -> String { + providers::packages::generate_semver(&mut self.rng) + } + + /// Generate a batch of SemVer pre-release strings (e.g., `1.2.3-alpha.1`). + pub fn semver_prereleases(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!(self, n, providers::packages::generate_semver_prereleases) + } + + /// Generate a single SemVer pre-release string. + pub fn semver_prerelease(&mut self) -> String { + providers::packages::generate_semver_prerelease(&mut self.rng) + } + + /// Generate a batch of calendar-versioning (CalVer) strings. + pub fn calvers(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!(self, n, providers::packages::generate_calvers) + } + + /// Generate a single CalVer string. + pub fn calver(&mut self) -> String { + providers::packages::generate_calver(&mut self.rng) + } + + /// Generate a batch of SPDX license identifiers. + /// + /// With `unique=true`, at most 50 distinct values can be returned + /// (the size of the built-in SPDX list); requesting more raises + /// `UniqueExhaustedError`. + pub fn spdx_licenses(&mut self, n: usize, unique: bool) -> Result, ForgeryError> { + batch_simple_unique!( + self, + n, + unique, + providers::packages::generate_spdx_license, + providers::packages::generate_spdx_licenses + ) + } + + /// Generate a single SPDX license identifier. + pub fn spdx_license(&mut self) -> String { + providers::packages::generate_spdx_license(&mut self.rng) + } + + /// Generate a batch of GitHub/GitLab/Bitbucket-compatible usernames. + pub fn git_usernames(&mut self, n: usize, unique: bool) -> Result, ForgeryError> { + batch_simple_unique!( + self, + n, + unique, + providers::packages::generate_git_username, + providers::packages::generate_git_usernames + ) + } + + /// Generate a single Git-platform-compatible username. + pub fn git_username(&mut self) -> String { + providers::packages::generate_git_username(&mut self.rng) + } + + /// Generate a batch of PEP 440 version strings. + pub fn pypi_versions(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!(self, n, providers::packages::generate_pypi_versions) + } + + /// Generate a single PEP 440 version string. + pub fn pypi_version(&mut self) -> String { + providers::packages::generate_pypi_version(&mut self.rng) + } + + /// Generate a batch of Maven-style version strings. + pub fn maven_versions(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!(self, n, providers::packages::generate_maven_versions) + } + + /// Generate a single Maven-style version string. + pub fn maven_version(&mut self) -> String { + providers::packages::generate_maven_version(&mut self.rng) + } + + /// Generate a batch of PEP 440 version specifiers. + pub fn pypi_version_specifiers(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!( + self, + n, + providers::packages::generate_pypi_version_specifiers + ) + } + + /// Generate a single PEP 440 version specifier. + pub fn pypi_version_specifier(&mut self) -> String { + providers::packages::generate_pypi_version_specifier(&mut self.rng) + } + + /// Generate a batch of npm-style version ranges. + pub fn npm_version_ranges(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!(self, n, providers::packages::generate_npm_version_ranges) + } + + /// Generate a single npm-style version range. + pub fn npm_version_range(&mut self) -> String { + providers::packages::generate_npm_version_range(&mut self.rng) + } + + /// Generate a batch of Cargo version requirements. + pub fn cargo_version_reqs(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!(self, n, providers::packages::generate_cargo_version_reqs) + } + + /// Generate a single Cargo version requirement. + pub fn cargo_version_req(&mut self) -> String { + providers::packages::generate_cargo_version_req(&mut self.rng) + } + + /// Generate a batch of Maven version ranges. + pub fn maven_version_ranges(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!(self, n, providers::packages::generate_maven_version_ranges) + } + + /// Generate a single Maven version range. + pub fn maven_version_range(&mut self) -> String { + providers::packages::generate_maven_version_range(&mut self.rng) + } + + /// Generate a batch of RubyGems version requirements. + pub fn gem_version_requirements(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!( + self, + n, + providers::packages::generate_gem_version_requirements + ) + } + + /// Generate a single RubyGems version requirement. + pub fn gem_version_requirement(&mut self) -> String { + providers::packages::generate_gem_version_requirement(&mut self.rng) + } + + /// Generate a batch of PEP 503-normalized PyPI package names. + pub fn pypi_package_names( + &mut self, + n: usize, + unique: bool, + ) -> Result, ForgeryError> { + batch_simple_unique!( + self, + n, + unique, + providers::packages::generate_pypi_package_name, + providers::packages::generate_pypi_package_names + ) + } + + /// Generate a single PyPI package name. + pub fn pypi_package_name(&mut self) -> String { + providers::packages::generate_pypi_package_name(&mut self.rng) + } + + /// Generate a batch of npm package names. + pub fn npm_package_names( + &mut self, + n: usize, + unique: bool, + ) -> Result, ForgeryError> { + batch_simple_unique!( + self, + n, + unique, + providers::packages::generate_npm_package_name, + providers::packages::generate_npm_package_names + ) + } + + /// Generate a single npm package name (plain or scoped). + pub fn npm_package_name(&mut self) -> String { + providers::packages::generate_npm_package_name(&mut self.rng) + } + + /// Generate a batch of Cargo-compatible crate names. + pub fn cargo_package_names( + &mut self, + n: usize, + unique: bool, + ) -> Result, ForgeryError> { + batch_simple_unique!( + self, + n, + unique, + providers::packages::generate_cargo_package_name, + providers::packages::generate_cargo_package_names + ) + } + + /// Generate a single Cargo-compatible crate name. + pub fn cargo_package_name(&mut self) -> String { + providers::packages::generate_cargo_package_name(&mut self.rng) + } + + /// Generate a batch of RubyGems gem names. + pub fn gem_names(&mut self, n: usize, unique: bool) -> Result, ForgeryError> { + batch_simple_unique!( + self, + n, + unique, + providers::packages::generate_gem_name, + providers::packages::generate_gem_names + ) + } + + /// Generate a single RubyGems gem name. + pub fn gem_name(&mut self) -> String { + providers::packages::generate_gem_name(&mut self.rng) + } + + /// Generate a batch of Maven reverse-domain group IDs. + pub fn maven_group_ids(&mut self, n: usize, unique: bool) -> Result, ForgeryError> { + batch_simple_unique!( + self, + n, + unique, + providers::packages::generate_maven_group_id, + providers::packages::generate_maven_group_ids + ) + } + + /// Generate a single Maven group ID. + pub fn maven_group_id(&mut self) -> String { + providers::packages::generate_maven_group_id(&mut self.rng) + } + + /// Generate a batch of Maven artifact IDs. + pub fn maven_artifact_ids( + &mut self, + n: usize, + unique: bool, + ) -> Result, ForgeryError> { + batch_simple_unique!( + self, + n, + unique, + providers::packages::generate_maven_artifact_id, + providers::packages::generate_maven_artifact_ids + ) + } + + /// Generate a single Maven artifact ID. + pub fn maven_artifact_id(&mut self) -> String { + providers::packages::generate_maven_artifact_id(&mut self.rng) + } + + /// Generate a batch of Maven GAV coordinates (`group:artifact:version`). + pub fn maven_coordinates( + &mut self, + n: usize, + unique: bool, + ) -> Result, ForgeryError> { + batch_simple_unique!( + self, + n, + unique, + providers::packages::generate_maven_coordinate, + providers::packages::generate_maven_coordinates + ) + } + + /// Generate a single Maven GAV coordinate. + pub fn maven_coordinate(&mut self) -> String { + providers::packages::generate_maven_coordinate(&mut self.rng) + } + + /// Generate a batch of full pip-install requirement lines. + pub fn pypi_requirements(&mut self, n: usize) -> Result, BatchSizeError> { + batch_simple!(self, n, providers::packages::generate_pypi_requirements) + } + + /// Generate a single pip-install requirement line. + pub fn pypi_requirement(&mut self) -> String { + providers::packages::generate_pypi_requirement(&mut self.rng) + } } // Python API - these methods are exposed to Python via PyO3 @@ -4918,6 +5242,294 @@ impl Faker { } dict.into_py_any(py) } + + // === Package Registry Data === + + /// Generate a batch of git commit SHAs (40 lowercase hex chars). + #[pyo3(name = "commit_shas")] + fn py_commit_shas(&mut self, n: usize) -> PyResult> { + self.commit_shas(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single git commit SHA (40 lowercase hex chars). + #[pyo3(name = "commit_sha")] + fn py_commit_sha(&mut self) -> String { + self.commit_sha() + } + + /// Generate a batch of short git commit SHAs (7 lowercase hex chars). + #[pyo3(name = "short_commit_shas")] + fn py_short_commit_shas(&mut self, n: usize) -> PyResult> { + self.short_commit_shas(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single short git commit SHA (7 lowercase hex chars). + #[pyo3(name = "short_commit_sha")] + fn py_short_commit_sha(&mut self) -> String { + self.short_commit_sha() + } + + /// Generate a batch of SemVer version strings. + #[pyo3(name = "semvers")] + fn py_semvers(&mut self, n: usize) -> PyResult> { + self.semvers(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single SemVer version string. + #[pyo3(name = "semver")] + fn py_semver(&mut self) -> String { + self.semver() + } + + /// Generate a batch of SemVer pre-release strings. + #[pyo3(name = "semver_prereleases")] + fn py_semver_prereleases(&mut self, n: usize) -> PyResult> { + self.semver_prereleases(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single SemVer pre-release string. + #[pyo3(name = "semver_prerelease")] + fn py_semver_prerelease(&mut self) -> String { + self.semver_prerelease() + } + + /// Generate a batch of CalVer strings. + #[pyo3(name = "calvers")] + fn py_calvers(&mut self, n: usize) -> PyResult> { + self.calvers(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single CalVer string. + #[pyo3(name = "calver")] + fn py_calver(&mut self) -> String { + self.calver() + } + + /// Generate a batch of SPDX license identifiers. + #[pyo3(name = "spdx_licenses", signature = (n, unique = false))] + fn py_spdx_licenses(&mut self, n: usize, unique: bool) -> PyResult> { + self.spdx_licenses(n, unique) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single SPDX license identifier. + #[pyo3(name = "spdx_license")] + fn py_spdx_license(&mut self) -> String { + self.spdx_license() + } + + /// Generate a batch of Git-platform-compatible usernames. + #[pyo3(name = "git_usernames", signature = (n, unique = false))] + fn py_git_usernames(&mut self, n: usize, unique: bool) -> PyResult> { + self.git_usernames(n, unique) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single Git-platform-compatible username. + #[pyo3(name = "git_username")] + fn py_git_username(&mut self) -> String { + self.git_username() + } + + /// Generate a batch of PEP 440 version strings. + #[pyo3(name = "pypi_versions")] + fn py_pypi_versions(&mut self, n: usize) -> PyResult> { + self.pypi_versions(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single PEP 440 version string. + #[pyo3(name = "pypi_version")] + fn py_pypi_version(&mut self) -> String { + self.pypi_version() + } + + /// Generate a batch of Maven version strings. + #[pyo3(name = "maven_versions")] + fn py_maven_versions(&mut self, n: usize) -> PyResult> { + self.maven_versions(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single Maven version string. + #[pyo3(name = "maven_version")] + fn py_maven_version(&mut self) -> String { + self.maven_version() + } + + /// Generate a batch of PEP 440 version specifiers. + #[pyo3(name = "pypi_version_specifiers")] + fn py_pypi_version_specifiers(&mut self, n: usize) -> PyResult> { + self.pypi_version_specifiers(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single PEP 440 version specifier. + #[pyo3(name = "pypi_version_specifier")] + fn py_pypi_version_specifier(&mut self) -> String { + self.pypi_version_specifier() + } + + /// Generate a batch of npm version ranges. + #[pyo3(name = "npm_version_ranges")] + fn py_npm_version_ranges(&mut self, n: usize) -> PyResult> { + self.npm_version_ranges(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single npm version range. + #[pyo3(name = "npm_version_range")] + fn py_npm_version_range(&mut self) -> String { + self.npm_version_range() + } + + /// Generate a batch of Cargo version requirements. + #[pyo3(name = "cargo_version_reqs")] + fn py_cargo_version_reqs(&mut self, n: usize) -> PyResult> { + self.cargo_version_reqs(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single Cargo version requirement. + #[pyo3(name = "cargo_version_req")] + fn py_cargo_version_req(&mut self) -> String { + self.cargo_version_req() + } + + /// Generate a batch of Maven version ranges. + #[pyo3(name = "maven_version_ranges")] + fn py_maven_version_ranges(&mut self, n: usize) -> PyResult> { + self.maven_version_ranges(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single Maven version range. + #[pyo3(name = "maven_version_range")] + fn py_maven_version_range(&mut self) -> String { + self.maven_version_range() + } + + /// Generate a batch of RubyGems version requirements. + #[pyo3(name = "gem_version_requirements")] + fn py_gem_version_requirements(&mut self, n: usize) -> PyResult> { + self.gem_version_requirements(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single RubyGems version requirement. + #[pyo3(name = "gem_version_requirement")] + fn py_gem_version_requirement(&mut self) -> String { + self.gem_version_requirement() + } + + /// Generate a batch of PyPI package names. + #[pyo3(name = "pypi_package_names", signature = (n, unique = false))] + fn py_pypi_package_names(&mut self, n: usize, unique: bool) -> PyResult> { + self.pypi_package_names(n, unique) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single PyPI package name. + #[pyo3(name = "pypi_package_name")] + fn py_pypi_package_name(&mut self) -> String { + self.pypi_package_name() + } + + /// Generate a batch of npm package names. + #[pyo3(name = "npm_package_names", signature = (n, unique = false))] + fn py_npm_package_names(&mut self, n: usize, unique: bool) -> PyResult> { + self.npm_package_names(n, unique) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single npm package name. + #[pyo3(name = "npm_package_name")] + fn py_npm_package_name(&mut self) -> String { + self.npm_package_name() + } + + /// Generate a batch of Cargo crate names. + #[pyo3(name = "cargo_package_names", signature = (n, unique = false))] + fn py_cargo_package_names(&mut self, n: usize, unique: bool) -> PyResult> { + self.cargo_package_names(n, unique) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single Cargo crate name. + #[pyo3(name = "cargo_package_name")] + fn py_cargo_package_name(&mut self) -> String { + self.cargo_package_name() + } + + /// Generate a batch of RubyGems gem names. + #[pyo3(name = "gem_names", signature = (n, unique = false))] + fn py_gem_names(&mut self, n: usize, unique: bool) -> PyResult> { + self.gem_names(n, unique) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single RubyGems gem name. + #[pyo3(name = "gem_name")] + fn py_gem_name(&mut self) -> String { + self.gem_name() + } + + /// Generate a batch of Maven group IDs. + #[pyo3(name = "maven_group_ids", signature = (n, unique = false))] + fn py_maven_group_ids(&mut self, n: usize, unique: bool) -> PyResult> { + self.maven_group_ids(n, unique) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single Maven group ID. + #[pyo3(name = "maven_group_id")] + fn py_maven_group_id(&mut self) -> String { + self.maven_group_id() + } + + /// Generate a batch of Maven artifact IDs. + #[pyo3(name = "maven_artifact_ids", signature = (n, unique = false))] + fn py_maven_artifact_ids(&mut self, n: usize, unique: bool) -> PyResult> { + self.maven_artifact_ids(n, unique) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single Maven artifact ID. + #[pyo3(name = "maven_artifact_id")] + fn py_maven_artifact_id(&mut self) -> String { + self.maven_artifact_id() + } + + /// Generate a batch of Maven GAV coordinates. + #[pyo3(name = "maven_coordinates", signature = (n, unique = false))] + fn py_maven_coordinates(&mut self, n: usize, unique: bool) -> PyResult> { + self.maven_coordinates(n, unique) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single Maven GAV coordinate. + #[pyo3(name = "maven_coordinate")] + fn py_maven_coordinate(&mut self) -> String { + self.maven_coordinate() + } + + /// Generate a batch of pip-install requirement lines. + #[pyo3(name = "pypi_requirements")] + fn py_pypi_requirements(&mut self, n: usize) -> PyResult> { + self.pypi_requirements(n) + .map_err(|e| PyValueError::new_err(e.to_string())) + } + + /// Generate a single pip-install requirement line. + #[pyo3(name = "pypi_requirement")] + fn py_pypi_requirement(&mut self) -> String { + self.pypi_requirement() + } } /// Prepared state for async record generation operations. diff --git a/src/providers/mod.rs b/src/providers/mod.rs index 80163e5..88df125 100644 --- a/src/providers/mod.rs +++ b/src/providers/mod.rs @@ -24,6 +24,7 @@ pub mod isbn; pub mod names; pub mod network; pub mod numbers; +pub mod packages; pub mod password; pub mod pattern; pub mod phone; diff --git a/src/providers/packages.rs b/src/providers/packages.rs new file mode 100644 index 0000000..22abc7c --- /dev/null +++ b/src/providers/packages.rs @@ -0,0 +1,1005 @@ +//! Package registry data generation. +//! +//! Generators for seeding test databases of package registries such as PyPI, +//! npm, Maven, Cargo, and RubyGems. +//! +//! The provider deliberately splits primitives that look the same everywhere +//! (`commit_sha`, `semver`, `spdx_license`, `git_username`, `calver`) from +//! ecosystem-specific shapes that genuinely differ (package names, version +//! strings, constraint syntaxes, Maven coordinates). +//! +//! # Note on identifiers +//! +//! SHAs, versions, and package names are produced from our seeded RNG. They +//! are not cryptographic hashes of any input and they are not guaranteed to +//! collide with real-world names. + +use crate::data::en_us::{ + MAVEN_ORG_COMPONENTS, MAVEN_QUALIFIERS, MAVEN_TLDS, NPM_SCOPE_PREFIXES, PACKAGE_KEYWORDS, + PACKAGE_MODIFIERS, PRERELEASE_TAGS, SPDX_LICENSES, +}; +use crate::rng::ForgeryRng; + +/// Lowercase hex digits for byte-to-hex encoding. +const HEX_CHARS: &[u8; 16] = b"0123456789abcdef"; + +/// Encode a byte slice as a lowercase hex string. +fn bytes_to_hex(bytes: &[u8]) -> String { + let mut s = String::with_capacity(bytes.len() * 2); + for b in bytes { + s.push(HEX_CHARS[(b >> 4) as usize] as char); + s.push(HEX_CHARS[(b & 0x0f) as usize] as char); + } + s +} + +/// Internal helper to generate a `(major, minor, patch)` triple. +/// +/// The major-version distribution is skewed toward lower values so the output +/// looks like a realistic registry snapshot: most packages sit at 0.x–2.x. +fn semver_parts(rng: &mut ForgeryRng) -> (u32, u32, u32) { + let bucket: u8 = rng.gen_range(0, 99); + let major: u32 = match bucket { + 0..=30 => 0, + 31..=60 => 1, + 61..=75 => 2, + 76..=85 => 3, + 86..=90 => rng.gen_range(4, 5), + 91..=95 => rng.gen_range(6, 10), + _ => rng.gen_range(11, 25), + }; + let minor: u32 = rng.gen_range(0, 20); + let patch: u32 = rng.gen_range(0, 50); + (major, minor, patch) +} + +/// Generate a batch by repeatedly invoking the single-value generator. +macro_rules! batch_of { + ($name:ident, $single:ident, $doc:literal) => { + #[doc = $doc] + pub fn $name(rng: &mut ForgeryRng, n: usize) -> Vec { + let mut out = Vec::with_capacity(n); + for _ in 0..n { + out.push($single(rng)); + } + out + } + }; +} + +// === Git commit SHAs === + +/// Generate a single full-length (40-hex-char) git commit SHA. +#[inline] +pub fn generate_commit_sha(rng: &mut ForgeryRng) -> String { + let mut bytes = [0u8; 20]; + rng.fill_bytes(&mut bytes); + bytes_to_hex(&bytes) +} + +batch_of!( + generate_commit_shas, + generate_commit_sha, + "Generate a batch of full-length (40-hex-char) git commit SHAs." +); + +/// Generate a single short (7-hex-char) git commit SHA. +#[inline] +pub fn generate_short_commit_sha(rng: &mut ForgeryRng) -> String { + let mut bytes = [0u8; 4]; + rng.fill_bytes(&mut bytes); + let mut s = bytes_to_hex(&bytes); + s.truncate(7); + s +} + +batch_of!( + generate_short_commit_shas, + generate_short_commit_sha, + "Generate a batch of short (7-hex-char) git commit SHAs." +); + +// === SemVer === + +/// Generate a single SemVer `MAJOR.MINOR.PATCH` string. +#[inline] +pub fn generate_semver(rng: &mut ForgeryRng) -> String { + let (major, minor, patch) = semver_parts(rng); + format!("{}.{}.{}", major, minor, patch) +} + +batch_of!( + generate_semvers, + generate_semver, + "Generate a batch of SemVer `MAJOR.MINOR.PATCH` strings." +); + +/// Generate a SemVer pre-release string (e.g., `1.2.3-alpha.1` or +/// `1.2.3-rc.2+build.42`). +#[inline] +pub fn generate_semver_prerelease(rng: &mut ForgeryRng) -> String { + let (major, minor, patch) = semver_parts(rng); + let pre = rng.choose(PRERELEASE_TAGS); + let pre_num: u32 = rng.gen_range(1, 9); + // Roughly one in three gets a build metadata suffix. + if rng.gen_range(0u8, 9) < 3 { + let build: u32 = rng.gen_range(1, 999); + format!( + "{}.{}.{}-{}.{}+build.{}", + major, minor, patch, pre, pre_num, build + ) + } else { + format!("{}.{}.{}-{}.{}", major, minor, patch, pre, pre_num) + } +} + +batch_of!( + generate_semver_prereleases, + generate_semver_prerelease, + "Generate a batch of SemVer pre-release strings." +); + +// === CalVer === + +/// Generate a calendar-versioning string in one of several common schemes: +/// `YYYY.MM.DD`, `YYYY.MM`, `YYYY.MINOR`, `YY.MM`, or `YYYY.MM.MICRO`. +#[inline] +pub fn generate_calver(rng: &mut ForgeryRng) -> String { + let year: u32 = rng.gen_range(2015, 2030); + let month: u32 = rng.gen_range(1, 12); + match rng.gen_range(0u8, 4) { + 0 => { + let day: u32 = rng.gen_range(1, 28); + format!("{}.{:02}.{:02}", year, month, day) + } + 1 => format!("{}.{:02}", year, month), + 2 => { + let minor: u32 = rng.gen_range(0, 15); + format!("{}.{}", year, minor) + } + 3 => { + let yy = year % 100; + format!("{:02}.{:02}", yy, month) + } + _ => { + let micro: u32 = rng.gen_range(0, 5); + format!("{}.{:02}.{}", year, month, micro) + } + } +} + +batch_of!( + generate_calvers, + generate_calver, + "Generate a batch of CalVer strings in mixed schemes." +); + +// === SPDX licenses === + +/// Generate a single SPDX license identifier (e.g., `"MIT"`, `"Apache-2.0"`). +#[inline] +pub fn generate_spdx_license(rng: &mut ForgeryRng) -> String { + (*rng.choose(SPDX_LICENSES)).to_string() +} + +batch_of!( + generate_spdx_licenses, + generate_spdx_license, + "Generate a batch of SPDX license identifiers." +); + +// === Git-platform usernames === + +/// Generate a GitHub/GitLab/Bitbucket-compatible username. +/// +/// Guarantees: ASCII alphanumerics + single hyphens, no leading/trailing +/// hyphen, no consecutive hyphens, and ≤ 39 characters (GitHub's limit). +#[inline] +pub fn generate_git_username(rng: &mut ForgeryRng) -> String { + let primary = rng.choose(PACKAGE_KEYWORDS); + let modifier = rng.choose(PACKAGE_MODIFIERS); + let result = match rng.gen_range(0u8, 4) { + 0 => format!("{}-{}", modifier, primary), + 1 => format!("{}-{}", primary, modifier), + 2 => { + let num: u32 = rng.gen_range(1, 999); + format!("{}{}", primary, num) + } + _ => { + let num: u32 = rng.gen_range(1, 99); + format!("{}-{}{}", modifier, primary, num) + } + }; + if result.len() > 39 { + // Safe to byte-slice: every data entry is ASCII. + let mut truncated = result[..39].to_string(); + // Strip trailing hyphens, but keep at least one character so we never + // return an empty username if a future data entry is a run of hyphens. + while truncated.len() > 1 && truncated.ends_with('-') { + truncated.pop(); + } + truncated + } else { + result + } +} + +batch_of!( + generate_git_usernames, + generate_git_username, + "Generate a batch of GitHub/GitLab/Bitbucket-compatible usernames." +); + +// === PEP 440 versions (PyPI) === + +/// Generate a PEP 440–compliant version string. +/// +/// Produces a mix of plain releases, pre-releases (`aN`, `bN`, `rcN`), +/// post-releases (`.postN`), and developmental releases (`.devN`). +#[inline] +pub fn generate_pypi_version(rng: &mut ForgeryRng) -> String { + let (major, minor, patch) = semver_parts(rng); + let base = format!("{}.{}.{}", major, minor, patch); + match rng.gen_range(0u8, 19) { + 0..=11 => base, + 12..=14 => { + let tag = rng.choose(&["a", "b", "rc"]); + let num: u32 = rng.gen_range(0, 9); + format!("{}{}{}", base, tag, num) + } + 15..=16 => { + let num: u32 = rng.gen_range(1, 5); + format!("{}.post{}", base, num) + } + _ => { + let num: u32 = rng.gen_range(0, 9); + format!("{}.dev{}", base, num) + } + } +} + +batch_of!( + generate_pypi_versions, + generate_pypi_version, + "Generate a batch of PEP 440 version strings." +); + +// === Maven versions === + +/// Generate a Maven-style version, sometimes with a qualifier +/// (`-SNAPSHOT`, `.RELEASE`, `-RC1`, `.Final`, etc.). +#[inline] +pub fn generate_maven_version(rng: &mut ForgeryRng) -> String { + let (major, minor, patch) = semver_parts(rng); + let base = format!("{}.{}.{}", major, minor, patch); + if rng.gen_range(0u8, 9) < 3 { + let qualifier = *rng.choose(MAVEN_QUALIFIERS); + // `.Final` / `.RELEASE` are dot-separated; everything else uses a hyphen. + if qualifier == "RELEASE" || qualifier == "Final" { + format!("{}.{}", base, qualifier) + } else { + format!("{}-{}", base, qualifier) + } + } else { + base + } +} + +batch_of!( + generate_maven_versions, + generate_maven_version, + "Generate a batch of Maven-style version strings." +); + +// === Version constraints === + +/// Generate a PEP 440 version specifier (e.g., `>=1.2.3`, `~=1.2`, `==1.2.*`). +#[inline] +pub fn generate_pypi_version_specifier(rng: &mut ForgeryRng) -> String { + let (major, minor, patch) = semver_parts(rng); + match rng.gen_range(0u8, 5) { + 0 => format!(">={}.{}.{}", major, minor, patch), + 1 => format!("~={}.{}", major, minor), + 2 => format!("=={}.{}.*", major, minor), + 3 => format!( + ">={}.{}.{},<{}.0.0", + major, + minor, + patch, + major.saturating_add(1) + ), + _ => format!(">={}.0,<{}.0", major, major.saturating_add(1)), + } +} + +batch_of!( + generate_pypi_version_specifiers, + generate_pypi_version_specifier, + "Generate a batch of PEP 440 version specifiers." +); + +/// Generate an npm-style version range (e.g., `^1.2.3`, `~1.2.3`, `1.x`). +#[inline] +pub fn generate_npm_version_range(rng: &mut ForgeryRng) -> String { + let (major, minor, patch) = semver_parts(rng); + match rng.gen_range(0u8, 5) { + 0 => format!("^{}.{}.{}", major, minor, patch), + 1 => format!("~{}.{}.{}", major, minor, patch), + 2 => format!( + ">={}.{}.{} <{}.0.0", + major, + minor, + patch, + major.saturating_add(1) + ), + 3 => format!("{}.x", major), + _ => format!("{}.{}.x", major, minor), + } +} + +batch_of!( + generate_npm_version_ranges, + generate_npm_version_range, + "Generate a batch of npm-style version ranges." +); + +/// Generate a Cargo version requirement (e.g., `^1.2.3`, `~1.2`, `>=1.0, <2.0`). +#[inline] +pub fn generate_cargo_version_req(rng: &mut ForgeryRng) -> String { + let (major, minor, patch) = semver_parts(rng); + match rng.gen_range(0u8, 6) { + 0 => format!("{}.{}.{}", major, minor, patch), + 1 => format!("^{}.{}.{}", major, minor, patch), + 2 => format!("~{}.{}", major, minor), + 3 => format!("~{}.{}.{}", major, minor, patch), + 4 => format!(">={}.{}", major, minor), + _ => format!(">={}.{}, <{}.0", major, minor, major.saturating_add(1)), + } +} + +batch_of!( + generate_cargo_version_reqs, + generate_cargo_version_req, + "Generate a batch of Cargo version requirements." +); + +/// Generate a Maven version range (e.g., `[1.2.3]`, `[1.0.0,2.0.0)`, `(,1.5]`). +#[inline] +pub fn generate_maven_version_range(rng: &mut ForgeryRng) -> String { + let (major, minor, patch) = semver_parts(rng); + match rng.gen_range(0u8, 5) { + 0 => format!("[{}.{}.{}]", major, minor, patch), + 1 => format!("[{}.{}.{},)", major, minor, patch), + 2 => format!("(,{}.{}.{}]", major, minor, patch), + 3 => format!("[{}.0.0,{}.0.0)", major, major.saturating_add(1)), + _ => format!( + "[{}.{}.{},{}.0.0)", + major, + minor, + patch, + major.saturating_add(1) + ), + } +} + +batch_of!( + generate_maven_version_ranges, + generate_maven_version_range, + "Generate a batch of Maven version ranges." +); + +/// Generate a RubyGems requirement (e.g., `~> 1.2`, `>= 1.0`, `~> 1.2, < 2.0`). +#[inline] +pub fn generate_gem_version_requirement(rng: &mut ForgeryRng) -> String { + let (major, minor, patch) = semver_parts(rng); + match rng.gen_range(0u8, 5) { + 0 => format!("~> {}.{}", major, minor), + 1 => format!("~> {}.{}.{}", major, minor, patch), + 2 => format!(">= {}.{}", major, minor), + 3 => format!(">= {}.{}.{}", major, minor, patch), + _ => format!( + ">= {}.{}.{}, < {}.0", + major, + minor, + patch, + major.saturating_add(1) + ), + } +} + +batch_of!( + generate_gem_version_requirements, + generate_gem_version_requirement, + "Generate a batch of RubyGems version requirements." +); + +// === Package names === + +/// Generate a PEP 503-normalized PyPI package name. +/// +/// Output contains only `[a-z0-9-]`: lowercase ASCII, with hyphen as the +/// sole separator. This matches the canonical form that PyPI's simple +/// index serves and that package resolvers compare against (PEP 503 +/// §Normalized Names). +#[inline] +pub fn generate_pypi_package_name(rng: &mut ForgeryRng) -> String { + let primary = rng.choose(PACKAGE_KEYWORDS); + let secondary = rng.choose(PACKAGE_KEYWORDS); + let modifier = rng.choose(PACKAGE_MODIFIERS); + match rng.gen_range(0u8, 4) { + 0 => (*primary).to_string(), + 1 => format!("py-{}", primary), + 2 => format!("{}-{}", modifier, primary), + _ => format!("{}-{}", primary, secondary), + } +} + +batch_of!( + generate_pypi_package_names, + generate_pypi_package_name, + "Generate a batch of PEP 503-normalized PyPI package names." +); + +/// Generate an npm package name. Roughly one in three is scoped +/// (`@scope/name`). +#[inline] +pub fn generate_npm_package_name(rng: &mut ForgeryRng) -> String { + let primary = rng.choose(PACKAGE_KEYWORDS); + let modifier = rng.choose(PACKAGE_MODIFIERS); + let base = match rng.gen_range(0u8, 4) { + 0 => (*primary).to_string(), + 1 => format!("{}-{}", modifier, primary), + 2 => format!("{}-{}", primary, rng.choose(PACKAGE_KEYWORDS)), + _ => format!("{}.js", primary), + }; + if rng.gen_range(0u8, 9) < 3 { + let scope = rng.choose(NPM_SCOPE_PREFIXES); + format!("@{}/{}", scope, base) + } else { + base + } +} + +batch_of!( + generate_npm_package_names, + generate_npm_package_name, + "Generate a batch of npm package names (mix of plain and scoped)." +); + +/// Generate a Cargo-compatible crate name (lowercase ASCII with `-` +/// or `_` separators — a single name never mixes the two). +#[inline] +pub fn generate_cargo_package_name(rng: &mut ForgeryRng) -> String { + let sep = if rng.gen_range(0u8, 9) < 5 { "-" } else { "_" }; + let primary = rng.choose(PACKAGE_KEYWORDS); + let modifier = rng.choose(PACKAGE_MODIFIERS); + match rng.gen_range(0u8, 4) { + 0 => (*primary).to_string(), + 1 => format!("{}{}{}", modifier, sep, primary), + 2 => format!("{}{}rs", primary, sep), + _ => format!("{}{}{}", primary, sep, rng.choose(PACKAGE_KEYWORDS)), + } +} + +batch_of!( + generate_cargo_package_names, + generate_cargo_package_name, + "Generate a batch of Cargo-compatible crate names." +); + +/// Generate a RubyGems-style gem name. +#[inline] +pub fn generate_gem_name(rng: &mut ForgeryRng) -> String { + let sep = if rng.gen_range(0u8, 9) < 5 { "-" } else { "_" }; + let primary = rng.choose(PACKAGE_KEYWORDS); + match rng.gen_range(0u8, 4) { + 0 => (*primary).to_string(), + 1 => format!("{}{}{}", primary, sep, rng.choose(PACKAGE_KEYWORDS)), + 2 => format!("ruby{}{}", sep, primary), + _ => format!("{}{}rb", primary, sep), + } +} + +batch_of!( + generate_gem_names, + generate_gem_name, + "Generate a batch of RubyGems gem names." +); + +/// Generate a Maven `groupId` (reverse-domain, e.g., `com.example.tools`). +#[inline] +pub fn generate_maven_group_id(rng: &mut ForgeryRng) -> String { + let tld = rng.choose(MAVEN_TLDS); + let org = rng.choose(MAVEN_ORG_COMPONENTS); + if rng.gen_range(0u8, 1) == 0 { + format!("{}.{}", tld, org) + } else { + let sub = rng.choose(MAVEN_ORG_COMPONENTS); + format!("{}.{}.{}", tld, org, sub) + } +} + +batch_of!( + generate_maven_group_ids, + generate_maven_group_id, + "Generate a batch of Maven reverse-domain group IDs." +); + +/// Generate a Maven `artifactId` (lowercase ASCII with hyphens). +#[inline] +pub fn generate_maven_artifact_id(rng: &mut ForgeryRng) -> String { + let primary = rng.choose(PACKAGE_KEYWORDS); + let modifier = rng.choose(PACKAGE_MODIFIERS); + match rng.gen_range(0u8, 4) { + 0 => (*primary).to_string(), + 1 => format!("{}-{}", modifier, primary), + 2 => format!("{}-{}", primary, rng.choose(PACKAGE_KEYWORDS)), + _ => format!("{}-core", primary), + } +} + +batch_of!( + generate_maven_artifact_ids, + generate_maven_artifact_id, + "Generate a batch of Maven artifact IDs." +); + +/// Generate a Maven GAV coordinate (`groupId:artifactId:version`). +#[inline] +pub fn generate_maven_coordinate(rng: &mut ForgeryRng) -> String { + format!( + "{}:{}:{}", + generate_maven_group_id(rng), + generate_maven_artifact_id(rng), + generate_maven_version(rng) + ) +} + +batch_of!( + generate_maven_coordinates, + generate_maven_coordinate, + "Generate a batch of Maven GAV coordinates." +); + +// === Full requirements === + +/// Generate a full pip-install–style requirement line, e.g., +/// `requests>=2.0.0,<3.0.0`. +#[inline] +pub fn generate_pypi_requirement(rng: &mut ForgeryRng) -> String { + format!( + "{}{}", + generate_pypi_package_name(rng), + generate_pypi_version_specifier(rng) + ) +} + +batch_of!( + generate_pypi_requirements, + generate_pypi_requirement, + "Generate a batch of full pip-install requirement lines." +); + +#[cfg(test)] +mod tests { + use super::*; + + fn seeded() -> ForgeryRng { + let mut rng = ForgeryRng::new(); + rng.seed(42); + rng + } + + // --- commit SHAs --- + + #[test] + fn commit_sha_is_40_lowercase_hex() { + let mut rng = seeded(); + for _ in 0..100 { + let sha = generate_commit_sha(&mut rng); + assert_eq!(sha.len(), 40); + assert!(sha.chars().all(|c| c.is_ascii_hexdigit())); + assert_eq!(sha, sha.to_lowercase()); + } + } + + #[test] + fn short_commit_sha_is_7_lowercase_hex() { + let mut rng = seeded(); + for _ in 0..100 { + let sha = generate_short_commit_sha(&mut rng); + assert_eq!(sha.len(), 7); + assert!(sha.chars().all(|c| c.is_ascii_hexdigit())); + } + } + + #[test] + fn commit_shas_batch_size_respected() { + let mut rng = seeded(); + assert_eq!(generate_commit_shas(&mut rng, 0).len(), 0); + assert_eq!(generate_commit_shas(&mut rng, 250).len(), 250); + } + + #[test] + fn commit_sha_is_deterministic() { + let mut a = ForgeryRng::new(); + let mut b = ForgeryRng::new(); + a.seed(123); + b.seed(123); + assert_eq!( + generate_commit_shas(&mut a, 50), + generate_commit_shas(&mut b, 50) + ); + } + + // --- SemVer --- + + fn split_dots(v: &str) -> Vec<&str> { + v.split('.').collect() + } + + #[test] + fn semver_is_three_numeric_parts() { + let mut rng = seeded(); + for _ in 0..200 { + let v = generate_semver(&mut rng); + let parts = split_dots(&v); + assert_eq!(parts.len(), 3, "unexpected shape: {}", v); + for p in parts { + p.parse::() + .unwrap_or_else(|_| panic!("not numeric: {}", v)); + } + } + } + + #[test] + fn semver_prerelease_has_hyphen_tag() { + let mut rng = seeded(); + for _ in 0..200 { + let v = generate_semver_prerelease(&mut rng); + assert!(v.contains('-'), "missing pre-release: {}", v); + let (base, rest) = v.split_once('-').unwrap(); + assert_eq!(split_dots(base).len(), 3, "bad base: {}", v); + // rest starts with a known tag + assert!( + PRERELEASE_TAGS.iter().any(|t| rest.starts_with(*t)), + "bad tag: {}", + v + ); + } + } + + // --- CalVer --- + + #[test] + fn calver_has_at_least_two_components() { + let mut rng = seeded(); + for _ in 0..200 { + let v = generate_calver(&mut rng); + let parts = split_dots(&v); + assert!((2..=3).contains(&parts.len()), "unexpected shape: {}", v); + for p in parts { + p.parse::() + .unwrap_or_else(|_| panic!("not numeric: {}", v)); + } + } + } + + // --- SPDX --- + + #[test] + fn spdx_license_is_from_list() { + let mut rng = seeded(); + for _ in 0..200 { + let id = generate_spdx_license(&mut rng); + assert!(SPDX_LICENSES.contains(&id.as_str())); + } + } + + #[test] + fn spdx_list_has_50_entries() { + assert_eq!(SPDX_LICENSES.len(), 50); + } + + // --- git username --- + + #[test] + fn git_username_obeys_github_rules() { + let mut rng = seeded(); + for _ in 0..500 { + let u = generate_git_username(&mut rng); + assert!(!u.is_empty()); + assert!(u.len() <= 39, "too long: {}", u); + assert!(!u.starts_with('-'), "leading hyphen: {}", u); + assert!(!u.ends_with('-'), "trailing hyphen: {}", u); + assert!(!u.contains("--"), "consecutive hyphens: {}", u); + for c in u.chars() { + assert!( + c.is_ascii_alphanumeric() || c == '-', + "invalid char in {}: {:?}", + u, + c + ); + } + } + } + + // --- PyPI / Maven versions --- + + #[test] + fn pypi_version_is_ascii_and_starts_with_digit() { + let mut rng = seeded(); + for _ in 0..200 { + let v = generate_pypi_version(&mut rng); + assert!(v.is_ascii()); + assert!(v.chars().next().unwrap().is_ascii_digit(), "{}", v); + } + } + + #[test] + fn maven_version_is_ascii_and_starts_with_digit() { + let mut rng = seeded(); + for _ in 0..200 { + let v = generate_maven_version(&mut rng); + assert!(v.is_ascii()); + assert!(v.chars().next().unwrap().is_ascii_digit(), "{}", v); + } + } + + // --- Version constraints --- + + #[test] + fn pypi_specifier_starts_with_operator() { + let mut rng = seeded(); + for _ in 0..200 { + let s = generate_pypi_version_specifier(&mut rng); + assert!( + s.starts_with(">=") + || s.starts_with("~=") + || s.starts_with("==") + || s.starts_with("<=") + || s.starts_with('<') + || s.starts_with('>'), + "bad prefix: {}", + s + ); + } + } + + #[test] + fn npm_range_contains_caret_tilde_or_x() { + let mut rng = seeded(); + for _ in 0..200 { + let r = generate_npm_version_range(&mut rng); + let has_known = + r.starts_with('^') || r.starts_with('~') || r.starts_with(">=") || r.contains('x'); + assert!(has_known, "unrecognised npm range: {}", r); + } + } + + #[test] + fn cargo_req_well_formed() { + let mut rng = seeded(); + for _ in 0..200 { + let r = generate_cargo_version_req(&mut rng); + assert!(!r.is_empty()); + assert!(r.is_ascii()); + } + } + + #[test] + fn maven_range_has_brackets() { + let mut rng = seeded(); + for _ in 0..200 { + let r = generate_maven_version_range(&mut rng); + assert!( + r.starts_with('[') || r.starts_with('('), + "bad maven range: {}", + r + ); + assert!( + r.ends_with(']') || r.ends_with(')'), + "bad maven range: {}", + r + ); + } + } + + #[test] + fn gem_requirement_uses_rubygems_operators() { + let mut rng = seeded(); + for _ in 0..200 { + let r = generate_gem_version_requirement(&mut rng); + assert!(r.starts_with("~>") || r.starts_with(">="), "bad: {}", r); + } + } + + // --- Package names --- + + #[test] + fn pypi_package_name_is_pep503_normalized() { + let mut rng = seeded(); + for _ in 0..300 { + let n = generate_pypi_package_name(&mut rng); + assert!(!n.is_empty()); + assert_eq!(n, n.to_lowercase()); + for c in n.chars() { + assert!( + c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-', + "bad char in {} (PEP 503 normalized accepts only [a-z0-9-]): {:?}", + n, + c + ); + } + assert!(!n.starts_with('-'), "leading hyphen: {}", n); + assert!(!n.ends_with('-'), "trailing hyphen: {}", n); + assert!(!n.contains("--"), "double hyphen: {}", n); + } + } + + #[test] + fn npm_package_name_valid_plain_or_scoped() { + let mut rng = seeded(); + let mut saw_scoped = false; + for _ in 0..1000 { + let n = generate_npm_package_name(&mut rng); + assert!(!n.is_empty() && n.len() <= 214); + assert_eq!(n, n.to_lowercase()); + if let Some(rest) = n.strip_prefix('@') { + saw_scoped = true; + let (scope, pkg) = rest + .split_once('/') + .unwrap_or_else(|| panic!("bad scope: {}", n)); + assert!(!scope.is_empty() && !pkg.is_empty()); + } else { + assert!(!n.starts_with('.') && !n.starts_with('_')); + } + } + assert!(saw_scoped, "expected at least one scoped name in 1000"); + } + + #[test] + fn cargo_package_name_separator_consistent() { + let mut rng = seeded(); + for _ in 0..300 { + let n = generate_cargo_package_name(&mut rng); + // Should not mix both - and _ in a single name. + let has_hyphen = n.contains('-'); + let has_underscore = n.contains('_'); + assert!(!(has_hyphen && has_underscore), "mixed separators: {}", n); + } + } + + #[test] + fn maven_group_id_is_reverse_domain() { + let mut rng = seeded(); + for _ in 0..300 { + let g = generate_maven_group_id(&mut rng); + let parts = split_dots(&g); + assert!((2..=3).contains(&parts.len()), "bad group id: {}", g); + assert!(MAVEN_TLDS.contains(&parts[0]), "bad tld: {}", g); + } + } + + #[test] + fn maven_coordinate_is_gav() { + let mut rng = seeded(); + for _ in 0..300 { + let c = generate_maven_coordinate(&mut rng); + let parts: Vec<&str> = c.split(':').collect(); + assert_eq!(parts.len(), 3, "not GAV: {}", c); + assert!(!parts[0].is_empty() && !parts[1].is_empty() && !parts[2].is_empty()); + } + } + + // --- PyPI requirement --- + + #[test] + fn pypi_requirement_has_operator_after_name() { + let mut rng = seeded(); + for _ in 0..300 { + let r = generate_pypi_requirement(&mut rng); + // Find the first operator char. + let op_idx = r + .find(['>', '<', '=', '~', '!']) + .unwrap_or_else(|| panic!("no operator: {}", r)); + assert!(op_idx > 0, "empty name: {}", r); + } + } + + // --- Determinism / batch sanity --- + + #[test] + fn all_generators_are_deterministic() { + macro_rules! check { + ($fn:ident) => {{ + let mut a = ForgeryRng::new(); + let mut b = ForgeryRng::new(); + a.seed(7); + b.seed(7); + assert_eq!($fn(&mut a, 20), $fn(&mut b, 20), stringify!($fn)); + }}; + } + check!(generate_commit_shas); + check!(generate_short_commit_shas); + check!(generate_semvers); + check!(generate_semver_prereleases); + check!(generate_calvers); + check!(generate_spdx_licenses); + check!(generate_git_usernames); + check!(generate_pypi_versions); + check!(generate_maven_versions); + check!(generate_pypi_version_specifiers); + check!(generate_npm_version_ranges); + check!(generate_cargo_version_reqs); + check!(generate_maven_version_ranges); + check!(generate_gem_version_requirements); + check!(generate_pypi_package_names); + check!(generate_npm_package_names); + check!(generate_cargo_package_names); + check!(generate_gem_names); + check!(generate_maven_group_ids); + check!(generate_maven_artifact_ids); + check!(generate_maven_coordinates); + check!(generate_pypi_requirements); + } + + #[test] + fn empty_batch_is_empty() { + let mut rng = seeded(); + assert!(generate_semvers(&mut rng, 0).is_empty()); + assert!(generate_maven_coordinates(&mut rng, 0).is_empty()); + assert!(generate_spdx_licenses(&mut rng, 0).is_empty()); + } +} + +#[cfg(test)] +mod proptest_tests { + use super::*; + use proptest::prelude::*; + + proptest! { + #[test] + fn prop_commit_sha_batch(n in 0usize..1000) { + let mut rng = ForgeryRng::new(); + rng.seed(1); + prop_assert_eq!(generate_commit_shas(&mut rng, n).len(), n); + } + + #[test] + fn prop_semver_batch(n in 0usize..1000) { + let mut rng = ForgeryRng::new(); + rng.seed(1); + prop_assert_eq!(generate_semvers(&mut rng, n).len(), n); + } + + #[test] + fn prop_seed_determinism_semver(seed in any::(), n in 1usize..50) { + let mut a = ForgeryRng::new(); + let mut b = ForgeryRng::new(); + a.seed(seed); + b.seed(seed); + prop_assert_eq!(generate_semvers(&mut a, n), generate_semvers(&mut b, n)); + } + + #[test] + fn prop_git_username_well_formed(n in 1usize..200) { + let mut rng = ForgeryRng::new(); + rng.seed(1); + for u in generate_git_usernames(&mut rng, n) { + prop_assert!(!u.is_empty() && u.len() <= 39); + prop_assert!(!u.starts_with('-') && !u.ends_with('-')); + prop_assert!(!u.contains("--")); + } + } + + #[test] + fn prop_maven_coord_gav(n in 1usize..200) { + let mut rng = ForgeryRng::new(); + rng.seed(1); + for c in generate_maven_coordinates(&mut rng, n) { + prop_assert_eq!(c.split(':').count(), 3); + } + } + } +} diff --git a/tests/test_packages.py b/tests/test_packages.py new file mode 100644 index 0000000..aca850e --- /dev/null +++ b/tests/test_packages.py @@ -0,0 +1,562 @@ +"""Tests for package-registry data generation. + +Covers cross-ecosystem primitives (commit SHAs, SemVer, CalVer, SPDX, Git +usernames) and ecosystem-specific shapes (PyPI / npm / Cargo / RubyGems / +Maven package names, versions, and constraint syntaxes). +""" + +from __future__ import annotations + +import re +from typing import ClassVar + +import pytest + +from forgery import Faker + +# --------------------------------------------------------------------------- +# Shared fixtures / helpers +# --------------------------------------------------------------------------- + + +@pytest.fixture +def fake() -> Faker: + f = Faker() + f.seed(42) + return f + + +SEMVER_RE = re.compile(r"^\d+\.\d+\.\d+$") +HEX_RE = re.compile(r"^[0-9a-f]+$") + + +# --------------------------------------------------------------------------- +# Cross-ecosystem primitives +# --------------------------------------------------------------------------- + + +class TestCommitSha: + def test_single_is_40_lowercase_hex(self, fake: Faker) -> None: + sha = fake.commit_sha() + assert len(sha) == 40 + assert HEX_RE.match(sha) + assert sha == sha.lower() + + def test_batch_size_respected(self, fake: Faker) -> None: + assert fake.commit_shas(0) == [] + assert len(fake.commit_shas(500)) == 500 + + def test_deterministic(self) -> None: + a, b = Faker(), Faker() + a.seed(123) + b.seed(123) + assert a.commit_shas(50) == b.commit_shas(50) + + def test_short_form_is_7_hex(self, fake: Faker) -> None: + for _ in range(50): + s = fake.short_commit_sha() + assert len(s) == 7 + assert HEX_RE.match(s) + + def test_short_batch(self, fake: Faker) -> None: + assert len(fake.short_commit_shas(250)) == 250 + + +class TestSemver: + def test_format(self, fake: Faker) -> None: + for _ in range(200): + assert SEMVER_RE.match(fake.semver()) + + def test_batch(self, fake: Faker) -> None: + batch = fake.semvers(300) + assert len(batch) == 300 + for v in batch: + assert SEMVER_RE.match(v) + + def test_deterministic(self) -> None: + a, b = Faker(), Faker() + a.seed(7) + b.seed(7) + assert a.semvers(100) == b.semvers(100) + + +class TestSemverPrerelease: + PRERELEASE_TAGS: ClassVar[set[str]] = {"alpha", "beta", "rc", "pre", "dev"} + + def test_has_prerelease_tag(self, fake: Faker) -> None: + for _ in range(200): + v = fake.semver_prerelease() + assert "-" in v + base, _, rest = v.partition("-") + assert SEMVER_RE.match(base) + # rest starts with one of the known tags + tag_match = any(rest.startswith(tag) for tag in self.PRERELEASE_TAGS) + assert tag_match, f"unknown tag in: {v}" + + def test_sometimes_has_build_metadata(self, fake: Faker) -> None: + batch = fake.semver_prereleases(500) + assert any("+build." in v for v in batch), "expected at least one build suffix" + + +class TestCalver: + def test_numeric_parts_only(self, fake: Faker) -> None: + for _ in range(200): + v = fake.calver() + parts = v.split(".") + assert 2 <= len(parts) <= 3 + for p in parts: + assert p.isdigit(), f"non-numeric in: {v}" + + def test_batch(self, fake: Faker) -> None: + assert len(fake.calvers(100)) == 100 + + +class TestSpdx: + KNOWN_IDS: ClassVar[set[str]] = { + "MIT", + "Apache-2.0", + "BSD-2-Clause", + "BSD-3-Clause", + "GPL-3.0-only", + "ISC", + "MPL-2.0", + "Unlicense", + } + + def test_from_fixed_list(self, fake: Faker) -> None: + seen = {fake.spdx_license() for _ in range(200)} + # At least a few of the very common ones should appear across 200 draws. + assert seen & self.KNOWN_IDS, "none of the common SPDX IDs appeared" + + def test_batch(self, fake: Faker) -> None: + batch = fake.spdx_licenses(50) + assert len(batch) == 50 + for v in batch: + # Every entry is a non-empty string with no internal spaces. + assert v and " " not in v + + +class TestGitUsername: + RULES = re.compile(r"^[A-Za-z0-9]([A-Za-z0-9]|-(?!-))*[A-Za-z0-9]$|^[A-Za-z0-9]$") + + def test_github_rules(self, fake: Faker) -> None: + for _ in range(500): + u = fake.git_username() + assert u, "empty username" + assert len(u) <= 39, f"too long: {u}" + assert not u.startswith("-"), f"leading hyphen: {u}" + assert not u.endswith("-"), f"trailing hyphen: {u}" + assert "--" not in u, f"double hyphen: {u}" + assert self.RULES.match(u), f"invalid chars: {u}" + + def test_batch(self, fake: Faker) -> None: + assert len(fake.git_usernames(200)) == 200 + + +# --------------------------------------------------------------------------- +# Ecosystem-specific versions +# --------------------------------------------------------------------------- + + +class TestPypiVersion: + """PEP 440 coverage. Uses packaging.version if available; otherwise + falls back to a structural check.""" + + def test_starts_with_digit(self, fake: Faker) -> None: + for _ in range(200): + v = fake.pypi_version() + assert v[0].isdigit(), f"bad leading char: {v}" + assert v.isascii() + + def test_parseable_by_packaging_if_available(self, fake: Faker) -> None: + packaging_version = pytest.importorskip("packaging.version") + for _ in range(200): + v = fake.pypi_version() + # raises InvalidVersion on failure + packaging_version.Version(v) + + def test_batch(self, fake: Faker) -> None: + assert len(fake.pypi_versions(150)) == 150 + + +class TestMavenVersion: + def test_starts_with_digit(self, fake: Faker) -> None: + for _ in range(200): + v = fake.maven_version() + assert v[0].isdigit(), v + assert v.isascii() + + def test_sometimes_has_qualifier(self, fake: Faker) -> None: + # Maven qualifiers appear as either a hyphen (`-SNAPSHOT`) or a + # fourth dotted segment (`.Final`, `.RELEASE`). The base is + # `MAJOR.MINOR.PATCH` — exactly two dots — so a dot qualifier is + # anything with more than two dots. + batch = fake.maven_versions(500) + assert any("-" in v or v.count(".") > 2 for v in batch) + + +# --------------------------------------------------------------------------- +# Version constraints +# --------------------------------------------------------------------------- + + +class TestPypiSpecifier: + def test_starts_with_operator(self, fake: Faker) -> None: + ops = (">=", "<=", "==", "~=", ">", "<") + for _ in range(200): + s = fake.pypi_version_specifier() + assert s.startswith(ops), f"bad prefix: {s}" + + def test_parseable_by_packaging_if_available(self, fake: Faker) -> None: + packaging_specifiers = pytest.importorskip("packaging.specifiers") + for _ in range(200): + s = fake.pypi_version_specifier() + packaging_specifiers.SpecifierSet(s) + + +class TestNpmRange: + def test_contains_known_syntax(self, fake: Faker) -> None: + for _ in range(200): + r = fake.npm_version_range() + assert r.startswith("^") or r.startswith("~") or r.startswith(">=") or "x" in r, ( + f"bad range: {r}" + ) + + +class TestCargoReq: + def test_ascii_and_nonempty(self, fake: Faker) -> None: + for _ in range(200): + r = fake.cargo_version_req() + assert r + assert r.isascii() + + +class TestMavenRange: + def test_has_brackets(self, fake: Faker) -> None: + for _ in range(200): + r = fake.maven_version_range() + assert r[0] in "[(" and r[-1] in "])", f"bad range: {r}" + + +class TestGemRequirement: + def test_rubygems_operators(self, fake: Faker) -> None: + for _ in range(200): + r = fake.gem_version_requirement() + assert r.startswith("~>") or r.startswith(">="), r + + +# --------------------------------------------------------------------------- +# Package names +# --------------------------------------------------------------------------- + + +class TestPypiPackageName: + # PEP 503 normalized form: lowercase ASCII + hyphens only + PEP503_NORMALIZED = re.compile(r"^[a-z0-9]([a-z0-9-]*[a-z0-9])?$") + + def test_pep503_normalized(self, fake: Faker) -> None: + for _ in range(300): + n = fake.pypi_package_name() + assert self.PEP503_NORMALIZED.match(n), f"not PEP 503 normalized: {n}" + # PEP 503 normalization collapses `[-_.]+` to `-`; output should + # never contain the pre-normalized separators. + assert "_" not in n, f"underscore in normalized name: {n}" + assert "." not in n, f"dot in normalized name: {n}" + + def test_lowercase(self, fake: Faker) -> None: + for n in fake.pypi_package_names(200): + assert n == n.lower() + + +class TestNpmPackageName: + def test_length_bound(self, fake: Faker) -> None: + for n in fake.npm_package_names(500): + assert 0 < len(n) <= 214 + + def test_lowercase(self, fake: Faker) -> None: + for n in fake.npm_package_names(200): + assert n == n.lower() + + def test_some_scoped(self, fake: Faker) -> None: + batch = fake.npm_package_names(500) + scoped = [n for n in batch if n.startswith("@")] + assert scoped, "expected at least one scoped name" + for s in scoped: + assert "/" in s + scope, pkg = s[1:].split("/", 1) + assert scope and pkg + + +class TestCargoPackageName: + def test_ascii_lowercase(self, fake: Faker) -> None: + for n in fake.cargo_package_names(300): + assert n.isascii() + assert n == n.lower() + + def test_no_mixed_separators(self, fake: Faker) -> None: + # A given name should not mix both '-' and '_'. + for n in fake.cargo_package_names(300): + assert not ("-" in n and "_" in n), f"mixed separators in {n}" + + +class TestGemName: + def test_ascii_lowercase(self, fake: Faker) -> None: + for n in fake.gem_names(200): + assert n.isascii() + assert n == n.lower() + + +class TestMaven: + TLDS: ClassVar[set[str]] = { + "com", + "org", + "io", + "net", + "dev", + "co", + "tech", + "app", + "xyz", + } + + def test_group_id_is_reverse_domain(self, fake: Faker) -> None: + for _ in range(300): + g = fake.maven_group_id() + parts = g.split(".") + assert 2 <= len(parts) <= 3 + assert parts[0] in self.TLDS + + def test_artifact_id_nonempty_ascii(self, fake: Faker) -> None: + for _ in range(200): + a = fake.maven_artifact_id() + assert a and a.isascii() and a == a.lower() + + def test_coordinate_is_gav(self, fake: Faker) -> None: + for _ in range(300): + c = fake.maven_coordinate() + parts = c.split(":") + assert len(parts) == 3, f"not GAV: {c}" + for p in parts: + assert p + + +# --------------------------------------------------------------------------- +# Full requirement lines +# --------------------------------------------------------------------------- + + +class TestPypiRequirement: + OPERATORS: ClassVar[tuple[str, ...]] = ("==", "!=", ">=", "<=", "~=", ">", "<") + + def test_has_name_and_specifier(self, fake: Faker) -> None: + for _ in range(200): + r = fake.pypi_requirement() + # Find the operator boundary. + op_idx = next( + (i for i, c in enumerate(r) if c in "<>=~!"), + -1, + ) + assert op_idx > 0, f"no operator in: {r}" + name, specifier = r[:op_idx], r[op_idx:] + assert name + assert specifier.startswith(self.OPERATORS) + + +# --------------------------------------------------------------------------- +# Determinism, parallel output, and error paths +# --------------------------------------------------------------------------- + + +class TestDeterminism: + @pytest.mark.parametrize( + "method, n", + [ + ("commit_shas", 50), + ("short_commit_shas", 50), + ("semvers", 50), + ("semver_prereleases", 50), + ("calvers", 50), + ("spdx_licenses", 50), + ("git_usernames", 50), + ("pypi_versions", 50), + ("maven_versions", 50), + ("pypi_version_specifiers", 50), + ("npm_version_ranges", 50), + ("cargo_version_reqs", 50), + ("maven_version_ranges", 50), + ("gem_version_requirements", 50), + ("pypi_package_names", 50), + ("npm_package_names", 50), + ("cargo_package_names", 50), + ("gem_names", 50), + ("maven_group_ids", 50), + ("maven_artifact_ids", 50), + ("maven_coordinates", 50), + ("pypi_requirements", 50), + ], + ) + def test_seed_determinism(self, method: str, n: int) -> None: + a, b = Faker(), Faker() + a.seed(2026) + b.seed(2026) + assert getattr(a, method)(n) == getattr(b, method)(n) + + +class TestParallelProducesSameShape: + """set_parallel(True) changes ordering but every element should still + satisfy the same shape invariants.""" + + def test_commit_sha_parallel(self) -> None: + fake = Faker() + fake.seed(11) + fake.set_parallel(True, num_threads=4) + for s in fake.commit_shas(2000): + assert len(s) == 40 and HEX_RE.match(s) + + def test_maven_coordinate_parallel(self) -> None: + fake = Faker() + fake.seed(11) + fake.set_parallel(True, num_threads=4) + for c in fake.maven_coordinates(2000): + assert c.count(":") == 2 + + +class TestErrorPaths: + def test_empty_batch(self, fake: Faker) -> None: + assert fake.semvers(0) == [] + assert fake.maven_coordinates(0) == [] + + def test_batch_size_over_limit_raises(self, fake: Faker) -> None: + with pytest.raises(ValueError): + fake.semvers(10_000_001) + + +class TestUnique: + """`unique=True` deduplicates and raises when the pool is exhausted.""" + + UNIQUE_METHODS: ClassVar[list[str]] = [ + "pypi_package_names", + "npm_package_names", + "cargo_package_names", + "gem_names", + "maven_group_ids", + "maven_artifact_ids", + "maven_coordinates", + "git_usernames", + "spdx_licenses", + ] + + @pytest.mark.parametrize("method", UNIQUE_METHODS) + def test_unique_batch_has_no_duplicates(self, method: str) -> None: + f = Faker() + f.seed(42) + batch = getattr(f, method)(50, unique=True) + assert len(batch) == 50 + assert len(set(batch)) == 50, f"{method} produced duplicates under unique=True" + + @pytest.mark.parametrize("method", UNIQUE_METHODS) + def test_unique_is_deterministic(self, method: str) -> None: + a, b = Faker(), Faker() + a.seed(2026) + b.seed(2026) + assert getattr(a, method)(30, unique=True) == getattr(b, method)(30, unique=True) + + def test_spdx_exhaustion_raises(self, fake: Faker) -> None: + # The built-in SPDX list has 50 entries; asking for 60 unique is impossible. + with pytest.raises(ValueError): + fake.spdx_licenses(60, unique=True) + + def test_non_unique_still_works(self, fake: Faker) -> None: + # Sanity: unique=False (the default) path is unchanged. + batch = fake.pypi_package_names(200) + assert len(batch) == 200 + # Almost certainly has some duplicates across 200 draws from the pool. + # This is a loose assertion — we just want to confirm non-unique returns n. + + +# --------------------------------------------------------------------------- +# Module-level convenience functions +# --------------------------------------------------------------------------- + + +class TestModuleConvenience: + def test_convenience_functions_importable(self) -> None: + from forgery import ( + calver, + cargo_package_name, + commit_sha, + gem_name, + git_username, + maven_coordinate, + npm_package_name, + pypi_package_name, + pypi_requirement, + semver, + spdx_license, + ) + + assert isinstance(commit_sha(), str) + assert isinstance(semver(), str) + assert isinstance(calver(), str) + assert isinstance(spdx_license(), str) + assert isinstance(git_username(), str) + assert isinstance(pypi_package_name(), str) + assert isinstance(npm_package_name(), str) + assert isinstance(cargo_package_name(), str) + assert isinstance(gem_name(), str) + assert isinstance(maven_coordinate(), str) + assert isinstance(pypi_requirement(), str) + + def test_seed_applies_to_convenience(self) -> None: + from forgery import seed, semvers + + seed(99) + first = semvers(10) + seed(99) + second = semvers(10) + assert first == second + + # One entry per module-level function (single + batch) so every path is + # exercised and coverage stays above threshold. + _SINGLE_FUNCS: ClassVar[list[str]] = [ + "commit_sha", + "short_commit_sha", + "semver", + "semver_prerelease", + "calver", + "spdx_license", + "git_username", + "pypi_version", + "maven_version", + "pypi_version_specifier", + "npm_version_range", + "cargo_version_req", + "maven_version_range", + "gem_version_requirement", + "pypi_package_name", + "npm_package_name", + "cargo_package_name", + "gem_name", + "maven_group_id", + "maven_artifact_id", + "maven_coordinate", + "pypi_requirement", + ] + + @pytest.mark.parametrize("fn_name", _SINGLE_FUNCS) + def test_every_single_convenience_returns_nonempty_str(self, fn_name: str) -> None: + import forgery + + fn = getattr(forgery, fn_name) + value = fn() + assert isinstance(value, str) and value + + @pytest.mark.parametrize("fn_name", _SINGLE_FUNCS) + def test_every_batch_convenience_returns_list(self, fn_name: str) -> None: + import forgery + + batch_fn = getattr(forgery, f"{fn_name}s") + result = batch_fn(5) + assert isinstance(result, list) + assert len(result) == 5 + assert all(isinstance(v, str) and v for v in result)