Speed up bulk card import logic

import requests
import gzip
import os
import pandas as pd
import json

# -----------------------
# Configuration
# -----------------------
SCRYFALL_BULK_URL = "https://api.scryfall.com/bulk-data"
BULK_TYPE = "default_cards"
DRAFTED_DECKS_PATH = "data/processed/drafted_decks.csv"
CUBE_MAINBOARD_PATH = "data/processed/cube_mainboard.csv"
OUTPUT_DIR = "data/cards"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "scryfall_filtered_cards.json.gz")
TIMESTAMP_FILE = OUTPUT_FILE + ".timestamp"
IDS_FILE = OUTPUT_FILE + ".ids"

# -----------------------
# Cache Helpers
# -----------------------
def load_cached_timestamp() -> str | None:
    if os.path.exists(TIMESTAMP_FILE):
        with open(TIMESTAMP_FILE) as f:
            return f.read().strip()
    return None

def save_cached_timestamp(updated_at: str):
    with open(TIMESTAMP_FILE, "w") as f:
        f.write(updated_at)

def load_cached_ids() -> set:
    if os.path.exists(IDS_FILE):
        with open(IDS_FILE) as f:
            return set(json.load(f))
    return set()

def save_cached_ids(drafted_ids: set):
    with open(IDS_FILE, "w") as f:
        json.dump(list(drafted_ids), f)

# -----------------------
# Helper Functions
# -----------------------
def get_bulk_info(bulk_type: str = BULK_TYPE) -> dict:
    """Fetch metadata for a specific bulk data type."""
    response = requests.get(SCRYFALL_BULK_URL)
    response.raise_for_status()
    for bulk in response.json()["data"]:
        if bulk["type"] == bulk_type:
            return bulk
    raise ValueError(f"No bulk file found for type '{bulk_type}'")

def download_and_filter_bulk(url: str, drafted_ids: set, save_path: str):
    """Download the bulk file, filter by drafted_ids, and save gzip JSON."""
    print("Downloading Scryfall bulk data...")
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    filtered_cards = [card for card in data if card["id"] in drafted_ids]
    print(f"Filtered {len(filtered_cards)} cards out of {len(data)} total.")
    with gzip.open(save_path, "wt", encoding="utf-8") as f:
        json.dump(filtered_cards, f)
    print(f"Saved filtered cards to: {save_path}")

# -----------------------
# Main Script
# -----------------------
if __name__ == "__main__":
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Load drafted deck Scryfall IDs
    print("Loading drafted deck IDs...")
    drafted_decks = pd.read_csv(DRAFTED_DECKS_PATH)
    cube_mainboard = pd.read_csv(CUBE_MAINBOARD_PATH)
    drafted_ids = (
        set(drafted_decks["scryfallId"].dropna().unique())
        | set(cube_mainboard["scryfallId"].dropna().unique())
    )
    print(f"Found {len(drafted_ids)} unique card IDs.")

    # Check remote Scryfall metadata
    print("Fetching Scryfall bulk metadata...")
    bulk_info = get_bulk_info()
    remote_updated_at = bulk_info["updated_at"]

    # Compare against cache
    cached_timestamp = load_cached_timestamp()
    cached_ids = load_cached_ids()
    ids_changed = drafted_ids != cached_ids
    scryfall_updated = remote_updated_at != cached_timestamp
    output_missing = not os.path.exists(OUTPUT_FILE)

    if not output_missing and not scryfall_updated and not ids_changed:
        print(f"Scryfall data is current ({remote_updated_at}) and cube is unchanged. Skipping download.")
    else:
        if output_missing:
            print("No local card data found, downloading...")
        elif scryfall_updated:
            print(f"Scryfall updated ({cached_timestamp} -> {remote_updated_at}), downloading...")
        elif ids_changed:
            new_ids = drafted_ids - cached_ids
            print(f"Cube has {len(new_ids)} new card(s), downloading...")

        download_and_filter_bulk(bulk_info["download_uri"], drafted_ids, OUTPUT_FILE)
        save_cached_timestamp(remote_updated_at)
        save_cached_ids(drafted_ids)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Speed up bulk card import logic #52

-----------------------

Configuration

-----------------------

-----------------------

Cache Helpers

-----------------------

-----------------------

Helper Functions

-----------------------

-----------------------

Main Script

-----------------------

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Speed up bulk card import logic #52

Description

-----------------------

Configuration

-----------------------

-----------------------

Cache Helpers

-----------------------

-----------------------

Helper Functions

-----------------------

-----------------------

Main Script

-----------------------

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions