Skip to content

Speed up bulk card import logic #52

@GuySchnidrig

Description

@GuySchnidrig

import requests
import gzip
import os
import pandas as pd
import json

-----------------------

Configuration

-----------------------

SCRYFALL_BULK_URL = "https://api.scryfall.com/bulk-data"
BULK_TYPE = "default_cards"
DRAFTED_DECKS_PATH = "data/processed/drafted_decks.csv"
CUBE_MAINBOARD_PATH = "data/processed/cube_mainboard.csv"
OUTPUT_DIR = "data/cards"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "scryfall_filtered_cards.json.gz")
TIMESTAMP_FILE = OUTPUT_FILE + ".timestamp"
IDS_FILE = OUTPUT_FILE + ".ids"

-----------------------

Cache Helpers

-----------------------

def load_cached_timestamp() -> str | None:
if os.path.exists(TIMESTAMP_FILE):
with open(TIMESTAMP_FILE) as f:
return f.read().strip()
return None

def save_cached_timestamp(updated_at: str):
with open(TIMESTAMP_FILE, "w") as f:
f.write(updated_at)

def load_cached_ids() -> set:
if os.path.exists(IDS_FILE):
with open(IDS_FILE) as f:
return set(json.load(f))
return set()

def save_cached_ids(drafted_ids: set):
with open(IDS_FILE, "w") as f:
json.dump(list(drafted_ids), f)

-----------------------

Helper Functions

-----------------------

def get_bulk_info(bulk_type: str = BULK_TYPE) -> dict:
"""Fetch metadata for a specific bulk data type."""
response = requests.get(SCRYFALL_BULK_URL)
response.raise_for_status()
for bulk in response.json()["data"]:
if bulk["type"] == bulk_type:
return bulk
raise ValueError(f"No bulk file found for type '{bulk_type}'")

def download_and_filter_bulk(url: str, drafted_ids: set, save_path: str):
"""Download the bulk file, filter by drafted_ids, and save gzip JSON."""
print("Downloading Scryfall bulk data...")
response = requests.get(url)
response.raise_for_status()
data = response.json()
filtered_cards = [card for card in data if card["id"] in drafted_ids]
print(f"Filtered {len(filtered_cards)} cards out of {len(data)} total.")
with gzip.open(save_path, "wt", encoding="utf-8") as f:
json.dump(filtered_cards, f)
print(f"Saved filtered cards to: {save_path}")

-----------------------

Main Script

-----------------------

if name == "main":
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load drafted deck Scryfall IDs
print("Loading drafted deck IDs...")
drafted_decks = pd.read_csv(DRAFTED_DECKS_PATH)
cube_mainboard = pd.read_csv(CUBE_MAINBOARD_PATH)
drafted_ids = (
    set(drafted_decks["scryfallId"].dropna().unique())
    | set(cube_mainboard["scryfallId"].dropna().unique())
)
print(f"Found {len(drafted_ids)} unique card IDs.")

# Check remote Scryfall metadata
print("Fetching Scryfall bulk metadata...")
bulk_info = get_bulk_info()
remote_updated_at = bulk_info["updated_at"]

# Compare against cache
cached_timestamp = load_cached_timestamp()
cached_ids = load_cached_ids()
ids_changed = drafted_ids != cached_ids
scryfall_updated = remote_updated_at != cached_timestamp
output_missing = not os.path.exists(OUTPUT_FILE)

if not output_missing and not scryfall_updated and not ids_changed:
    print(f"Scryfall data is current ({remote_updated_at}) and cube is unchanged. Skipping download.")
else:
    if output_missing:
        print("No local card data found, downloading...")
    elif scryfall_updated:
        print(f"Scryfall updated ({cached_timestamp} -> {remote_updated_at}), downloading...")
    elif ids_changed:
        new_ids = drafted_ids - cached_ids
        print(f"Cube has {len(new_ids)} new card(s), downloading...")

    download_and_filter_bulk(bulk_info["download_uri"], drafted_ids, OUTPUT_FILE)
    save_cached_timestamp(remote_updated_at)
    save_cached_ids(drafted_ids)

Metadata

Metadata

Assignees

Labels

No labels
No labels

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions