-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgetdata.py
More file actions
104 lines (88 loc) · 3.58 KB
/
Copy pathgetdata.py
File metadata and controls
104 lines (88 loc) · 3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import time
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout
# Constants
SEASONS = list(range(2016, 2025))
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR, "scores")
# Helper function to fetch HTML content
async def fetch_html_content(url, selector, sleep=9, retries=12):
"""
Fetch HTML content from a given URL using Playwright
"""
html = None
for attempt in range(1, retries + 1):
time.sleep(sleep * attempt)
try:
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()
await page.goto(url)
print(f"Fetching {url}: {await page.title()}")
html = await page.inner_html(selector)
except PlaywrightTimeout:
print(f"Timeout Error on attempt {attempt} for {url}")
continue
else:
break
return html
# Function to scrape standings for a given season
async def scrape_season_data(season):
"""
Scrape standings data for a given NBA season.
"""
url = f"https://www.basketball-reference.com/leagues/NBA_{season}_games.html"
html = await fetch_html_content(url, "#content .filter")
if not html:
print(f"Failed to retrieve season data for {season}")
return
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all("a")
standings_pages = [f"https://www.basketball-reference.com{link['href']}" for link in links]
for page_url in standings_pages:
save_path = os.path.join(STANDINGS_DIR, page_url.split("/")[-1])
if os.path.exists(save_path):
print(f"File already exists: {save_path}, skipping.")
continue
page_html = await fetch_html_content(page_url, "#all_schedule")
if page_html:
with open(save_path, "w+", encoding='utf-8') as file:
file.write(page_html)
# Function to scrape game data from standings files
async def scrape_game_data(standings_file):
"""
Scrape game data from a standings file.
"""
with open(standings_file, 'r', encoding='utf-8') as file:
html = file.read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all("a")
box_score_urls = [f"https://www.basketball-reference.com{link.get('href')}" for link in links if link.get('href') and "boxscore" in link.get('href')]
for game_url in box_score_urls:
save_path = os.path.join(SCORES_DIR, game_url.split("/")[-1])
if os.path.exists(save_path):
print(f"File already exists: {save_path}, skipping.")
continue
game_html = await fetch_html_content(game_url, "#content")
if game_html:
with open(save_path, "w+", encoding='utf-8') as file:
file.write(game_html)
# Main function to orchestrate scraping tasks
async def main():
os.makedirs(STANDINGS_DIR, exist_ok=True)
os.makedirs(SCORES_DIR, exist_ok=True)
# Scrape data for each season
for season in SEASONS:
await scrape_season_data(season)
# Scrape game data from standings files
standings_files = os.listdir(STANDINGS_DIR)
for season in SEASONS:
files_for_season = [file for file in standings_files if str(season) in file]
for standings_file in files_for_season:
await scrape_game_data(os.path.join(STANDINGS_DIR, standings_file))
# Run the main function
if __name__ == "__main__":
import asyncio
asyncio.run(main())