diff --git a/.gitignore b/.gitignore
index be2baa1..ffbec06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -103,3 +103,12 @@ venv.bak/
# mypy
.mypy_cache/
.vscode/
+
+# csv
+*.csv
+
+# ipynb
+*.ipynb
+
+# ignore this folder
+testing/
\ No newline at end of file
diff --git a/README.md b/README.md
index e557119..c7d9d6f 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,41 @@
-vgchartzfull is a python script based on BeautifulSoup.
+
+vgchartzfull is a python script with multiprocessing based on BeautifulSoup.
+proxies are implemented in the script, it can be disabled by changing it to False
+
It creates a dataset based on data from
http://www.vgchartz.com/gamedb/
-The dataset is saved as vgsales.csv.
+The dataset is saved as vgsales-%Y-%m-%d_%H_%M_%S.csv.
+
+You will need to have the following dependencies installed:
+```
+BeautifulSoup4
+pandas
+numpy
+requests
+unidecode
+user_agent
+```
+
+Thanks to:
+- https://www.kdnuggets.com/2018/02/web-scraping-tutorial-python.html
+- http://python.omics.wiki/multiprocessing_map/multiprocessing_partial_function_multiple_arguments
+- https://medium.com/datadriveninvestor/speed-up-web-scraping-using-multiprocessing-in-python-af434ff310c5
-You will need to have BeautifulSoup added.
-It can be installed by pip.
-sudo pip install BeautifulSoup
+Free proxies:
+[1](https://proxyscrape.com/free-proxy-list)
+[2](http://multiproxy.org/txt_all/proxy.txt)
+[3](https://proxy.rudnkh.me/txt)
+[4](https://www.us-proxy.org/)
-Thanks to Chris Albon.
-http://chrisalbon.com/python/beautiful_soup_scrape_table.html
+- [x] added multiprocessing for faster results with a maximum of 24 workers.
+- [x] added proxies to avoid being blocked
+- [x] handling couple of exceptions
+- [x] scraped data gets saved before raising an unexpected error
+- [x] add the option to continue where we left off due to an unexpected error
+- [x] clean version removes the print statements, should results in better performance!
+- [ ] optimize it
+- [ ] create a log file
+- [ ] convert the script to a class or use scrapy, reference
+ - https://edmundmartin.com/multi-threaded-crawler-in-python/
\ No newline at end of file
diff --git a/clean-vgchartzfull.py b/clean-vgchartzfull.py
new file mode 100644
index 0000000..39cf3ed
--- /dev/null
+++ b/clean-vgchartzfull.py
@@ -0,0 +1,273 @@
+from bs4 import BeautifulSoup, element
+import pandas as pd
+import numpy as np
+import requests
+import time
+import unidecode
+from user_agent import generate_user_agent
+from proxies_gen import get_proxies, test_proxies
+from itertools import cycle
+from lxml.html import fromstring
+from multiprocessing import Pool, cpu_count # This is a thread-based Pool
+from requests.exceptions import ConnectionError, Timeout, ProxyError, RequestException
+from urllib3.exceptions import ProtocolError
+import sys
+import os
+sys.setrecursionlimit(10000) # need to optimize code.
+proxy_enabled = True
+
+
+def parse_games(game_tags):
+ """
+ parse the games table on current page
+ parameters:
+ game_tags: games tags after reading the html page
+ df: the dataframe where we will store the games
+ """
+ global rec_count
+ global df
+ for tag in game_tags:
+ game = {}
+ game["Name"] = " ".join(tag.string.split())
+ #print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name']))
+
+ data = tag.parent.parent.find_all("td")
+ if data:
+ game["Rank"] = np.int32(data[0].string)
+ game["img_url"] = data[1].a.img.get('src')
+ game["url"] = data[2].a.get('href')
+ if len(game["Name"].split("/")) > 1:
+ # replace accented chars with ascii
+ game["basename"] = unidecode.unidecode(
+ game['Name'].strip().split('/')[0].strip().replace(' ', '-'))
+ else:
+ game["basename"] = game["url"].rsplit('/', 2)[1]
+ game["Platform"] = data[3].img.get('alt')
+ game["Publisher"] = data[4].get_text().strip()
+ game["Developer"] = data[5].get_text().strip()
+ game["Vgchartzscore"] = data[6].get_text().strip()
+ game["Critic_Score"] = float(
+ data[7].string) if not data[7].string.startswith("N/A") else np.nan
+ game["User_Score"] = float(
+ data[8].string) if not data[8].string.startswith("N/A") else np.nan
+ game["Total_Shipped"] = float(
+ data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan
+ game["Global_Sales"] = float(
+ data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan
+ game["NA_Sales"] = float(
+ data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan
+ game["PAL_Sales"] = float(
+ data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan
+ game["JP_Sales"] = float(
+ data[13].string[:-1]) if not data[13].string.startswith("N/A") else np.nan
+ game["Other_Sales"] = float(
+ data[14].string[:-1]) if not data[14].string.startswith("N/A") else np.nan
+ year = data[15].string.split()[-1]
+ if year.startswith('N/A'):
+ game["Year"] = 'N/A'
+ else:
+ if int(year) >= 70:
+ year_to_add = np.int32("19" + year)
+ else:
+ year_to_add = np.int32("20" + year)
+ game["Year"] = year_to_add
+ game["Last_Update"] = data[16].get_text().strip()
+ game['Genre'] = 'N/A'
+ game['ESRB_Rating'] = 'N/A'
+ game['status'] = 0
+ df = df.append(game, ignore_index=True)
+ rec_count += 1
+
+
+def parse_genre_esrb(df):
+ """loads every game's url to get genre and esrb rating"""
+ headers = {'User-Agent': generate_user_agent(
+ device_type='desktop', os=('mac', 'linux'))}
+ proxy = {}
+ if proxy_enabled:
+ #print("\n******getting list of proxies and testing them******'\n")
+ # this an api call which returns a list of working proxies that get checked evrey 15 minutes
+ proxies = cycle(get_proxies(5))
+ proxy = next(proxies)
+
+ for index, row in df.iterrows():
+ try:
+ game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}, timeout=5)
+ if game_page.status_code == 200:
+ sub_soup = BeautifulSoup(game_page.text, "lxml")
+ # again, the info box is inconsistent among games so we
+ # have to find all the h2 and traverse from that to the genre
+ gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"})
+ h2s = gamebox.find_all('h2')
+ # make a temporary tag here to search for the one that contains
+ # the word "Genre"
+ temp_tag = element.Tag
+ for h2 in h2s:
+ if h2.string == 'Genre':
+ temp_tag = h2
+ df.loc[index, 'Genre'] = temp_tag.next_sibling.string
+
+ # find the ESRB rating
+ game_rating = gamebox.find('img').get('src')
+ if 'esrb' in game_rating:
+ df.loc[index, 'ESRB_Rating'] = game_rating.split(
+ '_')[1].split('.')[0].upper()
+ # we successfuly got the genre and rating
+ df.loc[index, 'status'] = 1
+ #print('Successfully scraped genre and rating for :', df.at[index, 'Name'])
+
+ except(ProxyError):
+ proxy = next(proxies)
+
+ except (ConnectionError, Timeout, ProtocolError, TimeoutError):
+ #print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later')
+ continue
+
+ except Exception as e:
+ #print('different error occurred while connecting, will pass')
+ continue
+ # wait for 1 seconds between every call,
+ # we do not want to get blocked or abuse the server
+ time.sleep(1)
+ return df
+
+
+def retry_game(df):
+ """try to scrape the missing data again"""
+ return parse_genre_esrb(df)
+
+
+if __name__ == "__main__":
+ def process_games(df):
+ failed_games = len(df[df['status'] == 0])
+ NUM_WORKERS = cpu_count() * 2
+ df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS)
+ #update num_workers
+ df_subsets = [i for i in df_subsets if len(i) != 0]
+ if len(df_subsets) != 0:
+ NUM_WORKERS = len(df_subsets)# we don't want to have a worker for empty subsets
+ pool = Pool(processes=NUM_WORKERS)
+ results = pool.map(retry_game, df_subsets)
+ try:
+ df_updated = pd.concat(results)
+ df = pd.concat([df[df['status'] == 1], df_updated])
+ except:
+ print('error occurred while joining dataframe')
+ pool.close()
+ pool.join()
+ return df
+
+ rec_count = 0
+ start_time = time.time()
+ current_time = time.time()
+ crashed_tag = 'before_crashing_'
+ exists = [s for s in os.listdir() if crashed_tag in s]
+ if exists:
+ print("found a data saved from a crash, will continue on it")
+ csvfilename = exists[0].replace(crashed_tag, '')
+ df = pd.read_csv(exists[0])
+ rec_count = df['Rank'].max()
+ page = int(rec_count/1000) + 1 # because we already scraped current
+ df = process_games(df)
+ else:
+ csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv"
+
+ # initialize a panda dataframe to store all games with the following columns:
+ # rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer,
+ # publisher, release year, critic score, user score, na sales, pal sales,
+ # jp sales, other sales, total sales, total shipped, last update, url, status
+ # last two columns for debugging
+ if not exists:
+ df = pd.DataFrame(columns=[
+ 'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher',
+ 'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score',
+ 'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales',
+ 'Other_Sales', 'Year', 'Last_Update', 'url', 'status'])
+
+ urlhead = 'http://www.vgchartz.com/games/games.php?page='
+ urltail = '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both'
+ urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer='
+ urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1'
+ urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1'
+ urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1'
+
+ # get the number of pages
+ vglink = requests.get('http://www.vgchartz.com/gamedb/').text
+ x = fromstring(vglink).xpath(
+ "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0]
+ pages = int(x.split(',')[0])
+
+ if not exists: page = 1
+ while True:
+ if page > pages:
+ break
+ try:
+ proxy = get_proxies(1)[0]
+ headers = {'User-Agent': generate_user_agent(
+ device_type='desktop', os=('mac', 'linux'))}
+ surl = urlhead + str(page) + urltail
+ r = requests.get(surl, headers=headers, proxies={
+ 'http': proxy, 'https': proxy}, timeout=10)
+ if r.status_code == 200:
+ soup = BeautifulSoup(r.text, 'lxml')
+ print("******Scraping page " + str(page) + "******'\n")
+
+ # vgchartz website is really weird so we have to search for
+ # tags with game urls
+ game_tags = list(filter(
+ lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), soup.find_all("a")))[10:]
+ # discard the first 10 elements because those
+ # links are in the navigation bar
+
+ parse_games(game_tags)
+ page += 1
+ print('\n******begin scraping for Genre and Rating******\n')
+ df = process_games(df)
+
+ except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError, TimeoutError):
+ print('Something went wrong while connecting to page: ',
+ page, ', will try again later')
+ #proxy = get_proxies(1)
+ time.sleep(10)
+
+ except Exception as e:
+ print("something went wrong! We're on page: " +
+ str(page) + '\nSaving successfully crawled data')
+ print("Exception: ", e)
+ df.to_csv(crashed_tag + csvfilename, sep=",",
+ encoding='utf-8', index=False)
+ raise e
+
+
+ failed_games = len(df[df['status'] == 0])
+ print("******Finished scraping games, will try to scrape missing data******")
+ # 36 hours max, should be enough to scrape everything
+ t_end = start_time + 60 * 60 * 36
+ while True:
+ try:
+ df = process_games(df)
+ failed_games = len(df[df['status'] == 0])
+ if failed_games == 0 or time.time() > t_end:
+ break
+ #print('Number of not scraped yet:', failed_games, '\n')
+ time.sleep(10) # wait for 10 seconds for the server to recover?
+ except Exception as e:
+ print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data')
+ print("Exception: ", e)
+ df.to_csv(crashed_tag + csvfilename, sep=",",
+ encoding='utf-8', index=False)
+ raise e
+
+ elapsed_time = time.time() - start_time
+ print("Scraped", rec_count, "games in", round(elapsed_time/60, 2), "minutes.")
+
+ # select only these columns in the final dataset
+ df = df.sort_values(by=['Rank'])
+ df.to_csv('complete-vgchartz.csv', sep=",", encoding='utf-8', index=False)
+ df_final = df[[
+ 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating',
+ 'Publisher', 'Developer', 'Critic_Score', 'User_Score',
+ 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales']]
+
+ df_final.to_csv(csvfilename, sep=",", encoding='utf-8', index=False)
+ print("Wrote scraper data to", csvfilename)
diff --git a/proxies_gen.py b/proxies_gen.py
new file mode 100644
index 0000000..6c5a54e
--- /dev/null
+++ b/proxies_gen.py
@@ -0,0 +1,60 @@
+from lxml.html import fromstring
+import requests
+import numpy as np
+from itertools import cycle
+
+
+def get_proxies(num=None):
+ # url = 'https://free-proxy-list.net/'
+ # response = requests.get(url)
+ # parser = fromstring(response.text)
+ # proxies = list(requests.get('https://proxy.rudnkh.me/txt').text.split())
+ # for i in parser.xpath('//tbody/tr'):
+ # if i.xpath('.//td[7][contains(text(),"yes")]'):
+ # proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
+ # proxies.append(proxy)
+
+ link = "https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=1000&country=all&ssl=all&anonymity=all&uptime=100"
+ proxies = list(requests.get(link).text.split())
+ np.random.shuffle(proxies)
+ proxies = []
+ if len(proxies) == 0:
+ proxies = list(requests.get(
+ link[:-3]+'99').text.split()) # change uptime to 99
+ np.random.shuffle(proxies)
+ # print('Found', len(proxies), 'proxies, testing them now')
+
+ if num is None:
+ num = len(proxies)
+ tested = test_proxies(proxies, num)
+ return tested
+
+
+def test_proxies(proxies, num):
+ url = 'https://httpbin.org/ip'
+ proxy_pool = cycle(proxies)
+ working_proxies = []
+ for i in range(1, len(proxies)):
+ if num == 0:
+ break
+ # Get a proxy from the pool
+ proxy = next(proxy_pool)
+ # print("Request #%d" % i)
+ try:
+ response = requests.get(
+ url, proxies={"http": proxy, "https": proxy}, timeout=1)
+ # print(response.json())
+ working_proxies.append(proxy)
+ num -= 1
+ except:
+ # Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work.
+ # print("Skipping. Connnection error")
+ pass
+ return working_proxies
+
+
+# proxies = get_proxies(5)
+# # with open('proxies.txt') as f:
+# # proxies = f.read().splitlines()
+# # test_proxies(proxies, 10)
+# print(proxies)
diff --git a/vgchartzfull.py b/vgchartzfull.py
index b1d75a4..7938e72 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -1,130 +1,270 @@
from bs4 import BeautifulSoup, element
-import urllib
import pandas as pd
import numpy as np
+import requests
+import time
+import unidecode
+from user_agent import generate_user_agent
+from proxies_gen import get_proxies, test_proxies
+from itertools import cycle
+from lxml.html import fromstring
+from multiprocessing import Pool, cpu_count # This is a thread-based Pool
+from requests.exceptions import ConnectionError, Timeout, ProxyError, RequestException
+from urllib3.exceptions import ProtocolError
+import sys
+import os
+sys.setrecursionlimit(10000) # need to optimize code.
+proxy_enabled = True
-pages = 19
-rec_count = 0
-rank = []
-gname = []
-platform = []
-year = []
-genre = []
-critic_score = []
-user_score = []
-publisher = []
-developer = []
-sales_na = []
-sales_pal = []
-sales_jp = []
-sales_ot = []
-sales_gl = []
-
-urlhead = 'http://www.vgchartz.com/gamedb/?page='
-urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'
-urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0'
-urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'
-urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'
-urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'
-
-for page in range(1, pages):
- surl = urlhead + str(page) + urltail
- r = urllib.request.urlopen(surl).read()
- soup = BeautifulSoup(r)
- print(f"Page: {page}")
-
- # vgchartz website is really weird so we have to search for
- # tags with game urls
- game_tags = list(filter(
- lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),
- # discard the first 10 elements because those
- # links are in the navigation bar
- soup.find_all("a")
- ))[10:]
+def parse_games(game_tags):
+ """
+ parse the games table on current page
+ parameters:
+ game_tags: games tags after reading the html page
+ df: the dataframe where we will store the games
+ """
+ global rec_count
+ global df
for tag in game_tags:
+ game = {}
+ game["Name"] = " ".join(tag.string.split())
+ print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name']))
- # add name to list
- gname.append(" ".join(tag.string.split()))
- print(f"{rec_count + 1} Fetch data for game {gname[-1]}")
-
- # get different attributes
- # traverse up the DOM tree
data = tag.parent.parent.find_all("td")
- rank.append(np.int32(data[0].string))
- platform.append(data[3].find('img').attrs['alt'])
- publisher.append(data[4].string)
- developer.append(data[5].string)
- critic_score.append(
- float(data[6].string) if
- not data[6].string.startswith("N/A") else np.nan)
- user_score.append(
- float(data[7].string) if
- not data[7].string.startswith("N/A") else np.nan)
- sales_na.append(
- float(data[9].string[:-1]) if
- not data[9].string.startswith("N/A") else np.nan)
- sales_pal.append(
- float(data[10].string[:-1]) if
- not data[10].string.startswith("N/A") else np.nan)
- sales_jp.append(
- float(data[11].string[:-1]) if
- not data[11].string.startswith("N/A") else np.nan)
- sales_ot.append(
- float(data[12].string[:-1]) if
- not data[12].string.startswith("N/A") else np.nan)
- sales_gl.append(
- float(data[8].string[:-1]) if
- not data[8].string.startswith("N/A") else np.nan)
- release_year = data[13].string.split()[-1]
- # different format for year
- if release_year.startswith('N/A'):
- year.append('N/A')
- else:
- if int(release_year) >= 80:
- year_to_add = np.int32("19" + release_year)
+ if data:
+ game["Rank"] = np.int32(data[0].string)
+ game["img_url"] = data[1].a.img.get('src')
+ game["url"] = data[2].a.get('href')
+ if len(game["Name"].split("/")) > 1:
+ # replace accented chars with ascii
+ game["basename"] = unidecode.unidecode(
+ game['Name'].strip().split('/')[0].strip().replace(' ', '-'))
else:
- year_to_add = np.int32("20" + release_year)
- year.append(year_to_add)
-
- # go to every individual website to get genre info
- url_to_game = tag.attrs['href']
- site_raw = urllib.request.urlopen(url_to_game).read()
- sub_soup = BeautifulSoup(site_raw, "html.parser")
- # again, the info box is inconsistent among games so we
- # have to find all the h2 and traverse from that to the genre name
- h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')
- # make a temporary tag here to search for the one that contains
- # the word "Genre"
- temp_tag = element.Tag
- for h2 in h2s:
- if h2.string == 'Genre':
- temp_tag = h2
- genre.append(temp_tag.next_sibling.string)
-
+ game["basename"] = game["url"].rsplit('/', 2)[1]
+ game["Platform"] = data[3].img.get('alt')
+ game["Publisher"] = data[4].get_text().strip()
+ game["Developer"] = data[5].get_text().strip()
+ game["Vgchartzscore"] = data[6].get_text().strip()
+ game["Critic_Score"] = float(
+ data[7].string) if not data[7].string.startswith("N/A") else np.nan
+ game["User_Score"] = float(
+ data[8].string) if not data[8].string.startswith("N/A") else np.nan
+ game["Total_Shipped"] = float(
+ data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan
+ game["Global_Sales"] = float(
+ data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan
+ game["NA_Sales"] = float(
+ data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan
+ game["PAL_Sales"] = float(
+ data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan
+ game["JP_Sales"] = float(
+ data[13].string[:-1]) if not data[13].string.startswith("N/A") else np.nan
+ game["Other_Sales"] = float(
+ data[14].string[:-1]) if not data[14].string.startswith("N/A") else np.nan
+ year = data[15].string.split()[-1]
+ if year.startswith('N/A'):
+ game["Year"] = 'N/A'
+ else:
+ if int(year) >= 70:
+ year_to_add = np.int32("19" + year)
+ else:
+ year_to_add = np.int32("20" + year)
+ game["Year"] = year_to_add
+ game["Last_Update"] = data[16].get_text().strip()
+ game['Genre'] = 'N/A'
+ game['ESRB_Rating'] = 'N/A'
+ game['status'] = 0
+ df = df.append(game, ignore_index=True)
rec_count += 1
-columns = {
- 'Rank': rank,
- 'Name': gname,
- 'Platform': platform,
- 'Year': year,
- 'Genre': genre,
- 'Critic_Score': critic_score,
- 'User_Score': user_score,
- 'Publisher': publisher,
- 'Developer': developer,
- 'NA_Sales': sales_na,
- 'PAL_Sales': sales_pal,
- 'JP_Sales': sales_jp,
- 'Other_Sales': sales_ot,
- 'Global_Sales': sales_gl
-}
-print(rec_count)
-df = pd.DataFrame(columns)
-print(df.columns)
-df = df[[
- 'Rank', 'Name', 'Platform', 'Year', 'Genre',
- 'Publisher', 'Developer', 'Critic_Score', 'User_Score',
- 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
-df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False)
+
+def parse_genre_esrb(df):
+ """loads every game's url to get genre and esrb rating"""
+ headers = {'User-Agent': generate_user_agent(
+ device_type='desktop', os=('mac', 'linux'))}
+ proxy = {}
+ if proxy_enabled:
+ print("\n******getting list of proxies and testing them******'\n")
+ # this an api call which returns a list of working proxies that get checked evrey 15 minutes
+ proxies = cycle(get_proxies(5))
+ proxy = next(proxies)
+
+ for index, row in df.iterrows():
+ try:
+ game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}, timeout=5)
+ if game_page.status_code == 200:
+ sub_soup = BeautifulSoup(game_page.text, "lxml")
+ # again, the info box is inconsistent among games so we
+ # have to find all the h2 and traverse from that to the genre
+ gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"})
+ h2s = gamebox.find_all('h2')
+ # make a temporary tag here to search for the one that contains
+ # the word "Genre"
+ temp_tag = element.Tag
+ for h2 in h2s:
+ if h2.string == 'Genre':
+ temp_tag = h2
+ df.loc[index, 'Genre'] = temp_tag.next_sibling.string
+
+ # find the ESRB rating
+ game_rating = gamebox.find('img').get('src')
+ if 'esrb' in game_rating:
+ df.loc[index, 'ESRB_Rating'] = game_rating.split(
+ '_')[1].split('.')[0].upper()
+ # we successfuly got the genre and rating
+ df.loc[index, 'status'] = 1
+ print('Successfully scraped genre and rating for :', df.at[index, 'Name'])
+
+ except(ProxyError):
+ proxy = next(proxies)
+
+ except (ConnectionError, Timeout, ProtocolError):
+ print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later')
+
+ except Exception as e:
+ print('different error occurred while connecting, will pass')
+ # wait for 1 seconds between every call,
+ # we do not want to get blocked or abuse the server
+ time.sleep(1)
+ return df
+
+
+def retry_game(df):
+ """try to scrape the missing data again"""
+ return parse_genre_esrb(df)
+
+
+if __name__ == "__main__":
+ def process_games(df):
+ failed_games = len(df[df['status'] == 0])
+ NUM_WORKERS = cpu_count() * 2
+ df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS)
+ #update num_workers
+ df_subsets = [i for i in df_subsets if len(i) != 0]
+ NUM_WORKERS = len(df_subsets) # we don't want to have a worker for empty subsets
+ pool = Pool(processes=NUM_WORKERS)
+ results = pool.map(retry_game, df_subsets)
+ try:
+ df_updated = pd.concat(results)
+ df = pd.concat([df[df['status'] == 1], df_updated])
+ except:
+ print('error occurred while joining dataframe')
+ pool.close()
+ pool.join()
+ return df
+
+ rec_count = 0
+ start_time = time.time()
+ current_time = time.time()
+ crashed_tag = 'before_crashing_'
+ exists = [s for s in os.listdir() if crashed_tag in s]
+ if exists:
+ print("found a data saved from a crash, will continue on it")
+ csvfilename = exists[0].replace(crashed_tag, '')
+ df = pd.read_csv(exists[0])
+ rec_count = df['Rank'].max()
+ page = int(rec_count/1000) + 1 # because we already scraped current
+ df = process_games(df)
+ else:
+ csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv"
+
+ # initialize a panda dataframe to store all games with the following columns:
+ # rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer,
+ # publisher, release year, critic score, user score, na sales, pal sales,
+ # jp sales, other sales, total sales, total shipped, last update, url, status
+ # last two columns for debugging
+ if not exists:
+ df = pd.DataFrame(columns=[
+ 'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher',
+ 'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score',
+ 'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales',
+ 'Other_Sales', 'Year', 'Last_Update', 'url', 'status'])
+
+ urlhead = 'http://www.vgchartz.com/games/games.php?page='
+ urltail = '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both'
+ urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer='
+ urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1'
+ urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1'
+ urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1'
+
+ # get the number of pages
+ vglink = requests.get('http://www.vgchartz.com/gamedb/').text
+ x = fromstring(vglink).xpath(
+ "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0]
+ pages = int(x.split(',')[0])
+
+ if not exists: page = 1
+ while True:
+ if page > pages:
+ break
+ try:
+ proxy = get_proxies(1)[0]
+ headers = {'User-Agent': generate_user_agent(
+ device_type='desktop', os=('mac', 'linux'))}
+ surl = urlhead + str(page) + urltail
+ r = requests.get(surl, headers=headers, proxies={
+ 'http': proxy, 'https': proxy}, timeout=10)
+ if r.status_code == 200:
+ soup = BeautifulSoup(r.text, 'lxml')
+ print("******Scraping page " + str(page) + "******'\n")
+
+ # vgchartz website is really weird so we have to search for
+ # tags with game urls
+ game_tags = list(filter(
+ lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), soup.find_all("a")))[10:]
+ # discard the first 10 elements because those
+ # links are in the navigation bar
+
+ parse_games(game_tags)
+ print('\n******begin scraping for Genre and Rating******\n')
+ df = process_games(df)
+ page += 1
+
+ except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError):
+ print('Something went wrong while connecting to page: ',
+ page, ', will try again later')
+ #proxy = get_proxies(1)
+ time.sleep(10)
+
+ except Exception as e:
+ print("something went wrong! We're on page: " +
+ str(page) + '\nSaving successfully crawled data')
+ print("Exception: ", e)
+ df.to_csv(crashed_tag + csvfilename, sep=",",
+ encoding='utf-8', index=False)
+ raise e
+
+
+ failed_games = len(df[df['status'] == 0])
+ print("******Finished scraping games, will try to scrape missing data******")
+ # 36 hours max, should be enough to scrape everything
+ t_end = start_time + 60 * 60 * 36
+ while True:
+ try:
+ df = process_games(df)
+ failed_games = len(df[df['status'] == 0])
+ if failed_games == 0 or time.time() > t_end:
+ break
+ print('Number of not scraped yet:', failed_games, '\n')
+ time.sleep(10) # wait for 10 seconds for the server to recover?
+ except Exception as e:
+ print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data')
+ print("Exception: ", e)
+ df.to_csv(crashed_tag + csvfilename, sep=",",
+ encoding='utf-8', index=False)
+ raise e
+
+ elapsed_time = time.time() - start_time
+ print("Scraped", rec_count, "games in", round(elapsed_time/60, 2), "minutes.")
+
+ # select only these columns in the final dataset
+ df = df.sort_values(by=['Rank'])
+ df.to_csv('complete-vgchartz.csv', sep=",", encoding='utf-8', index=False)
+ df_final = df[[
+ 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating',
+ 'Publisher', 'Developer', 'Critic_Score', 'User_Score',
+ 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales']]
+
+ df_final.to_csv(csvfilename, sep=",", encoding='utf-8', index=False)
+ print("Wrote scraper data to", csvfilename)