diff --git a/.gitignore b/.gitignore index be2baa1..ffbec06 100644 --- a/.gitignore +++ b/.gitignore @@ -103,3 +103,12 @@ venv.bak/ # mypy .mypy_cache/ .vscode/ + +# csv +*.csv + +# ipynb +*.ipynb + +# ignore this folder +testing/ \ No newline at end of file diff --git a/README.md b/README.md index e557119..c7d9d6f 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,41 @@ -vgchartzfull is a python script based on BeautifulSoup. + +vgchartzfull is a python script with multiprocessing based on BeautifulSoup. +proxies are implemented in the script, it can be disabled by changing it to False + It creates a dataset based on data from http://www.vgchartz.com/gamedb/ -The dataset is saved as vgsales.csv. +The dataset is saved as vgsales-%Y-%m-%d_%H_%M_%S.csv. + +You will need to have the following dependencies installed: +``` +BeautifulSoup4 +pandas +numpy +requests +unidecode +user_agent +``` + +Thanks to: +- https://www.kdnuggets.com/2018/02/web-scraping-tutorial-python.html +- http://python.omics.wiki/multiprocessing_map/multiprocessing_partial_function_multiple_arguments +- https://medium.com/datadriveninvestor/speed-up-web-scraping-using-multiprocessing-in-python-af434ff310c5 -You will need to have BeautifulSoup added. -It can be installed by pip. -sudo pip install BeautifulSoup +Free proxies: +[1](https://proxyscrape.com/free-proxy-list) +[2](http://multiproxy.org/txt_all/proxy.txt) +[3](https://proxy.rudnkh.me/txt) +[4](https://www.us-proxy.org/) -Thanks to Chris Albon. -http://chrisalbon.com/python/beautiful_soup_scrape_table.html +- [x] added multiprocessing for faster results with a maximum of 24 workers. +- [x] added proxies to avoid being blocked +- [x] handling couple of exceptions +- [x] scraped data gets saved before raising an unexpected error +- [x] add the option to continue where we left off due to an unexpected error +- [x] clean version removes the print statements, should results in better performance! +- [ ] optimize it +- [ ] create a log file +- [ ] convert the script to a class or use scrapy, reference + - https://edmundmartin.com/multi-threaded-crawler-in-python/ \ No newline at end of file diff --git a/clean-vgchartzfull.py b/clean-vgchartzfull.py new file mode 100644 index 0000000..39cf3ed --- /dev/null +++ b/clean-vgchartzfull.py @@ -0,0 +1,273 @@ +from bs4 import BeautifulSoup, element +import pandas as pd +import numpy as np +import requests +import time +import unidecode +from user_agent import generate_user_agent +from proxies_gen import get_proxies, test_proxies +from itertools import cycle +from lxml.html import fromstring +from multiprocessing import Pool, cpu_count # This is a thread-based Pool +from requests.exceptions import ConnectionError, Timeout, ProxyError, RequestException +from urllib3.exceptions import ProtocolError +import sys +import os +sys.setrecursionlimit(10000) # need to optimize code. +proxy_enabled = True + + +def parse_games(game_tags): + """ + parse the games table on current page + parameters: + game_tags: games tags after reading the html page + df: the dataframe where we will store the games + """ + global rec_count + global df + for tag in game_tags: + game = {} + game["Name"] = " ".join(tag.string.split()) + #print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name'])) + + data = tag.parent.parent.find_all("td") + if data: + game["Rank"] = np.int32(data[0].string) + game["img_url"] = data[1].a.img.get('src') + game["url"] = data[2].a.get('href') + if len(game["Name"].split("/")) > 1: + # replace accented chars with ascii + game["basename"] = unidecode.unidecode( + game['Name'].strip().split('/')[0].strip().replace(' ', '-')) + else: + game["basename"] = game["url"].rsplit('/', 2)[1] + game["Platform"] = data[3].img.get('alt') + game["Publisher"] = data[4].get_text().strip() + game["Developer"] = data[5].get_text().strip() + game["Vgchartzscore"] = data[6].get_text().strip() + game["Critic_Score"] = float( + data[7].string) if not data[7].string.startswith("N/A") else np.nan + game["User_Score"] = float( + data[8].string) if not data[8].string.startswith("N/A") else np.nan + game["Total_Shipped"] = float( + data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan + game["Global_Sales"] = float( + data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan + game["NA_Sales"] = float( + data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan + game["PAL_Sales"] = float( + data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan + game["JP_Sales"] = float( + data[13].string[:-1]) if not data[13].string.startswith("N/A") else np.nan + game["Other_Sales"] = float( + data[14].string[:-1]) if not data[14].string.startswith("N/A") else np.nan + year = data[15].string.split()[-1] + if year.startswith('N/A'): + game["Year"] = 'N/A' + else: + if int(year) >= 70: + year_to_add = np.int32("19" + year) + else: + year_to_add = np.int32("20" + year) + game["Year"] = year_to_add + game["Last_Update"] = data[16].get_text().strip() + game['Genre'] = 'N/A' + game['ESRB_Rating'] = 'N/A' + game['status'] = 0 + df = df.append(game, ignore_index=True) + rec_count += 1 + + +def parse_genre_esrb(df): + """loads every game's url to get genre and esrb rating""" + headers = {'User-Agent': generate_user_agent( + device_type='desktop', os=('mac', 'linux'))} + proxy = {} + if proxy_enabled: + #print("\n******getting list of proxies and testing them******'\n") + # this an api call which returns a list of working proxies that get checked evrey 15 minutes + proxies = cycle(get_proxies(5)) + proxy = next(proxies) + + for index, row in df.iterrows(): + try: + game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}, timeout=5) + if game_page.status_code == 200: + sub_soup = BeautifulSoup(game_page.text, "lxml") + # again, the info box is inconsistent among games so we + # have to find all the h2 and traverse from that to the genre + gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"}) + h2s = gamebox.find_all('h2') + # make a temporary tag here to search for the one that contains + # the word "Genre" + temp_tag = element.Tag + for h2 in h2s: + if h2.string == 'Genre': + temp_tag = h2 + df.loc[index, 'Genre'] = temp_tag.next_sibling.string + + # find the ESRB rating + game_rating = gamebox.find('img').get('src') + if 'esrb' in game_rating: + df.loc[index, 'ESRB_Rating'] = game_rating.split( + '_')[1].split('.')[0].upper() + # we successfuly got the genre and rating + df.loc[index, 'status'] = 1 + #print('Successfully scraped genre and rating for :', df.at[index, 'Name']) + + except(ProxyError): + proxy = next(proxies) + + except (ConnectionError, Timeout, ProtocolError, TimeoutError): + #print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') + continue + + except Exception as e: + #print('different error occurred while connecting, will pass') + continue + # wait for 1 seconds between every call, + # we do not want to get blocked or abuse the server + time.sleep(1) + return df + + +def retry_game(df): + """try to scrape the missing data again""" + return parse_genre_esrb(df) + + +if __name__ == "__main__": + def process_games(df): + failed_games = len(df[df['status'] == 0]) + NUM_WORKERS = cpu_count() * 2 + df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS) + #update num_workers + df_subsets = [i for i in df_subsets if len(i) != 0] + if len(df_subsets) != 0: + NUM_WORKERS = len(df_subsets)# we don't want to have a worker for empty subsets + pool = Pool(processes=NUM_WORKERS) + results = pool.map(retry_game, df_subsets) + try: + df_updated = pd.concat(results) + df = pd.concat([df[df['status'] == 1], df_updated]) + except: + print('error occurred while joining dataframe') + pool.close() + pool.join() + return df + + rec_count = 0 + start_time = time.time() + current_time = time.time() + crashed_tag = 'before_crashing_' + exists = [s for s in os.listdir() if crashed_tag in s] + if exists: + print("found a data saved from a crash, will continue on it") + csvfilename = exists[0].replace(crashed_tag, '') + df = pd.read_csv(exists[0]) + rec_count = df['Rank'].max() + page = int(rec_count/1000) + 1 # because we already scraped current + df = process_games(df) + else: + csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" + + # initialize a panda dataframe to store all games with the following columns: + # rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer, + # publisher, release year, critic score, user score, na sales, pal sales, + # jp sales, other sales, total sales, total shipped, last update, url, status + # last two columns for debugging + if not exists: + df = pd.DataFrame(columns=[ + 'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher', + 'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score', + 'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', + 'Other_Sales', 'Year', 'Last_Update', 'url', 'status']) + + urlhead = 'http://www.vgchartz.com/games/games.php?page=' + urltail = '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' + urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer=' + urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1' + urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1' + urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' + + # get the number of pages + vglink = requests.get('http://www.vgchartz.com/gamedb/').text + x = fromstring(vglink).xpath( + "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0] + pages = int(x.split(',')[0]) + + if not exists: page = 1 + while True: + if page > pages: + break + try: + proxy = get_proxies(1)[0] + headers = {'User-Agent': generate_user_agent( + device_type='desktop', os=('mac', 'linux'))} + surl = urlhead + str(page) + urltail + r = requests.get(surl, headers=headers, proxies={ + 'http': proxy, 'https': proxy}, timeout=10) + if r.status_code == 200: + soup = BeautifulSoup(r.text, 'lxml') + print("******Scraping page " + str(page) + "******'\n") + + # vgchartz website is really weird so we have to search for + # tags with game urls + game_tags = list(filter( + lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), soup.find_all("a")))[10:] + # discard the first 10 elements because those + # links are in the navigation bar + + parse_games(game_tags) + page += 1 + print('\n******begin scraping for Genre and Rating******\n') + df = process_games(df) + + except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError, TimeoutError): + print('Something went wrong while connecting to page: ', + page, ', will try again later') + #proxy = get_proxies(1) + time.sleep(10) + + except Exception as e: + print("something went wrong! We're on page: " + + str(page) + '\nSaving successfully crawled data') + print("Exception: ", e) + df.to_csv(crashed_tag + csvfilename, sep=",", + encoding='utf-8', index=False) + raise e + + + failed_games = len(df[df['status'] == 0]) + print("******Finished scraping games, will try to scrape missing data******") + # 36 hours max, should be enough to scrape everything + t_end = start_time + 60 * 60 * 36 + while True: + try: + df = process_games(df) + failed_games = len(df[df['status'] == 0]) + if failed_games == 0 or time.time() > t_end: + break + #print('Number of not scraped yet:', failed_games, '\n') + time.sleep(10) # wait for 10 seconds for the server to recover? + except Exception as e: + print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data') + print("Exception: ", e) + df.to_csv(crashed_tag + csvfilename, sep=",", + encoding='utf-8', index=False) + raise e + + elapsed_time = time.time() - start_time + print("Scraped", rec_count, "games in", round(elapsed_time/60, 2), "minutes.") + + # select only these columns in the final dataset + df = df.sort_values(by=['Rank']) + df.to_csv('complete-vgchartz.csv', sep=",", encoding='utf-8', index=False) + df_final = df[[ + 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating', + 'Publisher', 'Developer', 'Critic_Score', 'User_Score', + 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales']] + + df_final.to_csv(csvfilename, sep=",", encoding='utf-8', index=False) + print("Wrote scraper data to", csvfilename) diff --git a/proxies_gen.py b/proxies_gen.py new file mode 100644 index 0000000..6c5a54e --- /dev/null +++ b/proxies_gen.py @@ -0,0 +1,60 @@ +from lxml.html import fromstring +import requests +import numpy as np +from itertools import cycle + + +def get_proxies(num=None): + # url = 'https://free-proxy-list.net/' + # response = requests.get(url) + # parser = fromstring(response.text) + # proxies = list(requests.get('https://proxy.rudnkh.me/txt').text.split()) + # for i in parser.xpath('//tbody/tr'): + # if i.xpath('.//td[7][contains(text(),"yes")]'): + # proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]]) + # proxies.append(proxy) + + link = "https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=1000&country=all&ssl=all&anonymity=all&uptime=100" + proxies = list(requests.get(link).text.split()) + np.random.shuffle(proxies) + proxies = [] + if len(proxies) == 0: + proxies = list(requests.get( + link[:-3]+'99').text.split()) # change uptime to 99 + np.random.shuffle(proxies) + # print('Found', len(proxies), 'proxies, testing them now') + + if num is None: + num = len(proxies) + tested = test_proxies(proxies, num) + return tested + + +def test_proxies(proxies, num): + url = 'https://httpbin.org/ip' + proxy_pool = cycle(proxies) + working_proxies = [] + for i in range(1, len(proxies)): + if num == 0: + break + # Get a proxy from the pool + proxy = next(proxy_pool) + # print("Request #%d" % i) + try: + response = requests.get( + url, proxies={"http": proxy, "https": proxy}, timeout=1) + # print(response.json()) + working_proxies.append(proxy) + num -= 1 + except: + # Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. + # print("Skipping. Connnection error") + pass + return working_proxies + + +# proxies = get_proxies(5) +# # with open('proxies.txt') as f: +# # proxies = f.read().splitlines() +# # test_proxies(proxies, 10) +# print(proxies) diff --git a/vgchartzfull.py b/vgchartzfull.py index b1d75a4..7938e72 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -1,130 +1,270 @@ from bs4 import BeautifulSoup, element -import urllib import pandas as pd import numpy as np +import requests +import time +import unidecode +from user_agent import generate_user_agent +from proxies_gen import get_proxies, test_proxies +from itertools import cycle +from lxml.html import fromstring +from multiprocessing import Pool, cpu_count # This is a thread-based Pool +from requests.exceptions import ConnectionError, Timeout, ProxyError, RequestException +from urllib3.exceptions import ProtocolError +import sys +import os +sys.setrecursionlimit(10000) # need to optimize code. +proxy_enabled = True -pages = 19 -rec_count = 0 -rank = [] -gname = [] -platform = [] -year = [] -genre = [] -critic_score = [] -user_score = [] -publisher = [] -developer = [] -sales_na = [] -sales_pal = [] -sales_jp = [] -sales_ot = [] -sales_gl = [] - -urlhead = 'http://www.vgchartz.com/gamedb/?page=' -urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both' -urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0' -urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1' -urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1' -urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL' - -for page in range(1, pages): - surl = urlhead + str(page) + urltail - r = urllib.request.urlopen(surl).read() - soup = BeautifulSoup(r) - print(f"Page: {page}") - - # vgchartz website is really weird so we have to search for - # tags with game urls - game_tags = list(filter( - lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), - # discard the first 10 elements because those - # links are in the navigation bar - soup.find_all("a") - ))[10:] +def parse_games(game_tags): + """ + parse the games table on current page + parameters: + game_tags: games tags after reading the html page + df: the dataframe where we will store the games + """ + global rec_count + global df for tag in game_tags: + game = {} + game["Name"] = " ".join(tag.string.split()) + print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name'])) - # add name to list - gname.append(" ".join(tag.string.split())) - print(f"{rec_count + 1} Fetch data for game {gname[-1]}") - - # get different attributes - # traverse up the DOM tree data = tag.parent.parent.find_all("td") - rank.append(np.int32(data[0].string)) - platform.append(data[3].find('img').attrs['alt']) - publisher.append(data[4].string) - developer.append(data[5].string) - critic_score.append( - float(data[6].string) if - not data[6].string.startswith("N/A") else np.nan) - user_score.append( - float(data[7].string) if - not data[7].string.startswith("N/A") else np.nan) - sales_na.append( - float(data[9].string[:-1]) if - not data[9].string.startswith("N/A") else np.nan) - sales_pal.append( - float(data[10].string[:-1]) if - not data[10].string.startswith("N/A") else np.nan) - sales_jp.append( - float(data[11].string[:-1]) if - not data[11].string.startswith("N/A") else np.nan) - sales_ot.append( - float(data[12].string[:-1]) if - not data[12].string.startswith("N/A") else np.nan) - sales_gl.append( - float(data[8].string[:-1]) if - not data[8].string.startswith("N/A") else np.nan) - release_year = data[13].string.split()[-1] - # different format for year - if release_year.startswith('N/A'): - year.append('N/A') - else: - if int(release_year) >= 80: - year_to_add = np.int32("19" + release_year) + if data: + game["Rank"] = np.int32(data[0].string) + game["img_url"] = data[1].a.img.get('src') + game["url"] = data[2].a.get('href') + if len(game["Name"].split("/")) > 1: + # replace accented chars with ascii + game["basename"] = unidecode.unidecode( + game['Name'].strip().split('/')[0].strip().replace(' ', '-')) else: - year_to_add = np.int32("20" + release_year) - year.append(year_to_add) - - # go to every individual website to get genre info - url_to_game = tag.attrs['href'] - site_raw = urllib.request.urlopen(url_to_game).read() - sub_soup = BeautifulSoup(site_raw, "html.parser") - # again, the info box is inconsistent among games so we - # have to find all the h2 and traverse from that to the genre name - h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2') - # make a temporary tag here to search for the one that contains - # the word "Genre" - temp_tag = element.Tag - for h2 in h2s: - if h2.string == 'Genre': - temp_tag = h2 - genre.append(temp_tag.next_sibling.string) - + game["basename"] = game["url"].rsplit('/', 2)[1] + game["Platform"] = data[3].img.get('alt') + game["Publisher"] = data[4].get_text().strip() + game["Developer"] = data[5].get_text().strip() + game["Vgchartzscore"] = data[6].get_text().strip() + game["Critic_Score"] = float( + data[7].string) if not data[7].string.startswith("N/A") else np.nan + game["User_Score"] = float( + data[8].string) if not data[8].string.startswith("N/A") else np.nan + game["Total_Shipped"] = float( + data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan + game["Global_Sales"] = float( + data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan + game["NA_Sales"] = float( + data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan + game["PAL_Sales"] = float( + data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan + game["JP_Sales"] = float( + data[13].string[:-1]) if not data[13].string.startswith("N/A") else np.nan + game["Other_Sales"] = float( + data[14].string[:-1]) if not data[14].string.startswith("N/A") else np.nan + year = data[15].string.split()[-1] + if year.startswith('N/A'): + game["Year"] = 'N/A' + else: + if int(year) >= 70: + year_to_add = np.int32("19" + year) + else: + year_to_add = np.int32("20" + year) + game["Year"] = year_to_add + game["Last_Update"] = data[16].get_text().strip() + game['Genre'] = 'N/A' + game['ESRB_Rating'] = 'N/A' + game['status'] = 0 + df = df.append(game, ignore_index=True) rec_count += 1 -columns = { - 'Rank': rank, - 'Name': gname, - 'Platform': platform, - 'Year': year, - 'Genre': genre, - 'Critic_Score': critic_score, - 'User_Score': user_score, - 'Publisher': publisher, - 'Developer': developer, - 'NA_Sales': sales_na, - 'PAL_Sales': sales_pal, - 'JP_Sales': sales_jp, - 'Other_Sales': sales_ot, - 'Global_Sales': sales_gl -} -print(rec_count) -df = pd.DataFrame(columns) -print(df.columns) -df = df[[ - 'Rank', 'Name', 'Platform', 'Year', 'Genre', - 'Publisher', 'Developer', 'Critic_Score', 'User_Score', - 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] -df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False) + +def parse_genre_esrb(df): + """loads every game's url to get genre and esrb rating""" + headers = {'User-Agent': generate_user_agent( + device_type='desktop', os=('mac', 'linux'))} + proxy = {} + if proxy_enabled: + print("\n******getting list of proxies and testing them******'\n") + # this an api call which returns a list of working proxies that get checked evrey 15 minutes + proxies = cycle(get_proxies(5)) + proxy = next(proxies) + + for index, row in df.iterrows(): + try: + game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}, timeout=5) + if game_page.status_code == 200: + sub_soup = BeautifulSoup(game_page.text, "lxml") + # again, the info box is inconsistent among games so we + # have to find all the h2 and traverse from that to the genre + gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"}) + h2s = gamebox.find_all('h2') + # make a temporary tag here to search for the one that contains + # the word "Genre" + temp_tag = element.Tag + for h2 in h2s: + if h2.string == 'Genre': + temp_tag = h2 + df.loc[index, 'Genre'] = temp_tag.next_sibling.string + + # find the ESRB rating + game_rating = gamebox.find('img').get('src') + if 'esrb' in game_rating: + df.loc[index, 'ESRB_Rating'] = game_rating.split( + '_')[1].split('.')[0].upper() + # we successfuly got the genre and rating + df.loc[index, 'status'] = 1 + print('Successfully scraped genre and rating for :', df.at[index, 'Name']) + + except(ProxyError): + proxy = next(proxies) + + except (ConnectionError, Timeout, ProtocolError): + print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') + + except Exception as e: + print('different error occurred while connecting, will pass') + # wait for 1 seconds between every call, + # we do not want to get blocked or abuse the server + time.sleep(1) + return df + + +def retry_game(df): + """try to scrape the missing data again""" + return parse_genre_esrb(df) + + +if __name__ == "__main__": + def process_games(df): + failed_games = len(df[df['status'] == 0]) + NUM_WORKERS = cpu_count() * 2 + df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS) + #update num_workers + df_subsets = [i for i in df_subsets if len(i) != 0] + NUM_WORKERS = len(df_subsets) # we don't want to have a worker for empty subsets + pool = Pool(processes=NUM_WORKERS) + results = pool.map(retry_game, df_subsets) + try: + df_updated = pd.concat(results) + df = pd.concat([df[df['status'] == 1], df_updated]) + except: + print('error occurred while joining dataframe') + pool.close() + pool.join() + return df + + rec_count = 0 + start_time = time.time() + current_time = time.time() + crashed_tag = 'before_crashing_' + exists = [s for s in os.listdir() if crashed_tag in s] + if exists: + print("found a data saved from a crash, will continue on it") + csvfilename = exists[0].replace(crashed_tag, '') + df = pd.read_csv(exists[0]) + rec_count = df['Rank'].max() + page = int(rec_count/1000) + 1 # because we already scraped current + df = process_games(df) + else: + csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" + + # initialize a panda dataframe to store all games with the following columns: + # rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer, + # publisher, release year, critic score, user score, na sales, pal sales, + # jp sales, other sales, total sales, total shipped, last update, url, status + # last two columns for debugging + if not exists: + df = pd.DataFrame(columns=[ + 'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher', + 'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score', + 'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', + 'Other_Sales', 'Year', 'Last_Update', 'url', 'status']) + + urlhead = 'http://www.vgchartz.com/games/games.php?page=' + urltail = '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' + urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer=' + urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1' + urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1' + urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' + + # get the number of pages + vglink = requests.get('http://www.vgchartz.com/gamedb/').text + x = fromstring(vglink).xpath( + "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0] + pages = int(x.split(',')[0]) + + if not exists: page = 1 + while True: + if page > pages: + break + try: + proxy = get_proxies(1)[0] + headers = {'User-Agent': generate_user_agent( + device_type='desktop', os=('mac', 'linux'))} + surl = urlhead + str(page) + urltail + r = requests.get(surl, headers=headers, proxies={ + 'http': proxy, 'https': proxy}, timeout=10) + if r.status_code == 200: + soup = BeautifulSoup(r.text, 'lxml') + print("******Scraping page " + str(page) + "******'\n") + + # vgchartz website is really weird so we have to search for + # tags with game urls + game_tags = list(filter( + lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), soup.find_all("a")))[10:] + # discard the first 10 elements because those + # links are in the navigation bar + + parse_games(game_tags) + print('\n******begin scraping for Genre and Rating******\n') + df = process_games(df) + page += 1 + + except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError): + print('Something went wrong while connecting to page: ', + page, ', will try again later') + #proxy = get_proxies(1) + time.sleep(10) + + except Exception as e: + print("something went wrong! We're on page: " + + str(page) + '\nSaving successfully crawled data') + print("Exception: ", e) + df.to_csv(crashed_tag + csvfilename, sep=",", + encoding='utf-8', index=False) + raise e + + + failed_games = len(df[df['status'] == 0]) + print("******Finished scraping games, will try to scrape missing data******") + # 36 hours max, should be enough to scrape everything + t_end = start_time + 60 * 60 * 36 + while True: + try: + df = process_games(df) + failed_games = len(df[df['status'] == 0]) + if failed_games == 0 or time.time() > t_end: + break + print('Number of not scraped yet:', failed_games, '\n') + time.sleep(10) # wait for 10 seconds for the server to recover? + except Exception as e: + print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data') + print("Exception: ", e) + df.to_csv(crashed_tag + csvfilename, sep=",", + encoding='utf-8', index=False) + raise e + + elapsed_time = time.time() - start_time + print("Scraped", rec_count, "games in", round(elapsed_time/60, 2), "minutes.") + + # select only these columns in the final dataset + df = df.sort_values(by=['Rank']) + df.to_csv('complete-vgchartz.csv', sep=",", encoding='utf-8', index=False) + df_final = df[[ + 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating', + 'Publisher', 'Developer', 'Critic_Score', 'User_Score', + 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales']] + + df_final.to_csv(csvfilename, sep=",", encoding='utf-8', index=False) + print("Wrote scraper data to", csvfilename)