From 607055b0817d123695092f6472a3783f159a11d0 Mon Sep 17 00:00:00 2001 From: hechmik Date: Tue, 14 May 2019 21:09:09 +0200 Subject: [PATCH 01/35] Add file for listing used libraries --- requirements.txt | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7c120b1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +beautifulsoup4==4.7.1 +bs4==0.0.1 +numpy==1.16.3 +pandas==0.24.2 +python-dateutil==2.8.0 +pytz==2019.1 +six==1.12.0 +soupsieve==1.9.1 From f0e242c788b26fd89662d8d240da218e71b76e16 Mon Sep 17 00:00:00 2001 From: hechmik Date: Tue, 14 May 2019 21:31:04 +0200 Subject: [PATCH 02/35] Add method for random headers generation, sleep between requests. --- vgchartzfull.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/vgchartzfull.py b/vgchartzfull.py index b1d75a4..13a39c9 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -27,6 +27,30 @@ urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1' urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL' +def create_random_header(): + browsers = ["Mozilla", "Chrome"] + os = ["Windows NT 6.1; Win64; x64" "X11; Linux x86_64"], + #user_agent = 'Mozilla/{}.{} (Windows NT 6.1; Win64; x64)'.format(randint(1,56)) + major_version = randint(1, 56) + minor_version = randint(1, 10) + chosen_browser = random.choice(browsers) + chosen_os = random.choice(os) + + user_agent = '{}/{}.{} ({})'.format( + chosen_browser, + major_version, + minor_version, + chosen_os) + header = { 'User-Agent' : user_agent} + print(header) + return header +def get_page(url): + header = create_random_header() + request = urllib.request.Request(url, headers=header) + result = urllib.request.urlopen(request).read() + time.sleep(randint(6,15)) + return result + for page in range(1, pages): surl = urlhead + str(page) + urltail r = urllib.request.urlopen(surl).read() From f68ea44c2a1d3dc6346d53656e592360d501ced6 Mon Sep 17 00:00:00 2001 From: hechmik Date: Tue, 14 May 2019 21:38:42 +0200 Subject: [PATCH 03/35] Externalise the get genre part, wrap all code into a function --- vgchartzfull.py | 240 +++++++++++++++++++++++++----------------------- 1 file changed, 123 insertions(+), 117 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 13a39c9..7ad271f 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -3,30 +3,6 @@ import pandas as pd import numpy as np -pages = 19 -rec_count = 0 -rank = [] -gname = [] -platform = [] -year = [] -genre = [] -critic_score = [] -user_score = [] -publisher = [] -developer = [] -sales_na = [] -sales_pal = [] -sales_jp = [] -sales_ot = [] -sales_gl = [] - -urlhead = 'http://www.vgchartz.com/gamedb/?page=' -urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both' -urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0' -urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1' -urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1' -urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL' - def create_random_header(): browsers = ["Mozilla", "Chrome"] os = ["Windows NT 6.1; Win64; x64" "X11; Linux x86_64"], @@ -50,105 +26,135 @@ def get_page(url): result = urllib.request.urlopen(request).read() time.sleep(randint(6,15)) return result +def get_genre(): + # go to every individual website to get genre info + url_to_game = tag.attrs['href'] + site_raw = urllib.request.urlopen(url_to_game).read() + sub_soup = BeautifulSoup(site_raw, "html.parser") + # again, the info box is inconsistent among games so we + # have to find all the h2 and traverse from that to the genre name + h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2') + # make a temporary tag here to search for the one that contains + # the word "Genre" + temp_tag = element.Tag + for h2 in h2s: + if h2.string == 'Genre': + temp_tag = h2 + genre.append(temp_tag.next_sibling.string) -for page in range(1, pages): - surl = urlhead + str(page) + urltail - r = urllib.request.urlopen(surl).read() - soup = BeautifulSoup(r) - print(f"Page: {page}") +def download_data(): + rec_count = 0 + for page in range(1, pages): + surl = urlhead + str(page) + urltail + r = urllib.request.urlopen(surl).read() + soup = BeautifulSoup(r) + print(f"Page: {page}") - # vgchartz website is really weird so we have to search for - # tags with game urls - game_tags = list(filter( - lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), - # discard the first 10 elements because those - # links are in the navigation bar - soup.find_all("a") - ))[10:] + # vgchartz website is really weird so we have to search for + # tags with game urls + game_tags = list(filter( + lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), + # discard the first 10 elements because those + # links are in the navigation bar + soup.find_all("a") + ))[10:] - for tag in game_tags: + for tag in game_tags: - # add name to list - gname.append(" ".join(tag.string.split())) - print(f"{rec_count + 1} Fetch data for game {gname[-1]}") + # add name to list + gname.append(" ".join(tag.string.split())) + print(f"{rec_count + 1} Fetch data for game {gname[-1]}") - # get different attributes - # traverse up the DOM tree - data = tag.parent.parent.find_all("td") - rank.append(np.int32(data[0].string)) - platform.append(data[3].find('img').attrs['alt']) - publisher.append(data[4].string) - developer.append(data[5].string) - critic_score.append( - float(data[6].string) if - not data[6].string.startswith("N/A") else np.nan) - user_score.append( - float(data[7].string) if - not data[7].string.startswith("N/A") else np.nan) - sales_na.append( - float(data[9].string[:-1]) if - not data[9].string.startswith("N/A") else np.nan) - sales_pal.append( - float(data[10].string[:-1]) if - not data[10].string.startswith("N/A") else np.nan) - sales_jp.append( - float(data[11].string[:-1]) if - not data[11].string.startswith("N/A") else np.nan) - sales_ot.append( - float(data[12].string[:-1]) if - not data[12].string.startswith("N/A") else np.nan) - sales_gl.append( - float(data[8].string[:-1]) if - not data[8].string.startswith("N/A") else np.nan) - release_year = data[13].string.split()[-1] - # different format for year - if release_year.startswith('N/A'): - year.append('N/A') - else: - if int(release_year) >= 80: - year_to_add = np.int32("19" + release_year) + # get different attributes + # traverse up the DOM tree + data = tag.parent.parent.find_all("td") + rank.append(np.int32(data[0].string)) + platform.append(data[3].find('img').attrs['alt']) + publisher.append(data[4].string) + developer.append(data[5].string) + critic_score.append( + float(data[6].string) if + not data[6].string.startswith("N/A") else np.nan) + user_score.append( + float(data[7].string) if + not data[7].string.startswith("N/A") else np.nan) + sales_na.append( + float(data[9].string[:-1]) if + not data[9].string.startswith("N/A") else np.nan) + sales_pal.append( + float(data[10].string[:-1]) if + not data[10].string.startswith("N/A") else np.nan) + sales_jp.append( + float(data[11].string[:-1]) if + not data[11].string.startswith("N/A") else np.nan) + sales_ot.append( + float(data[12].string[:-1]) if + not data[12].string.startswith("N/A") else np.nan) + sales_gl.append( + float(data[8].string[:-1]) if + not data[8].string.startswith("N/A") else np.nan) + release_year = data[13].string.split()[-1] + # different format for year + if release_year.startswith('N/A'): + year.append('N/A') else: - year_to_add = np.int32("20" + release_year) - year.append(year_to_add) + if int(release_year) >= 80: + year_to_add = np.int32("19" + release_year) + else: + year_to_add = np.int32("20" + release_year) + year.append(year_to_add) + + + + rec_count += 1 - # go to every individual website to get genre info - url_to_game = tag.attrs['href'] - site_raw = urllib.request.urlopen(url_to_game).read() - sub_soup = BeautifulSoup(site_raw, "html.parser") - # again, the info box is inconsistent among games so we - # have to find all the h2 and traverse from that to the genre name - h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2') - # make a temporary tag here to search for the one that contains - # the word "Genre" - temp_tag = element.Tag - for h2 in h2s: - if h2.string == 'Genre': - temp_tag = h2 - genre.append(temp_tag.next_sibling.string) + columns = { + 'Rank': rank, + 'Name': gname, + 'Platform': platform, + 'Year': year, + 'Genre': genre, + 'Critic_Score': critic_score, + 'User_Score': user_score, + 'Publisher': publisher, + 'Developer': developer, + 'NA_Sales': sales_na, + 'PAL_Sales': sales_pal, + 'JP_Sales': sales_jp, + 'Other_Sales': sales_ot, + 'Global_Sales': sales_gl + } + print(rec_count) + df = pd.DataFrame(columns) + print(df.columns) + df = df[[ + 'Rank', 'Name', 'Platform', 'Year', 'Genre', + 'Publisher', 'Developer', 'Critic_Score', 'User_Score', + 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] + df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False) - rec_count += 1 +if __name__ == "__main__": + pages = 19 + rec_count = 0 + rank = [] + gname = [] + platform = [] + year = [] + genre = [] + critic_score = [] + user_score = [] + publisher = [] + developer = [] + sales_na = [] + sales_pal = [] + sales_jp = [] + sales_ot = [] + sales_gl = [] -columns = { - 'Rank': rank, - 'Name': gname, - 'Platform': platform, - 'Year': year, - 'Genre': genre, - 'Critic_Score': critic_score, - 'User_Score': user_score, - 'Publisher': publisher, - 'Developer': developer, - 'NA_Sales': sales_na, - 'PAL_Sales': sales_pal, - 'JP_Sales': sales_jp, - 'Other_Sales': sales_ot, - 'Global_Sales': sales_gl -} -print(rec_count) -df = pd.DataFrame(columns) -print(df.columns) -df = df[[ - 'Rank', 'Name', 'Platform', 'Year', 'Genre', - 'Publisher', 'Developer', 'Critic_Score', 'User_Score', - 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] -df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False) + urlhead = 'http://www.vgchartz.com/gamedb/?page=' + urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both' + urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0' + urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1' + urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1' + urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL' + download_data() From 65a0596fffdbad61b3fd8cd29f4414d52404c5fe Mon Sep 17 00:00:00 2001 From: hechmik Date: Tue, 14 May 2019 21:44:20 +0200 Subject: [PATCH 04/35] fixed bugs such as libraries, get_page usage and os random pick for user agent --- vgchartzfull.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 7ad271f..0821e78 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -2,15 +2,17 @@ import urllib import pandas as pd import numpy as np +from random import randint, choice +import time def create_random_header(): browsers = ["Mozilla", "Chrome"] - os = ["Windows NT 6.1; Win64; x64" "X11; Linux x86_64"], + os = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"], #user_agent = 'Mozilla/{}.{} (Windows NT 6.1; Win64; x64)'.format(randint(1,56)) major_version = randint(1, 56) minor_version = randint(1, 10) - chosen_browser = random.choice(browsers) - chosen_os = random.choice(os) + chosen_browser = choice(browsers) + chosen_os = choice(os) user_agent = '{}/{}.{} ({})'.format( chosen_browser, @@ -46,8 +48,8 @@ def download_data(): rec_count = 0 for page in range(1, pages): surl = urlhead + str(page) + urltail - r = urllib.request.urlopen(surl).read() - soup = BeautifulSoup(r) + current_page = get_page(surl) + soup = BeautifulSoup(current_page) print(f"Page: {page}") # vgchartz website is really weird so we have to search for From 2abab6059d78ff7daf5d9bc7a7dea437bfaefb6f Mon Sep 17 00:00:00 2001 From: hechmik Date: Tue, 14 May 2019 21:47:30 +0200 Subject: [PATCH 05/35] first working version with random user agent and sleep between requests --- vgchartzfull.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 0821e78..76d11db 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -7,12 +7,11 @@ def create_random_header(): browsers = ["Mozilla", "Chrome"] - os = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"], - #user_agent = 'Mozilla/{}.{} (Windows NT 6.1; Win64; x64)'.format(randint(1,56)) + os_list = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"] major_version = randint(1, 56) minor_version = randint(1, 10) chosen_browser = choice(browsers) - chosen_os = choice(os) + chosen_os = choice(os_list) user_agent = '{}/{}.{} ({})'.format( chosen_browser, From ea44a6977716bcc96ad0dfb2cbb251772b77125d Mon Sep 17 00:00:00 2001 From: hechmik Date: Tue, 14 May 2019 23:10:05 +0200 Subject: [PATCH 06/35] Add basic documentation, modularized code --- vgchartzfull.py | 231 +++++++++++++++++++++++++++++++----------------- 1 file changed, 149 insertions(+), 82 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 76d11db..2b14450 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -5,11 +5,21 @@ from random import randint, choice import time -def create_random_header(): + +def create_random_header(lb_major=1, + ub_major=56, + lb_minor=1, + ub_minor=10): + """ + Create a random user agent in order to better mimic user behaviour. + Optional parameters for defining the: + - range of browser's major version (lower and upper bound) + - range of browser's minor version (lower and upper bound) + """ browsers = ["Mozilla", "Chrome"] os_list = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"] - major_version = randint(1, 56) - minor_version = randint(1, 10) + major_version = randint(lb_major, ub_major) + minor_version = randint(lb_minor, ub_minor) chosen_browser = choice(browsers) chosen_os = choice(os_list) @@ -21,16 +31,30 @@ def create_random_header(): header = { 'User-Agent' : user_agent} print(header) return header -def get_page(url): + + +def get_page(url, + lower_bound_sleep=6, + upper_bound_sleep=15): + """ + Perform a GET request to the given URL and return results. + Add a wait logic that, combined with random header, will help avoiding + HTTP 429 error. + The optional parameters will allow further customization of waiting periods. + """ header = create_random_header() request = urllib.request.Request(url, headers=header) result = urllib.request.urlopen(request).read() - time.sleep(randint(6,15)) + time.sleep(randint(lower_bound_sleep, upper_bound_sleep)) return result -def get_genre(): - # go to every individual website to get genre info - url_to_game = tag.attrs['href'] - site_raw = urllib.request.urlopen(url_to_game).read() + + +def get_genre(game_url): + """ + Return the game genre retrieved from the given url + """ + + site_raw = get_page(game_url) sub_soup = BeautifulSoup(site_raw, "html.parser") # again, the info box is inconsistent among games so we # have to find all the h2 and traverse from that to the genre name @@ -41,74 +65,60 @@ def get_genre(): for h2 in h2s: if h2.string == 'Genre': temp_tag = h2 - genre.append(temp_tag.next_sibling.string) -def download_data(): - rec_count = 0 - for page in range(1, pages): - surl = urlhead + str(page) + urltail - current_page = get_page(surl) - soup = BeautifulSoup(current_page) - print(f"Page: {page}") + genre_value = temp_tag.next_sibling.string + return genre_value - # vgchartz website is really weird so we have to search for - # tags with game urls - game_tags = list(filter( - lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), - # discard the first 10 elements because those - # links are in the navigation bar - soup.find_all("a") - ))[10:] - for tag in game_tags: +def get_release_year(raw_year): + """ + Return the release year of the given game in a 4 digit format or N/A. + """ + if raw_year.startswith('N/A'): + final_year = 'N/A' + elif int(raw_year) >= 80: + final_year = np.int32("19" + raw_year) + else: + final_year = np.int32("20" + raw_year) + return final_year - # add name to list - gname.append(" ".join(tag.string.split())) - print(f"{rec_count + 1} Fetch data for game {gname[-1]}") - # get different attributes - # traverse up the DOM tree - data = tag.parent.parent.find_all("td") - rank.append(np.int32(data[0].string)) - platform.append(data[3].find('img').attrs['alt']) - publisher.append(data[4].string) - developer.append(data[5].string) - critic_score.append( - float(data[6].string) if - not data[6].string.startswith("N/A") else np.nan) - user_score.append( - float(data[7].string) if - not data[7].string.startswith("N/A") else np.nan) - sales_na.append( - float(data[9].string[:-1]) if - not data[9].string.startswith("N/A") else np.nan) - sales_pal.append( - float(data[10].string[:-1]) if - not data[10].string.startswith("N/A") else np.nan) - sales_jp.append( - float(data[11].string[:-1]) if - not data[11].string.startswith("N/A") else np.nan) - sales_ot.append( - float(data[12].string[:-1]) if - not data[12].string.startswith("N/A") else np.nan) - sales_gl.append( - float(data[8].string[:-1]) if - not data[8].string.startswith("N/A") else np.nan) - release_year = data[13].string.split()[-1] - # different format for year - if release_year.startswith('N/A'): - year.append('N/A') - else: - if int(release_year) >= 80: - year_to_add = np.int32("19" + release_year) - else: - year_to_add = np.int32("20" + release_year) - year.append(year_to_add) - - - - rec_count += 1 +def add_current_game_data(current_critic_score, + current_developer, + current_gname, + current_platform, + current_publisher, + current_rank, + current_release_year, + current_sales_gl, + current_sales_jp, + current_sales_na, + current_sales_ot, + current_sales_pal, + current_user_score): + + """ + Add all the game data to the related lists + """ + gname.append(current_gname) + rank.append(current_rank) + platform.append(current_platform) + publisher.append(current_publisher) + developer.append(current_developer) + critic_score.append(current_critic_score) + user_score.append(current_user_score) + sales_na.append(current_sales_na) + sales_pal.append(current_sales_pal) + sales_jp.append(current_sales_jp) + sales_ot.append(current_sales_ot) + sales_gl.append(current_sales_gl) + year.append(current_release_year) + +def save_games_data(filename = "vgsales.csv", separator=",", enc="utf-8"): + """ + Save all the downloaded data into the specified file + """ columns = { 'Rank': rank, 'Name': gname, @@ -125,32 +135,87 @@ def download_data(): 'Other_Sales': sales_ot, 'Global_Sales': sales_gl } - print(rec_count) df = pd.DataFrame(columns) print(df.columns) df = df[[ 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'Developer', 'Critic_Score', 'User_Score', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] - df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False) + df.to_csv(filename, sep=separator, encoding=enc, index=False) + + +def download_data(start_page, end_page, download_genre=False): + """ + Download games data from vgchartz: only data whose pages are in the range (start_page, end_page) will be downloaded + :param start_page: + :param end_page: + :param download_genre: + :return: + """ + game_rank = 1 # Results are decreasingly ordered according to Shipped units + for page in range(start_page, end_page): + surl = urlhead + str(page) + urltail + current_page = get_page(surl) + soup = BeautifulSoup(current_page) + print(f"Page: {page}") + + # vgchartz website is really weird so we have to search for + # tags with game urls + game_tags = list(filter( + lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), + # discard the first 10 elements because those + # links are in the navigation bar + soup.find_all("a") + ))[10:] + + for tag in game_tags: + + current_gname = " ".join(tag.string.split()) # add game name to list + print(f"{game_rank} Fetch data for game {current_gname}") + + # Get different attributes + # traverse up the DOM tree + data = tag.parent.parent.find_all("td") + current_rank = np.int32(data[0].string) + current_platform = data[3].find('img').attrs['alt'] + current_publisher = data[4].string + current_developer = data[5].string + current_critic_score = float(data[6].string) if not data[6].string.startswith("N/A") else np.nan + current_user_score = float(data[7].string) if not data[7].string.startswith("N/A") else np.nan + current_sales_na = float(data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan + current_sales_pal = float(data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan + current_sales_jp = float(data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan + current_sales_ot = float(data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan + current_sales_gl = float(data[8].string[:-1]) if not data[8].string.startswith("N/A") else np.nan + current_release_year = get_release_year(data[13].string.split()[-1]) + + add_current_game_data(current_critic_score, current_developer, current_gname, current_platform, + current_publisher, current_rank, current_release_year, current_sales_gl, + current_sales_jp, current_sales_na,current_sales_ot, current_sales_pal, + current_user_score) + + game_url = tag.attrs['href'] + game_genre = "" + if download_genre: + game_genre = get_genre(game_url) + genre.append(game_genre) + + game_rank += 1 + + print("Number of downloaded resources: {}".format(game_rank)) + if __name__ == "__main__": pages = 19 - rec_count = 0 rank = [] gname = [] platform = [] year = [] genre = [] - critic_score = [] - user_score = [] + critic_score, user_score = [], [] publisher = [] developer = [] - sales_na = [] - sales_pal = [] - sales_jp = [] - sales_ot = [] - sales_gl = [] + sales_na, sales_pal, sales_jp, sales_ot, sales_gl = [], [], [], [], [] urlhead = 'http://www.vgchartz.com/gamedb/?page=' urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both' @@ -158,4 +223,6 @@ def download_data(): urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1' urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1' urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL' - download_data() + download_data(1, 2) + save_games_data() + From a55b284a66604de6dd654466e54832910592dd3b Mon Sep 17 00:00:00 2001 From: hechmik Date: Tue, 14 May 2019 23:34:14 +0200 Subject: [PATCH 07/35] Example property file --- resources.json | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 resources.json diff --git a/resources.json b/resources.json new file mode 100644 index 0000000..fd40629 --- /dev/null +++ b/resources.json @@ -0,0 +1,16 @@ +{ + "output_filename": "vgsales.csv", + "separator": ",", + "encoding": "utf-8", + "start_page": 1, + "end_page": 2, + "include_genre": false, + "minimum_sleep_time": 6, + "maximum_sleep_time": 15, + "minimum_major_version": 1, + "maximum_major_version": 56, + "minimum_minor_version": 1, + "maximum_minor_version": 10, + "base_page_url": "http://www.vgchartz.com/gamedb/?page=", + "remaining_url": "&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1&showlastupdate=0&showothersales=1&showgenre=1&sort=GL" +} From 70922cf270f0e9ee3ee1fd03a950594e1b26c837 Mon Sep 17 00:00:00 2001 From: hechmik Date: Tue, 14 May 2019 23:35:04 +0200 Subject: [PATCH 08/35] Completed refactor, read many parameter from property file --- vgchartzfull.py | 97 ++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 2b14450..58636b4 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -4,22 +4,17 @@ import numpy as np from random import randint, choice import time +import json -def create_random_header(lb_major=1, - ub_major=56, - lb_minor=1, - ub_minor=10): +def create_random_header(): """ Create a random user agent in order to better mimic user behaviour. - Optional parameters for defining the: - - range of browser's major version (lower and upper bound) - - range of browser's minor version (lower and upper bound) """ browsers = ["Mozilla", "Chrome"] os_list = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"] - major_version = randint(lb_major, ub_major) - minor_version = randint(lb_minor, ub_minor) + major_version = randint(properties['minimum_major_version'], properties['maximum_major_version']) + minor_version = randint(properties['minimum_minor_version'], properties['maximum_minor_version']) chosen_browser = choice(browsers) chosen_os = choice(os_list) @@ -33,19 +28,16 @@ def create_random_header(lb_major=1, return header -def get_page(url, - lower_bound_sleep=6, - upper_bound_sleep=15): +def get_page(url): """ Perform a GET request to the given URL and return results. Add a wait logic that, combined with random header, will help avoiding HTTP 429 error. - The optional parameters will allow further customization of waiting periods. """ header = create_random_header() request = urllib.request.Request(url, headers=header) result = urllib.request.urlopen(request).read() - time.sleep(randint(lower_bound_sleep, upper_bound_sleep)) + time.sleep(randint(properties['minimum_sleep_time'], properties['maximum_sleep_time'])) return result @@ -115,45 +107,16 @@ def add_current_game_data(current_critic_score, year.append(current_release_year) -def save_games_data(filename = "vgsales.csv", separator=",", enc="utf-8"): - """ - Save all the downloaded data into the specified file - """ - columns = { - 'Rank': rank, - 'Name': gname, - 'Platform': platform, - 'Year': year, - 'Genre': genre, - 'Critic_Score': critic_score, - 'User_Score': user_score, - 'Publisher': publisher, - 'Developer': developer, - 'NA_Sales': sales_na, - 'PAL_Sales': sales_pal, - 'JP_Sales': sales_jp, - 'Other_Sales': sales_ot, - 'Global_Sales': sales_gl - } - df = pd.DataFrame(columns) - print(df.columns) - df = df[[ - 'Rank', 'Name', 'Platform', 'Year', 'Genre', - 'Publisher', 'Developer', 'Critic_Score', 'User_Score', - 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] - df.to_csv(filename, sep=separator, encoding=enc, index=False) - - -def download_data(start_page, end_page, download_genre=False): +def download_data(start_page, end_page, include_genre): """ Download games data from vgchartz: only data whose pages are in the range (start_page, end_page) will be downloaded :param start_page: :param end_page: - :param download_genre: + :param include_genre: :return: """ game_rank = 1 # Results are decreasingly ordered according to Shipped units - for page in range(start_page, end_page): + for page in range(start_page, end_page + 1): surl = urlhead + str(page) + urltail current_page = get_page(surl) soup = BeautifulSoup(current_page) @@ -196,7 +159,7 @@ def download_data(start_page, end_page, download_genre=False): game_url = tag.attrs['href'] game_genre = "" - if download_genre: + if include_genre: game_genre = get_genre(game_url) genre.append(game_genre) @@ -205,8 +168,36 @@ def download_data(start_page, end_page, download_genre=False): print("Number of downloaded resources: {}".format(game_rank)) +def save_games_data(filename, separator, enc): + """ + Save all the downloaded data into the specified file + """ + columns = { + 'Rank': rank, + 'Name': gname, + 'Platform': platform, + 'Year': year, + 'Genre': genre, + 'Critic_Score': critic_score, + 'User_Score': user_score, + 'Publisher': publisher, + 'Developer': developer, + 'NA_Sales': sales_na, + 'PAL_Sales': sales_pal, + 'JP_Sales': sales_jp, + 'Other_Sales': sales_ot, + 'Global_Sales': sales_gl + } + df = pd.DataFrame(columns) + print(df.columns) + df = df[[ + 'Rank', 'Name', 'Platform', 'Year', 'Genre', + 'Publisher', 'Developer', 'Critic_Score', 'User_Score', + 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] + df.to_csv(filename, sep=separator, encoding=enc, index=False) + + if __name__ == "__main__": - pages = 19 rank = [] gname = [] platform = [] @@ -223,6 +214,12 @@ def download_data(start_page, end_page, download_genre=False): urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1' urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1' urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL' - download_data(1, 2) - save_games_data() + + properties = None + + with open("resources.json") as file: + properties = json.load(file) + print(properties) + download_data(properties['start_page'], properties['end_page'], properties['include_genre']) + save_games_data(properties['output_filename'], properties['separator'], properties['encoding']) From e65e51a431ce4b01717ed2d9f561899609479ebf Mon Sep 17 00:00:00 2001 From: hechmik Date: Wed, 15 May 2019 10:46:42 +0200 Subject: [PATCH 09/35] Read vgchartz url from config json --- vgchartzfull.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 58636b4..f84c697 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -117,7 +117,7 @@ def download_data(start_page, end_page, include_genre): """ game_rank = 1 # Results are decreasingly ordered according to Shipped units for page in range(start_page, end_page + 1): - surl = urlhead + str(page) + urltail + surl = base_url + str(page) + remaining_url current_page = get_page(surl) soup = BeautifulSoup(current_page) print(f"Page: {page}") @@ -208,18 +208,13 @@ def save_games_data(filename, separator, enc): developer = [] sales_na, sales_pal, sales_jp, sales_ot, sales_gl = [], [], [], [], [] - urlhead = 'http://www.vgchartz.com/gamedb/?page=' - urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both' - urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0' - urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1' - urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1' - urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL' - properties = None with open("resources.json") as file: properties = json.load(file) print(properties) + base_url = properties['base_page_url'] + remaining_url = properties['remaining_url'] download_data(properties['start_page'], properties['end_page'], properties['include_genre']) save_games_data(properties['output_filename'], properties['separator'], properties['encoding']) From d5ab1f45470cc221e1c35e364cd42bdc624a4dd9 Mon Sep 17 00:00:00 2001 From: hechmik Date: Wed, 15 May 2019 13:04:48 +0200 Subject: [PATCH 10/35] Add entry for log filename --- resources.json | 3 ++- vgchartzfull.py | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/resources.json b/resources.json index fd40629..e3559b7 100644 --- a/resources.json +++ b/resources.json @@ -12,5 +12,6 @@ "minimum_minor_version": 1, "maximum_minor_version": 10, "base_page_url": "http://www.vgchartz.com/gamedb/?page=", - "remaining_url": "&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1&showlastupdate=0&showothersales=1&showgenre=1&sort=GL" + "remaining_url": "&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1&showlastupdate=0&showothersales=1&showgenre=1&sort=GL", + "application_log_filename": "app.log" } diff --git a/vgchartzfull.py b/vgchartzfull.py index f84c697..0ee3b96 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -5,6 +5,7 @@ from random import randint, choice import time import json +import logging def create_random_header(): @@ -92,7 +93,7 @@ def add_current_game_data(current_critic_score, """ Add all the game data to the related lists """ - gname.append(current_gname) + game_name.append(current_gname) rank.append(current_rank) platform.append(current_platform) publisher.append(current_publisher) @@ -174,7 +175,7 @@ def save_games_data(filename, separator, enc): """ columns = { 'Rank': rank, - 'Name': gname, + 'Name': game_name, 'Platform': platform, 'Year': year, 'Genre': genre, @@ -198,8 +199,10 @@ def save_games_data(filename, separator, enc): if __name__ == "__main__": + + rank = [] - gname = [] + game_name = [] platform = [] year = [] genre = [] @@ -212,7 +215,12 @@ def save_games_data(filename, separator, enc): with open("resources.json") as file: properties = json.load(file) - print(properties) + + logging.basicConfig(filename=properties["application_log_filename"], + filemode='w', + format='%(asctime)s|%(name)s|%(levelname)s| %(message)s', + datefmt='%d-%m-%y %H:%M:%S') + logging.warning('Application started') base_url = properties['base_page_url'] remaining_url = properties['remaining_url'] download_data(properties['start_page'], properties['end_page'], properties['include_genre']) From 4ed64bf40e767972010a4388852469853ce0f71c Mon Sep 17 00:00:00 2001 From: hechmik Date: Wed, 15 May 2019 13:05:26 +0200 Subject: [PATCH 11/35] Improved documentation, add logging to both stdout and file --- vgchartzfull.py | 96 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 72 insertions(+), 24 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 0ee3b96..8d82715 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -11,7 +11,9 @@ def create_random_header(): """ Create a random user agent in order to better mimic user behaviour. + :return JSON with User-Agent as key and random browser-os combo as value """ + logging.info("create_random_header >>>") browsers = ["Mozilla", "Chrome"] os_list = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"] major_version = randint(properties['minimum_major_version'], properties['maximum_major_version']) @@ -24,8 +26,9 @@ def create_random_header(): major_version, minor_version, chosen_os) - header = { 'User-Agent' : user_agent} - print(header) + header = {'User-Agent': user_agent} + logging.debug("Current user_agent: {}".format(header)) + logging.info("create_random_header <<<") return header @@ -34,19 +37,27 @@ def get_page(url): Perform a GET request to the given URL and return results. Add a wait logic that, combined with random header, will help avoiding HTTP 429 error. + :param url: webpage URL + :return: HTML page's body """ + logging.info("get_page >>>") + logging.debug("Current URL: {}".format(url)) header = create_random_header() request = urllib.request.Request(url, headers=header) result = urllib.request.urlopen(request).read() time.sleep(randint(properties['minimum_sleep_time'], properties['maximum_sleep_time'])) + logging.info("get_page <<<") return result def get_genre(game_url): """ Return the game genre retrieved from the given url + :param game_url: + :return: Genre of the input game """ - + logging.info("get_genre >>>") + logging.debug("Page to download: {}".format(game_url)) site_raw = get_page(game_url) sub_soup = BeautifulSoup(site_raw, "html.parser") # again, the info box is inconsistent among games so we @@ -60,25 +71,32 @@ def get_genre(game_url): temp_tag = h2 genre_value = temp_tag.next_sibling.string + logging.debug("Game genre: {}".format(genre_value)) + logging.info("get_genre <<<") return genre_value def get_release_year(raw_year): """ Return the release year of the given game in a 4 digit format or N/A. + :param raw_year: + :return: Game Release year """ + logging.info("get_release_year >>>") if raw_year.startswith('N/A'): final_year = 'N/A' elif int(raw_year) >= 80: final_year = np.int32("19" + raw_year) else: final_year = np.int32("20" + raw_year) + logging.debug("Release Year: {}".format(final_year)) + logging.info("get_release_year <<<") return final_year def add_current_game_data(current_critic_score, current_developer, - current_gname, + current_game_name, current_platform, current_publisher, current_rank, @@ -89,11 +107,26 @@ def add_current_game_data(current_critic_score, current_sales_ot, current_sales_pal, current_user_score): - """ Add all the game data to the related lists + + :param current_critic_score: + :param current_developer: + :param current_game_name: + :param current_platform: + :param current_publisher: + :param current_rank: + :param current_release_year: + :param current_sales_gl: + :param current_sales_jp: + :param current_sales_na: + :param current_sales_ot: + :param current_sales_pal: + :param current_user_score: + :return: """ - game_name.append(current_gname) + logging.info("add_current_game_data >>>") + game_name.append(current_game_name) rank.append(current_rank) platform.append(current_platform) publisher.append(current_publisher) @@ -106,6 +139,7 @@ def add_current_game_data(current_critic_score, sales_ot.append(current_sales_ot) sales_gl.append(current_sales_gl) year.append(current_release_year) + logging.info("add_current_game_data <<<") def download_data(start_page, end_page, include_genre): @@ -116,12 +150,13 @@ def download_data(start_page, end_page, include_genre): :param include_genre: :return: """ - game_rank = 1 # Results are decreasingly ordered according to Shipped units + logging.info("download_data >>>") + downloaded_games = 0 # Results are decreasingly ordered according to Shipped units for page in range(start_page, end_page + 1): - surl = base_url + str(page) + remaining_url - current_page = get_page(surl) + page_url = "{}{}{}".format(base_url, str(page), remaining_url) + current_page = get_page(page_url) soup = BeautifulSoup(current_page) - print(f"Page: {page}") + logging.info("Downloaded page {}".format(page)) # vgchartz website is really weird so we have to search for # tags with game urls @@ -134,8 +169,8 @@ def download_data(start_page, end_page, include_genre): for tag in game_tags: - current_gname = " ".join(tag.string.split()) # add game name to list - print(f"{game_rank} Fetch data for game {current_gname}") + current_gname = " ".join(tag.string.split()) # add game name to list + logging.debug("Downloaded game: {}. Name: {}".format(downloaded_games + 1, current_gname)) # Get different attributes # traverse up the DOM tree @@ -155,7 +190,7 @@ def download_data(start_page, end_page, include_genre): add_current_game_data(current_critic_score, current_developer, current_gname, current_platform, current_publisher, current_rank, current_release_year, current_sales_gl, - current_sales_jp, current_sales_na,current_sales_ot, current_sales_pal, + current_sales_jp, current_sales_na, current_sales_ot, current_sales_pal, current_user_score) game_url = tag.attrs['href'] @@ -164,15 +199,20 @@ def download_data(start_page, end_page, include_genre): game_genre = get_genre(game_url) genre.append(game_genre) - game_rank += 1 + downloaded_games += 1 - print("Number of downloaded resources: {}".format(game_rank)) + logging.info("Number of downloaded resources: {}".format(downloaded_games)) + logging.info("download_data <<<") def save_games_data(filename, separator, enc): """ Save all the downloaded data into the specified file + :param filename + :param separator + :param enc """ + logging.info("save_games_data >>>") columns = { 'Rank': rank, 'Name': game_name, @@ -190,17 +230,16 @@ def save_games_data(filename, separator, enc): 'Global_Sales': sales_gl } df = pd.DataFrame(columns) - print(df.columns) + logging.debug("Dataframe column name: {}".format(df.columns)) df = df[[ 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'Developer', 'Critic_Score', 'User_Score', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] df.to_csv(filename, sep=separator, encoding=enc, index=False) + logging.info("save_games_data <<<") if __name__ == "__main__": - - rank = [] game_name = [] platform = [] @@ -216,13 +255,22 @@ def save_games_data(filename, separator, enc): with open("resources.json") as file: properties = json.load(file) - logging.basicConfig(filename=properties["application_log_filename"], - filemode='w', - format='%(asctime)s|%(name)s|%(levelname)s| %(message)s', - datefmt='%d-%m-%y %H:%M:%S') - logging.warning('Application started') + logging.root.handlers = [] + logging.basicConfig(format='%(asctime)s|%(name)s|%(levelname)s| %(message)s', + level=logging.DEBUG, + filename=properties["application_log_filename"]) + + # set up logging to console + console = logging.StreamHandler() + console.setLevel(logging.DEBUG) + # set a format which is simpler for console use + formatter = logging.Formatter(fmt='%(asctime)s|%(name)s|%(levelname)s| %(message)s', + datefmt="%d-%m-%Y %H:%M:%S") + console.setFormatter(formatter) + logging.getLogger("").addHandler(console) + + logging.info('Application started') base_url = properties['base_page_url'] remaining_url = properties['remaining_url'] download_data(properties['start_page'], properties['end_page'], properties['include_genre']) save_games_data(properties['output_filename'], properties['separator'], properties['encoding']) - From 8a3e28fb57713f6314539824d465960ed673adc2 Mon Sep 17 00:00:00 2001 From: Khaled Hechmi Date: Wed, 4 Mar 2020 15:40:28 +0100 Subject: [PATCH 12/35] Upgraded to HTTPS --- resources.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources.json b/resources.json index e3559b7..d6d4167 100644 --- a/resources.json +++ b/resources.json @@ -11,7 +11,7 @@ "maximum_major_version": 56, "minimum_minor_version": 1, "maximum_minor_version": 10, - "base_page_url": "http://www.vgchartz.com/gamedb/?page=", + "base_page_url": "https://www.vgchartz.com/gamedb/?page=", "remaining_url": "&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1&showlastupdate=0&showothersales=1&showgenre=1&sort=GL", "application_log_filename": "app.log" } From b31b21d7638ee5e44c6f92985210334d98fba5e1 Mon Sep 17 00:00:00 2001 From: Khaled Hechmi Date: Wed, 4 Mar 2020 15:50:33 +0100 Subject: [PATCH 13/35] Use https in lambda for skipping first 10 elements --- vgchartzfull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 8d82715..425238b 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -161,7 +161,7 @@ def download_data(start_page, end_page, include_genre): # vgchartz website is really weird so we have to search for # tags with game urls game_tags = list(filter( - lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), + lambda x: x.attrs['href'].startswith('https://www.vgchartz.com/game/'), # discard the first 10 elements because those # links are in the navigation bar soup.find_all("a") From 1c86411d705485e8f9a71d9de35210fb88d3bb86 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 11:18:09 +0200 Subject: [PATCH 14/35] Update README.md --- README.md | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index e557119..27dc238 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,19 @@ -vgchartzfull is a python script based on BeautifulSoup. -It creates a dataset based on data from -http://www.vgchartz.com/gamedb/ +# vgchartzfull + + +vgchartzfull.py is a python@3 script based on BeautifulSoup. It creates a dataset based on data from http://www.vgchartz.com/gamedb/ The dataset is saved as vgsales.csv. -You will need to have BeautifulSoup added. +You will need to have some depencies compiled at **requirements.txt**. + It can be installed by pip. -sudo pip install BeautifulSoup +```bash + + $> pip install -r requirements.txt + +``` -Thanks to Chris Albon. +Thanks to Chris Albon & Gregor UT http://chrisalbon.com/python/beautiful_soup_scrape_table.html From fb81a1358ad2681a5e7ee7e321ec15bd1a5547a0 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 11:25:40 +0200 Subject: [PATCH 15/35] Update README.md --- README.md | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 27dc238..666d4a1 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,13 @@ -# vgchartzfull - +# vgchartzfull - A crawler to download and analyze Video Game Sales data from more than 16,500 games. vgchartzfull.py is a python@3 script based on BeautifulSoup. It creates a dataset based on data from http://www.vgchartz.com/gamedb/ +## Output + The dataset is saved as vgsales.csv. +## Install & execution + You will need to have some depencies compiled at **requirements.txt**. It can be installed by pip. @@ -15,5 +18,30 @@ It can be installed by pip. ``` +## Dictionary + +| Field | Description | +|-------|--------------------------| +| Rank | Ranking of overall sales | +| Name | The games name | +| Platform | Platform of the games release (i.e. PC,PS4, etc.) | +| Year | Year of the game's release | +| Genre | Genre of the game | +| Publisher | Publisher of the game | +| NA_Sales | Sales in North America (in millions) | +| EU_Sales | Sales in Europe (in millions) | +| JP_Sales | Sales in Japan (in millions) | +| Other_Sales | Sales in the rest of the world (in millions) | +| Global_Sales | Total worldwide sales. | + + +## Links + +* [vgchartz.com](https://www.vgchartz.com) +* [Original Crawler](https://github.com/GregorUT/vgchartzScrape) +* [Kaggle Dataset](https://www.kaggle.com/gregorut/videogamesales) + +## Greetings + Thanks to Chris Albon & Gregor UT http://chrisalbon.com/python/beautiful_soup_scrape_table.html From dd64927d1cf9ef497ec505412177dfb145c2a9d5 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 11:26:11 +0200 Subject: [PATCH 16/35] Create requirements.txt --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f546999 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +beautifulsoup4==4.8.2 +numpy==1.16.4 +pandas==0.25.0 From c732358cc03669709eca0e882ec8fc7af6d8715b Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 11:26:37 +0200 Subject: [PATCH 17/35] Update .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index be2baa1..9b00b42 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +.idea +.vcs + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] From 35cff1d295f8aba89b168e54202e1e0d7bd06efb Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 11:28:31 +0200 Subject: [PATCH 18/35] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 666d4a1..8e15845 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,12 @@ It can be installed by pip. ```bash + # Install dependencies $> pip install -r requirements.txt + + # Run + $> python vgchartzfull.py + ``` From 0f5aec361aa87118329034a54a17e80de4b5a3a2 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 11:29:16 +0200 Subject: [PATCH 19/35] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8e15845..bbec132 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ -# vgchartzfull - A crawler to download and analyze Video Game Sales data from more than 16,500 games. +# vgchartzfull - A crawler to download and analyze Video Game Sales -vgchartzfull.py is a python@3 script based on BeautifulSoup. It creates a dataset based on data from http://www.vgchartz.com/gamedb/ +vgchartzfull.py is a python@3 script based on BeautifulSoup. + +It creates a dataset with data from more than 16,500 games. based on data from http://www.vgchartz.com/gamedb/ ## Output From d8f2173f23b32c83795723a6f209d602eccd2440 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 11:29:51 +0200 Subject: [PATCH 20/35] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index bbec132..0854b43 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# vgchartzfull - A crawler to download and analyze Video Game Sales +# vgchartzfull - A crawler to download data from Global Videogame Sales vgchartzfull.py is a python@3 script based on BeautifulSoup. From 6685ad393a2f41d1d5207e914976f407b07af2a3 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 12:36:23 +0200 Subject: [PATCH 21/35] Update .gitignore Avoiding to upload csv data partial files --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 9b00b42..fffebb9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ .idea .vcs +*.csv + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] From 33a53d830b04ca62500ec1293b75c7f7315c8594 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 13:56:40 +0200 Subject: [PATCH 22/35] Refactor in functions --- vgchartzfull.py | 344 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 235 insertions(+), 109 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index b1d75a4..bb98ac1 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -2,11 +2,16 @@ import urllib import pandas as pd import numpy as np +import datetime +import time -pages = 19 +# Environment & buffers rec_count = 0 +page_size = 10 +pages = 4 # 57,453 / 1000 = 58 (At the time of this writing) + rank = [] -gname = [] +game_name = [] platform = [] year = [] genre = [] @@ -20,111 +25,232 @@ sales_ot = [] sales_gl = [] -urlhead = 'http://www.vgchartz.com/gamedb/?page=' -urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both' -urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0' -urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1' -urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1' -urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL' - -for page in range(1, pages): - surl = urlhead + str(page) + urltail - r = urllib.request.urlopen(surl).read() - soup = BeautifulSoup(r) - print(f"Page: {page}") - - # vgchartz website is really weird so we have to search for - # tags with game urls - game_tags = list(filter( - lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), - # discard the first 10 elements because those - # links are in the navigation bar - soup.find_all("a") - ))[10:] - - for tag in game_tags: - - # add name to list - gname.append(" ".join(tag.string.split())) - print(f"{rec_count + 1} Fetch data for game {gname[-1]}") - - # get different attributes - # traverse up the DOM tree - data = tag.parent.parent.find_all("td") - rank.append(np.int32(data[0].string)) - platform.append(data[3].find('img').attrs['alt']) - publisher.append(data[4].string) - developer.append(data[5].string) - critic_score.append( - float(data[6].string) if - not data[6].string.startswith("N/A") else np.nan) - user_score.append( - float(data[7].string) if - not data[7].string.startswith("N/A") else np.nan) - sales_na.append( - float(data[9].string[:-1]) if - not data[9].string.startswith("N/A") else np.nan) - sales_pal.append( - float(data[10].string[:-1]) if - not data[10].string.startswith("N/A") else np.nan) - sales_jp.append( - float(data[11].string[:-1]) if - not data[11].string.startswith("N/A") else np.nan) - sales_ot.append( - float(data[12].string[:-1]) if - not data[12].string.startswith("N/A") else np.nan) - sales_gl.append( - float(data[8].string[:-1]) if - not data[8].string.startswith("N/A") else np.nan) - release_year = data[13].string.split()[-1] - # different format for year - if release_year.startswith('N/A'): - year.append('N/A') +def main (): + """ + Main Crawler Loop + + :return: a csv file :) + """ + + for page in range(1, pages): + + try: + surl = generate_uri(page_number=str(page), page_size=page_size) + r = urllib.request.urlopen(surl).read() + soup = BeautifulSoup(r, features="html.parser") + print(f"Crawling page: {page} of {pages}") + + # We locate the game from tags with game urls + game_tags = list(filter( + lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), + # discard the first 10 elements because those links are in the navigation bar + soup.find_all("a") + ))[10:] + + # Loop for each line received + for tag in game_tags: + parse_game(tag=tag) + + except urllib.error.HTTPError as e: + print("Unexpected error:", sys.exc_info()[0]) + print(e.code) + print(e.read()) + + time.sleep(15) + + finally: + # Crawlers: The Friend Nobody Likes + time.sleep(60) + + # Generate and export to CSV + timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M") + df.to_csv(f"vgsales-{timestamp}.csv", sep=",", encoding='utf-8', index=False) + + +def generate_uri(*, page_number, page_size): + """ + + Generate the uri from page number + + :param page_number: + :return: + """ + + urlhead = 'http://www.vgchartz.com/gamedb/?page=' + # page_number... <= here comes the param received + urltail = f'&results={page_size}' + urltail += '&order=Sales' + urltail += '®ion=All' + urltail += '&boxart=Both' + urltail += '&banner=Both' + urltail += '&ownership=Both' + urltail += '&keyword=' + urltail += '&console=' + urltail += '&developer=' + urltail += '&publisher=' + urltail += '&goty_year=' + urltail += '&genre=' + urltail += '&showmultiplat=No' + urltail += '&showtotalsales=0' + urltail += '&showtotalsales=1' + urltail += '&showpublisher=0' + urltail += '&showpublisher=1' + urltail += '&showvgchartzscore=0' + urltail += '&showvgchartzscore=1' + urltail += '&shownasales=0' + urltail += '&shownasales=1' + urltail += '&showdeveloper=0' + urltail += '&showdeveloper=1' + urltail += '&showcriticscore=0' + urltail += '&showcriticscore=1' + urltail += '&showpalsales=0' + urltail += '&showpalsales=1' + urltail += '&showreleasedate=0' + urltail += '&showreleasedate=1' + urltail += '&showuserscore=0' + urltail += '&showuserscore=1' + urltail += '&showjapansales=0' + urltail += '&showjapansales=1' + urltail += '&showlastupdate=0' + urltail += '&showlastupdate=1' + urltail += '&showothersales=0' + urltail += '&showothersales=1' + urltail += '&showshipped=0' + urltail += '&showshipped=1' + + return urlhead + str(page_number) + urltail + +def parse_game(*, tag): + """ + Parse a game and navigate to its particular url to grab its data + + :param tag: + :return: + """ + + # Add name to list + game_name.append(" ".join(tag.string.split())) + print(f"{rec_count + 1} Fetch data for game {game_name[-1]}") + + # Get different attributes traverse up the DOM tree + data = tag.parent.parent.find_all("td") + rank.append(np.int32(data[0].string)) + platform.append(data[3].find('img').attrs['alt']) + publisher.append(data[4].string) + developer.append(data[5].string) + + critic_score.append(float(data[6].string) if + not data[6].string.startswith("N/A") else np.nan) + + user_score.append( + float(data[7].string) if + not data[7].string.startswith("N/A") else np.nan) + + sales_na.append( + float(data[9].string[:-1]) if + not data[9].string.startswith("N/A") else np.nan) + + sales_pal.append( + float(data[10].string[:-1]) if + not data[10].string.startswith("N/A") else np.nan) + + sales_jp.append( + float(data[11].string[:-1]) if + not data[11].string.startswith("N/A") else np.nan) + + sales_ot.append( + float(data[12].string[:-1]) if + not data[12].string.startswith("N/A") else np.nan) + + sales_gl.append( + float(data[8].string[:-1]) if + not data[8].string.startswith("N/A") else np.nan) + + release_year = data[13].string.split()[-1] + + # different format for year i.e. 2K year effect XD + if release_year.startswith('N/A'): + year.append('N/A') + else: + if int(release_year) >= 80: + year_to_add = np.int32("19" + release_year) else: - if int(release_year) >= 80: - year_to_add = np.int32("19" + release_year) - else: - year_to_add = np.int32("20" + release_year) - year.append(year_to_add) - - # go to every individual website to get genre info - url_to_game = tag.attrs['href'] - site_raw = urllib.request.urlopen(url_to_game).read() - sub_soup = BeautifulSoup(site_raw, "html.parser") - # again, the info box is inconsistent among games so we - # have to find all the h2 and traverse from that to the genre name - h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2') - # make a temporary tag here to search for the one that contains - # the word "Genre" - temp_tag = element.Tag - for h2 in h2s: - if h2.string == 'Genre': - temp_tag = h2 - genre.append(temp_tag.next_sibling.string) - - rec_count += 1 - -columns = { - 'Rank': rank, - 'Name': gname, - 'Platform': platform, - 'Year': year, - 'Genre': genre, - 'Critic_Score': critic_score, - 'User_Score': user_score, - 'Publisher': publisher, - 'Developer': developer, - 'NA_Sales': sales_na, - 'PAL_Sales': sales_pal, - 'JP_Sales': sales_jp, - 'Other_Sales': sales_ot, - 'Global_Sales': sales_gl -} -print(rec_count) -df = pd.DataFrame(columns) -print(df.columns) -df = df[[ - 'Rank', 'Name', 'Platform', 'Year', 'Genre', - 'Publisher', 'Developer', 'Critic_Score', 'User_Score', - 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] -df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False) + year_to_add = np.int32("20" + release_year) + year.append(year_to_add) + + # go to every individual website to get genre info + url_to_game = tag.attrs['href'] + site_raw = urllib.request.urlopen(url_to_game).read() + sub_soup = BeautifulSoup(site_raw, "html.parser") + + # again, the info box is inconsistent among games so we + # have to find all the h2 and traverse from that to the genre name + h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2') + + # make a temporary tag here to search for the one that contains + # the word "Genre" + temp_tag = element.Tag + + for h2 in h2s: + if h2.string == 'Genre': + temp_tag = h2 + genre.append(temp_tag.next_sibling.string) + + rec_count += 1 + + # Crawlers: The Friend Nobody Likes + time.sleep(10) + + +def assemble_response(*, rank, game_name, platform, year, genre, critic_score, user_score, publisher, developer, sales_na, sales_pal, sales_jp, sales_ot, sales_gl): + """ + + Assemble from buffers to a Panda DataFrame + + :param rank: + :param game_name: + :param platform: + :param year: + :param genre: + :param critic_score: + :param user_score: + :param publisher: + :param developer: + :param sales_na: + :param sales_pal: + :param sales_jp: + :param sales_ot: + :param sales_gl: + :return: + """ + + # Assembler + columns = { + 'Rank': rank, + 'Name': game_name, + 'Platform': platform, + 'Year': year, + 'Genre': genre, + 'Critic_Score': critic_score, + 'User_Score': user_score, + 'Publisher': publisher, + 'Developer': developer, + 'NA_Sales': sales_na, + 'PAL_Sales': sales_pal, + 'JP_Sales': sales_jp, + 'Other_Sales': sales_ot, + 'Global_Sales': sales_gl + } + + # Final Report + print(rec_count) + df = pd.DataFrame(columns) + print(df.columns) + df = df[[ + 'Rank', 'Name', 'Platform', 'Year', 'Genre', + 'Publisher', 'Developer', 'Critic_Score', 'User_Score', + 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] + + return df + +main() From 414602cf9e75ad0922a8cdd4e4de3c958c739c6f Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 17:32:19 +0200 Subject: [PATCH 23/35] Add an exception to manage the main loop wrapping the functions call --- vgchartzfull.py | 84 +++++++++---------------------------------------- 1 file changed, 15 insertions(+), 69 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 0fa096e..b59cab9 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -1,12 +1,12 @@ from bs4 import BeautifulSoup, element -import urllib +from random import randint, choice import pandas as pd import numpy as np -from random import randint, choice -import datetime +import logging +import urllib +import sys import time import json -import logging def create_random_header(): """ @@ -31,59 +31,6 @@ def create_random_header(): logging.info("create_random_header <<<") return header -def generate_uri(*, page_number, page_size): - """ - - Generate the uri from page number - - :param page_number: - :return: - """ - - urlhead = 'http://www.vgchartz.com/gamedb/?page=' - # page_number... <= here comes the param received - urltail = f'&results={page_size}' - urltail += '&order=Sales' - urltail += '®ion=All' - urltail += '&boxart=Both' - urltail += '&banner=Both' - urltail += '&ownership=Both' - urltail += '&keyword=' - urltail += '&console=' - urltail += '&developer=' - urltail += '&publisher=' - urltail += '&goty_year=' - urltail += '&genre=' - urltail += '&showmultiplat=No' - urltail += '&showtotalsales=0' - urltail += '&showtotalsales=1' - urltail += '&showpublisher=0' - urltail += '&showpublisher=1' - urltail += '&showvgchartzscore=0' - urltail += '&showvgchartzscore=1' - urltail += '&shownasales=0' - urltail += '&shownasales=1' - urltail += '&showdeveloper=0' - urltail += '&showdeveloper=1' - urltail += '&showcriticscore=0' - urltail += '&showcriticscore=1' - urltail += '&showpalsales=0' - urltail += '&showpalsales=1' - urltail += '&showreleasedate=0' - urltail += '&showreleasedate=1' - urltail += '&showuserscore=0' - urltail += '&showuserscore=1' - urltail += '&showjapansales=0' - urltail += '&showjapansales=1' - urltail += '&showlastupdate=0' - urltail += '&showlastupdate=1' - urltail += '&showothersales=0' - urltail += '&showothersales=1' - urltail += '&showshipped=0' - urltail += '&showshipped=1' - - return urlhead + str(page_number) + urltail - def get_page(url): """ Perform a GET request to the given URL and return results. @@ -210,8 +157,7 @@ def download_data(start_page, end_page, include_genre): soup = BeautifulSoup(current_page) logging.info("Downloaded page {}".format(page)) - # vgchartz website is really weird so we have to search for - # tags with game urls + # We locate the game through search tags with game urls in the main table game_tags = list(filter( lambda x: x.attrs['href'].startswith('https://www.vgchartz.com/game/'), # discard the first 10 elements because those @@ -224,8 +170,7 @@ def download_data(start_page, end_page, include_genre): current_gname = " ".join(tag.string.split()) # add game name to list logging.debug("Downloaded game: {}. Name: {}".format(downloaded_games + 1, current_gname)) - # Get different attributes - # traverse up the DOM tree + # Get different attributes traverse up the DOM tree data = tag.parent.parent.find_all("td") current_rank = np.int32(data[0].string) current_platform = data[3].find('img').attrs['alt'] @@ -290,7 +235,6 @@ def save_games_data(filename, separator, enc): df.to_csv(filename, sep=separator, encoding=enc, index=False) logging.info("save_games_data <<<") - if __name__ == "__main__": rank = [] game_name = [] @@ -315,16 +259,18 @@ def save_games_data(filename, separator, enc): # set up logging to console console = logging.StreamHandler() console.setLevel(logging.DEBUG) + # set a format which is simpler for console use formatter = logging.Formatter(fmt='%(asctime)s|%(name)s|%(levelname)s| %(message)s', datefmt="%d-%m-%Y %H:%M:%S") console.setFormatter(formatter) logging.getLogger("").addHandler(console) - logging.info('Application started') - base_url = properties['base_page_url'] - remaining_url = properties['remaining_url'] - download_data(properties['start_page'], properties['end_page'], properties['include_genre']) - save_games_data(properties['output_filename'], properties['separator'], properties['encoding']) - - + try: + logging.info('Application started') + base_url = properties['base_page_url'] + remaining_url = properties['remaining_url'] + download_data(properties['start_page'], properties['end_page'], properties['include_genre']) + save_games_data(properties['output_filename'], properties['separator'], properties['encoding']) + except: + print("Unexpected error:", sys.exc_info()[0]) From 43dbe5edf211d203a90a3d11c7be2b1b2fc3730c Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 18:23:50 +0200 Subject: [PATCH 24/35] Explode query parameters --- resources.json | 37 ++++++++++++++++++++++++++++++++----- vgchartzfull.py | 18 +++++++++++++++++- 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/resources.json b/resources.json index d6d4167..a45ded9 100644 --- a/resources.json +++ b/resources.json @@ -1,17 +1,44 @@ { + "include_genre": false, + "application_log_filename": "app.log", "output_filename": "vgsales.csv", "separator": ",", "encoding": "utf-8", "start_page": 1, "end_page": 2, - "include_genre": false, + "base_page_url": "https://www.vgchartz.com/gamedb/?page=", + "query_parameters": { + "results": 1000, + "console": null, + "region": "All", + "developer": null, + "publisher": null, + "genre": null, + "boxart": "Both", + "ownership": "Both", + "order": "Sales", + "showtotalsales": 0, + "showtotalsales": 1, + "showpublisher": 0, + "showpublisher": 1, + "showvgchartzscore": 0, + "shownasales": 1, + "showdeveloper": 1, + "showcriticscore": 1, + "showpalsales": 0, + "showpalsales": 1, + "showreleasedate": 1, + "showuserscore": 1, + "showjapansales": 1, + "showlastupdate": 0, + "showothersales": 1, + "showgenre": 1, + "sort": "GL" + }, "minimum_sleep_time": 6, "maximum_sleep_time": 15, "minimum_major_version": 1, "maximum_major_version": 56, "minimum_minor_version": 1, - "maximum_minor_version": 10, - "base_page_url": "https://www.vgchartz.com/gamedb/?page=", - "remaining_url": "&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1&showlastupdate=0&showothersales=1&showgenre=1&sort=GL", - "application_log_filename": "app.log" + "maximum_minor_version": 10 } diff --git a/vgchartzfull.py b/vgchartzfull.py index b59cab9..4b805d5 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -31,6 +31,20 @@ def create_random_header(): logging.info("create_random_header <<<") return header +def generate_remaining_url(*, query_parameters): + """ + Generate an url with a list of videogames from the query params configured at resources.json + :return: Url with page number + """ + logging.info("generate_remaining_url >>>") + reply='' + for param in query_parameters: + value=query_parameters.get(param, None) + reply += f"&{param}={value}" if value is not None else f"&{param}=" + logging.debug(f"Url Generated: {base_url}?{reply}") + logging.info("generate_remaining_url <<<") + return reply + def get_page(url): """ Perform a GET request to the given URL and return results. @@ -269,8 +283,10 @@ def save_games_data(filename, separator, enc): try: logging.info('Application started') base_url = properties['base_page_url'] - remaining_url = properties['remaining_url'] + remaining_url=generate_remaining_url(query_parameters=properties['query_parameters']) download_data(properties['start_page'], properties['end_page'], properties['include_genre']) save_games_data(properties['output_filename'], properties['separator'], properties['encoding']) + except: print("Unexpected error:", sys.exc_info()[0]) + pass From 779fcad13a1684f1ec617c84ee63dfda23cd6d8b Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 18:30:39 +0200 Subject: [PATCH 25/35] I love functions with named parameters sorry XD --- vgchartzfull.py | 53 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 4b805d5..923002f 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -45,7 +45,7 @@ def generate_remaining_url(*, query_parameters): logging.info("generate_remaining_url <<<") return reply -def get_page(url): +def get_page(*, url): """ Perform a GET request to the given URL and return results. Add a wait logic that, combined with random header, will help avoiding @@ -63,7 +63,7 @@ def get_page(url): return result -def get_genre(game_url): +def get_genre(*, game_url): """ Return the game genre retrieved from the given url :param game_url: @@ -71,7 +71,7 @@ def get_genre(game_url): """ logging.info("get_genre >>>") logging.debug("Page to download: {}".format(game_url)) - site_raw = get_page(game_url) + site_raw = get_page(url=game_url) sub_soup = BeautifulSoup(site_raw, "html.parser") # again, the info box is inconsistent among games so we # have to find all the h2 and traverse from that to the genre name @@ -89,7 +89,7 @@ def get_genre(game_url): return genre_value -def get_release_year(raw_year): +def get_release_year(*, raw_year): """ Return the release year of the given game in a 4 digit format or N/A. :param raw_year: @@ -107,7 +107,8 @@ def get_release_year(raw_year): return final_year -def add_current_game_data(current_critic_score, +def add_current_game_data(*, + current_critic_score, current_developer, current_game_name, current_platform, @@ -155,7 +156,7 @@ def add_current_game_data(current_critic_score, logging.info("add_current_game_data <<<") -def download_data(start_page, end_page, include_genre): +def download_data(*, start_page, end_page, include_genre): """ Download games data from vgchartz: only data whose pages are in the range (start_page, end_page) will be downloaded :param start_page: @@ -167,7 +168,7 @@ def download_data(start_page, end_page, include_genre): downloaded_games = 0 # Results are decreasingly ordered according to Shipped units for page in range(start_page, end_page + 1): page_url = "{}{}{}".format(base_url, str(page), remaining_url) - current_page = get_page(page_url) + current_page = get_page(url=page_url) soup = BeautifulSoup(current_page) logging.info("Downloaded page {}".format(page)) @@ -197,17 +198,27 @@ def download_data(start_page, end_page, include_genre): current_sales_jp = float(data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan current_sales_ot = float(data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan current_sales_gl = float(data[8].string[:-1]) if not data[8].string.startswith("N/A") else np.nan - current_release_year = get_release_year(data[13].string.split()[-1]) - - add_current_game_data(current_critic_score, current_developer, current_gname, current_platform, - current_publisher, current_rank, current_release_year, current_sales_gl, - current_sales_jp, current_sales_na, current_sales_ot, current_sales_pal, - current_user_score) + current_release_year = get_release_year(raw_year=data[13].string.split()[-1]) + + add_current_game_data( + current_critic_score=current_critic_score, + current_developer=current_developer, + current_game_name=current_gname, + current_platform=current_platform, + current_publisher=current_publisher, + current_rank=current_rank, + current_release_year=current_release_year, + current_sales_gl=current_sales_gl, + current_sales_jp=current_sales_jp, + current_sales_na=current_sales_na, + current_sales_ot=current_sales_ot, + current_sales_pal=current_sales_pal, + current_user_score=current_user_score) game_url = tag.attrs['href'] game_genre = "" if include_genre: - game_genre = get_genre(game_url) + game_genre = get_genre(game_url=game_url) genre.append(game_genre) downloaded_games += 1 @@ -216,7 +227,7 @@ def download_data(start_page, end_page, include_genre): logging.info("download_data <<<") -def save_games_data(filename, separator, enc): +def save_games_data(*, filename, separator, enc): """ Save all the downloaded data into the specified file :param filename @@ -284,8 +295,16 @@ def save_games_data(filename, separator, enc): logging.info('Application started') base_url = properties['base_page_url'] remaining_url=generate_remaining_url(query_parameters=properties['query_parameters']) - download_data(properties['start_page'], properties['end_page'], properties['include_genre']) - save_games_data(properties['output_filename'], properties['separator'], properties['encoding']) + + download_data( + start_page=properties['start_page'], + end_page=properties['end_page'], + include_genre=properties['include_genre']) + + save_games_data( + filename=properties['output_filename'], + separator=properties['separator'], + enc=properties['encoding']) except: print("Unexpected error:", sys.exc_info()[0]) From 826476d65a28fa2b28a0614697b5ec0af86bd63f Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Mon, 30 Mar 2020 18:41:20 +0200 Subject: [PATCH 26/35] Fix padding spaces due to html parsing --- vgchartzfull.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 923002f..b4791a3 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -143,8 +143,8 @@ def add_current_game_data(*, game_name.append(current_game_name) rank.append(current_rank) platform.append(current_platform) - publisher.append(current_publisher) - developer.append(current_developer) + publisher.append(current_publisher.strip()) + developer.append(current_developer.strip()) critic_score.append(current_critic_score) user_score.append(current_user_score) sales_na.append(current_sales_na) @@ -169,7 +169,7 @@ def download_data(*, start_page, end_page, include_genre): for page in range(start_page, end_page + 1): page_url = "{}{}{}".format(base_url, str(page), remaining_url) current_page = get_page(url=page_url) - soup = BeautifulSoup(current_page) + soup = BeautifulSoup(current_page, features="html.parser") logging.info("Downloaded page {}".format(page)) # We locate the game through search tags with game urls in the main table From a8bf65696e3128e197e1cd52ccb5167591e6ee23 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Tue, 31 Mar 2020 00:34:00 +0200 Subject: [PATCH 27/35] Updating doc Signed-off-by: Manuel Eusebio de Paz Carmona --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 0854b43..618d6b0 100644 --- a/README.md +++ b/README.md @@ -50,5 +50,4 @@ It can be installed by pip. ## Greetings -Thanks to Chris Albon & Gregor UT -http://chrisalbon.com/python/beautiful_soup_scrape_table.html +Thanks to [Chris Albon](http://chrisalbon.com/python/beautiful_soup_scrape_table.html) From 6aa2b6c312c0004ca62e2ac85fd12fbfca754766 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Tue, 31 Mar 2020 09:42:09 +0200 Subject: [PATCH 28/35] Update README.md Update total count --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 618d6b0..e2a2262 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ vgchartzfull.py is a python@3 script based on BeautifulSoup. -It creates a dataset with data from more than 16,500 games. based on data from http://www.vgchartz.com/gamedb/ +It creates a dataset with data from more than 57,000 games. based on data from http://www.vgchartz.com/gamedb/ ## Output From 6a8c2a2d2d317c529d227bb16b8d28d462473554 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Tue, 31 Mar 2020 14:03:31 +0200 Subject: [PATCH 29/35] Folder reorganize and bump dependencies --- resources.json => cfg/resources.json | 8 ++++---- dataset/.gitkeep | 1 + log/.gitkeep | 1 + requirements.txt | 14 ++++++-------- vgchartzfull.py => vgchartz-full-crawler.py | 18 ++++++++++++------ 5 files changed, 24 insertions(+), 18 deletions(-) rename resources.json => cfg/resources.json (89%) create mode 100644 dataset/.gitkeep create mode 100644 log/.gitkeep rename vgchartzfull.py => vgchartz-full-crawler.py (96%) diff --git a/resources.json b/cfg/resources.json similarity index 89% rename from resources.json rename to cfg/resources.json index a45ded9..2fa636b 100644 --- a/resources.json +++ b/cfg/resources.json @@ -1,14 +1,14 @@ { - "include_genre": false, - "application_log_filename": "app.log", - "output_filename": "vgsales.csv", + "application_log_filename": "log/app.log", + "output_filename": "dataset/vgsales.csv", "separator": ",", "encoding": "utf-8", "start_page": 1, "end_page": 2, + "include_genre": false, "base_page_url": "https://www.vgchartz.com/gamedb/?page=", "query_parameters": { - "results": 1000, + "results": 10, "console": null, "region": "All", "developer": null, diff --git a/dataset/.gitkeep b/dataset/.gitkeep new file mode 100644 index 0000000..fe91d07 --- /dev/null +++ b/dataset/.gitkeep @@ -0,0 +1 @@ +Git doesn't like empty folders \ No newline at end of file diff --git a/log/.gitkeep b/log/.gitkeep new file mode 100644 index 0000000..fe91d07 --- /dev/null +++ b/log/.gitkeep @@ -0,0 +1 @@ +Git doesn't like empty folders \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e59be52..3311a7a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,8 @@ beautifulsoup4==4.8.2 bs4==0.0.1 -numpy==1.16.3 -numpy==1.16.4 -pandas==0.24.2 -pandas==0.25.0 -python-dateutil==2.8.0 -pytz==2019.1 -six==1.12.0 -soupsieve==1.9.1 \ No newline at end of file +numpy==1.18.2 +pandas==1.0.3 +python-dateutil==2.8.1 +pytz==2019.3 +six==1.14.0 +soupsieve==2.0 \ No newline at end of file diff --git a/vgchartzfull.py b/vgchartz-full-crawler.py similarity index 96% rename from vgchartzfull.py rename to vgchartz-full-crawler.py index b4791a3..4f4a894 100644 --- a/vgchartzfull.py +++ b/vgchartz-full-crawler.py @@ -1,9 +1,10 @@ from bs4 import BeautifulSoup, element from random import randint, choice +import urllib +import urllib.request import pandas as pd import numpy as np import logging -import urllib import sys import time import json @@ -41,7 +42,7 @@ def generate_remaining_url(*, query_parameters): for param in query_parameters: value=query_parameters.get(param, None) reply += f"&{param}={value}" if value is not None else f"&{param}=" - logging.debug(f"Url Generated: {base_url}?{reply}") + logging.debug(f"Url Generated: {base_url}N{reply}") logging.info("generate_remaining_url <<<") return reply @@ -66,6 +67,7 @@ def get_page(*, url): def get_genre(*, game_url): """ Return the game genre retrieved from the given url + (It involves another http request) :param game_url: :return: Genre of the input game """ @@ -73,12 +75,14 @@ def get_genre(*, game_url): logging.debug("Page to download: {}".format(game_url)) site_raw = get_page(url=game_url) sub_soup = BeautifulSoup(site_raw, "html.parser") - # again, the info box is inconsistent among games so we + + # Eventually the info box is inconsistent among games so we # have to find all the h2 and traverse from that to the genre name + # and make a temporary tag here to search + # for the one that contains the word "Genre" h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2') - # make a temporary tag here to search for the one that contains - # the word "Genre" temp_tag = element.Tag + for h2 in h2s: if h2.string == 'Genre': temp_tag = h2 @@ -187,6 +191,7 @@ def download_data(*, start_page, end_page, include_genre): # Get different attributes traverse up the DOM tree data = tag.parent.parent.find_all("td") + #print(data) current_rank = np.int32(data[0].string) current_platform = data[3].find('img').attrs['alt'] current_publisher = data[4].string @@ -273,7 +278,7 @@ def save_games_data(*, filename, separator, enc): properties = None - with open("resources.json") as file: + with open("cfg/resources.json") as file: properties = json.load(file) logging.root.handlers = [] @@ -307,5 +312,6 @@ def save_games_data(*, filename, separator, enc): enc=properties['encoding']) except: + print("Global exception") print("Unexpected error:", sys.exc_info()[0]) pass From c4f9ff812606134214f7115f0e92fea38b36d131 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Tue, 31 Mar 2020 14:03:49 +0200 Subject: [PATCH 30/35] script to easy run --- run.sh | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100755 run.sh diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..df4b431 --- /dev/null +++ b/run.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +python --version >/dev/null 2>&1 || { echo >&2 "I require python@3 utility but it's not installed. ¯\_(ツ)_/¯ Aborting."; exit 1; } +pip --version >/dev/null 2>&1 || { echo >&2 "I require pip utility but it's not installed. ¯\_(ツ)_/¯ Aborting."; exit 1; } + +clear + +pip install -r requirements.txt +python vgchartz-full-crawler.py From 0e48d8d8bf8821b926853a7620c7700dfc1c60ea Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Tue, 31 Mar 2020 18:19:06 +0200 Subject: [PATCH 31/35] Update documentation and add some TODOs --- README.md | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e2a2262..2d1dc0e 100644 --- a/README.md +++ b/README.md @@ -27,20 +27,34 @@ It can be installed by pip. ## Dictionary +The dataset it's composed by this fields, and the data is collected with this [methodology](https://www.vgchartz.com/methodology.php). + | Field | Description | |-------|--------------------------| | Rank | Ranking of overall sales | | Name | The games name | -| Platform | Platform of the games release (i.e. PC,PS4, etc.) | -| Year | Year of the game's release | | Genre | Genre of the game | +| Platform | Platform of the games release (i.e. PC,PS4, etc.) | +| Developer | Developer of the game | | Publisher | Publisher of the game | +| Vgchartz_Score | Score at VGcharz site | +| Critic_Score | Score at Critic | +| User_Score | Score by VGcharts users' site | +| Total_Shipped | Total worldwide shipments (in millions) | +| Total_Sales | Total worldwide sales (in millions) | | NA_Sales | Sales in North America (in millions) | | EU_Sales | Sales in Europe (in millions) | | JP_Sales | Sales in Japan (in millions) | | Other_Sales | Sales in the rest of the world (in millions) | -| Global_Sales | Total worldwide sales. | +| Release_Date | Year of the game's release | +| Last_Update | Last update of this register | + +## TODO +- [ ] Remap the columns according the selected values at resources.json +- [ ] Add some unit testing +- [ ] Dockerize (w/ alpine-python) to ease use and avoid intallations +- [ ] Publish at Docker hub ## Links From 7f5719e7cefaaef830d6b16688ea22670a319ed2 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Tue, 31 Mar 2020 18:19:27 +0200 Subject: [PATCH 32/35] Improve script output --- run.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/run.sh b/run.sh index df4b431..60ece4b 100755 --- a/run.sh +++ b/run.sh @@ -5,5 +5,9 @@ pip --version >/dev/null 2>&1 || { echo >&2 "I require pip utility but it's not clear +echo "\nInstalling deps... " pip install -r requirements.txt + +echo "\nStart crawling... (remember a crawler is the friend nobody likes)" python vgchartz-full-crawler.py + From 214a8538db3b3ea36305ac75c0b9c404bf741e99 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Tue, 31 Mar 2020 18:20:29 +0200 Subject: [PATCH 33/35] Refactor data saving to add more data and parse full dates instead only year. --- cfg/resources.json | 24 ++--- vgchartz-full-crawler.py | 186 ++++++++++++++++++++++----------------- 2 files changed, 116 insertions(+), 94 deletions(-) diff --git a/cfg/resources.json b/cfg/resources.json index 2fa636b..4e65ceb 100644 --- a/cfg/resources.json +++ b/cfg/resources.json @@ -8,32 +8,32 @@ "include_genre": false, "base_page_url": "https://www.vgchartz.com/gamedb/?page=", "query_parameters": { - "results": 10, - "console": null, + "results": 100, "region": "All", - "developer": null, - "publisher": null, - "genre": null, "boxart": "Both", + "banner": "Both", "ownership": "Both", + "showmultiplat": "No", "order": "Sales", - "showtotalsales": 0, "showtotalsales": 1, - "showpublisher": 0, "showpublisher": 1, - "showvgchartzscore": 0, + "showvgchartzscore": 1, "shownasales": 1, "showdeveloper": 1, "showcriticscore": 1, - "showpalsales": 0, "showpalsales": 1, "showreleasedate": 1, "showuserscore": 1, "showjapansales": 1, - "showlastupdate": 0, + "showlastupdate": 1, "showothersales": 1, - "showgenre": 1, - "sort": "GL" + "showshipped": 1, + "keyword": null, + "console": null, + "developer": null, + "publisher": null, + "goty_year": null, + "genre": null }, "minimum_sleep_time": 6, "maximum_sleep_time": 15, diff --git a/vgchartz-full-crawler.py b/vgchartz-full-crawler.py index 4f4a894..7c6c30c 100644 --- a/vgchartz-full-crawler.py +++ b/vgchartz-full-crawler.py @@ -92,71 +92,79 @@ def get_genre(*, game_url): logging.info("get_genre <<<") return genre_value +def parse_number(*, number_string): + """ + Return string parsed to float with custom format for millions (m) + :param number_string: + :return: a float number right parsed + """ + logging.info("parse_number >>>") + print(number_string) + if "m" in number_string: + reply = number_string.strip('m') + reply = str(float(reply) * 1000000) + else: + reply=number_string -def get_release_year(*, raw_year): + logging.info("parse_number <<<") + return float(reply) if not reply.startswith("N/A") else np.nan + +def parse_date(*, date_string): """ - Return the release year of the given game in a 4 digit format or N/A. - :param raw_year: - :return: Game Release year + Return the date received as string onto timestamp or N/A. + :param date_string: + :return: A timestamp in panda date format """ - logging.info("get_release_year >>>") - if raw_year.startswith('N/A'): - final_year = 'N/A' - elif int(raw_year) >= 80: - final_year = np.int32("19" + raw_year) + logging.info("parse_date >>>") + if date_string.startswith('N/A'): + date_formatted = 'N/A' else: - final_year = np.int32("20" + raw_year) - logging.debug("Release Year: {}".format(final_year)) - logging.info("get_release_year <<<") - return final_year + #i.e. date_string = '18th Feb 20' + date_formatted = pd.to_datetime(date_string) + logging.debug("Date parsed: {}".format(date_formatted)) + logging.info("parse_date <<<") + return date_formatted def add_current_game_data(*, - current_critic_score, - current_developer, + current_rank, current_game_name, + current_game_genre, current_platform, current_publisher, - current_rank, - current_release_year, - current_sales_gl, - current_sales_jp, + current_developer, + current_vgchartz_score, + current_critic_score, + current_user_score, + current_total_shipped, + current_total_sales, current_sales_na, - current_sales_ot, current_sales_pal, - current_user_score): + current_sales_jp, + current_sales_ot, + current_release_date, + current_last_update): """ Add all the game data to the related lists - - :param current_critic_score: - :param current_developer: - :param current_game_name: - :param current_platform: - :param current_publisher: - :param current_rank: - :param current_release_year: - :param current_sales_gl: - :param current_sales_jp: - :param current_sales_na: - :param current_sales_ot: - :param current_sales_pal: - :param current_user_score: - :return: """ logging.info("add_current_game_data >>>") game_name.append(current_game_name) rank.append(current_rank) platform.append(current_platform) + genre.append(current_game_genre) publisher.append(current_publisher.strip()) developer.append(current_developer.strip()) + vgchartz_score.append(current_vgchartz_score) critic_score.append(current_critic_score) user_score.append(current_user_score) + total_shipped.append(current_total_shipped) + total_sales.append(current_total_sales) sales_na.append(current_sales_na) sales_pal.append(current_sales_pal) sales_jp.append(current_sales_jp) sales_ot.append(current_sales_ot) - sales_gl.append(current_sales_gl) - year.append(current_release_year) + release_date.append(current_release_date) + last_update.append(current_last_update) logging.info("add_current_game_data <<<") @@ -186,45 +194,52 @@ def download_data(*, start_page, end_page, include_genre): for tag in game_tags: - current_gname = " ".join(tag.string.split()) # add game name to list - logging.debug("Downloaded game: {}. Name: {}".format(downloaded_games + 1, current_gname)) - - # Get different attributes traverse up the DOM tree + current_game_name = " ".join(tag.string.split()) data = tag.parent.parent.find_all("td") - #print(data) + + logging.debug("Downloaded game: {}. Name: {}".format(downloaded_games + 1, current_game_name)) + + # Get the resto of attributes traverse up the DOM tree looking for the cells in results' table current_rank = np.int32(data[0].string) current_platform = data[3].find('img').attrs['alt'] current_publisher = data[4].string current_developer = data[5].string - current_critic_score = float(data[6].string) if not data[6].string.startswith("N/A") else np.nan - current_user_score = float(data[7].string) if not data[7].string.startswith("N/A") else np.nan - current_sales_na = float(data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan - current_sales_pal = float(data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan - current_sales_jp = float(data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan - current_sales_ot = float(data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan - current_sales_gl = float(data[8].string[:-1]) if not data[8].string.startswith("N/A") else np.nan - current_release_year = get_release_year(raw_year=data[13].string.split()[-1]) + current_vgchartz_score = parse_number(number_string=data[6].string) + current_critic_score = parse_number(number_string=data[7].string) + current_user_score = parse_number(number_string=data[8].string) + current_total_shipped = parse_number(number_string=data[9].string) + current_total_sales = parse_number(number_string=data[10].string) + current_sales_na = parse_number(number_string=data[11].string) + current_sales_pal = parse_number(number_string=data[12].string) + current_sales_jp = parse_number(number_string=data[13].string) + current_sales_ot = parse_number(number_string=data[14].string) + current_release_date = parse_date(date_string=data[15].string) + current_last_update = parse_date(date_string=data[16].string) + + # The genre requires another HTTP Request, so it's made at the end + game_url = tag.attrs['href'] + current_game_genre = "" + if include_genre: + current_game_genre = get_genre(game_url=game_url) add_current_game_data( - current_critic_score=current_critic_score, - current_developer=current_developer, - current_game_name=current_gname, + current_rank=current_rank, + current_game_name=current_game_name, + current_game_genre=current_game_genre, current_platform=current_platform, current_publisher=current_publisher, - current_rank=current_rank, - current_release_year=current_release_year, - current_sales_gl=current_sales_gl, - current_sales_jp=current_sales_jp, + current_developer=current_developer, + current_vgchartz_score=current_vgchartz_score, + current_critic_score=current_critic_score, + current_user_score=current_user_score, + current_total_shipped=current_total_shipped, + current_total_sales=current_total_sales, current_sales_na=current_sales_na, - current_sales_ot=current_sales_ot, current_sales_pal=current_sales_pal, - current_user_score=current_user_score) - - game_url = tag.attrs['href'] - game_genre = "" - if include_genre: - game_genre = get_genre(game_url=game_url) - genre.append(game_genre) + current_sales_jp=current_sales_jp, + current_sales_ot=current_sales_ot, + current_release_date=current_release_date, + current_last_update=current_last_update) downloaded_games += 1 @@ -243,38 +258,45 @@ def save_games_data(*, filename, separator, enc): columns = { 'Rank': rank, 'Name': game_name, - 'Platform': platform, - 'Year': year, 'Genre': genre, - 'Critic_Score': critic_score, - 'User_Score': user_score, + 'Platform': platform, 'Publisher': publisher, 'Developer': developer, + 'Vgchartz_Score': vgchartz_score, + 'Critic_Score': critic_score, + 'User_Score': user_score, + 'Total_Shipped': total_shipped, + 'Total_Sales': total_sales, 'NA_Sales': sales_na, 'PAL_Sales': sales_pal, 'JP_Sales': sales_jp, 'Other_Sales': sales_ot, - 'Global_Sales': sales_gl + 'Release_Date': release_date, + 'Last_Update': last_update } + df = pd.DataFrame(columns) logging.debug("Dataframe column name: {}".format(df.columns)) - df = df[[ - 'Rank', 'Name', 'Platform', 'Year', 'Genre', - 'Publisher', 'Developer', 'Critic_Score', 'User_Score', - 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] + df = df[[ 'Rank', 'Name', 'Genre', 'Platform', 'Publisher', 'Developer', + 'Vgchartz_Score', 'Critic_Score', 'User_Score', 'Total_Shipped', + 'Total_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', + 'Release_Date', 'Last_Update' ]] + df.to_csv(filename, sep=separator, encoding=enc, index=False) logging.info("save_games_data <<<") if __name__ == "__main__": + + # Buffers rank = [] game_name = [] - platform = [] - year = [] genre = [] - critic_score, user_score = [], [] - publisher = [] - developer = [] - sales_na, sales_pal, sales_jp, sales_ot, sales_gl = [], [], [], [], [] + platform = [] + publisher, developer = [], [] + critic_score, user_score, vgchartz_score = [], [], [] + total_shipped = [] + total_sales, sales_na, sales_pal, sales_jp, sales_ot = [], [], [], [], [] + release_date, last_update = [], [] properties = None @@ -313,5 +335,5 @@ def save_games_data(*, filename, separator, enc): except: print("Global exception") - print("Unexpected error:", sys.exc_info()[0]) + print("Unexpected error:", sys.exc_info()) pass From 381c264401577494d67f6b0619a6ab1c57761259 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Tue, 31 Mar 2020 18:28:04 +0200 Subject: [PATCH 34/35] Updating Doc --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2d1dc0e..8b59cf5 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ It creates a dataset with data from more than 57,000 games. based on data from ## Output -The dataset is saved as vgsales.csv. +The dataset is saved in the file specified at cfg/resources.json, by default "dataset/vgsales.csv". ## Install & execution From 1b88322d3cf1a189c6fc8b7b5d09fa1ff0dcf2b3 Mon Sep 17 00:00:00 2001 From: Manuel Eusebio de Paz Carmona Date: Tue, 31 Mar 2020 18:29:02 +0200 Subject: [PATCH 35/35] Updating Doc --- README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8b59cf5..40aae9d 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ # vgchartzfull - A crawler to download data from Global Videogame Sales -vgchartzfull.py is a python@3 script based on BeautifulSoup. - -It creates a dataset with data from more than 57,000 games. based on data from http://www.vgchartz.com/gamedb/ +vgchartz-full-crawler.py is a python@3 crawler script based on BeautifulSoup. +It creates a csv dataset with data from more than 57,000 games. based on data from [VGChartz Site](http://www.vgchartz.com/gamedb/). ## Output