diff --git a/.gitignore b/.gitignore index be2baa1..fffebb9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +.idea +.vcs + +*.csv + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index e557119..40aae9d 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,66 @@ -vgchartzfull is a python script based on BeautifulSoup. -It creates a dataset based on data from -http://www.vgchartz.com/gamedb/ +# vgchartzfull - A crawler to download data from Global Videogame Sales -The dataset is saved as vgsales.csv. +vgchartz-full-crawler.py is a python@3 crawler script based on BeautifulSoup. +It creates a csv dataset with data from more than 57,000 games. based on data from [VGChartz Site](http://www.vgchartz.com/gamedb/). + +## Output + +The dataset is saved in the file specified at cfg/resources.json, by default "dataset/vgsales.csv". + +## Install & execution + +You will need to have some depencies compiled at **requirements.txt**. -You will need to have BeautifulSoup added. It can be installed by pip. -sudo pip install BeautifulSoup +```bash + + # Install dependencies + $> pip install -r requirements.txt + + # Run + $> python vgchartzfull.py + + +``` + +## Dictionary + +The dataset it's composed by this fields, and the data is collected with this [methodology](https://www.vgchartz.com/methodology.php). + +| Field | Description | +|-------|--------------------------| +| Rank | Ranking of overall sales | +| Name | The games name | +| Genre | Genre of the game | +| Platform | Platform of the games release (i.e. PC,PS4, etc.) | +| Developer | Developer of the game | +| Publisher | Publisher of the game | +| Vgchartz_Score | Score at VGcharz site | +| Critic_Score | Score at Critic | +| User_Score | Score by VGcharts users' site | +| Total_Shipped | Total worldwide shipments (in millions) | +| Total_Sales | Total worldwide sales (in millions) | +| NA_Sales | Sales in North America (in millions) | +| EU_Sales | Sales in Europe (in millions) | +| JP_Sales | Sales in Japan (in millions) | +| Other_Sales | Sales in the rest of the world (in millions) | +| Release_Date | Year of the game's release | +| Last_Update | Last update of this register | + +## TODO + +- [ ] Remap the columns according the selected values at resources.json +- [ ] Add some unit testing +- [ ] Dockerize (w/ alpine-python) to ease use and avoid intallations +- [ ] Publish at Docker hub + +## Links + +* [vgchartz.com](https://www.vgchartz.com) +* [Original Crawler](https://github.com/GregorUT/vgchartzScrape) +* [Kaggle Dataset](https://www.kaggle.com/gregorut/videogamesales) + +## Greetings -Thanks to Chris Albon. -http://chrisalbon.com/python/beautiful_soup_scrape_table.html +Thanks to [Chris Albon](http://chrisalbon.com/python/beautiful_soup_scrape_table.html) diff --git a/cfg/resources.json b/cfg/resources.json new file mode 100644 index 0000000..4e65ceb --- /dev/null +++ b/cfg/resources.json @@ -0,0 +1,44 @@ +{ + "application_log_filename": "log/app.log", + "output_filename": "dataset/vgsales.csv", + "separator": ",", + "encoding": "utf-8", + "start_page": 1, + "end_page": 2, + "include_genre": false, + "base_page_url": "https://www.vgchartz.com/gamedb/?page=", + "query_parameters": { + "results": 100, + "region": "All", + "boxart": "Both", + "banner": "Both", + "ownership": "Both", + "showmultiplat": "No", + "order": "Sales", + "showtotalsales": 1, + "showpublisher": 1, + "showvgchartzscore": 1, + "shownasales": 1, + "showdeveloper": 1, + "showcriticscore": 1, + "showpalsales": 1, + "showreleasedate": 1, + "showuserscore": 1, + "showjapansales": 1, + "showlastupdate": 1, + "showothersales": 1, + "showshipped": 1, + "keyword": null, + "console": null, + "developer": null, + "publisher": null, + "goty_year": null, + "genre": null + }, + "minimum_sleep_time": 6, + "maximum_sleep_time": 15, + "minimum_major_version": 1, + "maximum_major_version": 56, + "minimum_minor_version": 1, + "maximum_minor_version": 10 +} diff --git a/dataset/.gitkeep b/dataset/.gitkeep new file mode 100644 index 0000000..fe91d07 --- /dev/null +++ b/dataset/.gitkeep @@ -0,0 +1 @@ +Git doesn't like empty folders \ No newline at end of file diff --git a/log/.gitkeep b/log/.gitkeep new file mode 100644 index 0000000..fe91d07 --- /dev/null +++ b/log/.gitkeep @@ -0,0 +1 @@ +Git doesn't like empty folders \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3311a7a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +beautifulsoup4==4.8.2 +bs4==0.0.1 +numpy==1.18.2 +pandas==1.0.3 +python-dateutil==2.8.1 +pytz==2019.3 +six==1.14.0 +soupsieve==2.0 \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..60ece4b --- /dev/null +++ b/run.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +python --version >/dev/null 2>&1 || { echo >&2 "I require python@3 utility but it's not installed. ¯\_(ツ)_/¯ Aborting."; exit 1; } +pip --version >/dev/null 2>&1 || { echo >&2 "I require pip utility but it's not installed. ¯\_(ツ)_/¯ Aborting."; exit 1; } + +clear + +echo "\nInstalling deps... " +pip install -r requirements.txt + +echo "\nStart crawling... (remember a crawler is the friend nobody likes)" +python vgchartz-full-crawler.py + diff --git a/vgchartz-full-crawler.py b/vgchartz-full-crawler.py new file mode 100644 index 0000000..7c6c30c --- /dev/null +++ b/vgchartz-full-crawler.py @@ -0,0 +1,339 @@ +from bs4 import BeautifulSoup, element +from random import randint, choice +import urllib +import urllib.request +import pandas as pd +import numpy as np +import logging +import sys +import time +import json + +def create_random_header(): + """ + Create a random user agent in order to better mimic user behaviour. + :return JSON with User-Agent as key and random browser-os combo as value + """ + logging.info("create_random_header >>>") + browsers = ["Mozilla", "Chrome"] + os_list = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"] + major_version = randint(properties['minimum_major_version'], properties['maximum_major_version']) + minor_version = randint(properties['minimum_minor_version'], properties['maximum_minor_version']) + chosen_browser = choice(browsers) + chosen_os = choice(os_list) + + user_agent = '{}/{}.{} ({})'.format( + chosen_browser, + major_version, + minor_version, + chosen_os) + header = {'User-Agent': user_agent} + logging.debug("Current user_agent: {}".format(header)) + logging.info("create_random_header <<<") + return header + +def generate_remaining_url(*, query_parameters): + """ + Generate an url with a list of videogames from the query params configured at resources.json + :return: Url with page number + """ + logging.info("generate_remaining_url >>>") + reply='' + for param in query_parameters: + value=query_parameters.get(param, None) + reply += f"&{param}={value}" if value is not None else f"&{param}=" + logging.debug(f"Url Generated: {base_url}N{reply}") + logging.info("generate_remaining_url <<<") + return reply + +def get_page(*, url): + """ + Perform a GET request to the given URL and return results. + Add a wait logic that, combined with random header, will help avoiding + HTTP 429 error. + :param url: webpage URL + :return: HTML page's body + """ + logging.info("get_page >>>") + logging.debug("Current URL: {}".format(url)) + header = create_random_header() + request = urllib.request.Request(url, headers=header) + result = urllib.request.urlopen(request).read() + time.sleep(randint(properties['minimum_sleep_time'], properties['maximum_sleep_time'])) + logging.info("get_page <<<") + return result + + +def get_genre(*, game_url): + """ + Return the game genre retrieved from the given url + (It involves another http request) + :param game_url: + :return: Genre of the input game + """ + logging.info("get_genre >>>") + logging.debug("Page to download: {}".format(game_url)) + site_raw = get_page(url=game_url) + sub_soup = BeautifulSoup(site_raw, "html.parser") + + # Eventually the info box is inconsistent among games so we + # have to find all the h2 and traverse from that to the genre name + # and make a temporary tag here to search + # for the one that contains the word "Genre" + h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2') + temp_tag = element.Tag + + for h2 in h2s: + if h2.string == 'Genre': + temp_tag = h2 + + genre_value = temp_tag.next_sibling.string + logging.debug("Game genre: {}".format(genre_value)) + logging.info("get_genre <<<") + return genre_value + +def parse_number(*, number_string): + """ + Return string parsed to float with custom format for millions (m) + :param number_string: + :return: a float number right parsed + """ + logging.info("parse_number >>>") + print(number_string) + if "m" in number_string: + reply = number_string.strip('m') + reply = str(float(reply) * 1000000) + else: + reply=number_string + + logging.info("parse_number <<<") + return float(reply) if not reply.startswith("N/A") else np.nan + +def parse_date(*, date_string): + """ + Return the date received as string onto timestamp or N/A. + :param date_string: + :return: A timestamp in panda date format + """ + logging.info("parse_date >>>") + if date_string.startswith('N/A'): + date_formatted = 'N/A' + else: + #i.e. date_string = '18th Feb 20' + date_formatted = pd.to_datetime(date_string) + + logging.debug("Date parsed: {}".format(date_formatted)) + logging.info("parse_date <<<") + return date_formatted + +def add_current_game_data(*, + current_rank, + current_game_name, + current_game_genre, + current_platform, + current_publisher, + current_developer, + current_vgchartz_score, + current_critic_score, + current_user_score, + current_total_shipped, + current_total_sales, + current_sales_na, + current_sales_pal, + current_sales_jp, + current_sales_ot, + current_release_date, + current_last_update): + """ + Add all the game data to the related lists + """ + logging.info("add_current_game_data >>>") + game_name.append(current_game_name) + rank.append(current_rank) + platform.append(current_platform) + genre.append(current_game_genre) + publisher.append(current_publisher.strip()) + developer.append(current_developer.strip()) + vgchartz_score.append(current_vgchartz_score) + critic_score.append(current_critic_score) + user_score.append(current_user_score) + total_shipped.append(current_total_shipped) + total_sales.append(current_total_sales) + sales_na.append(current_sales_na) + sales_pal.append(current_sales_pal) + sales_jp.append(current_sales_jp) + sales_ot.append(current_sales_ot) + release_date.append(current_release_date) + last_update.append(current_last_update) + logging.info("add_current_game_data <<<") + + +def download_data(*, start_page, end_page, include_genre): + """ + Download games data from vgchartz: only data whose pages are in the range (start_page, end_page) will be downloaded + :param start_page: + :param end_page: + :param include_genre: + :return: + """ + logging.info("download_data >>>") + downloaded_games = 0 # Results are decreasingly ordered according to Shipped units + for page in range(start_page, end_page + 1): + page_url = "{}{}{}".format(base_url, str(page), remaining_url) + current_page = get_page(url=page_url) + soup = BeautifulSoup(current_page, features="html.parser") + logging.info("Downloaded page {}".format(page)) + + # We locate the game through search tags with game urls in the main table + game_tags = list(filter( + lambda x: x.attrs['href'].startswith('https://www.vgchartz.com/game/'), + # discard the first 10 elements because those + # links are in the navigation bar + soup.find_all("a") + ))[10:] + + for tag in game_tags: + + current_game_name = " ".join(tag.string.split()) + data = tag.parent.parent.find_all("td") + + logging.debug("Downloaded game: {}. Name: {}".format(downloaded_games + 1, current_game_name)) + + # Get the resto of attributes traverse up the DOM tree looking for the cells in results' table + current_rank = np.int32(data[0].string) + current_platform = data[3].find('img').attrs['alt'] + current_publisher = data[4].string + current_developer = data[5].string + current_vgchartz_score = parse_number(number_string=data[6].string) + current_critic_score = parse_number(number_string=data[7].string) + current_user_score = parse_number(number_string=data[8].string) + current_total_shipped = parse_number(number_string=data[9].string) + current_total_sales = parse_number(number_string=data[10].string) + current_sales_na = parse_number(number_string=data[11].string) + current_sales_pal = parse_number(number_string=data[12].string) + current_sales_jp = parse_number(number_string=data[13].string) + current_sales_ot = parse_number(number_string=data[14].string) + current_release_date = parse_date(date_string=data[15].string) + current_last_update = parse_date(date_string=data[16].string) + + # The genre requires another HTTP Request, so it's made at the end + game_url = tag.attrs['href'] + current_game_genre = "" + if include_genre: + current_game_genre = get_genre(game_url=game_url) + + add_current_game_data( + current_rank=current_rank, + current_game_name=current_game_name, + current_game_genre=current_game_genre, + current_platform=current_platform, + current_publisher=current_publisher, + current_developer=current_developer, + current_vgchartz_score=current_vgchartz_score, + current_critic_score=current_critic_score, + current_user_score=current_user_score, + current_total_shipped=current_total_shipped, + current_total_sales=current_total_sales, + current_sales_na=current_sales_na, + current_sales_pal=current_sales_pal, + current_sales_jp=current_sales_jp, + current_sales_ot=current_sales_ot, + current_release_date=current_release_date, + current_last_update=current_last_update) + + downloaded_games += 1 + + logging.info("Number of downloaded resources: {}".format(downloaded_games)) + logging.info("download_data <<<") + + +def save_games_data(*, filename, separator, enc): + """ + Save all the downloaded data into the specified file + :param filename + :param separator + :param enc + """ + logging.info("save_games_data >>>") + columns = { + 'Rank': rank, + 'Name': game_name, + 'Genre': genre, + 'Platform': platform, + 'Publisher': publisher, + 'Developer': developer, + 'Vgchartz_Score': vgchartz_score, + 'Critic_Score': critic_score, + 'User_Score': user_score, + 'Total_Shipped': total_shipped, + 'Total_Sales': total_sales, + 'NA_Sales': sales_na, + 'PAL_Sales': sales_pal, + 'JP_Sales': sales_jp, + 'Other_Sales': sales_ot, + 'Release_Date': release_date, + 'Last_Update': last_update + } + + df = pd.DataFrame(columns) + logging.debug("Dataframe column name: {}".format(df.columns)) + df = df[[ 'Rank', 'Name', 'Genre', 'Platform', 'Publisher', 'Developer', + 'Vgchartz_Score', 'Critic_Score', 'User_Score', 'Total_Shipped', + 'Total_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', + 'Release_Date', 'Last_Update' ]] + + df.to_csv(filename, sep=separator, encoding=enc, index=False) + logging.info("save_games_data <<<") + +if __name__ == "__main__": + + # Buffers + rank = [] + game_name = [] + genre = [] + platform = [] + publisher, developer = [], [] + critic_score, user_score, vgchartz_score = [], [], [] + total_shipped = [] + total_sales, sales_na, sales_pal, sales_jp, sales_ot = [], [], [], [], [] + release_date, last_update = [], [] + + properties = None + + with open("cfg/resources.json") as file: + properties = json.load(file) + + logging.root.handlers = [] + logging.basicConfig(format='%(asctime)s|%(name)s|%(levelname)s| %(message)s', + level=logging.DEBUG, + filename=properties["application_log_filename"]) + + # set up logging to console + console = logging.StreamHandler() + console.setLevel(logging.DEBUG) + + # set a format which is simpler for console use + formatter = logging.Formatter(fmt='%(asctime)s|%(name)s|%(levelname)s| %(message)s', + datefmt="%d-%m-%Y %H:%M:%S") + console.setFormatter(formatter) + logging.getLogger("").addHandler(console) + + try: + logging.info('Application started') + base_url = properties['base_page_url'] + remaining_url=generate_remaining_url(query_parameters=properties['query_parameters']) + + download_data( + start_page=properties['start_page'], + end_page=properties['end_page'], + include_genre=properties['include_genre']) + + save_games_data( + filename=properties['output_filename'], + separator=properties['separator'], + enc=properties['encoding']) + + except: + print("Global exception") + print("Unexpected error:", sys.exc_info()) + pass diff --git a/vgchartzfull.py b/vgchartzfull.py deleted file mode 100644 index b1d75a4..0000000 --- a/vgchartzfull.py +++ /dev/null @@ -1,130 +0,0 @@ -from bs4 import BeautifulSoup, element -import urllib -import pandas as pd -import numpy as np - -pages = 19 -rec_count = 0 -rank = [] -gname = [] -platform = [] -year = [] -genre = [] -critic_score = [] -user_score = [] -publisher = [] -developer = [] -sales_na = [] -sales_pal = [] -sales_jp = [] -sales_ot = [] -sales_gl = [] - -urlhead = 'http://www.vgchartz.com/gamedb/?page=' -urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both' -urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0' -urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1' -urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1' -urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL' - -for page in range(1, pages): - surl = urlhead + str(page) + urltail - r = urllib.request.urlopen(surl).read() - soup = BeautifulSoup(r) - print(f"Page: {page}") - - # vgchartz website is really weird so we have to search for - # tags with game urls - game_tags = list(filter( - lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), - # discard the first 10 elements because those - # links are in the navigation bar - soup.find_all("a") - ))[10:] - - for tag in game_tags: - - # add name to list - gname.append(" ".join(tag.string.split())) - print(f"{rec_count + 1} Fetch data for game {gname[-1]}") - - # get different attributes - # traverse up the DOM tree - data = tag.parent.parent.find_all("td") - rank.append(np.int32(data[0].string)) - platform.append(data[3].find('img').attrs['alt']) - publisher.append(data[4].string) - developer.append(data[5].string) - critic_score.append( - float(data[6].string) if - not data[6].string.startswith("N/A") else np.nan) - user_score.append( - float(data[7].string) if - not data[7].string.startswith("N/A") else np.nan) - sales_na.append( - float(data[9].string[:-1]) if - not data[9].string.startswith("N/A") else np.nan) - sales_pal.append( - float(data[10].string[:-1]) if - not data[10].string.startswith("N/A") else np.nan) - sales_jp.append( - float(data[11].string[:-1]) if - not data[11].string.startswith("N/A") else np.nan) - sales_ot.append( - float(data[12].string[:-1]) if - not data[12].string.startswith("N/A") else np.nan) - sales_gl.append( - float(data[8].string[:-1]) if - not data[8].string.startswith("N/A") else np.nan) - release_year = data[13].string.split()[-1] - # different format for year - if release_year.startswith('N/A'): - year.append('N/A') - else: - if int(release_year) >= 80: - year_to_add = np.int32("19" + release_year) - else: - year_to_add = np.int32("20" + release_year) - year.append(year_to_add) - - # go to every individual website to get genre info - url_to_game = tag.attrs['href'] - site_raw = urllib.request.urlopen(url_to_game).read() - sub_soup = BeautifulSoup(site_raw, "html.parser") - # again, the info box is inconsistent among games so we - # have to find all the h2 and traverse from that to the genre name - h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2') - # make a temporary tag here to search for the one that contains - # the word "Genre" - temp_tag = element.Tag - for h2 in h2s: - if h2.string == 'Genre': - temp_tag = h2 - genre.append(temp_tag.next_sibling.string) - - rec_count += 1 - -columns = { - 'Rank': rank, - 'Name': gname, - 'Platform': platform, - 'Year': year, - 'Genre': genre, - 'Critic_Score': critic_score, - 'User_Score': user_score, - 'Publisher': publisher, - 'Developer': developer, - 'NA_Sales': sales_na, - 'PAL_Sales': sales_pal, - 'JP_Sales': sales_jp, - 'Other_Sales': sales_ot, - 'Global_Sales': sales_gl -} -print(rec_count) -df = pd.DataFrame(columns) -print(df.columns) -df = df[[ - 'Rank', 'Name', 'Platform', 'Year', 'Genre', - 'Publisher', 'Developer', 'Critic_Score', 'User_Score', - 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] -df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False)