diff --git a/.gitignore b/.gitignore
index be2baa1..fffebb9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,8 @@
+.idea
+.vcs
+
+*.csv
+
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
diff --git a/README.md b/README.md
index e557119..40aae9d 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,66 @@
-vgchartzfull is a python script based on BeautifulSoup.
-It creates a dataset based on data from
-http://www.vgchartz.com/gamedb/
+# vgchartzfull - A crawler to download data from Global Videogame Sales
-The dataset is saved as vgsales.csv.
+vgchartz-full-crawler.py is a python@3 crawler script based on BeautifulSoup.
+It creates a csv dataset with data from more than 57,000 games. based on data from [VGChartz Site](http://www.vgchartz.com/gamedb/).
+
+## Output
+
+The dataset is saved in the file specified at cfg/resources.json, by default "dataset/vgsales.csv".
+
+## Install & execution
+
+You will need to have some depencies compiled at **requirements.txt**.
-You will need to have BeautifulSoup added.
It can be installed by pip.
-sudo pip install BeautifulSoup
+```bash
+
+ # Install dependencies
+ $> pip install -r requirements.txt
+
+ # Run
+ $> python vgchartzfull.py
+
+
+```
+
+## Dictionary
+
+The dataset it's composed by this fields, and the data is collected with this [methodology](https://www.vgchartz.com/methodology.php).
+
+| Field | Description |
+|-------|--------------------------|
+| Rank | Ranking of overall sales |
+| Name | The games name |
+| Genre | Genre of the game |
+| Platform | Platform of the games release (i.e. PC,PS4, etc.) |
+| Developer | Developer of the game |
+| Publisher | Publisher of the game |
+| Vgchartz_Score | Score at VGcharz site |
+| Critic_Score | Score at Critic |
+| User_Score | Score by VGcharts users' site |
+| Total_Shipped | Total worldwide shipments (in millions) |
+| Total_Sales | Total worldwide sales (in millions) |
+| NA_Sales | Sales in North America (in millions) |
+| EU_Sales | Sales in Europe (in millions) |
+| JP_Sales | Sales in Japan (in millions) |
+| Other_Sales | Sales in the rest of the world (in millions) |
+| Release_Date | Year of the game's release |
+| Last_Update | Last update of this register |
+
+## TODO
+
+- [ ] Remap the columns according the selected values at resources.json
+- [ ] Add some unit testing
+- [ ] Dockerize (w/ alpine-python) to ease use and avoid intallations
+- [ ] Publish at Docker hub
+
+## Links
+
+* [vgchartz.com](https://www.vgchartz.com)
+* [Original Crawler](https://github.com/GregorUT/vgchartzScrape)
+* [Kaggle Dataset](https://www.kaggle.com/gregorut/videogamesales)
+
+## Greetings
-Thanks to Chris Albon.
-http://chrisalbon.com/python/beautiful_soup_scrape_table.html
+Thanks to [Chris Albon](http://chrisalbon.com/python/beautiful_soup_scrape_table.html)
diff --git a/cfg/resources.json b/cfg/resources.json
new file mode 100644
index 0000000..4e65ceb
--- /dev/null
+++ b/cfg/resources.json
@@ -0,0 +1,44 @@
+{
+ "application_log_filename": "log/app.log",
+ "output_filename": "dataset/vgsales.csv",
+ "separator": ",",
+ "encoding": "utf-8",
+ "start_page": 1,
+ "end_page": 2,
+ "include_genre": false,
+ "base_page_url": "https://www.vgchartz.com/gamedb/?page=",
+ "query_parameters": {
+ "results": 100,
+ "region": "All",
+ "boxart": "Both",
+ "banner": "Both",
+ "ownership": "Both",
+ "showmultiplat": "No",
+ "order": "Sales",
+ "showtotalsales": 1,
+ "showpublisher": 1,
+ "showvgchartzscore": 1,
+ "shownasales": 1,
+ "showdeveloper": 1,
+ "showcriticscore": 1,
+ "showpalsales": 1,
+ "showreleasedate": 1,
+ "showuserscore": 1,
+ "showjapansales": 1,
+ "showlastupdate": 1,
+ "showothersales": 1,
+ "showshipped": 1,
+ "keyword": null,
+ "console": null,
+ "developer": null,
+ "publisher": null,
+ "goty_year": null,
+ "genre": null
+ },
+ "minimum_sleep_time": 6,
+ "maximum_sleep_time": 15,
+ "minimum_major_version": 1,
+ "maximum_major_version": 56,
+ "minimum_minor_version": 1,
+ "maximum_minor_version": 10
+}
diff --git a/dataset/.gitkeep b/dataset/.gitkeep
new file mode 100644
index 0000000..fe91d07
--- /dev/null
+++ b/dataset/.gitkeep
@@ -0,0 +1 @@
+Git doesn't like empty folders
\ No newline at end of file
diff --git a/log/.gitkeep b/log/.gitkeep
new file mode 100644
index 0000000..fe91d07
--- /dev/null
+++ b/log/.gitkeep
@@ -0,0 +1 @@
+Git doesn't like empty folders
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..3311a7a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+beautifulsoup4==4.8.2
+bs4==0.0.1
+numpy==1.18.2
+pandas==1.0.3
+python-dateutil==2.8.1
+pytz==2019.3
+six==1.14.0
+soupsieve==2.0
\ No newline at end of file
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..60ece4b
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+python --version >/dev/null 2>&1 || { echo >&2 "I require python@3 utility but it's not installed. ¯\_(ツ)_/¯ Aborting."; exit 1; }
+pip --version >/dev/null 2>&1 || { echo >&2 "I require pip utility but it's not installed. ¯\_(ツ)_/¯ Aborting."; exit 1; }
+
+clear
+
+echo "\nInstalling deps... "
+pip install -r requirements.txt
+
+echo "\nStart crawling... (remember a crawler is the friend nobody likes)"
+python vgchartz-full-crawler.py
+
diff --git a/vgchartz-full-crawler.py b/vgchartz-full-crawler.py
new file mode 100644
index 0000000..7c6c30c
--- /dev/null
+++ b/vgchartz-full-crawler.py
@@ -0,0 +1,339 @@
+from bs4 import BeautifulSoup, element
+from random import randint, choice
+import urllib
+import urllib.request
+import pandas as pd
+import numpy as np
+import logging
+import sys
+import time
+import json
+
+def create_random_header():
+ """
+ Create a random user agent in order to better mimic user behaviour.
+ :return JSON with User-Agent as key and random browser-os combo as value
+ """
+ logging.info("create_random_header >>>")
+ browsers = ["Mozilla", "Chrome"]
+ os_list = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"]
+ major_version = randint(properties['minimum_major_version'], properties['maximum_major_version'])
+ minor_version = randint(properties['minimum_minor_version'], properties['maximum_minor_version'])
+ chosen_browser = choice(browsers)
+ chosen_os = choice(os_list)
+
+ user_agent = '{}/{}.{} ({})'.format(
+ chosen_browser,
+ major_version,
+ minor_version,
+ chosen_os)
+ header = {'User-Agent': user_agent}
+ logging.debug("Current user_agent: {}".format(header))
+ logging.info("create_random_header <<<")
+ return header
+
+def generate_remaining_url(*, query_parameters):
+ """
+ Generate an url with a list of videogames from the query params configured at resources.json
+ :return: Url with page number
+ """
+ logging.info("generate_remaining_url >>>")
+ reply=''
+ for param in query_parameters:
+ value=query_parameters.get(param, None)
+ reply += f"&{param}={value}" if value is not None else f"&{param}="
+ logging.debug(f"Url Generated: {base_url}N{reply}")
+ logging.info("generate_remaining_url <<<")
+ return reply
+
+def get_page(*, url):
+ """
+ Perform a GET request to the given URL and return results.
+ Add a wait logic that, combined with random header, will help avoiding
+ HTTP 429 error.
+ :param url: webpage URL
+ :return: HTML page's body
+ """
+ logging.info("get_page >>>")
+ logging.debug("Current URL: {}".format(url))
+ header = create_random_header()
+ request = urllib.request.Request(url, headers=header)
+ result = urllib.request.urlopen(request).read()
+ time.sleep(randint(properties['minimum_sleep_time'], properties['maximum_sleep_time']))
+ logging.info("get_page <<<")
+ return result
+
+
+def get_genre(*, game_url):
+ """
+ Return the game genre retrieved from the given url
+ (It involves another http request)
+ :param game_url:
+ :return: Genre of the input game
+ """
+ logging.info("get_genre >>>")
+ logging.debug("Page to download: {}".format(game_url))
+ site_raw = get_page(url=game_url)
+ sub_soup = BeautifulSoup(site_raw, "html.parser")
+
+ # Eventually the info box is inconsistent among games so we
+ # have to find all the h2 and traverse from that to the genre name
+ # and make a temporary tag here to search
+ # for the one that contains the word "Genre"
+ h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')
+ temp_tag = element.Tag
+
+ for h2 in h2s:
+ if h2.string == 'Genre':
+ temp_tag = h2
+
+ genre_value = temp_tag.next_sibling.string
+ logging.debug("Game genre: {}".format(genre_value))
+ logging.info("get_genre <<<")
+ return genre_value
+
+def parse_number(*, number_string):
+ """
+ Return string parsed to float with custom format for millions (m)
+ :param number_string:
+ :return: a float number right parsed
+ """
+ logging.info("parse_number >>>")
+ print(number_string)
+ if "m" in number_string:
+ reply = number_string.strip('m')
+ reply = str(float(reply) * 1000000)
+ else:
+ reply=number_string
+
+ logging.info("parse_number <<<")
+ return float(reply) if not reply.startswith("N/A") else np.nan
+
+def parse_date(*, date_string):
+ """
+ Return the date received as string onto timestamp or N/A.
+ :param date_string:
+ :return: A timestamp in panda date format
+ """
+ logging.info("parse_date >>>")
+ if date_string.startswith('N/A'):
+ date_formatted = 'N/A'
+ else:
+ #i.e. date_string = '18th Feb 20'
+ date_formatted = pd.to_datetime(date_string)
+
+ logging.debug("Date parsed: {}".format(date_formatted))
+ logging.info("parse_date <<<")
+ return date_formatted
+
+def add_current_game_data(*,
+ current_rank,
+ current_game_name,
+ current_game_genre,
+ current_platform,
+ current_publisher,
+ current_developer,
+ current_vgchartz_score,
+ current_critic_score,
+ current_user_score,
+ current_total_shipped,
+ current_total_sales,
+ current_sales_na,
+ current_sales_pal,
+ current_sales_jp,
+ current_sales_ot,
+ current_release_date,
+ current_last_update):
+ """
+ Add all the game data to the related lists
+ """
+ logging.info("add_current_game_data >>>")
+ game_name.append(current_game_name)
+ rank.append(current_rank)
+ platform.append(current_platform)
+ genre.append(current_game_genre)
+ publisher.append(current_publisher.strip())
+ developer.append(current_developer.strip())
+ vgchartz_score.append(current_vgchartz_score)
+ critic_score.append(current_critic_score)
+ user_score.append(current_user_score)
+ total_shipped.append(current_total_shipped)
+ total_sales.append(current_total_sales)
+ sales_na.append(current_sales_na)
+ sales_pal.append(current_sales_pal)
+ sales_jp.append(current_sales_jp)
+ sales_ot.append(current_sales_ot)
+ release_date.append(current_release_date)
+ last_update.append(current_last_update)
+ logging.info("add_current_game_data <<<")
+
+
+def download_data(*, start_page, end_page, include_genre):
+ """
+ Download games data from vgchartz: only data whose pages are in the range (start_page, end_page) will be downloaded
+ :param start_page:
+ :param end_page:
+ :param include_genre:
+ :return:
+ """
+ logging.info("download_data >>>")
+ downloaded_games = 0 # Results are decreasingly ordered according to Shipped units
+ for page in range(start_page, end_page + 1):
+ page_url = "{}{}{}".format(base_url, str(page), remaining_url)
+ current_page = get_page(url=page_url)
+ soup = BeautifulSoup(current_page, features="html.parser")
+ logging.info("Downloaded page {}".format(page))
+
+ # We locate the game through search tags with game urls in the main table
+ game_tags = list(filter(
+ lambda x: x.attrs['href'].startswith('https://www.vgchartz.com/game/'),
+ # discard the first 10 elements because those
+ # links are in the navigation bar
+ soup.find_all("a")
+ ))[10:]
+
+ for tag in game_tags:
+
+ current_game_name = " ".join(tag.string.split())
+ data = tag.parent.parent.find_all("td")
+
+ logging.debug("Downloaded game: {}. Name: {}".format(downloaded_games + 1, current_game_name))
+
+ # Get the resto of attributes traverse up the DOM tree looking for the cells in results' table
+ current_rank = np.int32(data[0].string)
+ current_platform = data[3].find('img').attrs['alt']
+ current_publisher = data[4].string
+ current_developer = data[5].string
+ current_vgchartz_score = parse_number(number_string=data[6].string)
+ current_critic_score = parse_number(number_string=data[7].string)
+ current_user_score = parse_number(number_string=data[8].string)
+ current_total_shipped = parse_number(number_string=data[9].string)
+ current_total_sales = parse_number(number_string=data[10].string)
+ current_sales_na = parse_number(number_string=data[11].string)
+ current_sales_pal = parse_number(number_string=data[12].string)
+ current_sales_jp = parse_number(number_string=data[13].string)
+ current_sales_ot = parse_number(number_string=data[14].string)
+ current_release_date = parse_date(date_string=data[15].string)
+ current_last_update = parse_date(date_string=data[16].string)
+
+ # The genre requires another HTTP Request, so it's made at the end
+ game_url = tag.attrs['href']
+ current_game_genre = ""
+ if include_genre:
+ current_game_genre = get_genre(game_url=game_url)
+
+ add_current_game_data(
+ current_rank=current_rank,
+ current_game_name=current_game_name,
+ current_game_genre=current_game_genre,
+ current_platform=current_platform,
+ current_publisher=current_publisher,
+ current_developer=current_developer,
+ current_vgchartz_score=current_vgchartz_score,
+ current_critic_score=current_critic_score,
+ current_user_score=current_user_score,
+ current_total_shipped=current_total_shipped,
+ current_total_sales=current_total_sales,
+ current_sales_na=current_sales_na,
+ current_sales_pal=current_sales_pal,
+ current_sales_jp=current_sales_jp,
+ current_sales_ot=current_sales_ot,
+ current_release_date=current_release_date,
+ current_last_update=current_last_update)
+
+ downloaded_games += 1
+
+ logging.info("Number of downloaded resources: {}".format(downloaded_games))
+ logging.info("download_data <<<")
+
+
+def save_games_data(*, filename, separator, enc):
+ """
+ Save all the downloaded data into the specified file
+ :param filename
+ :param separator
+ :param enc
+ """
+ logging.info("save_games_data >>>")
+ columns = {
+ 'Rank': rank,
+ 'Name': game_name,
+ 'Genre': genre,
+ 'Platform': platform,
+ 'Publisher': publisher,
+ 'Developer': developer,
+ 'Vgchartz_Score': vgchartz_score,
+ 'Critic_Score': critic_score,
+ 'User_Score': user_score,
+ 'Total_Shipped': total_shipped,
+ 'Total_Sales': total_sales,
+ 'NA_Sales': sales_na,
+ 'PAL_Sales': sales_pal,
+ 'JP_Sales': sales_jp,
+ 'Other_Sales': sales_ot,
+ 'Release_Date': release_date,
+ 'Last_Update': last_update
+ }
+
+ df = pd.DataFrame(columns)
+ logging.debug("Dataframe column name: {}".format(df.columns))
+ df = df[[ 'Rank', 'Name', 'Genre', 'Platform', 'Publisher', 'Developer',
+ 'Vgchartz_Score', 'Critic_Score', 'User_Score', 'Total_Shipped',
+ 'Total_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales',
+ 'Release_Date', 'Last_Update' ]]
+
+ df.to_csv(filename, sep=separator, encoding=enc, index=False)
+ logging.info("save_games_data <<<")
+
+if __name__ == "__main__":
+
+ # Buffers
+ rank = []
+ game_name = []
+ genre = []
+ platform = []
+ publisher, developer = [], []
+ critic_score, user_score, vgchartz_score = [], [], []
+ total_shipped = []
+ total_sales, sales_na, sales_pal, sales_jp, sales_ot = [], [], [], [], []
+ release_date, last_update = [], []
+
+ properties = None
+
+ with open("cfg/resources.json") as file:
+ properties = json.load(file)
+
+ logging.root.handlers = []
+ logging.basicConfig(format='%(asctime)s|%(name)s|%(levelname)s| %(message)s',
+ level=logging.DEBUG,
+ filename=properties["application_log_filename"])
+
+ # set up logging to console
+ console = logging.StreamHandler()
+ console.setLevel(logging.DEBUG)
+
+ # set a format which is simpler for console use
+ formatter = logging.Formatter(fmt='%(asctime)s|%(name)s|%(levelname)s| %(message)s',
+ datefmt="%d-%m-%Y %H:%M:%S")
+ console.setFormatter(formatter)
+ logging.getLogger("").addHandler(console)
+
+ try:
+ logging.info('Application started')
+ base_url = properties['base_page_url']
+ remaining_url=generate_remaining_url(query_parameters=properties['query_parameters'])
+
+ download_data(
+ start_page=properties['start_page'],
+ end_page=properties['end_page'],
+ include_genre=properties['include_genre'])
+
+ save_games_data(
+ filename=properties['output_filename'],
+ separator=properties['separator'],
+ enc=properties['encoding'])
+
+ except:
+ print("Global exception")
+ print("Unexpected error:", sys.exc_info())
+ pass
diff --git a/vgchartzfull.py b/vgchartzfull.py
deleted file mode 100644
index b1d75a4..0000000
--- a/vgchartzfull.py
+++ /dev/null
@@ -1,130 +0,0 @@
-from bs4 import BeautifulSoup, element
-import urllib
-import pandas as pd
-import numpy as np
-
-pages = 19
-rec_count = 0
-rank = []
-gname = []
-platform = []
-year = []
-genre = []
-critic_score = []
-user_score = []
-publisher = []
-developer = []
-sales_na = []
-sales_pal = []
-sales_jp = []
-sales_ot = []
-sales_gl = []
-
-urlhead = 'http://www.vgchartz.com/gamedb/?page='
-urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'
-urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0'
-urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'
-urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'
-urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'
-
-for page in range(1, pages):
- surl = urlhead + str(page) + urltail
- r = urllib.request.urlopen(surl).read()
- soup = BeautifulSoup(r)
- print(f"Page: {page}")
-
- # vgchartz website is really weird so we have to search for
- # tags with game urls
- game_tags = list(filter(
- lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),
- # discard the first 10 elements because those
- # links are in the navigation bar
- soup.find_all("a")
- ))[10:]
-
- for tag in game_tags:
-
- # add name to list
- gname.append(" ".join(tag.string.split()))
- print(f"{rec_count + 1} Fetch data for game {gname[-1]}")
-
- # get different attributes
- # traverse up the DOM tree
- data = tag.parent.parent.find_all("td")
- rank.append(np.int32(data[0].string))
- platform.append(data[3].find('img').attrs['alt'])
- publisher.append(data[4].string)
- developer.append(data[5].string)
- critic_score.append(
- float(data[6].string) if
- not data[6].string.startswith("N/A") else np.nan)
- user_score.append(
- float(data[7].string) if
- not data[7].string.startswith("N/A") else np.nan)
- sales_na.append(
- float(data[9].string[:-1]) if
- not data[9].string.startswith("N/A") else np.nan)
- sales_pal.append(
- float(data[10].string[:-1]) if
- not data[10].string.startswith("N/A") else np.nan)
- sales_jp.append(
- float(data[11].string[:-1]) if
- not data[11].string.startswith("N/A") else np.nan)
- sales_ot.append(
- float(data[12].string[:-1]) if
- not data[12].string.startswith("N/A") else np.nan)
- sales_gl.append(
- float(data[8].string[:-1]) if
- not data[8].string.startswith("N/A") else np.nan)
- release_year = data[13].string.split()[-1]
- # different format for year
- if release_year.startswith('N/A'):
- year.append('N/A')
- else:
- if int(release_year) >= 80:
- year_to_add = np.int32("19" + release_year)
- else:
- year_to_add = np.int32("20" + release_year)
- year.append(year_to_add)
-
- # go to every individual website to get genre info
- url_to_game = tag.attrs['href']
- site_raw = urllib.request.urlopen(url_to_game).read()
- sub_soup = BeautifulSoup(site_raw, "html.parser")
- # again, the info box is inconsistent among games so we
- # have to find all the h2 and traverse from that to the genre name
- h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')
- # make a temporary tag here to search for the one that contains
- # the word "Genre"
- temp_tag = element.Tag
- for h2 in h2s:
- if h2.string == 'Genre':
- temp_tag = h2
- genre.append(temp_tag.next_sibling.string)
-
- rec_count += 1
-
-columns = {
- 'Rank': rank,
- 'Name': gname,
- 'Platform': platform,
- 'Year': year,
- 'Genre': genre,
- 'Critic_Score': critic_score,
- 'User_Score': user_score,
- 'Publisher': publisher,
- 'Developer': developer,
- 'NA_Sales': sales_na,
- 'PAL_Sales': sales_pal,
- 'JP_Sales': sales_jp,
- 'Other_Sales': sales_ot,
- 'Global_Sales': sales_gl
-}
-print(rec_count)
-df = pd.DataFrame(columns)
-print(df.columns)
-df = df[[
- 'Rank', 'Name', 'Platform', 'Year', 'Genre',
- 'Publisher', 'Developer', 'Critic_Score', 'User_Score',
- 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
-df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False)