From 607055b0817d123695092f6472a3783f159a11d0 Mon Sep 17 00:00:00 2001
From: hechmik <hechmi.khaled1995@gmail.com>
Date: Tue, 14 May 2019 21:09:09 +0200
Subject: [PATCH 01/35] Add file for listing used libraries

---
 requirements.txt | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..7c120b1
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+beautifulsoup4==4.7.1
+bs4==0.0.1
+numpy==1.16.3
+pandas==0.24.2
+python-dateutil==2.8.0
+pytz==2019.1
+six==1.12.0
+soupsieve==1.9.1

From f0e242c788b26fd89662d8d240da218e71b76e16 Mon Sep 17 00:00:00 2001
From: hechmik <hechmi.khaled1995@gmail.com>
Date: Tue, 14 May 2019 21:31:04 +0200
Subject: [PATCH 02/35] Add method for random headers generation, sleep between
 requests.

---
 vgchartzfull.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index b1d75a4..13a39c9 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -27,6 +27,30 @@
 urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'
 urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'
 
+def create_random_header():
+    browsers = ["Mozilla", "Chrome"]
+    os = ["Windows NT 6.1; Win64; x64" "X11; Linux x86_64"],
+    #user_agent = 'Mozilla/{}.{} (Windows NT 6.1; Win64; x64)'.format(randint(1,56))
+    major_version = randint(1, 56)
+    minor_version = randint(1, 10)
+    chosen_browser = random.choice(browsers)
+    chosen_os = random.choice(os)
+
+    user_agent = '{}/{}.{} ({})'.format(
+        chosen_browser,
+        major_version,
+        minor_version,
+        chosen_os)
+    header = { 'User-Agent' :  user_agent}
+    print(header)
+    return header
+def get_page(url):
+    header = create_random_header()
+    request = urllib.request.Request(url, headers=header)
+    result = urllib.request.urlopen(request).read()
+    time.sleep(randint(6,15))
+    return result
+
 for page in range(1, pages):
     surl = urlhead + str(page) + urltail
     r = urllib.request.urlopen(surl).read()

From f68ea44c2a1d3dc6346d53656e592360d501ced6 Mon Sep 17 00:00:00 2001
From: hechmik <hechmi.khaled1995@gmail.com>
Date: Tue, 14 May 2019 21:38:42 +0200
Subject: [PATCH 03/35] Externalise the get genre part, wrap all code into a
 function

---
 vgchartzfull.py | 240 +++++++++++++++++++++++++-----------------------
 1 file changed, 123 insertions(+), 117 deletions(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index 13a39c9..7ad271f 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -3,30 +3,6 @@
 import pandas as pd
 import numpy as np
 
-pages = 19
-rec_count = 0
-rank = []
-gname = []
-platform = []
-year = []
-genre = []
-critic_score = []
-user_score = []
-publisher = []
-developer = []
-sales_na = []
-sales_pal = []
-sales_jp = []
-sales_ot = []
-sales_gl = []
-
-urlhead = 'http://www.vgchartz.com/gamedb/?page='
-urltail = '&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'
-urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0'
-urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'
-urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'
-urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'
-
 def create_random_header():
     browsers = ["Mozilla", "Chrome"]
     os = ["Windows NT 6.1; Win64; x64" "X11; Linux x86_64"],
@@ -50,105 +26,135 @@ def get_page(url):
     result = urllib.request.urlopen(request).read()
     time.sleep(randint(6,15))
     return result
+def get_genre():
+    # go to every individual website to get genre info
+    url_to_game = tag.attrs['href']
+    site_raw = urllib.request.urlopen(url_to_game).read()
+    sub_soup = BeautifulSoup(site_raw, "html.parser")
+    # again, the info box is inconsistent among games so we
+    # have to find all the h2 and traverse from that to the genre name
+    h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')
+    # make a temporary tag here to search for the one that contains
+    # the word "Genre"
+    temp_tag = element.Tag
+    for h2 in h2s:
+        if h2.string == 'Genre':
+            temp_tag = h2
+    genre.append(temp_tag.next_sibling.string)
 
-for page in range(1, pages):
-    surl = urlhead + str(page) + urltail
-    r = urllib.request.urlopen(surl).read()
-    soup = BeautifulSoup(r)
-    print(f"Page: {page}")
+def download_data():
+    rec_count = 0
+    for page in range(1, pages):
+        surl = urlhead + str(page) + urltail
+        r = urllib.request.urlopen(surl).read()
+        soup = BeautifulSoup(r)
+        print(f"Page: {page}")
 
-    # vgchartz website is really weird so we have to search for
-    # <a> tags with game urls
-    game_tags = list(filter(
-        lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),
-        # discard the first 10 elements because those
-        # links are in the navigation bar
-        soup.find_all("a")
-    ))[10:]
+        # vgchartz website is really weird so we have to search for
+        # <a> tags with game urls
+        game_tags = list(filter(
+            lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),
+            # discard the first 10 elements because those
+            # links are in the navigation bar
+            soup.find_all("a")
+        ))[10:]
 
-    for tag in game_tags:
+        for tag in game_tags:
 
-        # add name to list
-        gname.append(" ".join(tag.string.split()))
-        print(f"{rec_count + 1} Fetch data for game {gname[-1]}")
+            # add name to list
+            gname.append(" ".join(tag.string.split()))
+            print(f"{rec_count + 1} Fetch data for game {gname[-1]}")
 
-        # get different attributes
-        # traverse up the DOM tree
-        data = tag.parent.parent.find_all("td")
-        rank.append(np.int32(data[0].string))
-        platform.append(data[3].find('img').attrs['alt'])
-        publisher.append(data[4].string)
-        developer.append(data[5].string)
-        critic_score.append(
-            float(data[6].string) if
-            not data[6].string.startswith("N/A") else np.nan)
-        user_score.append(
-            float(data[7].string) if
-            not data[7].string.startswith("N/A") else np.nan)
-        sales_na.append(
-            float(data[9].string[:-1]) if
-            not data[9].string.startswith("N/A") else np.nan)
-        sales_pal.append(
-            float(data[10].string[:-1]) if
-            not data[10].string.startswith("N/A") else np.nan)
-        sales_jp.append(
-            float(data[11].string[:-1]) if
-            not data[11].string.startswith("N/A") else np.nan)
-        sales_ot.append(
-            float(data[12].string[:-1]) if
-            not data[12].string.startswith("N/A") else np.nan)
-        sales_gl.append(
-            float(data[8].string[:-1]) if
-            not data[8].string.startswith("N/A") else np.nan)
-        release_year = data[13].string.split()[-1]
-        # different format for year
-        if release_year.startswith('N/A'):
-            year.append('N/A')
-        else:
-            if int(release_year) >= 80:
-                year_to_add = np.int32("19" + release_year)
+            # get different attributes
+            # traverse up the DOM tree
+            data = tag.parent.parent.find_all("td")
+            rank.append(np.int32(data[0].string))
+            platform.append(data[3].find('img').attrs['alt'])
+            publisher.append(data[4].string)
+            developer.append(data[5].string)
+            critic_score.append(
+                float(data[6].string) if
+                not data[6].string.startswith("N/A") else np.nan)
+            user_score.append(
+                float(data[7].string) if
+                not data[7].string.startswith("N/A") else np.nan)
+            sales_na.append(
+                float(data[9].string[:-1]) if
+                not data[9].string.startswith("N/A") else np.nan)
+            sales_pal.append(
+                float(data[10].string[:-1]) if
+                not data[10].string.startswith("N/A") else np.nan)
+            sales_jp.append(
+                float(data[11].string[:-1]) if
+                not data[11].string.startswith("N/A") else np.nan)
+            sales_ot.append(
+                float(data[12].string[:-1]) if
+                not data[12].string.startswith("N/A") else np.nan)
+            sales_gl.append(
+                float(data[8].string[:-1]) if
+                not data[8].string.startswith("N/A") else np.nan)
+            release_year = data[13].string.split()[-1]
+            # different format for year
+            if release_year.startswith('N/A'):
+                year.append('N/A')
             else:
-                year_to_add = np.int32("20" + release_year)
-            year.append(year_to_add)
+                if int(release_year) >= 80:
+                    year_to_add = np.int32("19" + release_year)
+                else:
+                    year_to_add = np.int32("20" + release_year)
+                year.append(year_to_add)
+
+
+
+            rec_count += 1
 
-        # go to every individual website to get genre info
-        url_to_game = tag.attrs['href']
-        site_raw = urllib.request.urlopen(url_to_game).read()
-        sub_soup = BeautifulSoup(site_raw, "html.parser")
-        # again, the info box is inconsistent among games so we
-        # have to find all the h2 and traverse from that to the genre name
-        h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')
-        # make a temporary tag here to search for the one that contains
-        # the word "Genre"
-        temp_tag = element.Tag
-        for h2 in h2s:
-            if h2.string == 'Genre':
-                temp_tag = h2
-        genre.append(temp_tag.next_sibling.string)
+    columns = {
+        'Rank': rank,
+        'Name': gname,
+        'Platform': platform,
+        'Year': year,
+        'Genre': genre,
+        'Critic_Score': critic_score,
+        'User_Score': user_score,
+        'Publisher': publisher,
+        'Developer': developer,
+        'NA_Sales': sales_na,
+        'PAL_Sales': sales_pal,
+        'JP_Sales': sales_jp,
+        'Other_Sales': sales_ot,
+        'Global_Sales': sales_gl
+    }
+    print(rec_count)
+    df = pd.DataFrame(columns)
+    print(df.columns)
+    df = df[[
+        'Rank', 'Name', 'Platform', 'Year', 'Genre',
+        'Publisher', 'Developer', 'Critic_Score', 'User_Score',
+        'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
+    df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False)
 
-        rec_count += 1
+if __name__ == "__main__":
+    pages = 19
+    rec_count = 0
+    rank = []
+    gname = []
+    platform = []
+    year = []
+    genre = []
+    critic_score = []
+    user_score = []
+    publisher = []
+    developer = []
+    sales_na = []
+    sales_pal = []
+    sales_jp = []
+    sales_ot = []
+    sales_gl = []
 
-columns = {
-    'Rank': rank,
-    'Name': gname,
-    'Platform': platform,
-    'Year': year,
-    'Genre': genre,
-    'Critic_Score': critic_score,
-    'User_Score': user_score,
-    'Publisher': publisher,
-    'Developer': developer,
-    'NA_Sales': sales_na,
-    'PAL_Sales': sales_pal,
-    'JP_Sales': sales_jp,
-    'Other_Sales': sales_ot,
-    'Global_Sales': sales_gl
-}
-print(rec_count)
-df = pd.DataFrame(columns)
-print(df.columns)
-df = df[[
-    'Rank', 'Name', 'Platform', 'Year', 'Genre',
-    'Publisher', 'Developer', 'Critic_Score', 'User_Score',
-    'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
-df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False)
+    urlhead = 'http://www.vgchartz.com/gamedb/?page='
+    urltail = '&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'
+    urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0'
+    urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'
+    urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'
+    urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'
+    download_data()

From 65a0596fffdbad61b3fd8cd29f4414d52404c5fe Mon Sep 17 00:00:00 2001
From: hechmik <hechmi.khaled1995@gmail.com>
Date: Tue, 14 May 2019 21:44:20 +0200
Subject: [PATCH 04/35] fixed bugs such as libraries, get_page usage and os
 random pick for user agent

---
 vgchartzfull.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index 7ad271f..0821e78 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -2,15 +2,17 @@
 import urllib
 import pandas as pd
 import numpy as np
+from random import randint, choice
+import time
 
 def create_random_header():
     browsers = ["Mozilla", "Chrome"]
-    os = ["Windows NT 6.1; Win64; x64" "X11; Linux x86_64"],
+    os = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"],
     #user_agent = 'Mozilla/{}.{} (Windows NT 6.1; Win64; x64)'.format(randint(1,56))
     major_version = randint(1, 56)
     minor_version = randint(1, 10)
-    chosen_browser = random.choice(browsers)
-    chosen_os = random.choice(os)
+    chosen_browser = choice(browsers)
+    chosen_os = choice(os)
 
     user_agent = '{}/{}.{} ({})'.format(
         chosen_browser,
@@ -46,8 +48,8 @@ def download_data():
     rec_count = 0
     for page in range(1, pages):
         surl = urlhead + str(page) + urltail
-        r = urllib.request.urlopen(surl).read()
-        soup = BeautifulSoup(r)
+        current_page = get_page(surl)
+        soup = BeautifulSoup(current_page)
         print(f"Page: {page}")
 
         # vgchartz website is really weird so we have to search for

From 2abab6059d78ff7daf5d9bc7a7dea437bfaefb6f Mon Sep 17 00:00:00 2001
From: hechmik <hechmi.khaled1995@gmail.com>
Date: Tue, 14 May 2019 21:47:30 +0200
Subject: [PATCH 05/35] first working version with random user agent and sleep
 between requests

---
 vgchartzfull.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index 0821e78..76d11db 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -7,12 +7,11 @@
 
 def create_random_header():
     browsers = ["Mozilla", "Chrome"]
-    os = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"],
-    #user_agent = 'Mozilla/{}.{} (Windows NT 6.1; Win64; x64)'.format(randint(1,56))
+    os_list = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"]
     major_version = randint(1, 56)
     minor_version = randint(1, 10)
     chosen_browser = choice(browsers)
-    chosen_os = choice(os)
+    chosen_os = choice(os_list)
 
     user_agent = '{}/{}.{} ({})'.format(
         chosen_browser,

From ea44a6977716bcc96ad0dfb2cbb251772b77125d Mon Sep 17 00:00:00 2001
From: hechmik <hechmi.khaled1995@gmail.com>
Date: Tue, 14 May 2019 23:10:05 +0200
Subject: [PATCH 06/35] Add basic documentation, modularized code

---
 vgchartzfull.py | 231 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 149 insertions(+), 82 deletions(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index 76d11db..2b14450 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -5,11 +5,21 @@
 from random import randint, choice
 import time
 
-def create_random_header():
+
+def create_random_header(lb_major=1,
+                         ub_major=56,
+                         lb_minor=1,
+                         ub_minor=10):
+    """
+    Create a random user agent in order to better mimic user behaviour.
+    Optional parameters for defining the:
+    - range of browser's major version (lower and upper bound)
+    - range of browser's minor version (lower and upper bound)
+    """
     browsers = ["Mozilla", "Chrome"]
     os_list = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"]
-    major_version = randint(1, 56)
-    minor_version = randint(1, 10)
+    major_version = randint(lb_major, ub_major)
+    minor_version = randint(lb_minor, ub_minor)
     chosen_browser = choice(browsers)
     chosen_os = choice(os_list)
 
@@ -21,16 +31,30 @@ def create_random_header():
     header = { 'User-Agent' :  user_agent}
     print(header)
     return header
-def get_page(url):
+
+
+def get_page(url,
+             lower_bound_sleep=6,
+             upper_bound_sleep=15):
+    """
+    Perform a GET request to the given URL and return results.
+    Add a wait logic that, combined with random header, will help avoiding
+    HTTP 429 error.
+    The optional parameters will allow further customization of waiting periods.
+    """
     header = create_random_header()
     request = urllib.request.Request(url, headers=header)
     result = urllib.request.urlopen(request).read()
-    time.sleep(randint(6,15))
+    time.sleep(randint(lower_bound_sleep, upper_bound_sleep))
     return result
-def get_genre():
-    # go to every individual website to get genre info
-    url_to_game = tag.attrs['href']
-    site_raw = urllib.request.urlopen(url_to_game).read()
+
+
+def get_genre(game_url):
+    """
+    Return the game genre retrieved from the given url
+    """
+
+    site_raw = get_page(game_url)
     sub_soup = BeautifulSoup(site_raw, "html.parser")
     # again, the info box is inconsistent among games so we
     # have to find all the h2 and traverse from that to the genre name
@@ -41,74 +65,60 @@ def get_genre():
     for h2 in h2s:
         if h2.string == 'Genre':
             temp_tag = h2
-    genre.append(temp_tag.next_sibling.string)
 
-def download_data():
-    rec_count = 0
-    for page in range(1, pages):
-        surl = urlhead + str(page) + urltail
-        current_page = get_page(surl)
-        soup = BeautifulSoup(current_page)
-        print(f"Page: {page}")
+    genre_value = temp_tag.next_sibling.string
+    return genre_value
 
-        # vgchartz website is really weird so we have to search for
-        # <a> tags with game urls
-        game_tags = list(filter(
-            lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),
-            # discard the first 10 elements because those
-            # links are in the navigation bar
-            soup.find_all("a")
-        ))[10:]
 
-        for tag in game_tags:
+def get_release_year(raw_year):
+    """
+    Return the release year of the given game in a 4 digit format or N/A.
+    """
+    if raw_year.startswith('N/A'):
+        final_year = 'N/A'
+    elif int(raw_year) >= 80:
+        final_year = np.int32("19" + raw_year)
+    else:
+        final_year = np.int32("20" + raw_year)
+    return final_year
 
-            # add name to list
-            gname.append(" ".join(tag.string.split()))
-            print(f"{rec_count + 1} Fetch data for game {gname[-1]}")
 
-            # get different attributes
-            # traverse up the DOM tree
-            data = tag.parent.parent.find_all("td")
-            rank.append(np.int32(data[0].string))
-            platform.append(data[3].find('img').attrs['alt'])
-            publisher.append(data[4].string)
-            developer.append(data[5].string)
-            critic_score.append(
-                float(data[6].string) if
-                not data[6].string.startswith("N/A") else np.nan)
-            user_score.append(
-                float(data[7].string) if
-                not data[7].string.startswith("N/A") else np.nan)
-            sales_na.append(
-                float(data[9].string[:-1]) if
-                not data[9].string.startswith("N/A") else np.nan)
-            sales_pal.append(
-                float(data[10].string[:-1]) if
-                not data[10].string.startswith("N/A") else np.nan)
-            sales_jp.append(
-                float(data[11].string[:-1]) if
-                not data[11].string.startswith("N/A") else np.nan)
-            sales_ot.append(
-                float(data[12].string[:-1]) if
-                not data[12].string.startswith("N/A") else np.nan)
-            sales_gl.append(
-                float(data[8].string[:-1]) if
-                not data[8].string.startswith("N/A") else np.nan)
-            release_year = data[13].string.split()[-1]
-            # different format for year
-            if release_year.startswith('N/A'):
-                year.append('N/A')
-            else:
-                if int(release_year) >= 80:
-                    year_to_add = np.int32("19" + release_year)
-                else:
-                    year_to_add = np.int32("20" + release_year)
-                year.append(year_to_add)
-
-
-
-            rec_count += 1
+def add_current_game_data(current_critic_score,
+                          current_developer,
+                          current_gname,
+                          current_platform,
+                          current_publisher,
+                          current_rank,
+                          current_release_year,
+                          current_sales_gl,
+                          current_sales_jp,
+                          current_sales_na,
+                          current_sales_ot,
+                          current_sales_pal,
+                          current_user_score):
+
+    """
+    Add all the game data to the related lists
+    """
+    gname.append(current_gname)
+    rank.append(current_rank)
+    platform.append(current_platform)
+    publisher.append(current_publisher)
+    developer.append(current_developer)
+    critic_score.append(current_critic_score)
+    user_score.append(current_user_score)
+    sales_na.append(current_sales_na)
+    sales_pal.append(current_sales_pal)
+    sales_jp.append(current_sales_jp)
+    sales_ot.append(current_sales_ot)
+    sales_gl.append(current_sales_gl)
+    year.append(current_release_year)
 
+
+def save_games_data(filename = "vgsales.csv", separator=",", enc="utf-8"):
+    """
+    Save all the downloaded data into the specified file
+    """
     columns = {
         'Rank': rank,
         'Name': gname,
@@ -125,32 +135,87 @@ def download_data():
         'Other_Sales': sales_ot,
         'Global_Sales': sales_gl
     }
-    print(rec_count)
     df = pd.DataFrame(columns)
     print(df.columns)
     df = df[[
         'Rank', 'Name', 'Platform', 'Year', 'Genre',
         'Publisher', 'Developer', 'Critic_Score', 'User_Score',
         'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
-    df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False)
+    df.to_csv(filename, sep=separator, encoding=enc, index=False)
+
+
+def download_data(start_page, end_page, download_genre=False):
+    """
+    Download games data from vgchartz: only data whose pages are in the range (start_page, end_page) will be downloaded
+    :param start_page:
+    :param end_page:
+    :param download_genre:
+    :return:
+    """
+    game_rank = 1 # Results are decreasingly ordered according to Shipped units
+    for page in range(start_page, end_page):
+        surl = urlhead + str(page) + urltail
+        current_page = get_page(surl)
+        soup = BeautifulSoup(current_page)
+        print(f"Page: {page}")
+
+        # vgchartz website is really weird so we have to search for
+        # <a> tags with game urls
+        game_tags = list(filter(
+            lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),
+            # discard the first 10 elements because those
+            # links are in the navigation bar
+            soup.find_all("a")
+        ))[10:]
+
+        for tag in game_tags:
+
+            current_gname = " ".join(tag.string.split()) # add game name to list
+            print(f"{game_rank} Fetch data for game {current_gname}")
+
+            # Get different attributes
+            # traverse up the DOM tree
+            data = tag.parent.parent.find_all("td")
+            current_rank = np.int32(data[0].string)
+            current_platform = data[3].find('img').attrs['alt']
+            current_publisher = data[4].string
+            current_developer = data[5].string
+            current_critic_score = float(data[6].string) if not data[6].string.startswith("N/A") else np.nan
+            current_user_score = float(data[7].string) if not data[7].string.startswith("N/A") else np.nan
+            current_sales_na = float(data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan
+            current_sales_pal = float(data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan
+            current_sales_jp = float(data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan
+            current_sales_ot = float(data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan
+            current_sales_gl = float(data[8].string[:-1]) if not data[8].string.startswith("N/A") else np.nan
+            current_release_year = get_release_year(data[13].string.split()[-1])
+
+            add_current_game_data(current_critic_score, current_developer, current_gname, current_platform,
+                                  current_publisher, current_rank, current_release_year, current_sales_gl,
+                                  current_sales_jp, current_sales_na,current_sales_ot, current_sales_pal,
+                                  current_user_score)
+
+            game_url = tag.attrs['href']
+            game_genre = ""
+            if download_genre:
+                game_genre = get_genre(game_url)
+            genre.append(game_genre)
+
+            game_rank += 1
+
+    print("Number of downloaded resources: {}".format(game_rank))
+
 
 if __name__ == "__main__":
     pages = 19
-    rec_count = 0
     rank = []
     gname = []
     platform = []
     year = []
     genre = []
-    critic_score = []
-    user_score = []
+    critic_score, user_score = [], []
     publisher = []
     developer = []
-    sales_na = []
-    sales_pal = []
-    sales_jp = []
-    sales_ot = []
-    sales_gl = []
+    sales_na, sales_pal, sales_jp, sales_ot, sales_gl = [], [], [], [], []
 
     urlhead = 'http://www.vgchartz.com/gamedb/?page='
     urltail = '&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'
@@ -158,4 +223,6 @@ def download_data():
     urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'
     urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'
     urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'
-    download_data()
+    download_data(1, 2)
+    save_games_data()
+

From a55b284a66604de6dd654466e54832910592dd3b Mon Sep 17 00:00:00 2001
From: hechmik <hechmi.khaled1995@gmail.com>
Date: Tue, 14 May 2019 23:34:14 +0200
Subject: [PATCH 07/35] Example property file

---
 resources.json | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 resources.json

diff --git a/resources.json b/resources.json
new file mode 100644
index 0000000..fd40629
--- /dev/null
+++ b/resources.json
@@ -0,0 +1,16 @@
+{
+  "output_filename": "vgsales.csv",
+  "separator": ",",
+  "encoding": "utf-8",
+  "start_page": 1,
+  "end_page": 2,
+  "include_genre": false,
+  "minimum_sleep_time": 6,
+  "maximum_sleep_time": 15,
+  "minimum_major_version": 1,
+  "maximum_major_version": 56,
+  "minimum_minor_version": 1,
+  "maximum_minor_version": 10,
+  "base_page_url": "http://www.vgchartz.com/gamedb/?page=",
+  "remaining_url": "&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1&showlastupdate=0&showothersales=1&showgenre=1&sort=GL"
+}

From 70922cf270f0e9ee3ee1fd03a950594e1b26c837 Mon Sep 17 00:00:00 2001
From: hechmik <hechmi.khaled1995@gmail.com>
Date: Tue, 14 May 2019 23:35:04 +0200
Subject: [PATCH 08/35] Completed refactor, read many parameter from property
 file

---
 vgchartzfull.py | 97 ++++++++++++++++++++++++-------------------------
 1 file changed, 47 insertions(+), 50 deletions(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index 2b14450..58636b4 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -4,22 +4,17 @@
 import numpy as np
 from random import randint, choice
 import time
+import json
 
 
-def create_random_header(lb_major=1,
-                         ub_major=56,
-                         lb_minor=1,
-                         ub_minor=10):
+def create_random_header():
     """
     Create a random user agent in order to better mimic user behaviour.
-    Optional parameters for defining the:
-    - range of browser's major version (lower and upper bound)
-    - range of browser's minor version (lower and upper bound)
     """
     browsers = ["Mozilla", "Chrome"]
     os_list = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"]
-    major_version = randint(lb_major, ub_major)
-    minor_version = randint(lb_minor, ub_minor)
+    major_version = randint(properties['minimum_major_version'], properties['maximum_major_version'])
+    minor_version = randint(properties['minimum_minor_version'], properties['maximum_minor_version'])
     chosen_browser = choice(browsers)
     chosen_os = choice(os_list)
 
@@ -33,19 +28,16 @@ def create_random_header(lb_major=1,
     return header
 
 
-def get_page(url,
-             lower_bound_sleep=6,
-             upper_bound_sleep=15):
+def get_page(url):
     """
     Perform a GET request to the given URL and return results.
     Add a wait logic that, combined with random header, will help avoiding
     HTTP 429 error.
-    The optional parameters will allow further customization of waiting periods.
     """
     header = create_random_header()
     request = urllib.request.Request(url, headers=header)
     result = urllib.request.urlopen(request).read()
-    time.sleep(randint(lower_bound_sleep, upper_bound_sleep))
+    time.sleep(randint(properties['minimum_sleep_time'], properties['maximum_sleep_time']))
     return result
 
 
@@ -115,45 +107,16 @@ def add_current_game_data(current_critic_score,
     year.append(current_release_year)
 
 
-def save_games_data(filename = "vgsales.csv", separator=",", enc="utf-8"):
-    """
-    Save all the downloaded data into the specified file
-    """
-    columns = {
-        'Rank': rank,
-        'Name': gname,
-        'Platform': platform,
-        'Year': year,
-        'Genre': genre,
-        'Critic_Score': critic_score,
-        'User_Score': user_score,
-        'Publisher': publisher,
-        'Developer': developer,
-        'NA_Sales': sales_na,
-        'PAL_Sales': sales_pal,
-        'JP_Sales': sales_jp,
-        'Other_Sales': sales_ot,
-        'Global_Sales': sales_gl
-    }
-    df = pd.DataFrame(columns)
-    print(df.columns)
-    df = df[[
-        'Rank', 'Name', 'Platform', 'Year', 'Genre',
-        'Publisher', 'Developer', 'Critic_Score', 'User_Score',
-        'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
-    df.to_csv(filename, sep=separator, encoding=enc, index=False)
-
-
-def download_data(start_page, end_page, download_genre=False):
+def download_data(start_page, end_page, include_genre):
     """
     Download games data from vgchartz: only data whose pages are in the range (start_page, end_page) will be downloaded
     :param start_page:
     :param end_page:
-    :param download_genre:
+    :param include_genre:
     :return:
     """
     game_rank = 1 # Results are decreasingly ordered according to Shipped units
-    for page in range(start_page, end_page):
+    for page in range(start_page, end_page + 1):
         surl = urlhead + str(page) + urltail
         current_page = get_page(surl)
         soup = BeautifulSoup(current_page)
@@ -196,7 +159,7 @@ def download_data(start_page, end_page, download_genre=False):
 
             game_url = tag.attrs['href']
             game_genre = ""
-            if download_genre:
+            if include_genre:
                 game_genre = get_genre(game_url)
             genre.append(game_genre)
 
@@ -205,8 +168,36 @@ def download_data(start_page, end_page, download_genre=False):
     print("Number of downloaded resources: {}".format(game_rank))
 
 
+def save_games_data(filename, separator, enc):
+    """
+    Save all the downloaded data into the specified file
+    """
+    columns = {
+        'Rank': rank,
+        'Name': gname,
+        'Platform': platform,
+        'Year': year,
+        'Genre': genre,
+        'Critic_Score': critic_score,
+        'User_Score': user_score,
+        'Publisher': publisher,
+        'Developer': developer,
+        'NA_Sales': sales_na,
+        'PAL_Sales': sales_pal,
+        'JP_Sales': sales_jp,
+        'Other_Sales': sales_ot,
+        'Global_Sales': sales_gl
+    }
+    df = pd.DataFrame(columns)
+    print(df.columns)
+    df = df[[
+        'Rank', 'Name', 'Platform', 'Year', 'Genre',
+        'Publisher', 'Developer', 'Critic_Score', 'User_Score',
+        'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
+    df.to_csv(filename, sep=separator, encoding=enc, index=False)
+
+
 if __name__ == "__main__":
-    pages = 19
     rank = []
     gname = []
     platform = []
@@ -223,6 +214,12 @@ def download_data(start_page, end_page, download_genre=False):
     urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'
     urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'
     urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'
-    download_data(1, 2)
-    save_games_data()
+
+    properties = None
+
+    with open("resources.json") as file:
+        properties = json.load(file)
+    print(properties)
+    download_data(properties['start_page'], properties['end_page'], properties['include_genre'])
+    save_games_data(properties['output_filename'], properties['separator'], properties['encoding'])
 

From e65e51a431ce4b01717ed2d9f561899609479ebf Mon Sep 17 00:00:00 2001
From: hechmik <hechmi.khaled1995@gmail.com>
Date: Wed, 15 May 2019 10:46:42 +0200
Subject: [PATCH 09/35] Read vgchartz url from config json

---
 vgchartzfull.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index 58636b4..f84c697 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -117,7 +117,7 @@ def download_data(start_page, end_page, include_genre):
     """
     game_rank = 1 # Results are decreasingly ordered according to Shipped units
     for page in range(start_page, end_page + 1):
-        surl = urlhead + str(page) + urltail
+        surl = base_url + str(page) + remaining_url
         current_page = get_page(surl)
         soup = BeautifulSoup(current_page)
         print(f"Page: {page}")
@@ -208,18 +208,13 @@ def save_games_data(filename, separator, enc):
     developer = []
     sales_na, sales_pal, sales_jp, sales_ot, sales_gl = [], [], [], [], []
 
-    urlhead = 'http://www.vgchartz.com/gamedb/?page='
-    urltail = '&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'
-    urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0'
-    urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'
-    urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'
-    urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'
-
     properties = None
 
     with open("resources.json") as file:
         properties = json.load(file)
     print(properties)
+    base_url = properties['base_page_url']
+    remaining_url = properties['remaining_url']
     download_data(properties['start_page'], properties['end_page'], properties['include_genre'])
     save_games_data(properties['output_filename'], properties['separator'], properties['encoding'])
 

From d5ab1f45470cc221e1c35e364cd42bdc624a4dd9 Mon Sep 17 00:00:00 2001
From: hechmik <hechmi.khaled1995@gmail.com>
Date: Wed, 15 May 2019 13:04:48 +0200
Subject: [PATCH 10/35] Add entry for log filename

---
 resources.json  |  3 ++-
 vgchartzfull.py | 16 ++++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/resources.json b/resources.json
index fd40629..e3559b7 100644
--- a/resources.json
+++ b/resources.json
@@ -12,5 +12,6 @@
   "minimum_minor_version": 1,
   "maximum_minor_version": 10,
   "base_page_url": "http://www.vgchartz.com/gamedb/?page=",
-  "remaining_url": "&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1&showlastupdate=0&showothersales=1&showgenre=1&sort=GL"
+  "remaining_url": "&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1&showlastupdate=0&showothersales=1&showgenre=1&sort=GL",
+  "application_log_filename": "app.log"
 }
diff --git a/vgchartzfull.py b/vgchartzfull.py
index f84c697..0ee3b96 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -5,6 +5,7 @@
 from random import randint, choice
 import time
 import json
+import logging
 
 
 def create_random_header():
@@ -92,7 +93,7 @@ def add_current_game_data(current_critic_score,
     """
     Add all the game data to the related lists
     """
-    gname.append(current_gname)
+    game_name.append(current_gname)
     rank.append(current_rank)
     platform.append(current_platform)
     publisher.append(current_publisher)
@@ -174,7 +175,7 @@ def save_games_data(filename, separator, enc):
     """
     columns = {
         'Rank': rank,
-        'Name': gname,
+        'Name': game_name,
         'Platform': platform,
         'Year': year,
         'Genre': genre,
@@ -198,8 +199,10 @@ def save_games_data(filename, separator, enc):
 
 
 if __name__ == "__main__":
+
+
     rank = []
-    gname = []
+    game_name = []
     platform = []
     year = []
     genre = []
@@ -212,7 +215,12 @@ def save_games_data(filename, separator, enc):
 
     with open("resources.json") as file:
         properties = json.load(file)
-    print(properties)
+
+    logging.basicConfig(filename=properties["application_log_filename"],
+                        filemode='w',
+                        format='%(asctime)s|%(name)s|%(levelname)s| %(message)s',
+                        datefmt='%d-%m-%y %H:%M:%S')
+    logging.warning('Application started')
     base_url = properties['base_page_url']
     remaining_url = properties['remaining_url']
     download_data(properties['start_page'], properties['end_page'], properties['include_genre'])

From 4ed64bf40e767972010a4388852469853ce0f71c Mon Sep 17 00:00:00 2001
From: hechmik <hechmi.khaled1995@gmail.com>
Date: Wed, 15 May 2019 13:05:26 +0200
Subject: [PATCH 11/35] Improved documentation, add logging to both stdout and
 file

---
 vgchartzfull.py | 96 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 72 insertions(+), 24 deletions(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index 0ee3b96..8d82715 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -11,7 +11,9 @@
 def create_random_header():
     """
     Create a random user agent in order to better mimic user behaviour.
+    :return JSON with User-Agent as key and random browser-os combo as value
     """
+    logging.info("create_random_header >>>")
     browsers = ["Mozilla", "Chrome"]
     os_list = ["Windows NT 6.1; Win64; x64", "X11; Linux x86_64"]
     major_version = randint(properties['minimum_major_version'], properties['maximum_major_version'])
@@ -24,8 +26,9 @@ def create_random_header():
         major_version,
         minor_version,
         chosen_os)
-    header = { 'User-Agent' :  user_agent}
-    print(header)
+    header = {'User-Agent': user_agent}
+    logging.debug("Current user_agent: {}".format(header))
+    logging.info("create_random_header <<<")
     return header
 
 
@@ -34,19 +37,27 @@ def get_page(url):
     Perform a GET request to the given URL and return results.
     Add a wait logic that, combined with random header, will help avoiding
     HTTP 429 error.
+    :param url: webpage URL
+    :return: HTML page's body
     """
+    logging.info("get_page >>>")
+    logging.debug("Current URL: {}".format(url))
     header = create_random_header()
     request = urllib.request.Request(url, headers=header)
     result = urllib.request.urlopen(request).read()
     time.sleep(randint(properties['minimum_sleep_time'], properties['maximum_sleep_time']))
+    logging.info("get_page <<<")
     return result
 
 
 def get_genre(game_url):
     """
     Return the game genre retrieved from the given url
+    :param game_url:
+    :return: Genre of the input game
     """
-
+    logging.info("get_genre >>>")
+    logging.debug("Page to download: {}".format(game_url))
     site_raw = get_page(game_url)
     sub_soup = BeautifulSoup(site_raw, "html.parser")
     # again, the info box is inconsistent among games so we
@@ -60,25 +71,32 @@ def get_genre(game_url):
             temp_tag = h2
 
     genre_value = temp_tag.next_sibling.string
+    logging.debug("Game genre: {}".format(genre_value))
+    logging.info("get_genre <<<")
     return genre_value
 
 
 def get_release_year(raw_year):
     """
     Return the release year of the given game in a 4 digit format or N/A.
+    :param raw_year:
+    :return: Game Release year
     """
+    logging.info("get_release_year >>>")
     if raw_year.startswith('N/A'):
         final_year = 'N/A'
     elif int(raw_year) >= 80:
         final_year = np.int32("19" + raw_year)
     else:
         final_year = np.int32("20" + raw_year)
+    logging.debug("Release Year: {}".format(final_year))
+    logging.info("get_release_year <<<")
     return final_year
 
 
 def add_current_game_data(current_critic_score,
                           current_developer,
-                          current_gname,
+                          current_game_name,
                           current_platform,
                           current_publisher,
                           current_rank,
@@ -89,11 +107,26 @@ def add_current_game_data(current_critic_score,
                           current_sales_ot,
                           current_sales_pal,
                           current_user_score):
-
     """
     Add all the game data to the related lists
+
+    :param current_critic_score:
+    :param current_developer:
+    :param current_game_name:
+    :param current_platform:
+    :param current_publisher:
+    :param current_rank:
+    :param current_release_year:
+    :param current_sales_gl:
+    :param current_sales_jp:
+    :param current_sales_na:
+    :param current_sales_ot:
+    :param current_sales_pal:
+    :param current_user_score:
+    :return:
     """
-    game_name.append(current_gname)
+    logging.info("add_current_game_data >>>")
+    game_name.append(current_game_name)
     rank.append(current_rank)
     platform.append(current_platform)
     publisher.append(current_publisher)
@@ -106,6 +139,7 @@ def add_current_game_data(current_critic_score,
     sales_ot.append(current_sales_ot)
     sales_gl.append(current_sales_gl)
     year.append(current_release_year)
+    logging.info("add_current_game_data <<<")
 
 
 def download_data(start_page, end_page, include_genre):
@@ -116,12 +150,13 @@ def download_data(start_page, end_page, include_genre):
     :param include_genre:
     :return:
     """
-    game_rank = 1 # Results are decreasingly ordered according to Shipped units
+    logging.info("download_data >>>")
+    downloaded_games = 0  # Results are decreasingly ordered according to Shipped units
     for page in range(start_page, end_page + 1):
-        surl = base_url + str(page) + remaining_url
-        current_page = get_page(surl)
+        page_url = "{}{}{}".format(base_url, str(page), remaining_url)
+        current_page = get_page(page_url)
         soup = BeautifulSoup(current_page)
-        print(f"Page: {page}")
+        logging.info("Downloaded page {}".format(page))
 
         # vgchartz website is really weird so we have to search for
         # <a> tags with game urls
@@ -134,8 +169,8 @@ def download_data(start_page, end_page, include_genre):
 
         for tag in game_tags:
 
-            current_gname = " ".join(tag.string.split()) # add game name to list
-            print(f"{game_rank} Fetch data for game {current_gname}")
+            current_gname = " ".join(tag.string.split())  # add game name to list
+            logging.debug("Downloaded game: {}. Name: {}".format(downloaded_games + 1, current_gname))
 
             # Get different attributes
             # traverse up the DOM tree
@@ -155,7 +190,7 @@ def download_data(start_page, end_page, include_genre):
 
             add_current_game_data(current_critic_score, current_developer, current_gname, current_platform,
                                   current_publisher, current_rank, current_release_year, current_sales_gl,
-                                  current_sales_jp, current_sales_na,current_sales_ot, current_sales_pal,
+                                  current_sales_jp, current_sales_na, current_sales_ot, current_sales_pal,
                                   current_user_score)
 
             game_url = tag.attrs['href']
@@ -164,15 +199,20 @@ def download_data(start_page, end_page, include_genre):
                 game_genre = get_genre(game_url)
             genre.append(game_genre)
 
-            game_rank += 1
+            downloaded_games += 1
 
-    print("Number of downloaded resources: {}".format(game_rank))
+    logging.info("Number of downloaded resources: {}".format(downloaded_games))
+    logging.info("download_data <<<")
 
 
 def save_games_data(filename, separator, enc):
     """
     Save all the downloaded data into the specified file
+    :param filename
+    :param separator
+    :param enc
     """
+    logging.info("save_games_data >>>")
     columns = {
         'Rank': rank,
         'Name': game_name,
@@ -190,17 +230,16 @@ def save_games_data(filename, separator, enc):
         'Global_Sales': sales_gl
     }
     df = pd.DataFrame(columns)
-    print(df.columns)
+    logging.debug("Dataframe column name: {}".format(df.columns))
     df = df[[
         'Rank', 'Name', 'Platform', 'Year', 'Genre',
         'Publisher', 'Developer', 'Critic_Score', 'User_Score',
         'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
     df.to_csv(filename, sep=separator, encoding=enc, index=False)
+    logging.info("save_games_data <<<")
 
 
 if __name__ == "__main__":
-
-
     rank = []
     game_name = []
     platform = []
@@ -216,13 +255,22 @@ def save_games_data(filename, separator, enc):
     with open("resources.json") as file:
         properties = json.load(file)
 
-    logging.basicConfig(filename=properties["application_log_filename"],
-                        filemode='w',
-                        format='%(asctime)s|%(name)s|%(levelname)s| %(message)s',
-                        datefmt='%d-%m-%y %H:%M:%S')
-    logging.warning('Application started')
+    logging.root.handlers = []
+    logging.basicConfig(format='%(asctime)s|%(name)s|%(levelname)s| %(message)s',
+                        level=logging.DEBUG,
+                        filename=properties["application_log_filename"])
+
+    # set up logging to console
+    console = logging.StreamHandler()
+    console.setLevel(logging.DEBUG)
+    # set a format which is simpler for console use
+    formatter = logging.Formatter(fmt='%(asctime)s|%(name)s|%(levelname)s| %(message)s',
+                                  datefmt="%d-%m-%Y %H:%M:%S")
+    console.setFormatter(formatter)
+    logging.getLogger("").addHandler(console)
+
+    logging.info('Application started')
     base_url = properties['base_page_url']
     remaining_url = properties['remaining_url']
     download_data(properties['start_page'], properties['end_page'], properties['include_genre'])
     save_games_data(properties['output_filename'], properties['separator'], properties['encoding'])
-

From 8a3e28fb57713f6314539824d465960ed673adc2 Mon Sep 17 00:00:00 2001
From: Khaled Hechmi <hechmi.khaled1995@gmail.com>
Date: Wed, 4 Mar 2020 15:40:28 +0100
Subject: [PATCH 12/35] Upgraded to HTTPS

---
 resources.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources.json b/resources.json
index e3559b7..d6d4167 100644
--- a/resources.json
+++ b/resources.json
@@ -11,7 +11,7 @@
   "maximum_major_version": 56,
   "minimum_minor_version": 1,
   "maximum_minor_version": 10,
-  "base_page_url": "http://www.vgchartz.com/gamedb/?page=",
+  "base_page_url": "https://www.vgchartz.com/gamedb/?page=",
   "remaining_url": "&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1&showlastupdate=0&showothersales=1&showgenre=1&sort=GL",
   "application_log_filename": "app.log"
 }

From b31b21d7638ee5e44c6f92985210334d98fba5e1 Mon Sep 17 00:00:00 2001
From: Khaled Hechmi <hechmi.khaled1995@gmail.com>
Date: Wed, 4 Mar 2020 15:50:33 +0100
Subject: [PATCH 13/35] Use https in lambda for skipping first 10 elements

---
 vgchartzfull.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index 8d82715..425238b 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -161,7 +161,7 @@ def download_data(start_page, end_page, include_genre):
         # vgchartz website is really weird so we have to search for
         # <a> tags with game urls
         game_tags = list(filter(
-            lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),
+            lambda x: x.attrs['href'].startswith('https://www.vgchartz.com/game/'),
             # discard the first 10 elements because those
             # links are in the navigation bar
             soup.find_all("a")

From 1c86411d705485e8f9a71d9de35210fb88d3bb86 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 11:18:09 +0200
Subject: [PATCH 14/35] Update README.md

---
 README.md | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index e557119..27dc238 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,19 @@
-vgchartzfull is a python script based on BeautifulSoup.
-It creates a dataset based on data from 
-http://www.vgchartz.com/gamedb/
+# vgchartzfull
+
+
+vgchartzfull.py is a python@3 script based on BeautifulSoup. It creates a dataset based on data from  http://www.vgchartz.com/gamedb/ 
 
 The dataset is saved as vgsales.csv.
 
-You will need to have BeautifulSoup added.
+You will need to have some depencies compiled at **requirements.txt**.
+
 It can be installed by pip.
 
-sudo pip install BeautifulSoup
+```bash
+
+  $> pip install -r requirements.txt
+
+```
 
-Thanks to Chris Albon.
+Thanks to Chris Albon & Gregor UT
 http://chrisalbon.com/python/beautiful_soup_scrape_table.html

From fb81a1358ad2681a5e7ee7e321ec15bd1a5547a0 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 11:25:40 +0200
Subject: [PATCH 15/35] Update README.md

---
 README.md | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 27dc238..666d4a1 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,13 @@
-# vgchartzfull
-
+# vgchartzfull - A crawler to download and analyze Video Game Sales data from more than 16,500 games.
 
 vgchartzfull.py is a python@3 script based on BeautifulSoup. It creates a dataset based on data from  http://www.vgchartz.com/gamedb/ 
 
+## Output
+
 The dataset is saved as vgsales.csv.
 
+## Install & execution
+
 You will need to have some depencies compiled at **requirements.txt**.
 
 It can be installed by pip.
@@ -15,5 +18,30 @@ It can be installed by pip.
 
 ```
 
+## Dictionary
+
+| Field | Description              |
+|-------|--------------------------|
+| Rank  | Ranking of overall sales |
+| Name | The games name |
+| Platform | Platform of the games release (i.e. PC,PS4, etc.) |
+| Year | Year of the game's release |
+| Genre | Genre of the game |
+| Publisher | Publisher of the game |
+| NA_Sales | Sales in North America (in millions) |
+| EU_Sales | Sales in Europe (in millions) |
+| JP_Sales | Sales in Japan (in millions) |
+| Other_Sales | Sales in the rest of the world (in millions) |
+| Global_Sales | Total worldwide sales. |
+
+
+## Links
+
+* [vgchartz.com](https://www.vgchartz.com)
+* [Original Crawler](https://github.com/GregorUT/vgchartzScrape)
+* [Kaggle Dataset](https://www.kaggle.com/gregorut/videogamesales)
+
+## Greetings
+
 Thanks to Chris Albon & Gregor UT
 http://chrisalbon.com/python/beautiful_soup_scrape_table.html

From dd64927d1cf9ef497ec505412177dfb145c2a9d5 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 11:26:11 +0200
Subject: [PATCH 16/35] Create requirements.txt

---
 requirements.txt | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f546999
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+beautifulsoup4==4.8.2
+numpy==1.16.4
+pandas==0.25.0

From c732358cc03669709eca0e882ec8fc7af6d8715b Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 11:26:37 +0200
Subject: [PATCH 17/35] Update .gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index be2baa1..9b00b42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+.idea
+.vcs
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

From 35cff1d295f8aba89b168e54202e1e0d7bd06efb Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 11:28:31 +0200
Subject: [PATCH 18/35] Update README.md

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 666d4a1..8e15845 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,12 @@ It can be installed by pip.
 
 ```bash
 
+  # Install dependencies
   $> pip install -r requirements.txt
+  
+  # Run
+  $> python vgchartzfull.py
+  
 
 ```
 

From 0f5aec361aa87118329034a54a17e80de4b5a3a2 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 11:29:16 +0200
Subject: [PATCH 19/35] Update README.md

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8e15845..bbec132 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
-# vgchartzfull - A crawler to download and analyze Video Game Sales data from more than 16,500 games.
+# vgchartzfull - A crawler to download and analyze Video Game Sales
 
-vgchartzfull.py is a python@3 script based on BeautifulSoup. It creates a dataset based on data from  http://www.vgchartz.com/gamedb/ 
+vgchartzfull.py is a python@3 script based on BeautifulSoup.
+
+It creates a dataset with data from more than 16,500 games. based on data from  http://www.vgchartz.com/gamedb/ 
 
 ## Output
 

From d8f2173f23b32c83795723a6f209d602eccd2440 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 11:29:51 +0200
Subject: [PATCH 20/35] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index bbec132..0854b43 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# vgchartzfull - A crawler to download and analyze Video Game Sales
+# vgchartzfull - A crawler to download data from Global Videogame Sales
 
 vgchartzfull.py is a python@3 script based on BeautifulSoup.
 

From 6685ad393a2f41d1d5207e914976f407b07af2a3 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 12:36:23 +0200
Subject: [PATCH 21/35] Update .gitignore

Avoiding to upload csv data partial files
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 9b00b42..fffebb9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
 .idea
 .vcs
 
+*.csv
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

From 33a53d830b04ca62500ec1293b75c7f7315c8594 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 13:56:40 +0200
Subject: [PATCH 22/35] Refactor in functions

---
 vgchartzfull.py | 344 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 235 insertions(+), 109 deletions(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index b1d75a4..bb98ac1 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -2,11 +2,16 @@
 import urllib
 import pandas as pd
 import numpy as np
+import datetime
+import time
 
-pages = 19
+# Environment & buffers
 rec_count = 0
+page_size = 10
+pages = 4          # 57,453 / 1000 = 58 (At the time of this writing)
+
 rank = []
-gname = []
+game_name = []
 platform = []
 year = []
 genre = []
@@ -20,111 +25,232 @@
 sales_ot = []
 sales_gl = []
 
-urlhead = 'http://www.vgchartz.com/gamedb/?page='
-urltail = '&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'
-urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0'
-urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'
-urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'
-urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'
-
-for page in range(1, pages):
-    surl = urlhead + str(page) + urltail
-    r = urllib.request.urlopen(surl).read()
-    soup = BeautifulSoup(r)
-    print(f"Page: {page}")
-
-    # vgchartz website is really weird so we have to search for
-    # <a> tags with game urls
-    game_tags = list(filter(
-        lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),
-        # discard the first 10 elements because those
-        # links are in the navigation bar
-        soup.find_all("a")
-    ))[10:]
-
-    for tag in game_tags:
-
-        # add name to list
-        gname.append(" ".join(tag.string.split()))
-        print(f"{rec_count + 1} Fetch data for game {gname[-1]}")
-
-        # get different attributes
-        # traverse up the DOM tree
-        data = tag.parent.parent.find_all("td")
-        rank.append(np.int32(data[0].string))
-        platform.append(data[3].find('img').attrs['alt'])
-        publisher.append(data[4].string)
-        developer.append(data[5].string)
-        critic_score.append(
-            float(data[6].string) if
-            not data[6].string.startswith("N/A") else np.nan)
-        user_score.append(
-            float(data[7].string) if
-            not data[7].string.startswith("N/A") else np.nan)
-        sales_na.append(
-            float(data[9].string[:-1]) if
-            not data[9].string.startswith("N/A") else np.nan)
-        sales_pal.append(
-            float(data[10].string[:-1]) if
-            not data[10].string.startswith("N/A") else np.nan)
-        sales_jp.append(
-            float(data[11].string[:-1]) if
-            not data[11].string.startswith("N/A") else np.nan)
-        sales_ot.append(
-            float(data[12].string[:-1]) if
-            not data[12].string.startswith("N/A") else np.nan)
-        sales_gl.append(
-            float(data[8].string[:-1]) if
-            not data[8].string.startswith("N/A") else np.nan)
-        release_year = data[13].string.split()[-1]
-        # different format for year
-        if release_year.startswith('N/A'):
-            year.append('N/A')
+def main ():
+    """
+    Main Crawler Loop
+
+    :return: a csv file :)
+    """
+
+    for page in range(1, pages):
+
+        try:
+            surl = generate_uri(page_number=str(page), page_size=page_size)
+            r = urllib.request.urlopen(surl).read()
+            soup = BeautifulSoup(r, features="html.parser")
+            print(f"Crawling page: {page} of {pages}")
+
+            # We locate the game from <a> tags with game urls
+            game_tags = list(filter(
+                lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),
+                # discard the first 10 elements because those links are in the navigation bar
+                soup.find_all("a")
+            ))[10:]
+
+            # Loop for each line received
+            for tag in game_tags:
+                parse_game(tag=tag)
+
+        except urllib.error.HTTPError as e:
+            print("Unexpected error:", sys.exc_info()[0])
+            print(e.code)
+            print(e.read())
+
+            time.sleep(15)
+
+        finally:
+            # Crawlers: The Friend Nobody Likes
+            time.sleep(60)
+
+    # Generate and export to CSV
+    timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M")
+    df.to_csv(f"vgsales-{timestamp}.csv", sep=",", encoding='utf-8', index=False)
+
+
+def generate_uri(*, page_number, page_size):
+    """
+
+    Generate the uri from page number
+
+    :param page_number:
+    :return:
+    """
+
+    urlhead = 'http://www.vgchartz.com/gamedb/?page='
+    # page_number... <= here comes the param received
+    urltail = f'&results={page_size}'
+    urltail += '&order=Sales'
+    urltail += '&region=All'
+    urltail += '&boxart=Both'
+    urltail += '&banner=Both'
+    urltail += '&ownership=Both'
+    urltail += '&keyword='
+    urltail += '&console='
+    urltail += '&developer='
+    urltail += '&publisher='
+    urltail += '&goty_year='
+    urltail += '&genre='
+    urltail += '&showmultiplat=No'
+    urltail += '&showtotalsales=0'
+    urltail += '&showtotalsales=1'
+    urltail += '&showpublisher=0'
+    urltail += '&showpublisher=1'
+    urltail += '&showvgchartzscore=0'
+    urltail += '&showvgchartzscore=1'
+    urltail += '&shownasales=0'
+    urltail += '&shownasales=1'
+    urltail += '&showdeveloper=0'
+    urltail += '&showdeveloper=1'
+    urltail += '&showcriticscore=0'
+    urltail += '&showcriticscore=1'
+    urltail += '&showpalsales=0'
+    urltail += '&showpalsales=1'
+    urltail += '&showreleasedate=0'
+    urltail += '&showreleasedate=1'
+    urltail += '&showuserscore=0'
+    urltail += '&showuserscore=1'
+    urltail += '&showjapansales=0'
+    urltail += '&showjapansales=1'
+    urltail += '&showlastupdate=0'
+    urltail += '&showlastupdate=1'
+    urltail += '&showothersales=0'
+    urltail += '&showothersales=1'
+    urltail += '&showshipped=0'
+    urltail += '&showshipped=1'
+
+    return urlhead + str(page_number) + urltail
+
+def parse_game(*, tag):
+    """
+    Parse a game and navigate to its particular url to grab its data
+
+    :param tag:
+    :return:
+    """
+
+    # Add name to list
+    game_name.append(" ".join(tag.string.split()))
+    print(f"{rec_count + 1} Fetch data for game {game_name[-1]}")
+
+    # Get different attributes traverse up the DOM tree
+    data = tag.parent.parent.find_all("td")
+    rank.append(np.int32(data[0].string))
+    platform.append(data[3].find('img').attrs['alt'])
+    publisher.append(data[4].string)
+    developer.append(data[5].string)
+
+    critic_score.append(float(data[6].string) if
+        not data[6].string.startswith("N/A") else np.nan)
+
+    user_score.append(
+        float(data[7].string) if
+        not data[7].string.startswith("N/A") else np.nan)
+
+    sales_na.append(
+        float(data[9].string[:-1]) if
+        not data[9].string.startswith("N/A") else np.nan)
+
+    sales_pal.append(
+        float(data[10].string[:-1]) if
+        not data[10].string.startswith("N/A") else np.nan)
+
+    sales_jp.append(
+        float(data[11].string[:-1]) if
+        not data[11].string.startswith("N/A") else np.nan)
+
+    sales_ot.append(
+        float(data[12].string[:-1]) if
+        not data[12].string.startswith("N/A") else np.nan)
+
+    sales_gl.append(
+        float(data[8].string[:-1]) if
+        not data[8].string.startswith("N/A") else np.nan)
+
+    release_year = data[13].string.split()[-1]
+
+    # different format for year i.e. 2K year effect XD
+    if release_year.startswith('N/A'):
+        year.append('N/A')
+    else:
+        if int(release_year) >= 80:
+            year_to_add = np.int32("19" + release_year)
         else:
-            if int(release_year) >= 80:
-                year_to_add = np.int32("19" + release_year)
-            else:
-                year_to_add = np.int32("20" + release_year)
-            year.append(year_to_add)
-
-        # go to every individual website to get genre info
-        url_to_game = tag.attrs['href']
-        site_raw = urllib.request.urlopen(url_to_game).read()
-        sub_soup = BeautifulSoup(site_raw, "html.parser")
-        # again, the info box is inconsistent among games so we
-        # have to find all the h2 and traverse from that to the genre name
-        h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')
-        # make a temporary tag here to search for the one that contains
-        # the word "Genre"
-        temp_tag = element.Tag
-        for h2 in h2s:
-            if h2.string == 'Genre':
-                temp_tag = h2
-        genre.append(temp_tag.next_sibling.string)
-
-        rec_count += 1
-
-columns = {
-    'Rank': rank,
-    'Name': gname,
-    'Platform': platform,
-    'Year': year,
-    'Genre': genre,
-    'Critic_Score': critic_score,
-    'User_Score': user_score,
-    'Publisher': publisher,
-    'Developer': developer,
-    'NA_Sales': sales_na,
-    'PAL_Sales': sales_pal,
-    'JP_Sales': sales_jp,
-    'Other_Sales': sales_ot,
-    'Global_Sales': sales_gl
-}
-print(rec_count)
-df = pd.DataFrame(columns)
-print(df.columns)
-df = df[[
-    'Rank', 'Name', 'Platform', 'Year', 'Genre',
-    'Publisher', 'Developer', 'Critic_Score', 'User_Score',
-    'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
-df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False)
+            year_to_add = np.int32("20" + release_year)
+        year.append(year_to_add)
+
+    # go to every individual website to get genre info
+    url_to_game = tag.attrs['href']
+    site_raw = urllib.request.urlopen(url_to_game).read()
+    sub_soup = BeautifulSoup(site_raw, "html.parser")
+
+    # again, the info box is inconsistent among games so we
+    # have to find all the h2 and traverse from that to the genre name
+    h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')
+
+    # make a temporary tag here to search for the one that contains
+    # the word "Genre"
+    temp_tag = element.Tag
+
+    for h2 in h2s:
+        if h2.string == 'Genre':
+            temp_tag = h2
+    genre.append(temp_tag.next_sibling.string)
+
+    rec_count += 1
+
+    # Crawlers: The Friend Nobody Likes
+    time.sleep(10)
+
+
+def assemble_response(*, rank, game_name, platform, year, genre, critic_score, user_score, publisher, developer, sales_na, sales_pal, sales_jp, sales_ot, sales_gl):
+    """
+    
+    Assemble from buffers to a Panda DataFrame
+    
+    :param rank: 
+    :param game_name:
+    :param platform: 
+    :param year: 
+    :param genre: 
+    :param critic_score: 
+    :param user_score: 
+    :param publisher: 
+    :param developer: 
+    :param sales_na: 
+    :param sales_pal: 
+    :param sales_jp: 
+    :param sales_ot: 
+    :param sales_gl: 
+    :return: 
+    """
+
+    # Assembler
+    columns = {
+        'Rank': rank,
+        'Name': game_name,
+        'Platform': platform,
+        'Year': year,
+        'Genre': genre,
+        'Critic_Score': critic_score,
+        'User_Score': user_score,
+        'Publisher': publisher,
+        'Developer': developer,
+        'NA_Sales': sales_na,
+        'PAL_Sales': sales_pal,
+        'JP_Sales': sales_jp,
+        'Other_Sales': sales_ot,
+        'Global_Sales': sales_gl
+    }
+
+    # Final Report
+    print(rec_count)
+    df = pd.DataFrame(columns)
+    print(df.columns)
+    df = df[[
+        'Rank', 'Name', 'Platform', 'Year', 'Genre',
+        'Publisher', 'Developer', 'Critic_Score', 'User_Score',
+        'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
+
+    return df
+
+main()

From 414602cf9e75ad0922a8cdd4e4de3c958c739c6f Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 17:32:19 +0200
Subject: [PATCH 23/35] Add an exception to manage the main loop wrapping the
 functions call

---
 vgchartzfull.py | 84 +++++++++----------------------------------------
 1 file changed, 15 insertions(+), 69 deletions(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index 0fa096e..b59cab9 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -1,12 +1,12 @@
 from bs4 import BeautifulSoup, element
-import urllib
+from random import randint, choice
 import pandas as pd
 import numpy as np
-from random import randint, choice
-import datetime
+import logging
+import urllib
+import sys
 import time
 import json
-import logging
 
 def create_random_header():
     """
@@ -31,59 +31,6 @@ def create_random_header():
     logging.info("create_random_header <<<")
     return header
 
-def generate_uri(*, page_number, page_size):
-    """
-
-    Generate the uri from page number
-
-    :param page_number:
-    :return:
-    """
-
-    urlhead = 'http://www.vgchartz.com/gamedb/?page='
-    # page_number... <= here comes the param received
-    urltail = f'&results={page_size}'
-    urltail += '&order=Sales'
-    urltail += '&region=All'
-    urltail += '&boxart=Both'
-    urltail += '&banner=Both'
-    urltail += '&ownership=Both'
-    urltail += '&keyword='
-    urltail += '&console='
-    urltail += '&developer='
-    urltail += '&publisher='
-    urltail += '&goty_year='
-    urltail += '&genre='
-    urltail += '&showmultiplat=No'
-    urltail += '&showtotalsales=0'
-    urltail += '&showtotalsales=1'
-    urltail += '&showpublisher=0'
-    urltail += '&showpublisher=1'
-    urltail += '&showvgchartzscore=0'
-    urltail += '&showvgchartzscore=1'
-    urltail += '&shownasales=0'
-    urltail += '&shownasales=1'
-    urltail += '&showdeveloper=0'
-    urltail += '&showdeveloper=1'
-    urltail += '&showcriticscore=0'
-    urltail += '&showcriticscore=1'
-    urltail += '&showpalsales=0'
-    urltail += '&showpalsales=1'
-    urltail += '&showreleasedate=0'
-    urltail += '&showreleasedate=1'
-    urltail += '&showuserscore=0'
-    urltail += '&showuserscore=1'
-    urltail += '&showjapansales=0'
-    urltail += '&showjapansales=1'
-    urltail += '&showlastupdate=0'
-    urltail += '&showlastupdate=1'
-    urltail += '&showothersales=0'
-    urltail += '&showothersales=1'
-    urltail += '&showshipped=0'
-    urltail += '&showshipped=1'
-
-    return urlhead + str(page_number) + urltail
-
 def get_page(url):
     """
     Perform a GET request to the given URL and return results.
@@ -210,8 +157,7 @@ def download_data(start_page, end_page, include_genre):
         soup = BeautifulSoup(current_page)
         logging.info("Downloaded page {}".format(page))
 
-        # vgchartz website is really weird so we have to search for
-        # <a> tags with game urls
+        # We locate the game through search <a> tags with game urls in the main table
         game_tags = list(filter(
             lambda x: x.attrs['href'].startswith('https://www.vgchartz.com/game/'),
             # discard the first 10 elements because those
@@ -224,8 +170,7 @@ def download_data(start_page, end_page, include_genre):
             current_gname = " ".join(tag.string.split())  # add game name to list
             logging.debug("Downloaded game: {}. Name: {}".format(downloaded_games + 1, current_gname))
 
-            # Get different attributes
-            # traverse up the DOM tree
+            # Get different attributes traverse up the DOM tree
             data = tag.parent.parent.find_all("td")
             current_rank = np.int32(data[0].string)
             current_platform = data[3].find('img').attrs['alt']
@@ -290,7 +235,6 @@ def save_games_data(filename, separator, enc):
     df.to_csv(filename, sep=separator, encoding=enc, index=False)
     logging.info("save_games_data <<<")
 
-
 if __name__ == "__main__":
     rank = []
     game_name = []
@@ -315,16 +259,18 @@ def save_games_data(filename, separator, enc):
     # set up logging to console
     console = logging.StreamHandler()
     console.setLevel(logging.DEBUG)
+
     # set a format which is simpler for console use
     formatter = logging.Formatter(fmt='%(asctime)s|%(name)s|%(levelname)s| %(message)s',
                                   datefmt="%d-%m-%Y %H:%M:%S")
     console.setFormatter(formatter)
     logging.getLogger("").addHandler(console)
 
-    logging.info('Application started')
-    base_url = properties['base_page_url']
-    remaining_url = properties['remaining_url']
-    download_data(properties['start_page'], properties['end_page'], properties['include_genre'])
-    save_games_data(properties['output_filename'], properties['separator'], properties['encoding'])
-
-
+    try:
+        logging.info('Application started')
+        base_url = properties['base_page_url']
+        remaining_url = properties['remaining_url']
+        download_data(properties['start_page'], properties['end_page'], properties['include_genre'])
+        save_games_data(properties['output_filename'], properties['separator'], properties['encoding'])
+    except:
+        print("Unexpected error:", sys.exc_info()[0])

From 43dbe5edf211d203a90a3d11c7be2b1b2fc3730c Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 18:23:50 +0200
Subject: [PATCH 24/35] Explode query parameters

---
 resources.json  | 37 ++++++++++++++++++++++++++++++++-----
 vgchartzfull.py | 18 +++++++++++++++++-
 2 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/resources.json b/resources.json
index d6d4167..a45ded9 100644
--- a/resources.json
+++ b/resources.json
@@ -1,17 +1,44 @@
 {
+  "include_genre": false,
+  "application_log_filename": "app.log",
   "output_filename": "vgsales.csv",
   "separator": ",",
   "encoding": "utf-8",
   "start_page": 1,
   "end_page": 2,
-  "include_genre": false,
+  "base_page_url": "https://www.vgchartz.com/gamedb/?page=",
+  "query_parameters": {
+    "results": 1000,
+    "console": null,
+    "region": "All",
+    "developer": null,
+    "publisher": null,
+    "genre": null,
+    "boxart": "Both",
+    "ownership": "Both",
+    "order": "Sales",
+    "showtotalsales": 0,
+    "showtotalsales": 1,
+    "showpublisher": 0,
+    "showpublisher": 1,
+    "showvgchartzscore": 0,
+    "shownasales": 1,
+    "showdeveloper": 1,
+    "showcriticscore": 1,
+    "showpalsales": 0,
+    "showpalsales": 1,
+    "showreleasedate": 1,
+    "showuserscore": 1,
+    "showjapansales": 1,
+    "showlastupdate": 0,
+    "showothersales": 1,
+    "showgenre": 1,
+    "sort": "GL"
+  },
   "minimum_sleep_time": 6,
   "maximum_sleep_time": 15,
   "minimum_major_version": 1,
   "maximum_major_version": 56,
   "minimum_minor_version": 1,
-  "maximum_minor_version": 10,
-  "base_page_url": "https://www.vgchartz.com/gamedb/?page=",
-  "remaining_url": "&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1&showlastupdate=0&showothersales=1&showgenre=1&sort=GL",
-  "application_log_filename": "app.log"
+  "maximum_minor_version": 10
 }
diff --git a/vgchartzfull.py b/vgchartzfull.py
index b59cab9..4b805d5 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -31,6 +31,20 @@ def create_random_header():
     logging.info("create_random_header <<<")
     return header
 
+def generate_remaining_url(*, query_parameters):
+    """
+    Generate an url with a list of videogames from the query params configured at resources.json
+    :return: Url with page number
+    """
+    logging.info("generate_remaining_url >>>")
+    reply=''
+    for param in query_parameters:
+        value=query_parameters.get(param, None)
+        reply += f"&{param}={value}" if value is not None else f"&{param}="
+    logging.debug(f"Url Generated: {base_url}?{reply}")
+    logging.info("generate_remaining_url <<<")
+    return reply
+
 def get_page(url):
     """
     Perform a GET request to the given URL and return results.
@@ -269,8 +283,10 @@ def save_games_data(filename, separator, enc):
     try:
         logging.info('Application started')
         base_url = properties['base_page_url']
-        remaining_url = properties['remaining_url']
+        remaining_url=generate_remaining_url(query_parameters=properties['query_parameters'])
         download_data(properties['start_page'], properties['end_page'], properties['include_genre'])
         save_games_data(properties['output_filename'], properties['separator'], properties['encoding'])
+
     except:
         print("Unexpected error:", sys.exc_info()[0])
+        pass

From 779fcad13a1684f1ec617c84ee63dfda23cd6d8b Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 18:30:39 +0200
Subject: [PATCH 25/35] I love functions with named parameters sorry XD

---
 vgchartzfull.py | 53 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index 4b805d5..923002f 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -45,7 +45,7 @@ def generate_remaining_url(*, query_parameters):
     logging.info("generate_remaining_url <<<")
     return reply
 
-def get_page(url):
+def get_page(*, url):
     """
     Perform a GET request to the given URL and return results.
     Add a wait logic that, combined with random header, will help avoiding
@@ -63,7 +63,7 @@ def get_page(url):
     return result
 
 
-def get_genre(game_url):
+def get_genre(*, game_url):
     """
     Return the game genre retrieved from the given url
     :param game_url:
@@ -71,7 +71,7 @@ def get_genre(game_url):
     """
     logging.info("get_genre >>>")
     logging.debug("Page to download: {}".format(game_url))
-    site_raw = get_page(game_url)
+    site_raw = get_page(url=game_url)
     sub_soup = BeautifulSoup(site_raw, "html.parser")
     # again, the info box is inconsistent among games so we
     # have to find all the h2 and traverse from that to the genre name
@@ -89,7 +89,7 @@ def get_genre(game_url):
     return genre_value
 
 
-def get_release_year(raw_year):
+def get_release_year(*, raw_year):
     """
     Return the release year of the given game in a 4 digit format or N/A.
     :param raw_year:
@@ -107,7 +107,8 @@ def get_release_year(raw_year):
     return final_year
 
 
-def add_current_game_data(current_critic_score,
+def add_current_game_data(*,
+                          current_critic_score,
                           current_developer,
                           current_game_name,
                           current_platform,
@@ -155,7 +156,7 @@ def add_current_game_data(current_critic_score,
     logging.info("add_current_game_data <<<")
 
 
-def download_data(start_page, end_page, include_genre):
+def download_data(*, start_page, end_page, include_genre):
     """
     Download games data from vgchartz: only data whose pages are in the range (start_page, end_page) will be downloaded
     :param start_page:
@@ -167,7 +168,7 @@ def download_data(start_page, end_page, include_genre):
     downloaded_games = 0  # Results are decreasingly ordered according to Shipped units
     for page in range(start_page, end_page + 1):
         page_url = "{}{}{}".format(base_url, str(page), remaining_url)
-        current_page = get_page(page_url)
+        current_page = get_page(url=page_url)
         soup = BeautifulSoup(current_page)
         logging.info("Downloaded page {}".format(page))
 
@@ -197,17 +198,27 @@ def download_data(start_page, end_page, include_genre):
             current_sales_jp = float(data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan
             current_sales_ot = float(data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan
             current_sales_gl = float(data[8].string[:-1]) if not data[8].string.startswith("N/A") else np.nan
-            current_release_year = get_release_year(data[13].string.split()[-1])
-
-            add_current_game_data(current_critic_score, current_developer, current_gname, current_platform,
-                                  current_publisher, current_rank, current_release_year, current_sales_gl,
-                                  current_sales_jp, current_sales_na, current_sales_ot, current_sales_pal,
-                                  current_user_score)
+            current_release_year = get_release_year(raw_year=data[13].string.split()[-1])
+
+            add_current_game_data(
+                current_critic_score=current_critic_score,
+                current_developer=current_developer,
+                current_game_name=current_gname,
+                current_platform=current_platform,
+                current_publisher=current_publisher,
+                current_rank=current_rank,
+                current_release_year=current_release_year,
+                current_sales_gl=current_sales_gl,
+                current_sales_jp=current_sales_jp,
+                current_sales_na=current_sales_na,
+                current_sales_ot=current_sales_ot,
+                current_sales_pal=current_sales_pal,
+                current_user_score=current_user_score)
 
             game_url = tag.attrs['href']
             game_genre = ""
             if include_genre:
-                game_genre = get_genre(game_url)
+                game_genre = get_genre(game_url=game_url)
             genre.append(game_genre)
 
             downloaded_games += 1
@@ -216,7 +227,7 @@ def download_data(start_page, end_page, include_genre):
     logging.info("download_data <<<")
 
 
-def save_games_data(filename, separator, enc):
+def save_games_data(*, filename, separator, enc):
     """
     Save all the downloaded data into the specified file
     :param filename
@@ -284,8 +295,16 @@ def save_games_data(filename, separator, enc):
         logging.info('Application started')
         base_url = properties['base_page_url']
         remaining_url=generate_remaining_url(query_parameters=properties['query_parameters'])
-        download_data(properties['start_page'], properties['end_page'], properties['include_genre'])
-        save_games_data(properties['output_filename'], properties['separator'], properties['encoding'])
+
+        download_data(
+            start_page=properties['start_page'],
+            end_page=properties['end_page'],
+            include_genre=properties['include_genre'])
+
+        save_games_data(
+            filename=properties['output_filename'],
+            separator=properties['separator'],
+            enc=properties['encoding'])
 
     except:
         print("Unexpected error:", sys.exc_info()[0])

From 826476d65a28fa2b28a0614697b5ec0af86bd63f Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Mon, 30 Mar 2020 18:41:20 +0200
Subject: [PATCH 26/35] Fix padding spaces due to html parsing

---
 vgchartzfull.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vgchartzfull.py b/vgchartzfull.py
index 923002f..b4791a3 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -143,8 +143,8 @@ def add_current_game_data(*,
     game_name.append(current_game_name)
     rank.append(current_rank)
     platform.append(current_platform)
-    publisher.append(current_publisher)
-    developer.append(current_developer)
+    publisher.append(current_publisher.strip())
+    developer.append(current_developer.strip())
     critic_score.append(current_critic_score)
     user_score.append(current_user_score)
     sales_na.append(current_sales_na)
@@ -169,7 +169,7 @@ def download_data(*, start_page, end_page, include_genre):
     for page in range(start_page, end_page + 1):
         page_url = "{}{}{}".format(base_url, str(page), remaining_url)
         current_page = get_page(url=page_url)
-        soup = BeautifulSoup(current_page)
+        soup = BeautifulSoup(current_page, features="html.parser")
         logging.info("Downloaded page {}".format(page))
 
         # We locate the game through search <a> tags with game urls in the main table

From a8bf65696e3128e197e1cd52ccb5167591e6ee23 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Tue, 31 Mar 2020 00:34:00 +0200
Subject: [PATCH 27/35] Updating doc

Signed-off-by: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0854b43..618d6b0 100644
--- a/README.md
+++ b/README.md
@@ -50,5 +50,4 @@ It can be installed by pip.
 
 ## Greetings
 
-Thanks to Chris Albon & Gregor UT
-http://chrisalbon.com/python/beautiful_soup_scrape_table.html
+Thanks to [Chris Albon](http://chrisalbon.com/python/beautiful_soup_scrape_table.html) 

From 6aa2b6c312c0004ca62e2ac85fd12fbfca754766 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <Pelirrojo@users.noreply.github.com>
Date: Tue, 31 Mar 2020 09:42:09 +0200
Subject: [PATCH 28/35] Update README.md

Update total count
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 618d6b0..e2a2262 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 vgchartzfull.py is a python@3 script based on BeautifulSoup.
 
-It creates a dataset with data from more than 16,500 games. based on data from  http://www.vgchartz.com/gamedb/ 
+It creates a dataset with data from more than 57,000 games. based on data from  http://www.vgchartz.com/gamedb/ 
 
 ## Output
 

From 6a8c2a2d2d317c529d227bb16b8d28d462473554 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <pelirrojo@users.noreply.github.com>
Date: Tue, 31 Mar 2020 14:03:31 +0200
Subject: [PATCH 29/35] Folder reorganize and bump dependencies

---
 resources.json => cfg/resources.json        |  8 ++++----
 dataset/.gitkeep                            |  1 +
 log/.gitkeep                                |  1 +
 requirements.txt                            | 14 ++++++--------
 vgchartzfull.py => vgchartz-full-crawler.py | 18 ++++++++++++------
 5 files changed, 24 insertions(+), 18 deletions(-)
 rename resources.json => cfg/resources.json (89%)
 create mode 100644 dataset/.gitkeep
 create mode 100644 log/.gitkeep
 rename vgchartzfull.py => vgchartz-full-crawler.py (96%)

diff --git a/resources.json b/cfg/resources.json
similarity index 89%
rename from resources.json
rename to cfg/resources.json
index a45ded9..2fa636b 100644
--- a/resources.json
+++ b/cfg/resources.json
@@ -1,14 +1,14 @@
 {
-  "include_genre": false,
-  "application_log_filename": "app.log",
-  "output_filename": "vgsales.csv",
+  "application_log_filename": "log/app.log",
+  "output_filename": "dataset/vgsales.csv",
   "separator": ",",
   "encoding": "utf-8",
   "start_page": 1,
   "end_page": 2,
+  "include_genre": false,
   "base_page_url": "https://www.vgchartz.com/gamedb/?page=",
   "query_parameters": {
-    "results": 1000,
+    "results": 10,
     "console": null,
     "region": "All",
     "developer": null,
diff --git a/dataset/.gitkeep b/dataset/.gitkeep
new file mode 100644
index 0000000..fe91d07
--- /dev/null
+++ b/dataset/.gitkeep
@@ -0,0 +1 @@
+Git doesn't like empty folders
\ No newline at end of file
diff --git a/log/.gitkeep b/log/.gitkeep
new file mode 100644
index 0000000..fe91d07
--- /dev/null
+++ b/log/.gitkeep
@@ -0,0 +1 @@
+Git doesn't like empty folders
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index e59be52..3311a7a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,8 @@
 beautifulsoup4==4.8.2
 bs4==0.0.1
-numpy==1.16.3
-numpy==1.16.4
-pandas==0.24.2
-pandas==0.25.0
-python-dateutil==2.8.0
-pytz==2019.1
-six==1.12.0
-soupsieve==1.9.1
\ No newline at end of file
+numpy==1.18.2
+pandas==1.0.3
+python-dateutil==2.8.1
+pytz==2019.3
+six==1.14.0
+soupsieve==2.0
\ No newline at end of file
diff --git a/vgchartzfull.py b/vgchartz-full-crawler.py
similarity index 96%
rename from vgchartzfull.py
rename to vgchartz-full-crawler.py
index b4791a3..4f4a894 100644
--- a/vgchartzfull.py
+++ b/vgchartz-full-crawler.py
@@ -1,9 +1,10 @@
 from bs4 import BeautifulSoup, element
 from random import randint, choice
+import urllib
+import urllib.request
 import pandas as pd
 import numpy as np
 import logging
-import urllib
 import sys
 import time
 import json
@@ -41,7 +42,7 @@ def generate_remaining_url(*, query_parameters):
     for param in query_parameters:
         value=query_parameters.get(param, None)
         reply += f"&{param}={value}" if value is not None else f"&{param}="
-    logging.debug(f"Url Generated: {base_url}?{reply}")
+    logging.debug(f"Url Generated: {base_url}N{reply}")
     logging.info("generate_remaining_url <<<")
     return reply
 
@@ -66,6 +67,7 @@ def get_page(*, url):
 def get_genre(*, game_url):
     """
     Return the game genre retrieved from the given url
+    (It involves another http request)
     :param game_url:
     :return: Genre of the input game
     """
@@ -73,12 +75,14 @@ def get_genre(*, game_url):
     logging.debug("Page to download: {}".format(game_url))
     site_raw = get_page(url=game_url)
     sub_soup = BeautifulSoup(site_raw, "html.parser")
-    # again, the info box is inconsistent among games so we
+
+    # Eventually the info box is inconsistent among games so we
     # have to find all the h2 and traverse from that to the genre name
+    # and make a temporary tag here to search
+    # for the one that contains the word "Genre"
     h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')
-    # make a temporary tag here to search for the one that contains
-    # the word "Genre"
     temp_tag = element.Tag
+
     for h2 in h2s:
         if h2.string == 'Genre':
             temp_tag = h2
@@ -187,6 +191,7 @@ def download_data(*, start_page, end_page, include_genre):
 
             # Get different attributes traverse up the DOM tree
             data = tag.parent.parent.find_all("td")
+            #print(data)
             current_rank = np.int32(data[0].string)
             current_platform = data[3].find('img').attrs['alt']
             current_publisher = data[4].string
@@ -273,7 +278,7 @@ def save_games_data(*, filename, separator, enc):
 
     properties = None
 
-    with open("resources.json") as file:
+    with open("cfg/resources.json") as file:
         properties = json.load(file)
 
     logging.root.handlers = []
@@ -307,5 +312,6 @@ def save_games_data(*, filename, separator, enc):
             enc=properties['encoding'])
 
     except:
+        print("Global exception")
         print("Unexpected error:", sys.exc_info()[0])
         pass

From c4f9ff812606134214f7115f0e92fea38b36d131 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <pelirrojo@users.noreply.github.com>
Date: Tue, 31 Mar 2020 14:03:49 +0200
Subject: [PATCH 30/35] script to easy run

---
 run.sh | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100755 run.sh

diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..df4b431
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+python --version >/dev/null 2>&1 || { echo >&2 "I require python@3 utility but it's not installed. ¯\_(ツ)_/¯ Aborting."; exit 1; }
+pip --version >/dev/null 2>&1 || { echo >&2 "I require pip utility but it's not installed. ¯\_(ツ)_/¯ Aborting."; exit 1; }
+
+clear
+
+pip install -r requirements.txt
+python vgchartz-full-crawler.py

From 0e48d8d8bf8821b926853a7620c7700dfc1c60ea Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <pelirrojo@users.noreply.github.com>
Date: Tue, 31 Mar 2020 18:19:06 +0200
Subject: [PATCH 31/35] Update documentation and add some TODOs

---
 README.md | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e2a2262..2d1dc0e 100644
--- a/README.md
+++ b/README.md
@@ -27,20 +27,34 @@ It can be installed by pip.
 
 ## Dictionary
 
+The dataset it's composed by this fields, and the data is collected with this [methodology](https://www.vgchartz.com/methodology.php).
+
 | Field | Description              |
 |-------|--------------------------|
 | Rank  | Ranking of overall sales |
 | Name | The games name |
-| Platform | Platform of the games release (i.e. PC,PS4, etc.) |
-| Year | Year of the game's release |
 | Genre | Genre of the game |
+| Platform | Platform of the games release (i.e. PC,PS4, etc.) |
+| Developer | Developer of the game | 
 | Publisher | Publisher of the game |
+| Vgchartz_Score | Score at VGcharz site | 
+| Critic_Score | Score at Critic | 
+| User_Score | Score by VGcharts users' site | 
+| Total_Shipped | Total worldwide shipments (in millions) | 
+| Total_Sales | Total worldwide sales (in millions) |
 | NA_Sales | Sales in North America (in millions) |
 | EU_Sales | Sales in Europe (in millions) |
 | JP_Sales | Sales in Japan (in millions) |
 | Other_Sales | Sales in the rest of the world (in millions) |
-| Global_Sales | Total worldwide sales. |
+| Release_Date | Year of the game's release |
+| Last_Update | Last update of this register |
+
+## TODO
 
+- [ ] Remap the columns according the selected values at resources.json
+- [ ] Add some unit testing
+- [ ] Dockerize (w/ alpine-python) to ease use and avoid intallations
+- [ ] Publish at Docker hub
 
 ## Links
 

From 7f5719e7cefaaef830d6b16688ea22670a319ed2 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <pelirrojo@users.noreply.github.com>
Date: Tue, 31 Mar 2020 18:19:27 +0200
Subject: [PATCH 32/35] Improve script output

---
 run.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/run.sh b/run.sh
index df4b431..60ece4b 100755
--- a/run.sh
+++ b/run.sh
@@ -5,5 +5,9 @@ pip --version >/dev/null 2>&1 || { echo >&2 "I require pip utility but it's not
 
 clear
 
+echo "\nInstalling deps... "
 pip install -r requirements.txt
+
+echo "\nStart crawling... (remember a crawler is the friend nobody likes)"
 python vgchartz-full-crawler.py
+

From 214a8538db3b3ea36305ac75c0b9c404bf741e99 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <pelirrojo@users.noreply.github.com>
Date: Tue, 31 Mar 2020 18:20:29 +0200
Subject: [PATCH 33/35] Refactor data saving to add more data and parse full
 dates instead only year.

---
 cfg/resources.json       |  24 ++---
 vgchartz-full-crawler.py | 186 ++++++++++++++++++++++-----------------
 2 files changed, 116 insertions(+), 94 deletions(-)

diff --git a/cfg/resources.json b/cfg/resources.json
index 2fa636b..4e65ceb 100644
--- a/cfg/resources.json
+++ b/cfg/resources.json
@@ -8,32 +8,32 @@
   "include_genre": false,
   "base_page_url": "https://www.vgchartz.com/gamedb/?page=",
   "query_parameters": {
-    "results": 10,
-    "console": null,
+    "results": 100,
     "region": "All",
-    "developer": null,
-    "publisher": null,
-    "genre": null,
     "boxart": "Both",
+    "banner": "Both",
     "ownership": "Both",
+    "showmultiplat": "No",
     "order": "Sales",
-    "showtotalsales": 0,
     "showtotalsales": 1,
-    "showpublisher": 0,
     "showpublisher": 1,
-    "showvgchartzscore": 0,
+    "showvgchartzscore": 1,
     "shownasales": 1,
     "showdeveloper": 1,
     "showcriticscore": 1,
-    "showpalsales": 0,
     "showpalsales": 1,
     "showreleasedate": 1,
     "showuserscore": 1,
     "showjapansales": 1,
-    "showlastupdate": 0,
+    "showlastupdate": 1,
     "showothersales": 1,
-    "showgenre": 1,
-    "sort": "GL"
+    "showshipped": 1,
+    "keyword": null,
+    "console": null,
+    "developer": null,
+    "publisher": null,
+    "goty_year": null,
+    "genre": null
   },
   "minimum_sleep_time": 6,
   "maximum_sleep_time": 15,
diff --git a/vgchartz-full-crawler.py b/vgchartz-full-crawler.py
index 4f4a894..7c6c30c 100644
--- a/vgchartz-full-crawler.py
+++ b/vgchartz-full-crawler.py
@@ -92,71 +92,79 @@ def get_genre(*, game_url):
     logging.info("get_genre <<<")
     return genre_value
 
+def parse_number(*, number_string):
+    """
+    Return string parsed to float with custom format for millions (m)
+    :param number_string:
+    :return: a float number right parsed
+    """
+    logging.info("parse_number >>>")
+    print(number_string)
+    if "m" in number_string:
+        reply = number_string.strip('m')
+        reply = str(float(reply) * 1000000)
+    else:
+        reply=number_string
 
-def get_release_year(*, raw_year):
+    logging.info("parse_number <<<")
+    return float(reply) if not reply.startswith("N/A") else np.nan
+
+def parse_date(*, date_string):
     """
-    Return the release year of the given game in a 4 digit format or N/A.
-    :param raw_year:
-    :return: Game Release year
+    Return the date received as string onto timestamp or N/A.
+    :param date_string:
+    :return: A timestamp in panda date format
     """
-    logging.info("get_release_year >>>")
-    if raw_year.startswith('N/A'):
-        final_year = 'N/A'
-    elif int(raw_year) >= 80:
-        final_year = np.int32("19" + raw_year)
+    logging.info("parse_date >>>")
+    if date_string.startswith('N/A'):
+        date_formatted = 'N/A'
     else:
-        final_year = np.int32("20" + raw_year)
-    logging.debug("Release Year: {}".format(final_year))
-    logging.info("get_release_year <<<")
-    return final_year
+        #i.e. date_string = '18th Feb 20'
+        date_formatted = pd.to_datetime(date_string)
 
+    logging.debug("Date parsed: {}".format(date_formatted))
+    logging.info("parse_date <<<")
+    return date_formatted
 
 def add_current_game_data(*,
-                          current_critic_score,
-                          current_developer,
+                          current_rank,
                           current_game_name,
+                          current_game_genre,
                           current_platform,
                           current_publisher,
-                          current_rank,
-                          current_release_year,
-                          current_sales_gl,
-                          current_sales_jp,
+                          current_developer,
+                          current_vgchartz_score,
+                          current_critic_score,
+                          current_user_score,
+                          current_total_shipped,
+                          current_total_sales,
                           current_sales_na,
-                          current_sales_ot,
                           current_sales_pal,
-                          current_user_score):
+                          current_sales_jp,
+                          current_sales_ot,
+                          current_release_date,
+                          current_last_update):
     """
     Add all the game data to the related lists
-
-    :param current_critic_score:
-    :param current_developer:
-    :param current_game_name:
-    :param current_platform:
-    :param current_publisher:
-    :param current_rank:
-    :param current_release_year:
-    :param current_sales_gl:
-    :param current_sales_jp:
-    :param current_sales_na:
-    :param current_sales_ot:
-    :param current_sales_pal:
-    :param current_user_score:
-    :return:
     """
     logging.info("add_current_game_data >>>")
     game_name.append(current_game_name)
     rank.append(current_rank)
     platform.append(current_platform)
+    genre.append(current_game_genre)
     publisher.append(current_publisher.strip())
     developer.append(current_developer.strip())
+    vgchartz_score.append(current_vgchartz_score)
     critic_score.append(current_critic_score)
     user_score.append(current_user_score)
+    total_shipped.append(current_total_shipped)
+    total_sales.append(current_total_sales)
     sales_na.append(current_sales_na)
     sales_pal.append(current_sales_pal)
     sales_jp.append(current_sales_jp)
     sales_ot.append(current_sales_ot)
-    sales_gl.append(current_sales_gl)
-    year.append(current_release_year)
+    release_date.append(current_release_date)
+    last_update.append(current_last_update)
     logging.info("add_current_game_data <<<")
 
 
@@ -186,45 +194,52 @@ def download_data(*, start_page, end_page, include_genre):
 
         for tag in game_tags:
 
-            current_gname = " ".join(tag.string.split())  # add game name to list
-            logging.debug("Downloaded game: {}. Name: {}".format(downloaded_games + 1, current_gname))
-
-            # Get different attributes traverse up the DOM tree
+            current_game_name = " ".join(tag.string.split())
             data = tag.parent.parent.find_all("td")
-            #print(data)
+
+            logging.debug("Downloaded game: {}. Name: {}".format(downloaded_games + 1, current_game_name))
+
+            # Get the resto of attributes traverse up the DOM tree looking for the cells in results' table
             current_rank = np.int32(data[0].string)
             current_platform = data[3].find('img').attrs['alt']
             current_publisher = data[4].string
             current_developer = data[5].string
-            current_critic_score = float(data[6].string) if not data[6].string.startswith("N/A") else np.nan
-            current_user_score = float(data[7].string) if not data[7].string.startswith("N/A") else np.nan
-            current_sales_na = float(data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan
-            current_sales_pal = float(data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan
-            current_sales_jp = float(data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan
-            current_sales_ot = float(data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan
-            current_sales_gl = float(data[8].string[:-1]) if not data[8].string.startswith("N/A") else np.nan
-            current_release_year = get_release_year(raw_year=data[13].string.split()[-1])
+            current_vgchartz_score = parse_number(number_string=data[6].string)
+            current_critic_score = parse_number(number_string=data[7].string)
+            current_user_score = parse_number(number_string=data[8].string)
+            current_total_shipped = parse_number(number_string=data[9].string)
+            current_total_sales = parse_number(number_string=data[10].string)
+            current_sales_na = parse_number(number_string=data[11].string)
+            current_sales_pal = parse_number(number_string=data[12].string)
+            current_sales_jp = parse_number(number_string=data[13].string)
+            current_sales_ot = parse_number(number_string=data[14].string)
+            current_release_date = parse_date(date_string=data[15].string)
+            current_last_update = parse_date(date_string=data[16].string)
+
+            # The genre requires another HTTP Request, so it's made at the end
+            game_url = tag.attrs['href']
+            current_game_genre = ""
+            if include_genre:
+                current_game_genre = get_genre(game_url=game_url)
 
             add_current_game_data(
-                current_critic_score=current_critic_score,
-                current_developer=current_developer,
-                current_game_name=current_gname,
+                current_rank=current_rank,
+                current_game_name=current_game_name,
+                current_game_genre=current_game_genre,
                 current_platform=current_platform,
                 current_publisher=current_publisher,
-                current_rank=current_rank,
-                current_release_year=current_release_year,
-                current_sales_gl=current_sales_gl,
-                current_sales_jp=current_sales_jp,
+                current_developer=current_developer,
+                current_vgchartz_score=current_vgchartz_score,
+                current_critic_score=current_critic_score,
+                current_user_score=current_user_score,
+                current_total_shipped=current_total_shipped,
+                current_total_sales=current_total_sales,
                 current_sales_na=current_sales_na,
-                current_sales_ot=current_sales_ot,
                 current_sales_pal=current_sales_pal,
-                current_user_score=current_user_score)
-
-            game_url = tag.attrs['href']
-            game_genre = ""
-            if include_genre:
-                game_genre = get_genre(game_url=game_url)
-            genre.append(game_genre)
+                current_sales_jp=current_sales_jp,
+                current_sales_ot=current_sales_ot,
+                current_release_date=current_release_date,
+                current_last_update=current_last_update)
 
             downloaded_games += 1
 
@@ -243,38 +258,45 @@ def save_games_data(*, filename, separator, enc):
     columns = {
         'Rank': rank,
         'Name': game_name,
-        'Platform': platform,
-        'Year': year,
         'Genre': genre,
-        'Critic_Score': critic_score,
-        'User_Score': user_score,
+        'Platform': platform,
         'Publisher': publisher,
         'Developer': developer,
+        'Vgchartz_Score': vgchartz_score,
+        'Critic_Score': critic_score,
+        'User_Score': user_score,
+        'Total_Shipped': total_shipped,
+        'Total_Sales': total_sales,
         'NA_Sales': sales_na,
         'PAL_Sales': sales_pal,
         'JP_Sales': sales_jp,
         'Other_Sales': sales_ot,
-        'Global_Sales': sales_gl
+        'Release_Date': release_date,
+        'Last_Update': last_update
     }
+
     df = pd.DataFrame(columns)
     logging.debug("Dataframe column name: {}".format(df.columns))
-    df = df[[
-        'Rank', 'Name', 'Platform', 'Year', 'Genre',
-        'Publisher', 'Developer', 'Critic_Score', 'User_Score',
-        'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
+    df = df[[ 'Rank', 'Name', 'Genre', 'Platform', 'Publisher', 'Developer',
+              'Vgchartz_Score', 'Critic_Score', 'User_Score', 'Total_Shipped',
+              'Total_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales',
+              'Release_Date', 'Last_Update' ]]
+
     df.to_csv(filename, sep=separator, encoding=enc, index=False)
     logging.info("save_games_data <<<")
 
 if __name__ == "__main__":
+
+    # Buffers
     rank = []
     game_name = []
-    platform = []
-    year = []
     genre = []
-    critic_score, user_score = [], []
-    publisher = []
-    developer = []
-    sales_na, sales_pal, sales_jp, sales_ot, sales_gl = [], [], [], [], []
+    platform = []
+    publisher, developer = [], []
+    critic_score, user_score, vgchartz_score = [], [], []
+    total_shipped = []
+    total_sales, sales_na, sales_pal, sales_jp, sales_ot = [], [], [], [], []
+    release_date, last_update = [], []
 
     properties = None
 
@@ -313,5 +335,5 @@ def save_games_data(*, filename, separator, enc):
 
     except:
         print("Global exception")
-        print("Unexpected error:", sys.exc_info()[0])
+        print("Unexpected error:", sys.exc_info())
         pass

From 381c264401577494d67f6b0619a6ab1c57761259 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <pelirrojo@users.noreply.github.com>
Date: Tue, 31 Mar 2020 18:28:04 +0200
Subject: [PATCH 34/35] Updating Doc

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2d1dc0e..8b59cf5 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@ It creates a dataset with data from more than 57,000 games. based on data from
 
 ## Output
 
-The dataset is saved as vgsales.csv.
+The dataset is saved in the file specified at cfg/resources.json, by default "dataset/vgsales.csv".
 
 ## Install & execution
 

From 1b88322d3cf1a189c6fc8b7b5d09fa1ff0dcf2b3 Mon Sep 17 00:00:00 2001
From: Manuel Eusebio de Paz Carmona <pelirrojo@users.noreply.github.com>
Date: Tue, 31 Mar 2020 18:29:02 +0200
Subject: [PATCH 35/35] Updating Doc

---
 README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 8b59cf5..40aae9d 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,7 @@
 # vgchartzfull - A crawler to download data from Global Videogame Sales
 
-vgchartzfull.py is a python@3 script based on BeautifulSoup.
-
-It creates a dataset with data from more than 57,000 games. based on data from  http://www.vgchartz.com/gamedb/ 
+vgchartz-full-crawler.py is a python@3 crawler script based on BeautifulSoup.
+It creates a csv dataset with data from more than 57,000 games. based on data from [VGChartz Site](http://www.vgchartz.com/gamedb/).  
 
 ## Output