diff --git a/.gitignore b/.gitignore
index be2baa1..ffbec06 100644
--- a/.gitignore
+++ b/.gitignore
@@ -103,3 +103,12 @@ venv.bak/
 # mypy
 .mypy_cache/
 .vscode/
+
+# csv
+*.csv
+
+# ipynb
+*.ipynb
+
+# ignore this folder
+testing/
\ No newline at end of file
diff --git a/README.md b/README.md
index e557119..c7d9d6f 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,41 @@
-vgchartzfull is a python script based on BeautifulSoup.
+
+vgchartzfull is a python script with multiprocessing based on BeautifulSoup.
+proxies are implemented in the script, it can be disabled by changing it to False
+
 It creates a dataset based on data from 
 http://www.vgchartz.com/gamedb/
 
-The dataset is saved as vgsales.csv.
+The dataset is saved as vgsales-%Y-%m-%d_%H_%M_%S.csv.
+
+You will need to have the following dependencies installed:
+```
+BeautifulSoup4 
+pandas
+numpy
+requests
+unidecode
+user_agent
+```
+
+Thanks to:
+- https://www.kdnuggets.com/2018/02/web-scraping-tutorial-python.html
+- http://python.omics.wiki/multiprocessing_map/multiprocessing_partial_function_multiple_arguments
+- https://medium.com/datadriveninvestor/speed-up-web-scraping-using-multiprocessing-in-python-af434ff310c5
 
-You will need to have BeautifulSoup added.
-It can be installed by pip.
 
-sudo pip install BeautifulSoup
+Free proxies:
+[1](https://proxyscrape.com/free-proxy-list)
+[2](http://multiproxy.org/txt_all/proxy.txt)
+[3](https://proxy.rudnkh.me/txt)
+[4](https://www.us-proxy.org/)
 
-Thanks to Chris Albon.
-http://chrisalbon.com/python/beautiful_soup_scrape_table.html
+- [x] added multiprocessing for faster results with a maximum of 24 workers.
+- [x] added proxies to avoid being blocked 
+- [x] handling couple of exceptions
+- [x] scraped data gets saved before raising an unexpected error
+- [x] add the option to continue where we left off due to an unexpected error
+- [x] clean version removes the print statements, should results in better performance!
+- [ ] optimize it
+- [ ] create a log file
+- [ ] convert the script to a class or use scrapy, reference
+    - https://edmundmartin.com/multi-threaded-crawler-in-python/
\ No newline at end of file
diff --git a/clean-vgchartzfull.py b/clean-vgchartzfull.py
new file mode 100644
index 0000000..39cf3ed
--- /dev/null
+++ b/clean-vgchartzfull.py
@@ -0,0 +1,273 @@
+from bs4 import BeautifulSoup, element
+import pandas as pd
+import numpy as np
+import requests
+import time
+import unidecode
+from user_agent import generate_user_agent
+from proxies_gen import get_proxies, test_proxies
+from itertools import cycle
+from lxml.html import fromstring
+from multiprocessing import Pool, cpu_count  # This is a thread-based Pool
+from requests.exceptions import ConnectionError, Timeout, ProxyError, RequestException
+from urllib3.exceptions import ProtocolError
+import sys
+import os
+sys.setrecursionlimit(10000)  # need to optimize code.
+proxy_enabled = True
+
+
+def parse_games(game_tags):
+    """
+    parse the games table on current page
+    parameters:
+    game_tags: games tags after reading the html page
+    df: the dataframe where we will store the games
+    """
+    global rec_count
+    global df
+    for tag in game_tags:
+        game = {}
+        game["Name"] = " ".join(tag.string.split())
+        #print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name']))
+
+        data = tag.parent.parent.find_all("td")
+        if data:
+            game["Rank"] = np.int32(data[0].string)
+            game["img_url"] = data[1].a.img.get('src')
+            game["url"] = data[2].a.get('href')
+            if len(game["Name"].split("/")) > 1:
+                # replace accented chars with ascii
+                game["basename"] = unidecode.unidecode(
+                    game['Name'].strip().split('/')[0].strip().replace(' ', '-'))
+            else:
+                game["basename"] = game["url"].rsplit('/', 2)[1]
+            game["Platform"] = data[3].img.get('alt')
+            game["Publisher"] = data[4].get_text().strip()
+            game["Developer"] = data[5].get_text().strip()
+            game["Vgchartzscore"] = data[6].get_text().strip()
+            game["Critic_Score"] = float(
+                data[7].string) if not data[7].string.startswith("N/A") else np.nan
+            game["User_Score"] = float(
+                data[8].string) if not data[8].string.startswith("N/A") else np.nan
+            game["Total_Shipped"] = float(
+                data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan
+            game["Global_Sales"] = float(
+                data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan
+            game["NA_Sales"] = float(
+                data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan
+            game["PAL_Sales"] = float(
+                data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan
+            game["JP_Sales"] = float(
+                data[13].string[:-1]) if not data[13].string.startswith("N/A") else np.nan
+            game["Other_Sales"] = float(
+                data[14].string[:-1]) if not data[14].string.startswith("N/A") else np.nan
+            year = data[15].string.split()[-1]
+            if year.startswith('N/A'):
+                game["Year"] = 'N/A'
+            else:
+                if int(year) >= 70:
+                    year_to_add = np.int32("19" + year)
+                else:
+                    year_to_add = np.int32("20" + year)
+                game["Year"] = year_to_add
+            game["Last_Update"] = data[16].get_text().strip()
+            game['Genre'] = 'N/A'
+            game['ESRB_Rating'] = 'N/A'
+            game['status'] = 0
+            df = df.append(game, ignore_index=True)
+        rec_count += 1
+
+
+def parse_genre_esrb(df):
+    """loads every game's url to get genre and esrb rating"""
+    headers = {'User-Agent': generate_user_agent(
+        device_type='desktop', os=('mac', 'linux'))}
+    proxy = {}
+    if proxy_enabled:
+        #print("\n******getting list of proxies and testing them******'\n")
+        # this an api call which returns a list of working proxies that get checked evrey 15 minutes
+        proxies = cycle(get_proxies(5))
+        proxy = next(proxies)
+
+    for index, row in df.iterrows():
+        try:
+            game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}, timeout=5)
+            if game_page.status_code == 200:
+                sub_soup = BeautifulSoup(game_page.text, "lxml")
+                # again, the info box is inconsistent among games so we
+                # have to find all the h2 and traverse from that to the genre
+                gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"})
+                h2s = gamebox.find_all('h2')
+                # make a temporary tag here to search for the one that contains
+                # the word "Genre"
+                temp_tag = element.Tag
+                for h2 in h2s:
+                    if h2.string == 'Genre':
+                        temp_tag = h2
+                df.loc[index, 'Genre'] = temp_tag.next_sibling.string
+
+                # find the ESRB rating
+                game_rating = gamebox.find('img').get('src')
+                if 'esrb' in game_rating:
+                    df.loc[index, 'ESRB_Rating'] = game_rating.split(
+                        '_')[1].split('.')[0].upper()
+                # we successfuly got the genre and rating
+                df.loc[index, 'status'] = 1
+                #print('Successfully scraped genre and rating for :', df.at[index, 'Name'])
+
+        except(ProxyError):
+            proxy = next(proxies)
+
+        except (ConnectionError, Timeout, ProtocolError, TimeoutError):
+            #print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later')
+            continue
+
+        except Exception as e:
+            #print('different error occurred while connecting, will pass')
+            continue
+        # wait for 1 seconds between every call,
+        # we do not want to get blocked or abuse the server
+        time.sleep(1)
+    return df
+
+
+def retry_game(df):
+    """try to scrape the missing data again"""
+    return parse_genre_esrb(df)
+
+
+if __name__ == "__main__":
+    def process_games(df):
+        failed_games = len(df[df['status'] == 0])
+        NUM_WORKERS = cpu_count() * 2
+        df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS)
+        #update num_workers
+        df_subsets = [i for i in df_subsets if len(i) != 0]
+        if len(df_subsets) != 0:
+            NUM_WORKERS = len(df_subsets)# we don't want to have a worker for empty subsets
+            pool = Pool(processes=NUM_WORKERS)
+            results = pool.map(retry_game, df_subsets)
+            try:
+                df_updated = pd.concat(results)
+                df = pd.concat([df[df['status'] == 1], df_updated])
+            except: 
+                print('error occurred while joining dataframe')
+            pool.close()
+            pool.join()
+        return df
+
+    rec_count = 0
+    start_time = time.time()
+    current_time = time.time()
+    crashed_tag = 'before_crashing_'
+    exists = [s for s in os.listdir() if crashed_tag in s]
+    if exists:
+        print("found a data saved from a crash, will continue on it")
+        csvfilename = exists[0].replace(crashed_tag, '')
+        df = pd.read_csv(exists[0])
+        rec_count = df['Rank'].max()
+        page = int(rec_count/1000) + 1 # because we already scraped current 
+        df = process_games(df)
+    else:
+        csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv"
+
+    # initialize a panda dataframe to store all games with the following columns:
+    # rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer,
+    # publisher, release year, critic score, user score, na sales, pal sales,
+    # jp sales, other sales, total sales, total shipped, last update, url, status
+    # last two columns for debugging
+    if not exists:
+        df = pd.DataFrame(columns=[
+            'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher',
+            'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score',
+            'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales',
+            'Other_Sales', 'Year', 'Last_Update', 'url', 'status'])
+
+    urlhead = 'http://www.vgchartz.com/games/games.php?page='
+    urltail = '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both'
+    urltail += '&banner=Both&showdeleted=&region=All&goty_year=&developer='
+    urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1'
+    urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1'
+    urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1'
+
+    # get the number of pages
+    vglink = requests.get('http://www.vgchartz.com/gamedb/').text
+    x = fromstring(vglink).xpath(
+        "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0]
+    pages = int(x.split(',')[0])
+
+    if not exists: page = 1
+    while True:
+        if page > pages:
+            break
+        try:
+            proxy = get_proxies(1)[0]
+            headers = {'User-Agent': generate_user_agent(
+                device_type='desktop', os=('mac', 'linux'))}
+            surl = urlhead + str(page) + urltail
+            r = requests.get(surl, headers=headers, proxies={
+                            'http': proxy, 'https': proxy}, timeout=10)
+            if r.status_code == 200:
+                soup = BeautifulSoup(r.text, 'lxml')
+                print("******Scraping page " + str(page) + "******'\n")
+
+                # vgchartz website is really weird so we have to search for
+                # <a> tags with game urls
+                game_tags = list(filter(
+                    lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), soup.find_all("a")))[10:]
+                # discard the first 10 elements because those
+                # links are in the navigation bar
+
+                parse_games(game_tags)
+                page += 1
+                print('\n******begin scraping for Genre and Rating******\n')
+                df = process_games(df)
+
+        except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError, TimeoutError):
+            print('Something went wrong while connecting to page: ',
+                page, ', will try again later')
+            #proxy = get_proxies(1)
+            time.sleep(10)
+
+        except Exception as e:
+            print("something went wrong! We're on page: " +
+                str(page) + '\nSaving successfully crawled data')
+            print("Exception: ", e)
+            df.to_csv(crashed_tag + csvfilename, sep=",",
+                    encoding='utf-8', index=False)
+            raise e
+
+
+    failed_games = len(df[df['status'] == 0])
+    print("******Finished scraping games, will try to scrape missing data******")
+    # 36 hours max, should be enough to scrape everything
+    t_end = start_time + 60 * 60 * 36
+    while True:
+        try:
+            df = process_games(df)
+            failed_games = len(df[df['status'] == 0])
+            if failed_games == 0 or time.time() > t_end:
+                break
+            #print('Number of not scraped yet:', failed_games, '\n')
+            time.sleep(10)  # wait for 10 seconds for the server to recover?
+        except Exception as e:
+            print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data')
+            print("Exception: ", e)
+            df.to_csv(crashed_tag + csvfilename, sep=",",
+                    encoding='utf-8', index=False)
+            raise e
+
+    elapsed_time = time.time() - start_time
+    print("Scraped", rec_count, "games in", round(elapsed_time/60, 2), "minutes.")
+
+    # select only these columns in the final dataset
+    df = df.sort_values(by=['Rank'])
+    df.to_csv('complete-vgchartz.csv', sep=",", encoding='utf-8', index=False)
+    df_final = df[[
+        'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating',
+        'Publisher', 'Developer', 'Critic_Score', 'User_Score',
+        'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales']]
+
+    df_final.to_csv(csvfilename, sep=",", encoding='utf-8', index=False)
+    print("Wrote scraper data to", csvfilename)
diff --git a/proxies_gen.py b/proxies_gen.py
new file mode 100644
index 0000000..6c5a54e
--- /dev/null
+++ b/proxies_gen.py
@@ -0,0 +1,60 @@
+from lxml.html import fromstring
+import requests
+import numpy as np
+from itertools import cycle
+
+
+def get_proxies(num=None):
+    # url = 'https://free-proxy-list.net/'
+    # response = requests.get(url)
+    # parser = fromstring(response.text)
+    # proxies = list(requests.get('https://proxy.rudnkh.me/txt').text.split())
+    # for i in parser.xpath('//tbody/tr'):
+    #     if i.xpath('.//td[7][contains(text(),"yes")]'):
+    #         proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
+    #         proxies.append(proxy)
+
+    link = "https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=1000&country=all&ssl=all&anonymity=all&uptime=100"
+    proxies = list(requests.get(link).text.split())
+    np.random.shuffle(proxies)
+    proxies = []
+    if len(proxies) == 0:
+        proxies = list(requests.get(
+            link[:-3]+'99').text.split())  # change uptime to 99
+        np.random.shuffle(proxies)
+    # print('Found', len(proxies), 'proxies, testing them now')
+
+    if num is None:
+        num = len(proxies)
+    tested = test_proxies(proxies, num)
+    return tested
+
+
+def test_proxies(proxies, num):
+    url = 'https://httpbin.org/ip'
+    proxy_pool = cycle(proxies)
+    working_proxies = []
+    for i in range(1, len(proxies)):
+        if num == 0:
+            break
+        # Get a proxy from the pool
+        proxy = next(proxy_pool)
+        # print("Request #%d" % i)
+        try:
+            response = requests.get(
+                url, proxies={"http": proxy, "https": proxy}, timeout=1)
+            # print(response.json())
+            working_proxies.append(proxy)
+            num -= 1
+        except:
+            # Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work.
+            # print("Skipping. Connnection error")
+            pass
+    return working_proxies
+
+
+# proxies = get_proxies(5)
+# # with open('proxies.txt') as f:
+# #         proxies = f.read().splitlines()
+# # test_proxies(proxies, 10)
+# print(proxies)
diff --git a/vgchartzfull.py b/vgchartzfull.py
index b1d75a4..7938e72 100644
--- a/vgchartzfull.py
+++ b/vgchartzfull.py
@@ -1,130 +1,270 @@
 from bs4 import BeautifulSoup, element
-import urllib
 import pandas as pd
 import numpy as np
+import requests
+import time
+import unidecode
+from user_agent import generate_user_agent
+from proxies_gen import get_proxies, test_proxies
+from itertools import cycle
+from lxml.html import fromstring
+from multiprocessing import Pool, cpu_count  # This is a thread-based Pool
+from requests.exceptions import ConnectionError, Timeout, ProxyError, RequestException
+from urllib3.exceptions import ProtocolError
+import sys
+import os
+sys.setrecursionlimit(10000)  # need to optimize code.
+proxy_enabled = True
 
-pages = 19
-rec_count = 0
-rank = []
-gname = []
-platform = []
-year = []
-genre = []
-critic_score = []
-user_score = []
-publisher = []
-developer = []
-sales_na = []
-sales_pal = []
-sales_jp = []
-sales_ot = []
-sales_gl = []
-
-urlhead = 'http://www.vgchartz.com/gamedb/?page='
-urltail = '&console=&region=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'
-urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0'
-urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'
-urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'
-urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'
-
-for page in range(1, pages):
-    surl = urlhead + str(page) + urltail
-    r = urllib.request.urlopen(surl).read()
-    soup = BeautifulSoup(r)
-    print(f"Page: {page}")
-
-    # vgchartz website is really weird so we have to search for
-    # <a> tags with game urls
-    game_tags = list(filter(
-        lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),
-        # discard the first 10 elements because those
-        # links are in the navigation bar
-        soup.find_all("a")
-    ))[10:]
 
+def parse_games(game_tags):
+    """
+    parse the games table on current page
+    parameters:
+    game_tags: games tags after reading the html page
+    df: the dataframe where we will store the games
+    """
+    global rec_count
+    global df
     for tag in game_tags:
+        game = {}
+        game["Name"] = " ".join(tag.string.split())
+        print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name']))
 
-        # add name to list
-        gname.append(" ".join(tag.string.split()))
-        print(f"{rec_count + 1} Fetch data for game {gname[-1]}")
-
-        # get different attributes
-        # traverse up the DOM tree
         data = tag.parent.parent.find_all("td")
-        rank.append(np.int32(data[0].string))
-        platform.append(data[3].find('img').attrs['alt'])
-        publisher.append(data[4].string)
-        developer.append(data[5].string)
-        critic_score.append(
-            float(data[6].string) if
-            not data[6].string.startswith("N/A") else np.nan)
-        user_score.append(
-            float(data[7].string) if
-            not data[7].string.startswith("N/A") else np.nan)
-        sales_na.append(
-            float(data[9].string[:-1]) if
-            not data[9].string.startswith("N/A") else np.nan)
-        sales_pal.append(
-            float(data[10].string[:-1]) if
-            not data[10].string.startswith("N/A") else np.nan)
-        sales_jp.append(
-            float(data[11].string[:-1]) if
-            not data[11].string.startswith("N/A") else np.nan)
-        sales_ot.append(
-            float(data[12].string[:-1]) if
-            not data[12].string.startswith("N/A") else np.nan)
-        sales_gl.append(
-            float(data[8].string[:-1]) if
-            not data[8].string.startswith("N/A") else np.nan)
-        release_year = data[13].string.split()[-1]
-        # different format for year
-        if release_year.startswith('N/A'):
-            year.append('N/A')
-        else:
-            if int(release_year) >= 80:
-                year_to_add = np.int32("19" + release_year)
+        if data:
+            game["Rank"] = np.int32(data[0].string)
+            game["img_url"] = data[1].a.img.get('src')
+            game["url"] = data[2].a.get('href')
+            if len(game["Name"].split("/")) > 1:
+                # replace accented chars with ascii
+                game["basename"] = unidecode.unidecode(
+                    game['Name'].strip().split('/')[0].strip().replace(' ', '-'))
             else:
-                year_to_add = np.int32("20" + release_year)
-            year.append(year_to_add)
-
-        # go to every individual website to get genre info
-        url_to_game = tag.attrs['href']
-        site_raw = urllib.request.urlopen(url_to_game).read()
-        sub_soup = BeautifulSoup(site_raw, "html.parser")
-        # again, the info box is inconsistent among games so we
-        # have to find all the h2 and traverse from that to the genre name
-        h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2')
-        # make a temporary tag here to search for the one that contains
-        # the word "Genre"
-        temp_tag = element.Tag
-        for h2 in h2s:
-            if h2.string == 'Genre':
-                temp_tag = h2
-        genre.append(temp_tag.next_sibling.string)
-
+                game["basename"] = game["url"].rsplit('/', 2)[1]
+            game["Platform"] = data[3].img.get('alt')
+            game["Publisher"] = data[4].get_text().strip()
+            game["Developer"] = data[5].get_text().strip()
+            game["Vgchartzscore"] = data[6].get_text().strip()
+            game["Critic_Score"] = float(
+                data[7].string) if not data[7].string.startswith("N/A") else np.nan
+            game["User_Score"] = float(
+                data[8].string) if not data[8].string.startswith("N/A") else np.nan
+            game["Total_Shipped"] = float(
+                data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan
+            game["Global_Sales"] = float(
+                data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan
+            game["NA_Sales"] = float(
+                data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan
+            game["PAL_Sales"] = float(
+                data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan
+            game["JP_Sales"] = float(
+                data[13].string[:-1]) if not data[13].string.startswith("N/A") else np.nan
+            game["Other_Sales"] = float(
+                data[14].string[:-1]) if not data[14].string.startswith("N/A") else np.nan
+            year = data[15].string.split()[-1]
+            if year.startswith('N/A'):
+                game["Year"] = 'N/A'
+            else:
+                if int(year) >= 70:
+                    year_to_add = np.int32("19" + year)
+                else:
+                    year_to_add = np.int32("20" + year)
+                game["Year"] = year_to_add
+            game["Last_Update"] = data[16].get_text().strip()
+            game['Genre'] = 'N/A'
+            game['ESRB_Rating'] = 'N/A'
+            game['status'] = 0
+            df = df.append(game, ignore_index=True)
         rec_count += 1
 
-columns = {
-    'Rank': rank,
-    'Name': gname,
-    'Platform': platform,
-    'Year': year,
-    'Genre': genre,
-    'Critic_Score': critic_score,
-    'User_Score': user_score,
-    'Publisher': publisher,
-    'Developer': developer,
-    'NA_Sales': sales_na,
-    'PAL_Sales': sales_pal,
-    'JP_Sales': sales_jp,
-    'Other_Sales': sales_ot,
-    'Global_Sales': sales_gl
-}
-print(rec_count)
-df = pd.DataFrame(columns)
-print(df.columns)
-df = df[[
-    'Rank', 'Name', 'Platform', 'Year', 'Genre',
-    'Publisher', 'Developer', 'Critic_Score', 'User_Score',
-    'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]
-df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False)
+
+def parse_genre_esrb(df):
+    """loads every game's url to get genre and esrb rating"""
+    headers = {'User-Agent': generate_user_agent(
+        device_type='desktop', os=('mac', 'linux'))}
+    proxy = {}
+    if proxy_enabled:
+        print("\n******getting list of proxies and testing them******'\n")
+        # this an api call which returns a list of working proxies that get checked evrey 15 minutes
+        proxies = cycle(get_proxies(5))
+        proxy = next(proxies)
+
+    for index, row in df.iterrows():
+        try:
+            game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}, timeout=5)
+            if game_page.status_code == 200:
+                sub_soup = BeautifulSoup(game_page.text, "lxml")
+                # again, the info box is inconsistent among games so we
+                # have to find all the h2 and traverse from that to the genre
+                gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"})
+                h2s = gamebox.find_all('h2')
+                # make a temporary tag here to search for the one that contains
+                # the word "Genre"
+                temp_tag = element.Tag
+                for h2 in h2s:
+                    if h2.string == 'Genre':
+                        temp_tag = h2
+                df.loc[index, 'Genre'] = temp_tag.next_sibling.string
+
+                # find the ESRB rating
+                game_rating = gamebox.find('img').get('src')
+                if 'esrb' in game_rating:
+                    df.loc[index, 'ESRB_Rating'] = game_rating.split(
+                        '_')[1].split('.')[0].upper()
+                # we successfuly got the genre and rating
+                df.loc[index, 'status'] = 1
+                print('Successfully scraped genre and rating for :', df.at[index, 'Name'])
+
+        except(ProxyError):
+            proxy = next(proxies)
+
+        except (ConnectionError, Timeout, ProtocolError):
+            print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later')
+
+        except Exception as e:
+            print('different error occurred while connecting, will pass')
+        # wait for 1 seconds between every call,
+        # we do not want to get blocked or abuse the server
+        time.sleep(1)
+    return df
+
+
+def retry_game(df):
+    """try to scrape the missing data again"""
+    return parse_genre_esrb(df)
+
+
+if __name__ == "__main__":
+    def process_games(df):
+        failed_games = len(df[df['status'] == 0])
+        NUM_WORKERS = cpu_count() * 2
+        df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS)
+        #update num_workers
+        df_subsets = [i for i in df_subsets if len(i) != 0]
+        NUM_WORKERS = len(df_subsets) # we don't want to have a worker for empty subsets
+        pool = Pool(processes=NUM_WORKERS)
+        results = pool.map(retry_game, df_subsets)
+        try:
+            df_updated = pd.concat(results)
+            df = pd.concat([df[df['status'] == 1], df_updated])
+        except: 
+            print('error occurred while joining dataframe')
+        pool.close()
+        pool.join()
+        return df
+
+    rec_count = 0
+    start_time = time.time()
+    current_time = time.time()
+    crashed_tag = 'before_crashing_'
+    exists = [s for s in os.listdir() if crashed_tag in s]
+    if exists:
+        print("found a data saved from a crash, will continue on it")
+        csvfilename = exists[0].replace(crashed_tag, '')
+        df = pd.read_csv(exists[0])
+        rec_count = df['Rank'].max()
+        page = int(rec_count/1000) + 1 # because we already scraped current 
+        df = process_games(df)
+    else:
+        csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv"
+
+    # initialize a panda dataframe to store all games with the following columns:
+    # rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer,
+    # publisher, release year, critic score, user score, na sales, pal sales,
+    # jp sales, other sales, total sales, total shipped, last update, url, status
+    # last two columns for debugging
+    if not exists:
+        df = pd.DataFrame(columns=[
+            'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher',
+            'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score',
+            'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales',
+            'Other_Sales', 'Year', 'Last_Update', 'url', 'status'])
+
+    urlhead = 'http://www.vgchartz.com/games/games.php?page='
+    urltail = '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both'
+    urltail += '&banner=Both&showdeleted=&region=All&goty_year=&developer='
+    urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1'
+    urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1'
+    urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1'
+
+    # get the number of pages
+    vglink = requests.get('http://www.vgchartz.com/gamedb/').text
+    x = fromstring(vglink).xpath(
+        "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0]
+    pages = int(x.split(',')[0])
+
+    if not exists: page = 1
+    while True:
+        if page > pages:
+            break
+        try:
+            proxy = get_proxies(1)[0]
+            headers = {'User-Agent': generate_user_agent(
+                device_type='desktop', os=('mac', 'linux'))}
+            surl = urlhead + str(page) + urltail
+            r = requests.get(surl, headers=headers, proxies={
+                            'http': proxy, 'https': proxy}, timeout=10)
+            if r.status_code == 200:
+                soup = BeautifulSoup(r.text, 'lxml')
+                print("******Scraping page " + str(page) + "******'\n")
+
+                # vgchartz website is really weird so we have to search for
+                # <a> tags with game urls
+                game_tags = list(filter(
+                    lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), soup.find_all("a")))[10:]
+                # discard the first 10 elements because those
+                # links are in the navigation bar
+
+                parse_games(game_tags)
+                print('\n******begin scraping for Genre and Rating******\n')
+                df = process_games(df)
+                page += 1
+
+        except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError):
+            print('Something went wrong while connecting to page: ',
+                page, ', will try again later')
+            #proxy = get_proxies(1)
+            time.sleep(10)
+
+        except Exception as e:
+            print("something went wrong! We're on page: " +
+                str(page) + '\nSaving successfully crawled data')
+            print("Exception: ", e)
+            df.to_csv(crashed_tag + csvfilename, sep=",",
+                    encoding='utf-8', index=False)
+            raise e
+
+
+    failed_games = len(df[df['status'] == 0])
+    print("******Finished scraping games, will try to scrape missing data******")
+    # 36 hours max, should be enough to scrape everything
+    t_end = start_time + 60 * 60 * 36
+    while True:
+        try:
+            df = process_games(df)
+            failed_games = len(df[df['status'] == 0])
+            if failed_games == 0 or time.time() > t_end:
+                break
+            print('Number of not scraped yet:', failed_games, '\n')
+            time.sleep(10)  # wait for 10 seconds for the server to recover?
+        except Exception as e:
+            print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data')
+            print("Exception: ", e)
+            df.to_csv(crashed_tag + csvfilename, sep=",",
+                    encoding='utf-8', index=False)
+            raise e
+
+    elapsed_time = time.time() - start_time
+    print("Scraped", rec_count, "games in", round(elapsed_time/60, 2), "minutes.")
+
+    # select only these columns in the final dataset
+    df = df.sort_values(by=['Rank'])
+    df.to_csv('complete-vgchartz.csv', sep=",", encoding='utf-8', index=False)
+    df_final = df[[
+        'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating',
+        'Publisher', 'Developer', 'Critic_Score', 'User_Score',
+        'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales']]
+
+    df_final.to_csv(csvfilename, sep=",", encoding='utf-8', index=False)
+    print("Wrote scraper data to", csvfilename)