Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,12 @@ venv.bak/
# mypy
.mypy_cache/
.vscode/

# csv
*.csv

# ipynb
*.ipynb

# ignore this folder
testing/
42 changes: 35 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,41 @@
vgchartzfull is a python script based on BeautifulSoup.

vgchartzfull is a python script with multiprocessing based on BeautifulSoup.
proxies are implemented in the script, it can be disabled by changing it to False

It creates a dataset based on data from
http://www.vgchartz.com/gamedb/

The dataset is saved as vgsales.csv.
The dataset is saved as vgsales-%Y-%m-%d_%H_%M_%S.csv.

You will need to have the following dependencies installed:
```
BeautifulSoup4
pandas
numpy
requests
unidecode
user_agent
```

Thanks to:
- https://www.kdnuggets.com/2018/02/web-scraping-tutorial-python.html
- http://python.omics.wiki/multiprocessing_map/multiprocessing_partial_function_multiple_arguments
- https://medium.com/datadriveninvestor/speed-up-web-scraping-using-multiprocessing-in-python-af434ff310c5

You will need to have BeautifulSoup added.
It can be installed by pip.

sudo pip install BeautifulSoup
Free proxies:
[1](https://proxyscrape.com/free-proxy-list)
[2](http://multiproxy.org/txt_all/proxy.txt)
[3](https://proxy.rudnkh.me/txt)
[4](https://www.us-proxy.org/)

Thanks to Chris Albon.
http://chrisalbon.com/python/beautiful_soup_scrape_table.html
- [x] added multiprocessing for faster results with a maximum of 24 workers.
- [x] added proxies to avoid being blocked
- [x] handling couple of exceptions
- [x] scraped data gets saved before raising an unexpected error
- [x] add the option to continue where we left off due to an unexpected error
- [x] clean version removes the print statements, should results in better performance!
- [ ] optimize it
- [ ] create a log file
- [ ] convert the script to a class or use scrapy, reference
- https://edmundmartin.com/multi-threaded-crawler-in-python/
273 changes: 273 additions & 0 deletions clean-vgchartzfull.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
from bs4 import BeautifulSoup, element
import pandas as pd
import numpy as np
import requests
import time
import unidecode
from user_agent import generate_user_agent
from proxies_gen import get_proxies, test_proxies
from itertools import cycle
from lxml.html import fromstring
from multiprocessing import Pool, cpu_count # This is a thread-based Pool
from requests.exceptions import ConnectionError, Timeout, ProxyError, RequestException
from urllib3.exceptions import ProtocolError
import sys
import os
sys.setrecursionlimit(10000) # need to optimize code.
proxy_enabled = True


def parse_games(game_tags):
"""
parse the games table on current page
parameters:
game_tags: games tags after reading the html page
df: the dataframe where we will store the games
"""
global rec_count
global df
for tag in game_tags:
game = {}
game["Name"] = " ".join(tag.string.split())
#print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name']))

data = tag.parent.parent.find_all("td")
if data:
game["Rank"] = np.int32(data[0].string)
game["img_url"] = data[1].a.img.get('src')
game["url"] = data[2].a.get('href')
if len(game["Name"].split("/")) > 1:
# replace accented chars with ascii
game["basename"] = unidecode.unidecode(
game['Name'].strip().split('/')[0].strip().replace(' ', '-'))
else:
game["basename"] = game["url"].rsplit('/', 2)[1]
game["Platform"] = data[3].img.get('alt')
game["Publisher"] = data[4].get_text().strip()
game["Developer"] = data[5].get_text().strip()
game["Vgchartzscore"] = data[6].get_text().strip()
game["Critic_Score"] = float(
data[7].string) if not data[7].string.startswith("N/A") else np.nan
game["User_Score"] = float(
data[8].string) if not data[8].string.startswith("N/A") else np.nan
game["Total_Shipped"] = float(
data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan
game["Global_Sales"] = float(
data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan
game["NA_Sales"] = float(
data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan
game["PAL_Sales"] = float(
data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan
game["JP_Sales"] = float(
data[13].string[:-1]) if not data[13].string.startswith("N/A") else np.nan
game["Other_Sales"] = float(
data[14].string[:-1]) if not data[14].string.startswith("N/A") else np.nan
year = data[15].string.split()[-1]
if year.startswith('N/A'):
game["Year"] = 'N/A'
else:
if int(year) >= 70:
year_to_add = np.int32("19" + year)
else:
year_to_add = np.int32("20" + year)
game["Year"] = year_to_add
game["Last_Update"] = data[16].get_text().strip()
game['Genre'] = 'N/A'
game['ESRB_Rating'] = 'N/A'
game['status'] = 0
df = df.append(game, ignore_index=True)
rec_count += 1


def parse_genre_esrb(df):
"""loads every game's url to get genre and esrb rating"""
headers = {'User-Agent': generate_user_agent(
device_type='desktop', os=('mac', 'linux'))}
proxy = {}
if proxy_enabled:
#print("\n******getting list of proxies and testing them******'\n")
# this an api call which returns a list of working proxies that get checked evrey 15 minutes
proxies = cycle(get_proxies(5))
proxy = next(proxies)

for index, row in df.iterrows():
try:
game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}, timeout=5)
if game_page.status_code == 200:
sub_soup = BeautifulSoup(game_page.text, "lxml")
# again, the info box is inconsistent among games so we
# have to find all the h2 and traverse from that to the genre
gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"})
h2s = gamebox.find_all('h2')
# make a temporary tag here to search for the one that contains
# the word "Genre"
temp_tag = element.Tag
for h2 in h2s:
if h2.string == 'Genre':
temp_tag = h2
df.loc[index, 'Genre'] = temp_tag.next_sibling.string

# find the ESRB rating
game_rating = gamebox.find('img').get('src')
if 'esrb' in game_rating:
df.loc[index, 'ESRB_Rating'] = game_rating.split(
'_')[1].split('.')[0].upper()
# we successfuly got the genre and rating
df.loc[index, 'status'] = 1
#print('Successfully scraped genre and rating for :', df.at[index, 'Name'])

except(ProxyError):
proxy = next(proxies)

except (ConnectionError, Timeout, ProtocolError, TimeoutError):
#print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later')
continue

except Exception as e:
#print('different error occurred while connecting, will pass')
continue
# wait for 1 seconds between every call,
# we do not want to get blocked or abuse the server
time.sleep(1)
return df


def retry_game(df):
"""try to scrape the missing data again"""
return parse_genre_esrb(df)


if __name__ == "__main__":
def process_games(df):
failed_games = len(df[df['status'] == 0])
NUM_WORKERS = cpu_count() * 2
df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS)
#update num_workers
df_subsets = [i for i in df_subsets if len(i) != 0]
if len(df_subsets) != 0:
NUM_WORKERS = len(df_subsets)# we don't want to have a worker for empty subsets
pool = Pool(processes=NUM_WORKERS)
results = pool.map(retry_game, df_subsets)
try:
df_updated = pd.concat(results)
df = pd.concat([df[df['status'] == 1], df_updated])
except:
print('error occurred while joining dataframe')
pool.close()
pool.join()
return df

rec_count = 0
start_time = time.time()
current_time = time.time()
crashed_tag = 'before_crashing_'
exists = [s for s in os.listdir() if crashed_tag in s]
if exists:
print("found a data saved from a crash, will continue on it")
csvfilename = exists[0].replace(crashed_tag, '')
df = pd.read_csv(exists[0])
rec_count = df['Rank'].max()
page = int(rec_count/1000) + 1 # because we already scraped current
df = process_games(df)
else:
csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv"

# initialize a panda dataframe to store all games with the following columns:
# rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer,
# publisher, release year, critic score, user score, na sales, pal sales,
# jp sales, other sales, total sales, total shipped, last update, url, status
# last two columns for debugging
if not exists:
df = pd.DataFrame(columns=[
'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher',
'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score',
'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales',
'Other_Sales', 'Year', 'Last_Update', 'url', 'status'])

urlhead = 'http://www.vgchartz.com/games/games.php?page='
urltail = '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both'
urltail += '&banner=Both&showdeleted=&region=All&goty_year=&developer='
urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1'
urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1'
urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1'

# get the number of pages
vglink = requests.get('http://www.vgchartz.com/gamedb/').text
x = fromstring(vglink).xpath(
"//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0]
pages = int(x.split(',')[0])

if not exists: page = 1
while True:
if page > pages:
break
try:
proxy = get_proxies(1)[0]
headers = {'User-Agent': generate_user_agent(
device_type='desktop', os=('mac', 'linux'))}
surl = urlhead + str(page) + urltail
r = requests.get(surl, headers=headers, proxies={
'http': proxy, 'https': proxy}, timeout=10)
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'lxml')
print("******Scraping page " + str(page) + "******'\n")

# vgchartz website is really weird so we have to search for
# <a> tags with game urls
game_tags = list(filter(
lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), soup.find_all("a")))[10:]
# discard the first 10 elements because those
# links are in the navigation bar

parse_games(game_tags)
page += 1
print('\n******begin scraping for Genre and Rating******\n')
df = process_games(df)

except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError, TimeoutError):
print('Something went wrong while connecting to page: ',
page, ', will try again later')
#proxy = get_proxies(1)
time.sleep(10)

except Exception as e:
print("something went wrong! We're on page: " +
str(page) + '\nSaving successfully crawled data')
print("Exception: ", e)
df.to_csv(crashed_tag + csvfilename, sep=",",
encoding='utf-8', index=False)
raise e


failed_games = len(df[df['status'] == 0])
print("******Finished scraping games, will try to scrape missing data******")
# 36 hours max, should be enough to scrape everything
t_end = start_time + 60 * 60 * 36
while True:
try:
df = process_games(df)
failed_games = len(df[df['status'] == 0])
if failed_games == 0 or time.time() > t_end:
break
#print('Number of not scraped yet:', failed_games, '\n')
time.sleep(10) # wait for 10 seconds for the server to recover?
except Exception as e:
print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data')
print("Exception: ", e)
df.to_csv(crashed_tag + csvfilename, sep=",",
encoding='utf-8', index=False)
raise e

elapsed_time = time.time() - start_time
print("Scraped", rec_count, "games in", round(elapsed_time/60, 2), "minutes.")

# select only these columns in the final dataset
df = df.sort_values(by=['Rank'])
df.to_csv('complete-vgchartz.csv', sep=",", encoding='utf-8', index=False)
df_final = df[[
'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating',
'Publisher', 'Developer', 'Critic_Score', 'User_Score',
'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales']]

df_final.to_csv(csvfilename, sep=",", encoding='utf-8', index=False)
print("Wrote scraper data to", csvfilename)
60 changes: 60 additions & 0 deletions proxies_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from lxml.html import fromstring
import requests
import numpy as np
from itertools import cycle


def get_proxies(num=None):
# url = 'https://free-proxy-list.net/'
# response = requests.get(url)
# parser = fromstring(response.text)
# proxies = list(requests.get('https://proxy.rudnkh.me/txt').text.split())
# for i in parser.xpath('//tbody/tr'):
# if i.xpath('.//td[7][contains(text(),"yes")]'):
# proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
# proxies.append(proxy)

link = "https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=1000&country=all&ssl=all&anonymity=all&uptime=100"
proxies = list(requests.get(link).text.split())
np.random.shuffle(proxies)
proxies = []
if len(proxies) == 0:
proxies = list(requests.get(
link[:-3]+'99').text.split()) # change uptime to 99
np.random.shuffle(proxies)
# print('Found', len(proxies), 'proxies, testing them now')

if num is None:
num = len(proxies)
tested = test_proxies(proxies, num)
return tested


def test_proxies(proxies, num):
url = 'https://httpbin.org/ip'
proxy_pool = cycle(proxies)
working_proxies = []
for i in range(1, len(proxies)):
if num == 0:
break
# Get a proxy from the pool
proxy = next(proxy_pool)
# print("Request #%d" % i)
try:
response = requests.get(
url, proxies={"http": proxy, "https": proxy}, timeout=1)
# print(response.json())
working_proxies.append(proxy)
num -= 1
except:
# Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work.
# print("Skipping. Connnection error")
pass
return working_proxies


# proxies = get_proxies(5)
# # with open('proxies.txt') as f:
# # proxies = f.read().splitlines()
# # test_proxies(proxies, 10)
# print(proxies)
Loading