From 7448af251192663559ee4eec4bad7a5678709795 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Sun, 7 Apr 2019 11:16:43 -0400 Subject: [PATCH 01/18] added headers, sleep, genre, try:except for handling timeout, in process of chaning from urllib to requests --- vgchartzfull.py | 84 ++++++++++++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 32 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index b1d75a4..ea16f00 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -2,8 +2,15 @@ import urllib import pandas as pd import numpy as np +import requests +import time +from user_agent import generate_user_agent -pages = 19 +headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv: 12.0) Gecko/20100101 Firefox/12.0'} +# generate a new header for every new page +headers = {'User-Agent': generate_user_agent(device_type = 'desktop', os=('mac', 'linux'))} + +pages = 56 rec_count = 0 rank = [] gname = [] @@ -19,13 +26,14 @@ sales_jp = [] sales_ot = [] sales_gl = [] +rating = [] -urlhead = 'http://www.vgchartz.com/gamedb/?page=' -urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both' -urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0' -urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1' -urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1' -urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL' +urlhead = 'http://www.vgchartz.com/games/games.php?page=' +urltail = '&results=200&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' +urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer=' +urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1' +urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1' +urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' for page in range(1, pages): surl = urlhead + str(page) + urltail @@ -56,27 +64,27 @@ publisher.append(data[4].string) developer.append(data[5].string) critic_score.append( - float(data[6].string) if - not data[6].string.startswith("N/A") else np.nan) - user_score.append( float(data[7].string) if not data[7].string.startswith("N/A") else np.nan) + user_score.append( + float(data[8].string) if + not data[8].string.startswith("N/A") else np.nan) sales_na.append( - float(data[9].string[:-1]) if - not data[9].string.startswith("N/A") else np.nan) - sales_pal.append( - float(data[10].string[:-1]) if - not data[10].string.startswith("N/A") else np.nan) - sales_jp.append( float(data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan) - sales_ot.append( + sales_pal.append( float(data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan) + sales_jp.append( + float(data[13].string[:-1]) if + not data[13].string.startswith("N/A") else np.nan) + sales_ot.append( + float(data[14].string[:-1]) if + not data[14].string.startswith("N/A") else np.nan) sales_gl.append( - float(data[8].string[:-1]) if - not data[8].string.startswith("N/A") else np.nan) - release_year = data[13].string.split()[-1] + float(data[10].string[:-1]) if + not data[10].string.startswith("N/A") else np.nan) + release_year = data[15].string.split()[-1] # different format for year if release_year.startswith('N/A'): year.append('N/A') @@ -89,19 +97,31 @@ # go to every individual website to get genre info url_to_game = tag.attrs['href'] - site_raw = urllib.request.urlopen(url_to_game).read() - sub_soup = BeautifulSoup(site_raw, "html.parser") - # again, the info box is inconsistent among games so we - # have to find all the h2 and traverse from that to the genre name - h2s = sub_soup.find("div", {"id": "gameGenInfoBox"}).find_all('h2') - # make a temporary tag here to search for the one that contains - # the word "Genre" - temp_tag = element.Tag - for h2 in h2s: - if h2.string == 'Genre': - temp_tag = h2 - genre.append(temp_tag.next_sibling.string) + try: + #site_raw = urllib.request.urlopen(url_to_game).read() + site_raw = requests.get(url_to_game, headers=headers) + sub_soup = BeautifulSoup(site_raw.text, "lxml") + # again, the info box is inconsistent among games so we + # have to find all the h2 and traverse from that to the genre name + gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"}) + h2s = gamebox.find_all('h2') + # make a temporary tag here to search for the one that contains + # the word "Genre" + temp_tag = element.Tag + for h2 in h2s: + if h2.string == 'Genre': + temp_tag = h2 + genre.append(temp_tag.next_sibling.string) + #find the ESRB rating + game_rating = gamebox.find('img').get('src') + if 'esrb' in game_rating: + rating.append(game_rating[game_rating.index('esrb'):]) + except: + print('something wrong with game url:', url_to_game, 'code:', site_raw.status_code) + genre.append(np.nan) + rating.append(np.nan) + time.sleep(5) rec_count += 1 columns = { From 68f0e1ee92652f8f30a879b4d4ff3c943e0f63a6 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Sun, 7 Apr 2019 11:19:53 -0400 Subject: [PATCH 02/18] added headers, sleep, genre, try:except for handling timeout, in process of chaning from urllib to requests --- Untitled.ipynb | 519 ++++++++++++++++++++++++++++++++++++++++++++++ bs_test.py | 81 ++++++++ myspider.py | 11 + proxy_list_gen.py | 35 ++++ scraper.py | 14 ++ test.py | 160 ++++++++++++++ untitled | 0 untitled1 | 0 8 files changed, 820 insertions(+) create mode 100644 Untitled.ipynb create mode 100644 bs_test.py create mode 100644 myspider.py create mode 100644 proxy_list_gen.py create mode 100644 scraper.py create mode 100644 test.py create mode 100644 untitled create mode 100644 untitled1 diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..df85878 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,519 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup, element\n", + "import urllib\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "pages = 56\n", + "rec_count = 0\n", + "rank = []\n", + "gname = []\n", + "platform = []\n", + "year = []\n", + "genre = []\n", + "critic_score = []\n", + "user_score = []\n", + "publisher = []\n", + "developer = []\n", + "sales_na = []\n", + "sales_pal = []\n", + "sales_jp = []\n", + "sales_ot = []\n", + "sales_gl = []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "urlhead = 'http://www.vgchartz.com/gamedb/?page='\n", + "urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'\n", + "urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0'\n", + "urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'\n", + "urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'\n", + "urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "for page in range(1, pages):\n", + " surl = urlhead + str(page) + urltail\n", + " r = urllib.request.urlopen(surl).read()\n", + " soup = BeautifulSoup(r)\n", + " print(f\"Page: {page}\")\n", + "\n", + " # vgchartz website is really weird so we have to search for\n", + " # tags with game urls\n", + " game_tags = list(filter(\n", + " lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),\n", + " # discard the first 10 elements because those\n", + " # links are in the navigation bar\n", + " soup.find_all(\"a\")\n", + " ))[10:]\n", + "\n", + " for tag in game_tags:\n", + "\n", + " # add name to list\n", + " gname.append(\" \".join(tag.string.split()))\n", + " print(f\"{rec_count + 1} Fetch data for game {gname[-1]}\")\n", + "\n", + " # get different attributes\n", + " # traverse up the DOM tree\n", + " data = tag.parent.parent.find_all(\"td\")\n", + " rank.append(np.int32(data[0].string))\n", + " platform.append(data[3].find('img').attrs['alt'])\n", + " publisher.append(data[4].string)\n", + " developer.append(data[5].string)\n", + " critic_score.append(\n", + " float(data[6].string) if\n", + " not data[6].string.startswith(\"N/A\") else np.nan)\n", + " user_score.append(\n", + " float(data[7].string) if\n", + " not data[7].string.startswith(\"N/A\") else np.nan)\n", + " sales_na.append(\n", + " float(data[9].string[:-1]) if\n", + " not data[9].string.startswith(\"N/A\") else np.nan)\n", + " sales_pal.append(\n", + " float(data[10].string[:-1]) if\n", + " not data[10].string.startswith(\"N/A\") else np.nan)\n", + " sales_jp.append(\n", + " float(data[11].string[:-1]) if\n", + " not data[11].string.startswith(\"N/A\") else np.nan)\n", + " sales_ot.append(\n", + " float(data[12].string[:-1]) if\n", + " not data[12].string.startswith(\"N/A\") else np.nan)\n", + " sales_gl.append(\n", + " float(data[8].string[:-1]) if\n", + " not data[8].string.startswith(\"N/A\") else np.nan)\n", + " release_year = data[13].string.split()[-1]\n", + " # different format for year\n", + " if release_year.startswith('N/A'):\n", + " year.append('N/A')\n", + " else:\n", + " if int(release_year) >= 80:\n", + " year_to_add = np.int32(\"19\" + release_year)\n", + " else:\n", + " year_to_add = np.int32(\"20\" + release_year)\n", + " year.append(year_to_add)\n", + "\n", + " # go to every individual website to get genre info\n", + " url_to_game = tag.attrs['href']\n", + " site_raw = urllib.request.urlopen(url_to_game).read()\n", + " sub_soup = BeautifulSoup(site_raw, \"html.parser\")\n", + " # again, the info box is inconsistent among games so we\n", + " # have to find all the h2 and traverse from that to the genre name\n", + " h2s = sub_soup.find(\"div\", {\"id\": \"gameGenInfoBox\"}).find_all('h2')\n", + " # make a temporary tag here to search for the one that contains\n", + " # the word \"Genre\"\n", + " temp_tag = element.Tag\n", + " for h2 in h2s:\n", + " if h2.string == 'Genre':\n", + " temp_tag = h2\n", + " genre.append(temp_tag.next_sibling.string)\n", + "\n", + " rec_count += 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "columns = {\n", + " 'Rank': rank,\n", + " 'Name': gname,\n", + " 'Platform': platform,\n", + " 'Year': year,\n", + " 'Genre': genre,\n", + " 'Critic_Score': critic_score,\n", + " 'User_Score': user_score,\n", + " 'Publisher': publisher,\n", + " 'Developer': developer,\n", + " 'NA_Sales': sales_na,\n", + " 'PAL_Sales': sales_pal,\n", + " 'JP_Sales': sales_jp,\n", + " 'Other_Sales': sales_ot,\n", + " 'Global_Sales': sales_gl\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(rec_count)\n", + "df = pd.DataFrame(columns)\n", + "print(df.columns)\n", + "df = df[[\n", + " 'Rank', 'Name', 'Platform', 'Year', 'Genre',\n", + " 'Publisher', 'Developer', 'Critic_Score', 'User_Score',\n", + " 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]\n", + "df.to_csv(\"vgsales.csv\", sep=\",\", encoding='utf-8', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "ename": "RemoteDisconnected", + "evalue": "Remote end closed connection without response", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mRemoteDisconnected\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mgenre\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0murl_to_game\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"http://www.vgchartz.com/game/45608/kinect-adventures/?region=All\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0msite_raw\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_to_game\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5\u001b[0m \u001b[0msub_soup\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBeautifulSoup\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msite_raw\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"lxml\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;31m# again, the info box is inconsistent among games so we\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36murlopen\u001b[1;34m(url, data, timeout, cafile, capath, cadefault, context)\u001b[0m\n\u001b[0;32m 220\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 221\u001b[0m \u001b[0mopener\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_opener\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 222\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mopener\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 223\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 224\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0minstall_opener\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopener\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36mopen\u001b[1;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[0;32m 523\u001b[0m \u001b[0mreq\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 524\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 525\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_open\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 526\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 527\u001b[0m \u001b[1;31m# post-process response\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36m_open\u001b[1;34m(self, req, data)\u001b[0m\n\u001b[0;32m 541\u001b[0m \u001b[0mprotocol\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreq\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 542\u001b[0m result = self._call_chain(self.handle_open, protocol, protocol +\n\u001b[1;32m--> 543\u001b[1;33m '_open', req)\n\u001b[0m\u001b[0;32m 544\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 545\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36m_call_chain\u001b[1;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[0;32m 501\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mhandler\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mhandlers\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 502\u001b[0m \u001b[0mfunc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 503\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 504\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 505\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36mhttp_open\u001b[1;34m(self, req)\u001b[0m\n\u001b[0;32m 1343\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1344\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mhttp_open\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1345\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdo_open\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhttp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclient\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mHTTPConnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1346\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1347\u001b[0m \u001b[0mhttp_request\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mAbstractHTTPHandler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdo_request_\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36mdo_open\u001b[1;34m(self, http_class, req, **http_conn_args)\u001b[0m\n\u001b[0;32m 1318\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# timeout error\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1319\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mURLError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0merr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1320\u001b[1;33m \u001b[0mr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mh\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1321\u001b[0m \u001b[1;32mexcept\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1322\u001b[0m \u001b[0mh\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\python37\\lib\\http\\client.py\u001b[0m in \u001b[0;36mgetresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1319\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1320\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1321\u001b[1;33m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbegin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1322\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1323\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\python37\\lib\\http\\client.py\u001b[0m in \u001b[0;36mbegin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 294\u001b[0m \u001b[1;31m# read until we get a non-100 response\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 295\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 296\u001b[1;33m \u001b[0mversion\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreason\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_read_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 297\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mCONTINUE\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 298\u001b[0m \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mc:\\python37\\lib\\http\\client.py\u001b[0m in \u001b[0;36m_read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 263\u001b[0m \u001b[1;31m# Presumably, the server closed the connection before\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 264\u001b[0m \u001b[1;31m# sending a valid response.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 265\u001b[1;33m raise RemoteDisconnected(\"Remote end closed connection without\"\n\u001b[0m\u001b[0;32m 266\u001b[0m \" response\")\n\u001b[0;32m 267\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mRemoteDisconnected\u001b[0m: Remote end closed connection without response" + ] + } + ], + "source": [ + "rating = []\n", + "genre = []\n", + "url_to_game = \"http://www.vgchartz.com/game/45608/kinect-adventures/?region=All\"\n", + "site_raw = urllib.request.urlopen(url_to_game).read()\n", + "sub_soup = BeautifulSoup(site_raw, \"lxml\")\n", + "# again, the info box is inconsistent among games so we\n", + "# have to find all the h2 and traverse from that to the genre name\n", + "gamebox = sub_soup.find(\"div\", {\"id\": \"gameGenInfoBox\"})\n", + "h2s = gamebox.find_all('h2')\n", + "# make a temporary tag here to search for the one that contains\n", + "# the word \"Genre\"\n", + "temp_tag = element.Tag\n", + "for h2 in h2s:\n", + " if h2.string == 'Genre':\n", + " temp_tag = h2\n", + "genre.append(temp_tag.next_sibling.string)\n", + "\n", + "#find the ESRB rating\n", + "game_rating = gamebox.find('img').get('src')\n", + "if 'esrb' in game_rating:\n", + " rating.append(game_rating[game_rating.index('esrb'):])\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "rating = []\n", + "links = sub_soup.find_all('img')\n", + "for i in sub_soup.find_all('img'):\n", + " rate = i.get('src')\n", + " try:\n", + " if 'ESRB' in rate:\n", + " rating.append(rate[rate.index('esrb'):])\n", + " break\n", + " except:\n", + " pass\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'bytes' object has no attribute 'getcode'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_to_game\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetcode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m: 'bytes' object has no attribute 'getcode'" + ] + } + ], + "source": [ + "urllib.request.urlopen(url_to_game).read().getcode()" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for h2 in h2s:\n", + " if h2.string == 'Ratings':\n", + " temp_tag1 = h2\n", + " if h2.string == 'Genre':\n", + " temp_tag = h2\n", + "genre.append(temp_tag.next_sibling.string)\n", + "rating.append(temp_tag1.nextSibling)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['esrb/ESRB_e.png']" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rating" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Racing']" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genre" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " #go to game page to parse genre and rating\n", + " try:\n", + " site_raw = urllib.request.urlopen(game[\"url\"]).read()\n", + " sub_soup = BeautifulSoup(site_raw, \"lxml\")\n", + " # again, the info box is inconsistent among games so we\n", + " # have to find all the h2 and traverse from that to the genre name\n", + " gamebox = sub_soup.find(\"div\", {\"id\": \"gameGenInfoBox\"})\n", + " h2s = gamebox.find_all('h2')\n", + " # make a temporary tag here to search for the one that contains\n", + " # the word \"Genre\"\n", + " temp_tag = element.Tag\n", + " for h2 in h2s:\n", + " if h2.string == 'Genre':\n", + " temp_tag = h2\n", + " game[\"genre\"] = temp_tag.next_sibling.string\n", + "\n", + " #find the ESRB rating\n", + " game_rating = gamebox.find('img').get('src')\n", + " if 'esrb' in game_rating:\n", + " game[\"vg_rating\"] = game_rating[game_rating.index('esrb'):]\n", + " except: \n", + " game[\"genre\"] = 'N/A'\n", + " game[\"vg_rating\"] = 'N/A'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "from user_agent import generate_user_agent\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " try:\n", + " #site_raw = urllib.request.urlopen(url_to_game).read()\n", + " site_row = requests.get(url_to_game, headers={\n", + " 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv: 12.0) Gecko/20100101 Firefox/12.0'}).text\n", + " sub_soup = BeautifulSoup(site_raw, \"lxml\")\n", + " # again, the info box is inconsistent among games so we\n", + " # have to find all the h2 and traverse from that to the genre name\n", + " gamebox = sub_soup.find(\"div\", {\"id\": \"gameGenInfoBox\"})\n", + " h2s = gamebox.find_all('h2')\n", + " # make a temporary tag here to search for the one that contains\n", + " # the word \"Genre\"\n", + " temp_tag = element.Tag\n", + " for h2 in h2s:\n", + " if h2.string == 'Genre':\n", + " temp_tag = h2\n", + " genre.append(temp_tag.next_sibling.string)\n", + " #find the ESRB rating\n", + " game_rating = gamebox.find('img').get('src')\n", + " if 'esrb' in game_rating:\n", + " rating.append(game_rating[game_rating.index('esrb'):])\n", + " except:\n", + " print('something wrong with game url:', url_to_game)\n", + " genre.append(np.nan)\n", + " rating.append(np.nan)\n", + "\n", + " time.sleep(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'103.84.254.190:56505'}\n" + ] + } + ], + "source": [ + "from lxml.html import fromstring\n", + "import requests\n", + "from itertools import cycle\n", + "import traceback\n", + "\n", + "def get_proxies():\n", + " url = 'https://free-proxy-list.net/'\n", + " response = requests.get(url)\n", + " parser = fromstring(response.text)\n", + " proxies = set()\n", + " for i in parser.xpath('//tbody/tr')[:20]:\n", + " if i.xpath('.//td[7][contains(text(),\"yes\")]'):\n", + " proxy = \":\".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])\n", + " proxies.add(proxy)\n", + " return proxies\n", + "\n", + "\n", + "#If you are copy pasting proxy ips, put in the list below\n", + "#proxies = ['121.129.127.209:80', '124.41.215.238:45169', '185.93.3.123:8080', '194.182.64.67:3128', '106.0.38.174:8080', '163.172.175.210:3128', '13.92.196.150:8080']\n", + "proxies = get_proxies()\n", + "proxy_pool = cycle(proxies)\n", + "print(proxies)\n", + "\n", + "url = 'https://httpbin.org/ip'\n", + "for i in range(1,len(proxies)):\n", + " #Get a proxy from the pool\n", + " proxy = next(proxy_pool)\n", + " print(\"Request #%d\"%i)\n", + " try:\n", + " response = requests.get(url,proxies={\"http\": proxy, \"https\": proxy})\n", + " print(response.json())\n", + " except:\n", + " #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. \n", + " #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url \n", + " print(\"Skipping. Connnection error\")" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "56.0" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "url = \"http://www.vgchartz.com/games/games.php?page=280&results=200&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both&boxart=Both&banner=Both&showdeleted=®ion=All&goty_year=&developer=&direction=DESC&showtotalsales=0&shownasales=0&showpalsales=0&showjapansales=0&showothersales=0&showpublisher=1&showdeveloper=0&showreleasedate=1&showlastupdate=1&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=No\"\n", + "page = requests.get(url).text\n", + "x = fromstring(page).xpath(\"//th[@colspan='3']/text()\")[0].split('(', 1)[1].split(')')[0]\n", + "np.ceil(int(x.replace(',',\"\"))/1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/bs_test.py b/bs_test.py new file mode 100644 index 0000000..cc2dfef --- /dev/null +++ b/bs_test.py @@ -0,0 +1,81 @@ +from bs4 import BeautifulSoup, element +#import urllib.request +import requests + + +# link = "https://pythonprogramming.net/parsememcparseface/" +# sauce = urllib.request.urlopen(link).read() +# soup = bs.BeautifulSoup(sauce, 'lxml') + +# #print(soup.title.text) + +# # for paragraph in soup.find_all("p"): +# # print(paragraph.string) + +# pages = 56 +# rec_count = 0 +# rank = [] +# gname = [] +# platform = [] +# year = [] +# genre = [] +# critic_score = [] +# user_score = [] +# publisher = [] +# developer = [] +# sales_na = [] +# sales_pal = [] +# sales_jp = [] +# sales_ot = [] +# sales_gl = [] +# rating = [] + +# urlhead = 'http://www.vgchartz.com/games/games.php?page=' +# urltail = '&results=200&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' +# urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer=' +# urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1' +# urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1' +# urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' + +# for page in range(1, 3): +# surl = urlhead + str(page) + urltail +# r = urllib.request.urlopen(surl).read() +# soup = bs.BeautifulSoup(r) +# print(f"Page: {page}") + +# # vgchartz website is really weird so we have to search for +# # tags with game urls +# game_tags = list(filter( +# lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), +# # discard the first 10 elements because those +# # links are in the navigation bar +# soup.find_all("a") +# ))[10:] + +# print(game_tags) + + +rating = [] +genre = [] +url_to_game = "http://www.vgchartz.com/game/6968/mario-kart-wii/?region=All" +site_raw = requests.get(url_to_game, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv: 12.0) Gecko/20100101 Firefox/12.0'}).text +sub_soup = BeautifulSoup(site_raw, "lxml") +# again, the info box is inconsistent among games so we +# have to find all the h2 and traverse from that to the genre name +gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"}) +h2s = gamebox.find_all('h2') +# make a temporary tag here to search for the one that contains +# the word "Genre" +temp_tag = element.Tag +for h2 in h2s: + if h2.string == 'Genre': + temp_tag = h2 +genre.append(temp_tag.next_sibling.string) + +#find the ESRB rating +game_rating = gamebox.find('img').get('src') +if 'esrb' in game_rating: + rating.append(game_rating[game_rating.index('esrb'):]) + +print(rating) +print(genre) diff --git a/myspider.py b/myspider.py new file mode 100644 index 0000000..24cd1fd --- /dev/null +++ b/myspider.py @@ -0,0 +1,11 @@ +import scrapy +class BlogSpider(scrapy.Spider): + name = 'blogspider' + start_urls = ['https://blog.scrapinghub.com'] + + def parse(self, response): + for title in response.css('.post-header>h2'): + yield {'title': title.css('a ::text').get()} + + for next_page in response.css('a.next-posts-link'): + yield response.follow(next_page, self.parse) \ No newline at end of file diff --git a/proxy_list_gen.py b/proxy_list_gen.py new file mode 100644 index 0000000..ca03379 --- /dev/null +++ b/proxy_list_gen.py @@ -0,0 +1,35 @@ +from lxml.html import fromstring +import requests +from itertools import cycle +import traceback + +def get_proxies(): + url = 'https://free-proxy-list.net/' + response = requests.get(url) + parser = fromstring(response.text) + proxies = set() + for i in parser.xpath('//tbody/tr'): + if i.xpath('.//td[7][contains(text(),"yes")]'): + proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]]) + proxies.add(proxy) + return proxies + + +#If you are copy pasting proxy ips, put in the list below +#proxies = ['121.129.127.209:80', '124.41.215.238:45169', '185.93.3.123:8080', '194.182.64.67:3128', '106.0.38.174:8080', '163.172.175.210:3128', '13.92.196.150:8080'] +proxies = get_proxies() +proxy_pool = cycle(proxies) +print(proxies) + +url = 'https://httpbin.org/ip' +for i in range(1,len(proxies)): + #Get a proxy from the pool + proxy = next(proxy_pool) + print("Request #%d"%i) + try: + response = requests.get(url,proxies={"http": proxy, "https": proxy}) + print(response.json()) + except: + #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. + #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url + print("Skipping. Connnection error") \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..e175d5e --- /dev/null +++ b/scraper.py @@ -0,0 +1,14 @@ +import requests +from bs4 import BeautifulSoup +page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html") +#print(page.content) + +soup = BeautifulSoup(page.content, 'html.parser') +#print(soup.prettify()) + +#number of pages +pages = 279 + +page1 = requests.get("http://www.vgchartz.com/gamedb/games.php?name=&keyword=&console=®ion=All&developer=&publisher=&goty_year=&genre=&boxart=Both&banner=Both&ownership=Both&showmultiplat=Yes&results=200&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&showvgchartzscore=1&shownasales=0&shownasales=1&showdeveloper=0&showdeveloper=1&showcriticscore=0&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=0&showreleasedate=1&showuserscore=0&showuserscore=1&showjapansales=0&showjapansales=1&showlastupdate=0&showlastupdate=1&showothersales=0&showothersales=1&showshipped=0&showshipped=1") +soup1 = BeautifulSoup(page1.content, 'html.parser') +print(list(soup1.children)) \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..8eae906 --- /dev/null +++ b/test.py @@ -0,0 +1,160 @@ +from bs4 import BeautifulSoup #for web scraping +import urllib #for opening urls +import pandas as pd # for data frames +import time # for time +import datetime # for dates + +#global constants +gc_dflt_max_time = 300 # seconds +gc_output_filename ='vgsales' +gc_output_filetype ='csv' + +#global variables +dflt_max_rec = 10 +data_list = [] +pages = 0 +items_scraped = 0 +data_exists = True + +class NotPositiveError(UserWarning): + pass + +# Define a function row of type game +# Argument passed in must be of type BeautifulSoup row +def vgchartz_parse(row): + game = {} + #get columns from the row we passed in + cols = row.find_all("td") + if cols: + # Start to build our data grid, defining appropriate names + game["rank"] =cols[0].string.strip() + game["gname"] =cols[2].text.strip() + game["platform"] =cols[3].find('img').get('alt') + game["developer"] =cols[4].string.strip() + game["publisher"] =cols[5].string.strip() + game["criticscore"] =cols[6].string.strip() + game["userscore"] =cols[7].string.strip() + game["sales_tot"] =cols[8].string.strip() + game["sales_na"] =cols[9].string.strip() + game["sales_eu"] =cols[10].string.strip() + game["sales_jp"] =cols[11].string.strip() + game["sales_ot"] =cols[12].string.strip() + game["year"] =cols[13].string.strip() + game["lst_update"] =cols[14].string.strip() + + return game + +#Get maximum number of records from the user. +#Try-Except conditions to ensure that +# i) Figure is a number +# ii) Figure is Positive +# iii) If no entry is made (use to revert to our default in dflt_max_rec) +while True: + try: + print('Enter the maximum number of records to scrape') + in_max = input() + if (in_max.strip() == ""): + print('Running for maximum '+str(dflt_max_rec)+' records') + break + elif int(in_max) <= 0: + #make sure value is positive, otherwise raise user defined error + raise NotPositiveError + break + elif int(in_max) > 0: + #update variable with new value + dflt_max_rec = in_max + print('Running for '+str(in_max)+' records') + break + + except ValueError: + print("This was not a number, please try again.") + except NotPositiveError: + print("The number was not positive, please try again.") + +start_time = time.time() +print('Starting scrape...') + +while data_exists: + + #This is the page number used in the url query. + pages +=1 + + #concat url to include page number + url = 'http://www.vgchartz.com/gamedb/?page=' + str(pages) + '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both&boxart=Both&showdeleted=®ion=All&developer=&goty_year=&alphasort=&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1&showvgchartzscore=0&showcriticscore=1&showuserscore=1' + #url='file:GameDataScrape/vgchartz.htm'#save the site locally first for development purposes instead of hitting the site + #open url + r = urllib.request.urlopen(url).read() + + #use beautiful soup + soup = BeautifulSoup(r, "html.parser") + + divparent = soup.find('div', id = 'generalBody') + + table = divparent.find('table', width = '968') + + #Skipping to 3rd row from VGChartz to avoid menu rows and increment max_dat + for row in table.find_all('tr')[3:]: + + #create a row of the type we desire + vg_game_info = vgchartz_parse(row) + + #Gets values in named columns + columns = {'Rank': vg_game_info['rank'], 'Name': vg_game_info['gname'], \ + 'Platform': vg_game_info['platform'], 'Developer': vg_game_info['developer'],\ + 'Publisher': vg_game_info['publisher'], 'Critic_Score': vg_game_info['criticscore'],\ + 'User_Score': vg_game_info['userscore'],'Global_Sales':vg_game_info['sales_tot'],\ + 'NA_Sales':vg_game_info['sales_na'], 'EU_Sales': vg_game_info['sales_eu'],\ + 'JP_Sales': vg_game_info['sales_jp'],'Other_Sales':vg_game_info['sales_ot'], \ + 'Year': vg_game_info['year']} + + items_scraped +=1 + + #What are our end conditions to break out the loop?? + #1. End if the maximum number of records is reached which is determined by + # i) global constant default value in 'dflt_max_rec' + # ii) over-ridden 'gc_default_max' with new max + if (int(dflt_max_rec) == items_scraped): + print('***Reached max data limit, ending scrape process...') + data_exists = False + break + + #2. End of we've been running for too long, arbitrary number stored in max_time_limit + elif (round(time.time() - start_time, 2)>gc_dflt_max_time): + print('***Reached max time limit, ending scrape process...') + data_exists = False + break + + #3. End if games do not have any sale data, not interested otherwise + # i) Bespoke to your purpose, update or remove + if (vg_game_info["sales_tot"] == "0.00" or vg_game_info["sales_tot"] == "0.00m"): + print('***No more relvent data available, ending scrape process...') + data_exists = False + break + + #append to a list of data so we can save this row for later + data_list.append(columns) + +print('...Scrape completed') +print() +print('Now writing to file') + +#Use pandas create data frame from our games list +df = pd.DataFrame(data_list) + +#list of columns +df = df[['Rank','Name','Platform','Publisher','Developer','Critic_Score','User_Score','Global_Sales','NA_Sales','EU_Sales','JP_Sales','Other_Sales','Year']] + +del df.index.name + +#write out to file +filename = gc_output_filename+'-' + datetime.datetime.now().strftime("%Y%m%d-%H_%M_%S") + '.'+gc_output_filetype +df.to_csv(filename,sep=",",encoding='utf-8') +print ('Writing scraped data to', filename) + +elapsed_time = time.time() - start_time +print() +print('Filewrite completed') +print() +print('Record Count: '+str(items_scraped)) +print() +print( 'Scraped', items_scraped, 'records over',pages, 'pages in', round(elapsed_time, 2), 'seconds.') \ No newline at end of file diff --git a/untitled b/untitled new file mode 100644 index 0000000..e69de29 diff --git a/untitled1 b/untitled1 new file mode 100644 index 0000000..e69de29 From e81ab331d58ce2839b3bc151974371cbe588c324 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Tue, 9 Apr 2019 06:58:02 -0400 Subject: [PATCH 03/18] making progress --- .gitignore | 3 + proxy_list_gen.py | 35 ------ vgchartzfull.py | 311 ++++++++++++++++++++++++++++------------------ 3 files changed, 196 insertions(+), 153 deletions(-) delete mode 100644 proxy_list_gen.py diff --git a/.gitignore b/.gitignore index be2baa1..e28bf8d 100644 --- a/.gitignore +++ b/.gitignore @@ -103,3 +103,6 @@ venv.bak/ # mypy .mypy_cache/ .vscode/ + +# csv +*.csv \ No newline at end of file diff --git a/proxy_list_gen.py b/proxy_list_gen.py deleted file mode 100644 index ca03379..0000000 --- a/proxy_list_gen.py +++ /dev/null @@ -1,35 +0,0 @@ -from lxml.html import fromstring -import requests -from itertools import cycle -import traceback - -def get_proxies(): - url = 'https://free-proxy-list.net/' - response = requests.get(url) - parser = fromstring(response.text) - proxies = set() - for i in parser.xpath('//tbody/tr'): - if i.xpath('.//td[7][contains(text(),"yes")]'): - proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]]) - proxies.add(proxy) - return proxies - - -#If you are copy pasting proxy ips, put in the list below -#proxies = ['121.129.127.209:80', '124.41.215.238:45169', '185.93.3.123:8080', '194.182.64.67:3128', '106.0.38.174:8080', '163.172.175.210:3128', '13.92.196.150:8080'] -proxies = get_proxies() -proxy_pool = cycle(proxies) -print(proxies) - -url = 'https://httpbin.org/ip' -for i in range(1,len(proxies)): - #Get a proxy from the pool - proxy = next(proxy_pool) - print("Request #%d"%i) - try: - response = requests.get(url,proxies={"http": proxy, "https": proxy}) - print(response.json()) - except: - #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. - #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url - print("Skipping. Connnection error") \ No newline at end of file diff --git a/vgchartzfull.py b/vgchartzfull.py index ea16f00..d0221f4 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -1,45 +1,187 @@ from bs4 import BeautifulSoup, element -import urllib import pandas as pd import numpy as np import requests -import time +import time +import unidecode from user_agent import generate_user_agent +from proxies_gen import get_proxies +from itertools import cycle +from lxml.html import fromstring +# import threading +from multiprocessing import Pool # This is a thread-based Pool +from multiprocessing import cpu_count -headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv: 12.0) Gecko/20100101 Firefox/12.0'} -# generate a new header for every new page -headers = {'User-Agent': generate_user_agent(device_type = 'desktop', os=('mac', 'linux'))} -pages = 56 rec_count = 0 -rank = [] -gname = [] -platform = [] -year = [] -genre = [] -critic_score = [] -user_score = [] -publisher = [] -developer = [] -sales_na = [] -sales_pal = [] -sales_jp = [] -sales_ot = [] -sales_gl = [] -rating = [] +thread_counter = 0 +start_time = time.time() +current_time = time.time() +csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" + + +# initialize a panda dataframe to store all games with the following columns: +# rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer, +# publisher, release year, critic score, user score, na sales, pal sales, +# jp sales, other sales, total sales, total shipped, last update, url, status +# last two columns for debugging + +df = pd.DataFrame(columns=[ + 'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher', + 'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score', + 'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', + 'Other_Sales', 'Year', 'Last_Update', 'url', 'status']) urlhead = 'http://www.vgchartz.com/games/games.php?page=' -urltail = '&results=200&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' +urltail = '&results=20&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer=' urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1' urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1' urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' -for page in range(1, pages): + +def parse_games(game_tags): + """ + parse the games table on current page + parameters: + game_tags: games tags after reading the html page + df: the dataframe where we will store the games + """ + global rec_count + global df + for tag in game_tags: + game = {} + game["Name"] = " ".join(tag.string.split()) + # print(f"{rec_count + 1} Fetch data for game {game['Name']}") + print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name'])) + + data = tag.parent.parent.find_all("td") + if data: + game["Rank"] = np.int32(data[0].string) + game["img_url"] = data[1].a.img.get('src') + game["url"] = data[2].a.get('href') + if len(game["Name"].split("/")) > 1: + # replace accented chars with ascii + game["basename"] = unidecode.unidecode( + game['Name'].strip().split('/')[0].strip().replace(' ', '-')) + else: + game["basename"] = game["url"].rsplit('/', 2)[1] + game["Platform"] = data[3].img.get('alt') + game["Publisher"] = data[4].get_text().strip() + game["Developer"] = data[5].get_text().strip() + game["Vgchartzscore"] = data[6].get_text().strip() + game["Critic_Score"] = float( + data[7].string) if not data[7].string.startswith("N/A") else np.nan + game["User_Score"] = float( + data[8].string) if not data[8].string.startswith("N/A") else np.nan + game["Total_Shipped"] = float( + data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan + game["Global_Sales"] = float( + data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan + game["NA_Sales"] = float( + data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan + game["PAL_Sales"] = float( + data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan + game["JP_Sales"] = float( + data[13].string[:-1]) if not data[13].string.startswith("N/A") else np.nan + game["Other_Sales"] = float( + data[14].string[:-1]) if not data[14].string.startswith("N/A") else np.nan + year = data[15].string.split()[-1] + if year.startswith('N/A'): + game["Year"] = 'N/A' + else: + if int(year) >= 80: + year_to_add = np.int32("19" + year) + else: + year_to_add = np.int32("20" + year) + game["Year"] = year_to_add + game["Last_Update"] = data[16].get_text().strip() + game['Genre'] = 'N/A' + game['ESRB_Rating'] = 'N/A' + game['status'] = 0 + df = df.append(game, ignore_index=True) + rec_count += 1 + + +def parse_genre_esrb(df): + """ + loads every game's url to get genre and esrb rating + """ + count = 0 + # global thread_counter + # thread_counter += 1 + + headers = {'User-Agent': generate_user_agent( + device_type='desktop', os=('mac', 'linux'))} + + proxies = get_proxies() + proxy = cycle(proxies) + + for index, row in df.loc[df['status'] == 0].iterrows(): + # we only want to scrape 200 games at a time. + if count == 200: + break + try: + # uses global headers and proxies + game_page = requests.get( + df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}) + if game_page.status_code == 200: + sub_soup = BeautifulSoup(game_page.text, "lxml") + # again, the info box is inconsistent among games so we + # have to find all the h2 and traverse from that to the genre + gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"}) + h2s = gamebox.find_all('h2') + # make a temporary tag here to search for the one that contains + # the word "Genre" + temp_tag = element.Tag + for h2 in h2s: + if h2.string == 'Genre': + temp_tag = h2 + df.at[index, 'Genre'] = temp_tag.next_sibling.string + + # find the ESRB rating + game_rating = gamebox.find('img').get('src') + if 'esrb' in game_rating: + df.at[index, 'ESRB_Rating'] = game_rating.split( + '_')[1].split('.')[0].upper() + + # we successfuly got the genre and rating if available + df.at[index, 'status'] = 1 + print('Successfully scraped genre and rating for :', df.at[index, 'Name']) + except: + print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') + # probably something went wrong the proxy? + proxy = next(proxies) + + # wait for 2 seconds between every call + time.sleep(2) + count += 1 + + +# def retry_games(): +# """try to scrape the missing data again""" +# global df +# # run every 5 minutes +# t = threading.Timer(300.0, retry_games) +# t.start() +# print("Starting to scrape missing data") +# if len(df[df['status'] == 0]) == 0: +# t.cancel() +# else: +# parse_genre_esrb() + + +# get the number of pages +page = requests.get('http://www.vgchartz.com/gamedb/').text +x = fromstring(page).xpath("//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0] +pages = np.ceil(int(x.replace(',', ""))/1000) + +pages = 2 +for page in range(1, pages): # pages = 2 for debugging! surl = urlhead + str(page) + urltail - r = urllib.request.urlopen(surl).read() - soup = BeautifulSoup(r) - print(f"Page: {page}") + r = requests.get(surl).text + soup = BeautifulSoup(r, 'lxml') + print('Scraping page:', page) # vgchartz website is really weird so we have to search for # tags with game urls @@ -49,102 +191,35 @@ # links are in the navigation bar soup.find_all("a") ))[10:] + parse_games(game_tags) - for tag in game_tags: - # add name to list - gname.append(" ".join(tag.string.split())) - print(f"{rec_count + 1} Fetch data for game {gname[-1]}") +# # this should repeatedly try scrape genre and rating until it reaches this number of calls - # get different attributes - # traverse up the DOM tree - data = tag.parent.parent.find_all("td") - rank.append(np.int32(data[0].string)) - platform.append(data[3].find('img').attrs['alt']) - publisher.append(data[4].string) - developer.append(data[5].string) - critic_score.append( - float(data[7].string) if - not data[7].string.startswith("N/A") else np.nan) - user_score.append( - float(data[8].string) if - not data[8].string.startswith("N/A") else np.nan) - sales_na.append( - float(data[11].string[:-1]) if - not data[11].string.startswith("N/A") else np.nan) - sales_pal.append( - float(data[12].string[:-1]) if - not data[12].string.startswith("N/A") else np.nan) - sales_jp.append( - float(data[13].string[:-1]) if - not data[13].string.startswith("N/A") else np.nan) - sales_ot.append( - float(data[14].string[:-1]) if - not data[14].string.startswith("N/A") else np.nan) - sales_gl.append( - float(data[10].string[:-1]) if - not data[10].string.startswith("N/A") else np.nan) - release_year = data[15].string.split()[-1] - # different format for year - if release_year.startswith('N/A'): - year.append('N/A') - else: - if int(release_year) >= 80: - year_to_add = np.int32("19" + release_year) - else: - year_to_add = np.int32("20" + release_year) - year.append(year_to_add) +NUM_WORKERS = cpu_count() * 2 +while len(df.loc[df['status'] == 0]) > 0: + chunks = NUM_WORKERS // len(df.loc[df['status'] == 0]) + df_subsets = np.array_split( + df, chunks) if chunks != 0 else np.array_split(df, 1) + if __name__ == "__main__": + with Pool(NUM_WORKERS) as p: + p.map(parse_genre_esrb, df_subsets) + + ## add sleep, maybe proxies and headers here? better right? - # go to every individual website to get genre info - url_to_game = tag.attrs['href'] - try: - #site_raw = urllib.request.urlopen(url_to_game).read() - site_raw = requests.get(url_to_game, headers=headers) - sub_soup = BeautifulSoup(site_raw.text, "lxml") - # again, the info box is inconsistent among games so we - # have to find all the h2 and traverse from that to the genre name - gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"}) - h2s = gamebox.find_all('h2') - # make a temporary tag here to search for the one that contains - # the word "Genre" - temp_tag = element.Tag - for h2 in h2s: - if h2.string == 'Genre': - temp_tag = h2 - genre.append(temp_tag.next_sibling.string) - #find the ESRB rating - game_rating = gamebox.find('img').get('src') - if 'esrb' in game_rating: - rating.append(game_rating[game_rating.index('esrb'):]) - except: - print('something wrong with game url:', url_to_game, 'code:', site_raw.status_code) - genre.append(np.nan) - rating.append(np.nan) - time.sleep(5) - rec_count += 1 +# chunk_size = int(df.shape[0] / 4) +# for start in range(0, df.shape[0], chunk_size): +# df_subset = df.iloc[start:start + chunk_size] +# process_data(df_subset) -columns = { - 'Rank': rank, - 'Name': gname, - 'Platform': platform, - 'Year': year, - 'Genre': genre, - 'Critic_Score': critic_score, - 'User_Score': user_score, - 'Publisher': publisher, - 'Developer': developer, - 'NA_Sales': sales_na, - 'PAL_Sales': sales_pal, - 'JP_Sales': sales_jp, - 'Other_Sales': sales_ot, - 'Global_Sales': sales_gl -} -print(rec_count) -df = pd.DataFrame(columns) -print(df.columns) -df = df[[ - 'Rank', 'Name', 'Platform', 'Year', 'Genre', +# select only these columns in the final dataset +df_final = df[[ + 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating', 'Publisher', 'Developer', 'Critic_Score', 'User_Score', - 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] -df.to_csv("vgsales.csv", sep=",", encoding='utf-8', index=False) + 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales']] +df_final.to_csv(csvfilename, sep=",", encoding='utf-8', index=False) + +elapsed_time = time.time() - start_time +print("Scraped", rec_count, "games in", round(elapsed_time, 2), "seconds.") +print("Wrote scraper data to", csvfilename) From 29d6aadcd095a51781a230ecdb6b5db1e6ea0645 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Tue, 9 Apr 2019 10:18:36 -0400 Subject: [PATCH 04/18] multiprocessing works, after second thougt will change to threading --- vgchartzfull.py | 138 ++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 75 deletions(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index d0221f4..98f06c0 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -5,16 +5,15 @@ import time import unidecode from user_agent import generate_user_agent -from proxies_gen import get_proxies +from proxies_gen import get_proxies, test_proxies from itertools import cycle from lxml.html import fromstring -# import threading from multiprocessing import Pool # This is a thread-based Pool -from multiprocessing import cpu_count - +from requests.exceptions import ConnectionError, Timeout, ProxyError +import sys +sys.setrecursionlimit(10000) # need to optimize code. rec_count = 0 -thread_counter = 0 start_time = time.time() current_time = time.time() csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" @@ -25,7 +24,6 @@ # publisher, release year, critic score, user score, na sales, pal sales, # jp sales, other sales, total sales, total shipped, last update, url, status # last two columns for debugging - df = pd.DataFrame(columns=[ 'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher', 'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score', @@ -52,7 +50,6 @@ def parse_games(game_tags): for tag in game_tags: game = {} game["Name"] = " ".join(tag.string.split()) - # print(f"{rec_count + 1} Fetch data for game {game['Name']}") print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name'])) data = tag.parent.parent.find_all("td") @@ -107,24 +104,21 @@ def parse_genre_esrb(df): """ loads every game's url to get genre and esrb rating """ - count = 0 - # global thread_counter - # thread_counter += 1 headers = {'User-Agent': generate_user_agent( - device_type='desktop', os=('mac', 'linux'))} - - proxies = get_proxies() - proxy = cycle(proxies) - - for index, row in df.loc[df['status'] == 0].iterrows(): - # we only want to scrape 200 games at a time. - if count == 200: - break + device_type='desktop', os=('mac', 'linux'))} + + print("'\n'******getting list of proxies and testing them******'\n'") + #proxies = set(requests.get('https://proxy.rudnkh.me/txt').text.split()) + #proxies = get_proxies() + # proxies = test_proxies(proxies) + #proxy = cycle(proxies) + print('******begin scraping for Genre and Rating******') + + for index, row in df.iterrows(): try: - # uses global headers and proxies - game_page = requests.get( - df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}) + #game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}) + game_page = requests.get(df.at[index, 'url']) if game_page.status_code == 200: sub_soup = BeautifulSoup(game_page.text, "lxml") # again, the info box is inconsistent among games so we @@ -137,39 +131,31 @@ def parse_genre_esrb(df): for h2 in h2s: if h2.string == 'Genre': temp_tag = h2 - df.at[index, 'Genre'] = temp_tag.next_sibling.string + df.loc[index, 'Genre'] = temp_tag.next_sibling.string # find the ESRB rating game_rating = gamebox.find('img').get('src') if 'esrb' in game_rating: - df.at[index, 'ESRB_Rating'] = game_rating.split( + df.loc[index, 'ESRB_Rating'] = game_rating.split( '_')[1].split('.')[0].upper() - - # we successfuly got the genre and rating if available - df.at[index, 'status'] = 1 + # we successfuly got the genre and rating + df.loc[index, 'status'] = 1 print('Successfully scraped genre and rating for :', df.at[index, 'Name']) - except: - print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') - # probably something went wrong the proxy? - proxy = next(proxies) - - # wait for 2 seconds between every call - time.sleep(2) - count += 1 + #else: + #proxies.remove(proxy) + #proxy = next(proxies) + except (ConnectionError, Timeout): + print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') -# def retry_games(): -# """try to scrape the missing data again""" -# global df -# # run every 5 minutes -# t = threading.Timer(300.0, retry_games) -# t.start() -# print("Starting to scrape missing data") -# if len(df[df['status'] == 0]) == 0: -# t.cancel() -# else: -# parse_genre_esrb() + #except(ProxyError): + #proxies.remove(proxy) + #proxy = next(proxies) + # wait for 2 seconds between every call, + # we do not want to get blocked or abuse the server + time.sleep(2) + return df # get the number of pages page = requests.get('http://www.vgchartz.com/gamedb/').text @@ -194,32 +180,34 @@ def parse_genre_esrb(df): parse_games(game_tags) -# # this should repeatedly try scrape genre and rating until it reaches this number of calls - -NUM_WORKERS = cpu_count() * 2 -while len(df.loc[df['status'] == 0]) > 0: - chunks = NUM_WORKERS // len(df.loc[df['status'] == 0]) - df_subsets = np.array_split( - df, chunks) if chunks != 0 else np.array_split(df, 1) - if __name__ == "__main__": - with Pool(NUM_WORKERS) as p: - p.map(parse_genre_esrb, df_subsets) - - ## add sleep, maybe proxies and headers here? better right? - - -# chunk_size = int(df.shape[0] / 4) -# for start in range(0, df.shape[0], chunk_size): -# df_subset = df.iloc[start:start + chunk_size] -# process_data(df_subset) - -# select only these columns in the final dataset -df_final = df[[ - 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating', - 'Publisher', 'Developer', 'Critic_Score', 'User_Score', - 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales']] -df_final.to_csv(csvfilename, sep=",", encoding='utf-8', index=False) - -elapsed_time = time.time() - start_time -print("Scraped", rec_count, "games in", round(elapsed_time, 2), "seconds.") -print("Wrote scraper data to", csvfilename) +def retry_game(): + """try to scrape the missing data again""" + global df + failed_games = len(df['status'] == 0) + # every worker can have 100 games at max + NUM_WORKERS = int(np.ceil(failed_games/100)) + df_subsets = np.array_split(df, NUM_WORKERS) + print(df_subsets) + pool = Pool(processes=NUM_WORKERS) + result = pool.map(parse_genre_esrb, df_subsets) + updated_df = pd.concat([i for i in result if not i.empty]) + pool.close() + pool.join() + return updated_df if len(updated_df) > 0 else df + + +if __name__ == "__main__": + while len(df['status']) > 0 or time.time() - start_time >= 300: # change to one day + df = retry_game() + #df = retry_game() + elapsed_time = time.time() - start_time + print("Scraped", rec_count, "games in", round(elapsed_time, 2), "seconds.") + + # select only these columns in the final dataset + df_final = df[[ + 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating', + 'Publisher', 'Developer', 'Critic_Score', 'User_Score', + 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales']] + + df_final.to_csv(csvfilename, sep=",", encoding='utf-8', index=False) + print("Wrote scraper data to", csvfilename) From dd2df6f71e4365e6444d03f335860c45771b8eb7 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Tue, 9 Apr 2019 12:54:29 -0400 Subject: [PATCH 05/18] fixed couple of things --- proxies_gen.py | 45 +++++++++++++++ vgchartzfull.py | 146 +++++++++++++++++++++++++++--------------------- 2 files changed, 127 insertions(+), 64 deletions(-) create mode 100644 proxies_gen.py diff --git a/proxies_gen.py b/proxies_gen.py new file mode 100644 index 0000000..93b45ac --- /dev/null +++ b/proxies_gen.py @@ -0,0 +1,45 @@ +from lxml.html import fromstring +import requests +from itertools import cycle + + +def get_proxies(): + url = 'https://free-proxy-list.net/' + response = requests.get(url) + parser = fromstring(response.text) + proxies = set() + for i in parser.xpath('//tbody/tr'): + if i.xpath('.//td[7][contains(text(),"yes")]'): + proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]]) + proxies.add(proxy) + + tested = test_proxies(proxies) + return tested + + +# proxies = get_proxies() +# proxy_pool = cycle(proxies) +# print(proxies) + + +def test_proxies(proxies): + url = 'https://httpbin.org/ip' + proxy_pool = cycle(proxies) + working_proxies = set() + count = 0 + for i in range(1, len(proxies)): + count += 1 + if count == 10: + break + # Get a proxy from the pool + proxy = next(proxy_pool) + print("Request #%d" % i) + try: + response = requests.get(url, proxies = {"http": proxy, "https": proxy}) + print(response.json()) + working_proxies.add(proxy) + except: + # Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. + # We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url + print("Skipping. Connnection error") + return working_proxies diff --git a/vgchartzfull.py b/vgchartzfull.py index 98f06c0..50907e8 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -13,31 +13,6 @@ import sys sys.setrecursionlimit(10000) # need to optimize code. -rec_count = 0 -start_time = time.time() -current_time = time.time() -csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" - - -# initialize a panda dataframe to store all games with the following columns: -# rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer, -# publisher, release year, critic score, user score, na sales, pal sales, -# jp sales, other sales, total sales, total shipped, last update, url, status -# last two columns for debugging -df = pd.DataFrame(columns=[ - 'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher', - 'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score', - 'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', - 'Other_Sales', 'Year', 'Last_Update', 'url', 'status']) - -urlhead = 'http://www.vgchartz.com/games/games.php?page=' -urltail = '&results=20&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' -urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer=' -urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1' -urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1' -urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' - - def parse_games(game_tags): """ parse the games table on current page @@ -157,53 +132,96 @@ def parse_genre_esrb(df): time.sleep(2) return df -# get the number of pages -page = requests.get('http://www.vgchartz.com/gamedb/').text -x = fromstring(page).xpath("//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0] -pages = np.ceil(int(x.replace(',', ""))/1000) - -pages = 2 -for page in range(1, pages): # pages = 2 for debugging! - surl = urlhead + str(page) + urltail - r = requests.get(surl).text - soup = BeautifulSoup(r, 'lxml') - print('Scraping page:', page) - - # vgchartz website is really weird so we have to search for - # tags with game urls - game_tags = list(filter( - lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), - # discard the first 10 elements because those - # links are in the navigation bar - soup.find_all("a") - ))[10:] - parse_games(game_tags) - - -def retry_game(): +def retry_game(df): """try to scrape the missing data again""" - global df - failed_games = len(df['status'] == 0) - # every worker can have 100 games at max - NUM_WORKERS = int(np.ceil(failed_games/100)) - df_subsets = np.array_split(df, NUM_WORKERS) - print(df_subsets) - pool = Pool(processes=NUM_WORKERS) - result = pool.map(parse_genre_esrb, df_subsets) - updated_df = pd.concat([i for i in result if not i.empty]) - pool.close() - pool.join() - return updated_df if len(updated_df) > 0 else df + # global df + # failed_games = len(df['status'] == 0) + # # every worker can have 100 games at max + # NUM_WORKERS = int(np.ceil(failed_games/100)) + # df_subsets = np.array_split(df, NUM_WORKERS) + # if df is None: + # return None + # pool = Pool(processes=NUM_WORKERS) + # #result = pool.map(parse_genre_esrb, df_subsets) + # updated_df = pd.concat(pool.map(parse_genre_esrb, df_subsets)) + # pool.close() + # pool.join() + # return updated_df + + return parse_genre_esrb(df) + if __name__ == "__main__": - while len(df['status']) > 0 or time.time() - start_time >= 300: # change to one day - df = retry_game() - #df = retry_game() + rec_count = 0 + start_time = time.time() + current_time = time.time() + csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" + + + # initialize a panda dataframe to store all games with the following columns: + # rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer, + # publisher, release year, critic score, user score, na sales, pal sales, + # jp sales, other sales, total sales, total shipped, last update, url, status + # last two columns for debugging + df = pd.DataFrame(columns=[ + 'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher', + 'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score', + 'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', + 'Other_Sales', 'Year', 'Last_Update', 'url', 'status']) + + urlhead = 'http://www.vgchartz.com/games/games.php?page=' + urltail = '&results=10&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' + urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer=' + urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1' + urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1' + urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' + + + # get the number of pages + page = requests.get('http://www.vgchartz.com/gamedb/').text + x = fromstring(page).xpath( + "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0] + pages = np.ceil(int(x.replace(',', ""))/1000) + + pages = 3 + for page in range(1, pages): # pages = 2 for debugging! + surl = urlhead + str(page) + urltail + r = requests.get(surl).text + soup = BeautifulSoup(r, 'lxml') + print('Scraping page:', page) + + # vgchartz website is really weird so we have to search for + # tags with game urls + game_tags = list(filter( + lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), + # discard the first 10 elements because those + # links are in the navigation bar + soup.find_all("a") + ))[10:] + + parse_games(game_tags) + df = retry_game(df) + + failed_games = len(df[df['status'] == 0]) + while failed_games > 0 :# or 300 - (time.time() - start_time) % 60 == 300: # change to one day timing does not work for some reason! + # every worker can have 100 games at max + NUM_WORKERS = int(np.ceil(failed_games/100)) + df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS) + pool = Pool(processes=NUM_WORKERS) + df_updated = pd.concat(pool.map(retry_game, df_subsets)) + df = pd.concat([df[df['status'] == 1], df_updated]) + pool.close() + pool.join() + failed_games = len(df[df['status'] == 0]) + print('Number of not scraped yet:', failed_games) + time.sleep(30) + elapsed_time = time.time() - start_time print("Scraped", rec_count, "games in", round(elapsed_time, 2), "seconds.") # select only these columns in the final dataset + df = df.sort_index() df_final = df[[ 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating', 'Publisher', 'Developer', 'Critic_Score', 'User_Score', From cdd36c4f5749f8f7dc75ee15a192750c9d9cc0d9 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Wed, 10 Apr 2019 05:08:15 -0400 Subject: [PATCH 06/18] finished --- .gitignore | 8 +- README.md | 21 +- Untitled.ipynb | 519 ------------------------------------------------ bs_test.py | 81 -------- myspider.py | 11 - proxies_gen.py | 42 ++-- scraper.py | 14 -- test.py | 160 --------------- untitled | 0 untitled1 | 0 vgchartzfull.py | 129 ++++++------ 11 files changed, 108 insertions(+), 877 deletions(-) delete mode 100644 Untitled.ipynb delete mode 100644 bs_test.py delete mode 100644 myspider.py delete mode 100644 scraper.py delete mode 100644 test.py delete mode 100644 untitled delete mode 100644 untitled1 diff --git a/.gitignore b/.gitignore index e28bf8d..ffbec06 100644 --- a/.gitignore +++ b/.gitignore @@ -105,4 +105,10 @@ venv.bak/ .vscode/ # csv -*.csv \ No newline at end of file +*.csv + +# ipynb +*.ipynb + +# ignore this folder +testing/ \ No newline at end of file diff --git a/README.md b/README.md index e557119..2020882 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,22 @@ -vgchartzfull is a python script based on BeautifulSoup. + +vgchartzfull is a python script with multiprocessing based on BeautifulSoup. +proxies are implemented in the script but it is disabled due to free proxies are unreliable and could results in running the program longer. It can be enabled by changing it to True + It creates a dataset based on data from http://www.vgchartz.com/gamedb/ -The dataset is saved as vgsales.csv. - -You will need to have BeautifulSoup added. -It can be installed by pip. +The dataset is saved as vgsales-%Y-%m-%d_%H_%M_%S.csv. -sudo pip install BeautifulSoup +You will need to have the following dependencies installed: +``` +BeautifulSoup4 +pandas +numpy +requests +unidecode +user_agent +``` Thanks to Chris Albon. http://chrisalbon.com/python/beautiful_soup_scrape_table.html +https://www.kdnuggets.com/2018/02/web-scraping-tutorial-python.html \ No newline at end of file diff --git a/Untitled.ipynb b/Untitled.ipynb deleted file mode 100644 index df85878..0000000 --- a/Untitled.ipynb +++ /dev/null @@ -1,519 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from bs4 import BeautifulSoup, element\n", - "import urllib\n", - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "pages = 56\n", - "rec_count = 0\n", - "rank = []\n", - "gname = []\n", - "platform = []\n", - "year = []\n", - "genre = []\n", - "critic_score = []\n", - "user_score = []\n", - "publisher = []\n", - "developer = []\n", - "sales_na = []\n", - "sales_pal = []\n", - "sales_jp = []\n", - "sales_ot = []\n", - "sales_gl = []" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "urlhead = 'http://www.vgchartz.com/gamedb/?page='\n", - "urltail = '&console=®ion=All&developer=&publisher=&genre=&boxart=Both&ownership=Both'\n", - "urltail += '&results=1000&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0'\n", - "urltail += '&showpublisher=1&showvgchartzscore=0&shownasales=1&showdeveloper=1&showcriticscore=1'\n", - "urltail += '&showpalsales=0&showpalsales=1&showreleasedate=1&showuserscore=1&showjapansales=1'\n", - "urltail += '&showlastupdate=0&showothersales=1&showgenre=1&sort=GL'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "for page in range(1, pages):\n", - " surl = urlhead + str(page) + urltail\n", - " r = urllib.request.urlopen(surl).read()\n", - " soup = BeautifulSoup(r)\n", - " print(f\"Page: {page}\")\n", - "\n", - " # vgchartz website is really weird so we have to search for\n", - " # tags with game urls\n", - " game_tags = list(filter(\n", - " lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'),\n", - " # discard the first 10 elements because those\n", - " # links are in the navigation bar\n", - " soup.find_all(\"a\")\n", - " ))[10:]\n", - "\n", - " for tag in game_tags:\n", - "\n", - " # add name to list\n", - " gname.append(\" \".join(tag.string.split()))\n", - " print(f\"{rec_count + 1} Fetch data for game {gname[-1]}\")\n", - "\n", - " # get different attributes\n", - " # traverse up the DOM tree\n", - " data = tag.parent.parent.find_all(\"td\")\n", - " rank.append(np.int32(data[0].string))\n", - " platform.append(data[3].find('img').attrs['alt'])\n", - " publisher.append(data[4].string)\n", - " developer.append(data[5].string)\n", - " critic_score.append(\n", - " float(data[6].string) if\n", - " not data[6].string.startswith(\"N/A\") else np.nan)\n", - " user_score.append(\n", - " float(data[7].string) if\n", - " not data[7].string.startswith(\"N/A\") else np.nan)\n", - " sales_na.append(\n", - " float(data[9].string[:-1]) if\n", - " not data[9].string.startswith(\"N/A\") else np.nan)\n", - " sales_pal.append(\n", - " float(data[10].string[:-1]) if\n", - " not data[10].string.startswith(\"N/A\") else np.nan)\n", - " sales_jp.append(\n", - " float(data[11].string[:-1]) if\n", - " not data[11].string.startswith(\"N/A\") else np.nan)\n", - " sales_ot.append(\n", - " float(data[12].string[:-1]) if\n", - " not data[12].string.startswith(\"N/A\") else np.nan)\n", - " sales_gl.append(\n", - " float(data[8].string[:-1]) if\n", - " not data[8].string.startswith(\"N/A\") else np.nan)\n", - " release_year = data[13].string.split()[-1]\n", - " # different format for year\n", - " if release_year.startswith('N/A'):\n", - " year.append('N/A')\n", - " else:\n", - " if int(release_year) >= 80:\n", - " year_to_add = np.int32(\"19\" + release_year)\n", - " else:\n", - " year_to_add = np.int32(\"20\" + release_year)\n", - " year.append(year_to_add)\n", - "\n", - " # go to every individual website to get genre info\n", - " url_to_game = tag.attrs['href']\n", - " site_raw = urllib.request.urlopen(url_to_game).read()\n", - " sub_soup = BeautifulSoup(site_raw, \"html.parser\")\n", - " # again, the info box is inconsistent among games so we\n", - " # have to find all the h2 and traverse from that to the genre name\n", - " h2s = sub_soup.find(\"div\", {\"id\": \"gameGenInfoBox\"}).find_all('h2')\n", - " # make a temporary tag here to search for the one that contains\n", - " # the word \"Genre\"\n", - " temp_tag = element.Tag\n", - " for h2 in h2s:\n", - " if h2.string == 'Genre':\n", - " temp_tag = h2\n", - " genre.append(temp_tag.next_sibling.string)\n", - "\n", - " rec_count += 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "columns = {\n", - " 'Rank': rank,\n", - " 'Name': gname,\n", - " 'Platform': platform,\n", - " 'Year': year,\n", - " 'Genre': genre,\n", - " 'Critic_Score': critic_score,\n", - " 'User_Score': user_score,\n", - " 'Publisher': publisher,\n", - " 'Developer': developer,\n", - " 'NA_Sales': sales_na,\n", - " 'PAL_Sales': sales_pal,\n", - " 'JP_Sales': sales_jp,\n", - " 'Other_Sales': sales_ot,\n", - " 'Global_Sales': sales_gl\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(rec_count)\n", - "df = pd.DataFrame(columns)\n", - "print(df.columns)\n", - "df = df[[\n", - " 'Rank', 'Name', 'Platform', 'Year', 'Genre',\n", - " 'Publisher', 'Developer', 'Critic_Score', 'User_Score',\n", - " 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']]\n", - "df.to_csv(\"vgsales.csv\", sep=\",\", encoding='utf-8', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [ - { - "ename": "RemoteDisconnected", - "evalue": "Remote end closed connection without response", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mRemoteDisconnected\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0mgenre\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0murl_to_game\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"http://www.vgchartz.com/game/45608/kinect-adventures/?region=All\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0msite_raw\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_to_game\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5\u001b[0m \u001b[0msub_soup\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBeautifulSoup\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msite_raw\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"lxml\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;31m# again, the info box is inconsistent among games so we\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36murlopen\u001b[1;34m(url, data, timeout, cafile, capath, cadefault, context)\u001b[0m\n\u001b[0;32m 220\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 221\u001b[0m \u001b[0mopener\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_opener\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 222\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mopener\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 223\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 224\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0minstall_opener\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopener\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36mopen\u001b[1;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[0;32m 523\u001b[0m \u001b[0mreq\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmeth\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 524\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 525\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_open\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 526\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 527\u001b[0m \u001b[1;31m# post-process response\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36m_open\u001b[1;34m(self, req, data)\u001b[0m\n\u001b[0;32m 541\u001b[0m \u001b[0mprotocol\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreq\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtype\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 542\u001b[0m result = self._call_chain(self.handle_open, protocol, protocol +\n\u001b[1;32m--> 543\u001b[1;33m '_open', req)\n\u001b[0m\u001b[0;32m 544\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 545\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36m_call_chain\u001b[1;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[0;32m 501\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mhandler\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mhandlers\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 502\u001b[0m \u001b[0mfunc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhandler\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmeth_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 503\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 504\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 505\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36mhttp_open\u001b[1;34m(self, req)\u001b[0m\n\u001b[0;32m 1343\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1344\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mhttp_open\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1345\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdo_open\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhttp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclient\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mHTTPConnection\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreq\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1346\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1347\u001b[0m \u001b[0mhttp_request\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mAbstractHTTPHandler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdo_request_\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\python37\\lib\\urllib\\request.py\u001b[0m in \u001b[0;36mdo_open\u001b[1;34m(self, http_class, req, **http_conn_args)\u001b[0m\n\u001b[0;32m 1318\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;31m# timeout error\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1319\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mURLError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0merr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1320\u001b[1;33m \u001b[0mr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mh\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1321\u001b[0m \u001b[1;32mexcept\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1322\u001b[0m \u001b[0mh\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\python37\\lib\\http\\client.py\u001b[0m in \u001b[0;36mgetresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1319\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1320\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1321\u001b[1;33m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbegin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1322\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1323\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\python37\\lib\\http\\client.py\u001b[0m in \u001b[0;36mbegin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 294\u001b[0m \u001b[1;31m# read until we get a non-100 response\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 295\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 296\u001b[1;33m \u001b[0mversion\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreason\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_read_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 297\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mCONTINUE\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 298\u001b[0m \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32mc:\\python37\\lib\\http\\client.py\u001b[0m in \u001b[0;36m_read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 263\u001b[0m \u001b[1;31m# Presumably, the server closed the connection before\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 264\u001b[0m \u001b[1;31m# sending a valid response.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 265\u001b[1;33m raise RemoteDisconnected(\"Remote end closed connection without\"\n\u001b[0m\u001b[0;32m 266\u001b[0m \" response\")\n\u001b[0;32m 267\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mRemoteDisconnected\u001b[0m: Remote end closed connection without response" - ] - } - ], - "source": [ - "rating = []\n", - "genre = []\n", - "url_to_game = \"http://www.vgchartz.com/game/45608/kinect-adventures/?region=All\"\n", - "site_raw = urllib.request.urlopen(url_to_game).read()\n", - "sub_soup = BeautifulSoup(site_raw, \"lxml\")\n", - "# again, the info box is inconsistent among games so we\n", - "# have to find all the h2 and traverse from that to the genre name\n", - "gamebox = sub_soup.find(\"div\", {\"id\": \"gameGenInfoBox\"})\n", - "h2s = gamebox.find_all('h2')\n", - "# make a temporary tag here to search for the one that contains\n", - "# the word \"Genre\"\n", - "temp_tag = element.Tag\n", - "for h2 in h2s:\n", - " if h2.string == 'Genre':\n", - " temp_tag = h2\n", - "genre.append(temp_tag.next_sibling.string)\n", - "\n", - "#find the ESRB rating\n", - "game_rating = gamebox.find('img').get('src')\n", - "if 'esrb' in game_rating:\n", - " rating.append(game_rating[game_rating.index('esrb'):])\n", - " \n" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [], - "source": [ - "rating = []\n", - "links = sub_soup.find_all('img')\n", - "for i in sub_soup.find_all('img'):\n", - " rate = i.get('src')\n", - " try:\n", - " if 'ESRB' in rate:\n", - " rating.append(rate[rate.index('esrb'):])\n", - " break\n", - " except:\n", - " pass\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'bytes' object has no attribute 'getcode'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0murllib\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0murlopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl_to_game\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetcode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m: 'bytes' object has no attribute 'getcode'" - ] - } - ], - "source": [ - "urllib.request.urlopen(url_to_game).read().getcode()" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1" - ] - }, - "execution_count": 47, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "for h2 in h2s:\n", - " if h2.string == 'Ratings':\n", - " temp_tag1 = h2\n", - " if h2.string == 'Genre':\n", - " temp_tag = h2\n", - "genre.append(temp_tag.next_sibling.string)\n", - "rating.append(temp_tag1.nextSibling)" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['esrb/ESRB_e.png']" - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rating" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Racing']" - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "genre" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - " #go to game page to parse genre and rating\n", - " try:\n", - " site_raw = urllib.request.urlopen(game[\"url\"]).read()\n", - " sub_soup = BeautifulSoup(site_raw, \"lxml\")\n", - " # again, the info box is inconsistent among games so we\n", - " # have to find all the h2 and traverse from that to the genre name\n", - " gamebox = sub_soup.find(\"div\", {\"id\": \"gameGenInfoBox\"})\n", - " h2s = gamebox.find_all('h2')\n", - " # make a temporary tag here to search for the one that contains\n", - " # the word \"Genre\"\n", - " temp_tag = element.Tag\n", - " for h2 in h2s:\n", - " if h2.string == 'Genre':\n", - " temp_tag = h2\n", - " game[\"genre\"] = temp_tag.next_sibling.string\n", - "\n", - " #find the ESRB rating\n", - " game_rating = gamebox.find('img').get('src')\n", - " if 'esrb' in game_rating:\n", - " game[\"vg_rating\"] = game_rating[game_rating.index('esrb'):]\n", - " except: \n", - " game[\"genre\"] = 'N/A'\n", - " game[\"vg_rating\"] = 'N/A'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [], - "source": [ - "from user_agent import generate_user_agent\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - " try:\n", - " #site_raw = urllib.request.urlopen(url_to_game).read()\n", - " site_row = requests.get(url_to_game, headers={\n", - " 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv: 12.0) Gecko/20100101 Firefox/12.0'}).text\n", - " sub_soup = BeautifulSoup(site_raw, \"lxml\")\n", - " # again, the info box is inconsistent among games so we\n", - " # have to find all the h2 and traverse from that to the genre name\n", - " gamebox = sub_soup.find(\"div\", {\"id\": \"gameGenInfoBox\"})\n", - " h2s = gamebox.find_all('h2')\n", - " # make a temporary tag here to search for the one that contains\n", - " # the word \"Genre\"\n", - " temp_tag = element.Tag\n", - " for h2 in h2s:\n", - " if h2.string == 'Genre':\n", - " temp_tag = h2\n", - " genre.append(temp_tag.next_sibling.string)\n", - " #find the ESRB rating\n", - " game_rating = gamebox.find('img').get('src')\n", - " if 'esrb' in game_rating:\n", - " rating.append(game_rating[game_rating.index('esrb'):])\n", - " except:\n", - " print('something wrong with game url:', url_to_game)\n", - " genre.append(np.nan)\n", - " rating.append(np.nan)\n", - "\n", - " time.sleep(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'103.84.254.190:56505'}\n" - ] - } - ], - "source": [ - "from lxml.html import fromstring\n", - "import requests\n", - "from itertools import cycle\n", - "import traceback\n", - "\n", - "def get_proxies():\n", - " url = 'https://free-proxy-list.net/'\n", - " response = requests.get(url)\n", - " parser = fromstring(response.text)\n", - " proxies = set()\n", - " for i in parser.xpath('//tbody/tr')[:20]:\n", - " if i.xpath('.//td[7][contains(text(),\"yes\")]'):\n", - " proxy = \":\".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])\n", - " proxies.add(proxy)\n", - " return proxies\n", - "\n", - "\n", - "#If you are copy pasting proxy ips, put in the list below\n", - "#proxies = ['121.129.127.209:80', '124.41.215.238:45169', '185.93.3.123:8080', '194.182.64.67:3128', '106.0.38.174:8080', '163.172.175.210:3128', '13.92.196.150:8080']\n", - "proxies = get_proxies()\n", - "proxy_pool = cycle(proxies)\n", - "print(proxies)\n", - "\n", - "url = 'https://httpbin.org/ip'\n", - "for i in range(1,len(proxies)):\n", - " #Get a proxy from the pool\n", - " proxy = next(proxy_pool)\n", - " print(\"Request #%d\"%i)\n", - " try:\n", - " response = requests.get(url,proxies={\"http\": proxy, \"https\": proxy})\n", - " print(response.json())\n", - " except:\n", - " #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. \n", - " #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url \n", - " print(\"Skipping. Connnection error\")" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "56.0" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "url = \"http://www.vgchartz.com/games/games.php?page=280&results=200&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both&boxart=Both&banner=Both&showdeleted=®ion=All&goty_year=&developer=&direction=DESC&showtotalsales=0&shownasales=0&showpalsales=0&showjapansales=0&showothersales=0&showpublisher=1&showdeveloper=0&showreleasedate=1&showlastupdate=1&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=No\"\n", - "page = requests.get(url).text\n", - "x = fromstring(page).xpath(\"//th[@colspan='3']/text()\")[0].split('(', 1)[1].split(')')[0]\n", - "np.ceil(int(x.replace(',',\"\"))/1000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/bs_test.py b/bs_test.py deleted file mode 100644 index cc2dfef..0000000 --- a/bs_test.py +++ /dev/null @@ -1,81 +0,0 @@ -from bs4 import BeautifulSoup, element -#import urllib.request -import requests - - -# link = "https://pythonprogramming.net/parsememcparseface/" -# sauce = urllib.request.urlopen(link).read() -# soup = bs.BeautifulSoup(sauce, 'lxml') - -# #print(soup.title.text) - -# # for paragraph in soup.find_all("p"): -# # print(paragraph.string) - -# pages = 56 -# rec_count = 0 -# rank = [] -# gname = [] -# platform = [] -# year = [] -# genre = [] -# critic_score = [] -# user_score = [] -# publisher = [] -# developer = [] -# sales_na = [] -# sales_pal = [] -# sales_jp = [] -# sales_ot = [] -# sales_gl = [] -# rating = [] - -# urlhead = 'http://www.vgchartz.com/games/games.php?page=' -# urltail = '&results=200&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' -# urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer=' -# urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1' -# urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1' -# urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' - -# for page in range(1, 3): -# surl = urlhead + str(page) + urltail -# r = urllib.request.urlopen(surl).read() -# soup = bs.BeautifulSoup(r) -# print(f"Page: {page}") - -# # vgchartz website is really weird so we have to search for -# # tags with game urls -# game_tags = list(filter( -# lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), -# # discard the first 10 elements because those -# # links are in the navigation bar -# soup.find_all("a") -# ))[10:] - -# print(game_tags) - - -rating = [] -genre = [] -url_to_game = "http://www.vgchartz.com/game/6968/mario-kart-wii/?region=All" -site_raw = requests.get(url_to_game, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv: 12.0) Gecko/20100101 Firefox/12.0'}).text -sub_soup = BeautifulSoup(site_raw, "lxml") -# again, the info box is inconsistent among games so we -# have to find all the h2 and traverse from that to the genre name -gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"}) -h2s = gamebox.find_all('h2') -# make a temporary tag here to search for the one that contains -# the word "Genre" -temp_tag = element.Tag -for h2 in h2s: - if h2.string == 'Genre': - temp_tag = h2 -genre.append(temp_tag.next_sibling.string) - -#find the ESRB rating -game_rating = gamebox.find('img').get('src') -if 'esrb' in game_rating: - rating.append(game_rating[game_rating.index('esrb'):]) - -print(rating) -print(genre) diff --git a/myspider.py b/myspider.py deleted file mode 100644 index 24cd1fd..0000000 --- a/myspider.py +++ /dev/null @@ -1,11 +0,0 @@ -import scrapy -class BlogSpider(scrapy.Spider): - name = 'blogspider' - start_urls = ['https://blog.scrapinghub.com'] - - def parse(self, response): - for title in response.css('.post-header>h2'): - yield {'title': title.css('a ::text').get()} - - for next_page in response.css('a.next-posts-link'): - yield response.follow(next_page, self.parse) \ No newline at end of file diff --git a/proxies_gen.py b/proxies_gen.py index 93b45ac..1228ec0 100644 --- a/proxies_gen.py +++ b/proxies_gen.py @@ -1,45 +1,49 @@ from lxml.html import fromstring import requests +import numpy as np from itertools import cycle -def get_proxies(): +def get_proxies(num=None): url = 'https://free-proxy-list.net/' response = requests.get(url) parser = fromstring(response.text) - proxies = set() + proxies = list(requests.get('https://proxy.rudnkh.me/txt').text.split()) for i in parser.xpath('//tbody/tr'): if i.xpath('.//td[7][contains(text(),"yes")]'): proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]]) - proxies.add(proxy) - - tested = test_proxies(proxies) + proxies.append(proxy) + print('Found', len(proxies), 'proxies, testing them now') + # proxies = list(requests.get( + # 'http://multiproxy.org/txt_all/proxy.txt').text.split()) + if num is None: + num = len(proxies) + tested = test_proxies(proxies, num) return tested -# proxies = get_proxies() -# proxy_pool = cycle(proxies) -# print(proxies) - - -def test_proxies(proxies): +def test_proxies(proxies, num): url = 'https://httpbin.org/ip' proxy_pool = cycle(proxies) - working_proxies = set() - count = 0 + working_proxies = [] for i in range(1, len(proxies)): - count += 1 - if count == 10: - break + if num == 0: break # Get a proxy from the pool proxy = next(proxy_pool) print("Request #%d" % i) try: - response = requests.get(url, proxies = {"http": proxy, "https": proxy}) + response = requests.get(url, proxies={"http": proxy, "https": proxy}, timeout = 1) print(response.json()) - working_proxies.add(proxy) + working_proxies.append(proxy) + num -= 1 except: # Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. - # We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url print("Skipping. Connnection error") return working_proxies + + +#proxies = get_proxies(5) +# with open('proxies.txt') as f: +# proxies = f.read().splitlines() +# test_proxies(proxies, 10) +# print(proxies) diff --git a/scraper.py b/scraper.py deleted file mode 100644 index e175d5e..0000000 --- a/scraper.py +++ /dev/null @@ -1,14 +0,0 @@ -import requests -from bs4 import BeautifulSoup -page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html") -#print(page.content) - -soup = BeautifulSoup(page.content, 'html.parser') -#print(soup.prettify()) - -#number of pages -pages = 279 - -page1 = requests.get("http://www.vgchartz.com/gamedb/games.php?name=&keyword=&console=®ion=All&developer=&publisher=&goty_year=&genre=&boxart=Both&banner=Both&ownership=Both&showmultiplat=Yes&results=200&order=Sales&showtotalsales=0&showtotalsales=1&showpublisher=0&showpublisher=1&showvgchartzscore=0&showvgchartzscore=1&shownasales=0&shownasales=1&showdeveloper=0&showdeveloper=1&showcriticscore=0&showcriticscore=1&showpalsales=0&showpalsales=1&showreleasedate=0&showreleasedate=1&showuserscore=0&showuserscore=1&showjapansales=0&showjapansales=1&showlastupdate=0&showlastupdate=1&showothersales=0&showothersales=1&showshipped=0&showshipped=1") -soup1 = BeautifulSoup(page1.content, 'html.parser') -print(list(soup1.children)) \ No newline at end of file diff --git a/test.py b/test.py deleted file mode 100644 index 8eae906..0000000 --- a/test.py +++ /dev/null @@ -1,160 +0,0 @@ -from bs4 import BeautifulSoup #for web scraping -import urllib #for opening urls -import pandas as pd # for data frames -import time # for time -import datetime # for dates - -#global constants -gc_dflt_max_time = 300 # seconds -gc_output_filename ='vgsales' -gc_output_filetype ='csv' - -#global variables -dflt_max_rec = 10 -data_list = [] -pages = 0 -items_scraped = 0 -data_exists = True - -class NotPositiveError(UserWarning): - pass - -# Define a function row of type game -# Argument passed in must be of type BeautifulSoup row -def vgchartz_parse(row): - game = {} - #get columns from the row we passed in - cols = row.find_all("td") - if cols: - # Start to build our data grid, defining appropriate names - game["rank"] =cols[0].string.strip() - game["gname"] =cols[2].text.strip() - game["platform"] =cols[3].find('img').get('alt') - game["developer"] =cols[4].string.strip() - game["publisher"] =cols[5].string.strip() - game["criticscore"] =cols[6].string.strip() - game["userscore"] =cols[7].string.strip() - game["sales_tot"] =cols[8].string.strip() - game["sales_na"] =cols[9].string.strip() - game["sales_eu"] =cols[10].string.strip() - game["sales_jp"] =cols[11].string.strip() - game["sales_ot"] =cols[12].string.strip() - game["year"] =cols[13].string.strip() - game["lst_update"] =cols[14].string.strip() - - return game - -#Get maximum number of records from the user. -#Try-Except conditions to ensure that -# i) Figure is a number -# ii) Figure is Positive -# iii) If no entry is made (use to revert to our default in dflt_max_rec) -while True: - try: - print('Enter the maximum number of records to scrape') - in_max = input() - if (in_max.strip() == ""): - print('Running for maximum '+str(dflt_max_rec)+' records') - break - elif int(in_max) <= 0: - #make sure value is positive, otherwise raise user defined error - raise NotPositiveError - break - elif int(in_max) > 0: - #update variable with new value - dflt_max_rec = in_max - print('Running for '+str(in_max)+' records') - break - - except ValueError: - print("This was not a number, please try again.") - except NotPositiveError: - print("The number was not positive, please try again.") - -start_time = time.time() -print('Starting scrape...') - -while data_exists: - - #This is the page number used in the url query. - pages +=1 - - #concat url to include page number - url = 'http://www.vgchartz.com/gamedb/?page=' + str(pages) + '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both&boxart=Both&showdeleted=®ion=All&developer=&goty_year=&alphasort=&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1&showvgchartzscore=0&showcriticscore=1&showuserscore=1' - #url='file:GameDataScrape/vgchartz.htm'#save the site locally first for development purposes instead of hitting the site - #open url - r = urllib.request.urlopen(url).read() - - #use beautiful soup - soup = BeautifulSoup(r, "html.parser") - - divparent = soup.find('div', id = 'generalBody') - - table = divparent.find('table', width = '968') - - #Skipping to 3rd row from VGChartz to avoid menu rows and increment max_dat - for row in table.find_all('tr')[3:]: - - #create a row of the type we desire - vg_game_info = vgchartz_parse(row) - - #Gets values in named columns - columns = {'Rank': vg_game_info['rank'], 'Name': vg_game_info['gname'], \ - 'Platform': vg_game_info['platform'], 'Developer': vg_game_info['developer'],\ - 'Publisher': vg_game_info['publisher'], 'Critic_Score': vg_game_info['criticscore'],\ - 'User_Score': vg_game_info['userscore'],'Global_Sales':vg_game_info['sales_tot'],\ - 'NA_Sales':vg_game_info['sales_na'], 'EU_Sales': vg_game_info['sales_eu'],\ - 'JP_Sales': vg_game_info['sales_jp'],'Other_Sales':vg_game_info['sales_ot'], \ - 'Year': vg_game_info['year']} - - items_scraped +=1 - - #What are our end conditions to break out the loop?? - #1. End if the maximum number of records is reached which is determined by - # i) global constant default value in 'dflt_max_rec' - # ii) over-ridden 'gc_default_max' with new max - if (int(dflt_max_rec) == items_scraped): - print('***Reached max data limit, ending scrape process...') - data_exists = False - break - - #2. End of we've been running for too long, arbitrary number stored in max_time_limit - elif (round(time.time() - start_time, 2)>gc_dflt_max_time): - print('***Reached max time limit, ending scrape process...') - data_exists = False - break - - #3. End if games do not have any sale data, not interested otherwise - # i) Bespoke to your purpose, update or remove - if (vg_game_info["sales_tot"] == "0.00" or vg_game_info["sales_tot"] == "0.00m"): - print('***No more relvent data available, ending scrape process...') - data_exists = False - break - - #append to a list of data so we can save this row for later - data_list.append(columns) - -print('...Scrape completed') -print() -print('Now writing to file') - -#Use pandas create data frame from our games list -df = pd.DataFrame(data_list) - -#list of columns -df = df[['Rank','Name','Platform','Publisher','Developer','Critic_Score','User_Score','Global_Sales','NA_Sales','EU_Sales','JP_Sales','Other_Sales','Year']] - -del df.index.name - -#write out to file -filename = gc_output_filename+'-' + datetime.datetime.now().strftime("%Y%m%d-%H_%M_%S") + '.'+gc_output_filetype -df.to_csv(filename,sep=",",encoding='utf-8') -print ('Writing scraped data to', filename) - -elapsed_time = time.time() - start_time -print() -print('Filewrite completed') -print() -print('Record Count: '+str(items_scraped)) -print() -print( 'Scraped', items_scraped, 'records over',pages, 'pages in', round(elapsed_time, 2), 'seconds.') \ No newline at end of file diff --git a/untitled b/untitled deleted file mode 100644 index e69de29..0000000 diff --git a/untitled1 b/untitled1 deleted file mode 100644 index e69de29..0000000 diff --git a/vgchartzfull.py b/vgchartzfull.py index 50907e8..097fa59 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -11,7 +11,9 @@ from multiprocessing import Pool # This is a thread-based Pool from requests.exceptions import ConnectionError, Timeout, ProxyError import sys -sys.setrecursionlimit(10000) # need to optimize code. +sys.setrecursionlimit(10000) # need to optimize code. +proxy_enabled = False + def parse_games(game_tags): """ @@ -25,7 +27,8 @@ def parse_games(game_tags): for tag in game_tags: game = {} game["Name"] = " ".join(tag.string.split()) - print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name'])) + print(rec_count+1, 'Fetch Data for game', + unidecode.unidecode(game['Name'])) data = tag.parent.parent.find_all("td") if data: @@ -76,24 +79,24 @@ def parse_games(game_tags): def parse_genre_esrb(df): - """ - loads every game's url to get genre and esrb rating - """ - + """loads every game's url to get genre and esrb rating""" headers = {'User-Agent': generate_user_agent( - device_type='desktop', os=('mac', 'linux'))} - - print("'\n'******getting list of proxies and testing them******'\n'") - #proxies = set(requests.get('https://proxy.rudnkh.me/txt').text.split()) - #proxies = get_proxies() - # proxies = test_proxies(proxies) - #proxy = cycle(proxies) - print('******begin scraping for Genre and Rating******') + device_type='desktop', os=('mac', 'linux'))} + proxy = {} + if proxy_enabled: + print("\n******getting list of proxies and testing them******'\n") + # this an api call which returns a list of working proxies that get checked evrey 15 minutes + link = "https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=1000&country=all&ssl=all&anonymity=all&uptime=100" + proxies = list(requests.get(link).text.split()) + # return 5 (at max) working proxies for every worker + proxies = test_proxies(np.random.choice(proxies, 30, replace=False), 5) + proxies = cycle(proxies) + proxy = next(proxies) for index, row in df.iterrows(): try: - #game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}) - game_page = requests.get(df.at[index, 'url']) + game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={ + "http": proxy, "https": proxy}, timeout=5) if game_page.status_code == 200: sub_soup = BeautifulSoup(game_page.text, "lxml") # again, the info box is inconsistent among games so we @@ -115,50 +118,47 @@ def parse_genre_esrb(df): '_')[1].split('.')[0].upper() # we successfuly got the genre and rating df.loc[index, 'status'] = 1 - print('Successfully scraped genre and rating for :', df.at[index, 'Name']) - #else: - #proxies.remove(proxy) - #proxy = next(proxies) + print('Successfully scraped genre and rating for :', + df.at[index, 'Name']) - except (ConnectionError, Timeout): - print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') + except(ProxyError): + proxy = next(proxies) - #except(ProxyError): - #proxies.remove(proxy) - #proxy = next(proxies) + except (ConnectionError, Timeout): + print('Something went wrong while connecting to', + df.at[index, 'Name'], 'url, will try again later') # wait for 2 seconds between every call, # we do not want to get blocked or abuse the server time.sleep(2) return df + def retry_game(df): """try to scrape the missing data again""" - # global df - # failed_games = len(df['status'] == 0) - # # every worker can have 100 games at max - # NUM_WORKERS = int(np.ceil(failed_games/100)) - # df_subsets = np.array_split(df, NUM_WORKERS) - # if df is None: - # return None - # pool = Pool(processes=NUM_WORKERS) - # #result = pool.map(parse_genre_esrb, df_subsets) - # updated_df = pd.concat(pool.map(parse_genre_esrb, df_subsets)) - # pool.close() - # pool.join() - # return updated_df - return parse_genre_esrb(df) - if __name__ == "__main__": + def process_games(df): + failed_games = len(df[df['status'] == 0]) + NUM_WORKERS = int(np.ceil(failed_games/100)) + 1 + if NUM_WORKERS > 24: + NUM_WORKERS = 24 # trying to keep it to max 24 workers at every run + df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS) + pool = Pool(processes=NUM_WORKERS) + results = pool.map(retry_game, df_subsets) + df_updated = pd.concat(results) + df = pd.concat([df[df['status'] == 1], df_updated]) + pool.close() + pool.join() + return df + rec_count = 0 start_time = time.time() current_time = time.time() csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" - # initialize a panda dataframe to store all games with the following columns: # rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer, # publisher, release year, critic score, user score, na sales, pal sales, @@ -171,57 +171,54 @@ def retry_game(df): 'Other_Sales', 'Year', 'Last_Update', 'url', 'status']) urlhead = 'http://www.vgchartz.com/games/games.php?page=' - urltail = '&results=10&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' + urltail = '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer=' urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1' urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1' urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' - - + # get the number of pages page = requests.get('http://www.vgchartz.com/gamedb/').text x = fromstring(page).xpath( "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0] - pages = np.ceil(int(x.replace(',', ""))/1000) + pages = int(np.ceil(int(x.replace(',', ""))/1000)) - pages = 3 for page in range(1, pages): # pages = 2 for debugging! surl = urlhead + str(page) + urltail r = requests.get(surl).text soup = BeautifulSoup(r, 'lxml') - print('Scraping page:', page) + print("******Scraping page " + str(page) + "******'\n") # vgchartz website is really weird so we have to search for # tags with game urls game_tags = list(filter( - lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), - # discard the first 10 elements because those - # links are in the navigation bar - soup.find_all("a") - ))[10:] + lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), soup.find_all("a")))[10:] + # discard the first 10 elements because those + # links are in the navigation bar parse_games(game_tags) + print('\n******begin scraping for Genre and Rating******\n') df = retry_game(df) + failed_games = len(df[df['status'] == 0]) + print("******Finished scraping games, will try to scrape missing data******") + # 36 hours max, should be enough to scrape everything + t_end = start_time + 60 * 60 * 36 + while True: + if failed_games == 0 or time.time() > t_end: + break + df = process_games(df) failed_games = len(df[df['status'] == 0]) - while failed_games > 0 :# or 300 - (time.time() - start_time) % 60 == 300: # change to one day timing does not work for some reason! - # every worker can have 100 games at max - NUM_WORKERS = int(np.ceil(failed_games/100)) - df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS) - pool = Pool(processes=NUM_WORKERS) - df_updated = pd.concat(pool.map(retry_game, df_subsets)) - df = pd.concat([df[df['status'] == 1], df_updated]) - pool.close() - pool.join() - failed_games = len(df[df['status'] == 0]) - print('Number of not scraped yet:', failed_games) - time.sleep(30) - + print('Number of not scraped yet:', failed_games, '\n') + time.sleep(60) # wait for 1 minute for the server to recover? + elapsed_time = time.time() - start_time - print("Scraped", rec_count, "games in", round(elapsed_time, 2), "seconds.") + print("Scraped", rec_count, "games in", + round(elapsed_time/60, 2), "minutes.") # select only these columns in the final dataset df = df.sort_index() + df.to_csv('complete-vgchartz', sep=",", encoding='utf-8', index=False) df_final = df[[ 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating', 'Publisher', 'Developer', 'Critic_Score', 'User_Score', From 9e206fcde0bca50af282919e82e88c7a4b0ff6e4 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Wed, 10 Apr 2019 05:18:17 -0400 Subject: [PATCH 07/18] README Update! --- README.md | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2020882..003d08d 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,17 @@ unidecode user_agent ``` -Thanks to Chris Albon. -http://chrisalbon.com/python/beautiful_soup_scrape_table.html -https://www.kdnuggets.com/2018/02/web-scraping-tutorial-python.html \ No newline at end of file +Thanks to: +- https://www.kdnuggets.com/2018/02/web-scraping-tutorial-python.html +- http://python.omics.wiki/multiprocessing_map/multiprocessing_partial_function_multiple_arguments +- https://medium.com/datadriveninvestor/speed-up-web-scraping-using-multiprocessing-in-python-af434ff310c5 + + +Free proxies: +[1](https://proxyscrape.com/free-proxy-list) +[2](http://multiproxy.org/txt_all/proxy.txt) +[3](https://proxy.rudnkh.me/txt) +[4](https://www.us-proxy.org/) + +- [ ] convert the script to a class or use scrapy, reference +https://edmundmartin.com/multi-threaded-crawler-in-python/ \ No newline at end of file From 79cba812f747eebc414eadc05d18b04bb90cb18b Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Wed, 10 Apr 2019 05:21:25 -0400 Subject: [PATCH 08/18] fixed a small bug that made the script runs longer --- vgchartzfull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index 097fa59..804238e 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -198,7 +198,7 @@ def process_games(df): parse_games(game_tags) print('\n******begin scraping for Genre and Rating******\n') - df = retry_game(df) + df = process_games(df) failed_games = len(df[df['status'] == 0]) print("******Finished scraping games, will try to scrape missing data******") From a672625653e6760ad08d3673d383fe2d8ceeb1a5 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Wed, 10 Apr 2019 05:26:56 -0400 Subject: [PATCH 09/18] Re-enabled proxies, very FAST! --- README.md | 2 +- vgchartzfull.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 003d08d..a72d099 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ vgchartzfull is a python script with multiprocessing based on BeautifulSoup. -proxies are implemented in the script but it is disabled due to free proxies are unreliable and could results in running the program longer. It can be enabled by changing it to True +proxies are implemented in the script, it can be disabled by changing it to False It creates a dataset based on data from http://www.vgchartz.com/gamedb/ diff --git a/vgchartzfull.py b/vgchartzfull.py index 804238e..5268bdf 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -12,7 +12,7 @@ from requests.exceptions import ConnectionError, Timeout, ProxyError import sys sys.setrecursionlimit(10000) # need to optimize code. -proxy_enabled = False +proxy_enabled = True def parse_games(game_tags): From 43f5e892b11e507e4e1b026816b912b547336c32 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Wed, 10 Apr 2019 12:05:50 -0400 Subject: [PATCH 10/18] added couple of additional exceptions and proxies to parse pages. Now the crawled data get saved before raising error --- proxies_gen.py | 30 ++++++++++-------- vgchartzfull.py | 83 +++++++++++++++++++++++++++++++++---------------- 2 files changed, 73 insertions(+), 40 deletions(-) diff --git a/proxies_gen.py b/proxies_gen.py index 1228ec0..925478f 100644 --- a/proxies_gen.py +++ b/proxies_gen.py @@ -5,17 +5,21 @@ def get_proxies(num=None): - url = 'https://free-proxy-list.net/' - response = requests.get(url) - parser = fromstring(response.text) - proxies = list(requests.get('https://proxy.rudnkh.me/txt').text.split()) - for i in parser.xpath('//tbody/tr'): - if i.xpath('.//td[7][contains(text(),"yes")]'): - proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]]) - proxies.append(proxy) - print('Found', len(proxies), 'proxies, testing them now') + # url = 'https://free-proxy-list.net/' + # response = requests.get(url) + # parser = fromstring(response.text) + # proxies = list(requests.get('https://proxy.rudnkh.me/txt').text.split()) + # for i in parser.xpath('//tbody/tr'): + # if i.xpath('.//td[7][contains(text(),"yes")]'): + # proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]]) + # proxies.append(proxy) # proxies = list(requests.get( # 'http://multiproxy.org/txt_all/proxy.txt').text.split()) + link = "https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=1000&country=all&ssl=all&anonymity=all&uptime=100" + proxies = list(requests.get(link).text.split()) + np.random.shuffle(proxies) + print('Found', len(proxies), 'proxies, testing them now') + if num is None: num = len(proxies) tested = test_proxies(proxies, num) @@ -42,8 +46,8 @@ def test_proxies(proxies, num): return working_proxies -#proxies = get_proxies(5) -# with open('proxies.txt') as f: -# proxies = f.read().splitlines() -# test_proxies(proxies, 10) +# proxies = get_proxies(5) +# # with open('proxies.txt') as f: +# # proxies = f.read().splitlines() +# # test_proxies(proxies, 10) # print(proxies) diff --git a/vgchartzfull.py b/vgchartzfull.py index 5268bdf..94e33dd 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -9,7 +9,8 @@ from itertools import cycle from lxml.html import fromstring from multiprocessing import Pool # This is a thread-based Pool -from requests.exceptions import ConnectionError, Timeout, ProxyError +from requests.exceptions import ConnectionError, Timeout, ProxyError, RequestException +from urllib3.exceptions import ProtocolError import sys sys.setrecursionlimit(10000) # need to optimize code. proxy_enabled = True @@ -86,11 +87,7 @@ def parse_genre_esrb(df): if proxy_enabled: print("\n******getting list of proxies and testing them******'\n") # this an api call which returns a list of working proxies that get checked evrey 15 minutes - link = "https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=1000&country=all&ssl=all&anonymity=all&uptime=100" - proxies = list(requests.get(link).text.split()) - # return 5 (at max) working proxies for every worker - proxies = test_proxies(np.random.choice(proxies, 30, replace=False), 5) - proxies = cycle(proxies) + proxies = cycle(get_proxies(10)) proxy = next(proxies) for index, row in df.iterrows(): @@ -124,10 +121,12 @@ def parse_genre_esrb(df): except(ProxyError): proxy = next(proxies) - except (ConnectionError, Timeout): + except (ConnectionError, Timeout, ProtocolError): print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') + except Exception as e: + print('different error occurred while connecting, will pass') # wait for 2 seconds between every call, # we do not want to get blocked or abuse the server time.sleep(2) @@ -183,34 +182,64 @@ def process_games(df): "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0] pages = int(np.ceil(int(x.replace(',', ""))/1000)) - for page in range(1, pages): # pages = 2 for debugging! - surl = urlhead + str(page) + urltail - r = requests.get(surl).text - soup = BeautifulSoup(r, 'lxml') - print("******Scraping page " + str(page) + "******'\n") + page = 1 + proxy = get_proxies(1) + while page <= pages: + try: + headers = {'User-Agent': generate_user_agent( + device_type='desktop', os=('mac', 'linux'))} + surl = urlhead + str(page) + urltail + r = requests.get(surl, headers=headers, proxies={ + 'http': proxy, 'https': proxy}, timeout=10) + if r.status_code == 200: + soup = BeautifulSoup(r.text, 'lxml') + print("******Scraping page " + str(page) + "******'\n") + + # vgchartz website is really weird so we have to search for + # tags with game urls + game_tags = list(filter( + lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), soup.find_all("a")))[10:] + # discard the first 10 elements because those + # links are in the navigation bar + + parse_games(game_tags) + print('\n******begin scraping for Genre and Rating******\n') + df = process_games(df) + page += 1 - # vgchartz website is really weird so we have to search for - # tags with game urls - game_tags = list(filter( - lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), soup.find_all("a")))[10:] - # discard the first 10 elements because those - # links are in the navigation bar + except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError): + print('Something went wrong while connecting to page: ', + page, ', will try again later') + proxy = get_proxies(1) + time.sleep(60) + + except Exception as e: + print("something went wrong! We're on page: " + + str(page) + '\nSaving successfully crawled data') + print("Exception: ", e) + df.to_csv('before_crashing_'+csvfilename, sep=",", + encoding='utf-8', index=False) + raise e - parse_games(game_tags) - print('\n******begin scraping for Genre and Rating******\n') - df = process_games(df) failed_games = len(df[df['status'] == 0]) print("******Finished scraping games, will try to scrape missing data******") # 36 hours max, should be enough to scrape everything t_end = start_time + 60 * 60 * 36 while True: - if failed_games == 0 or time.time() > t_end: - break - df = process_games(df) - failed_games = len(df[df['status'] == 0]) - print('Number of not scraped yet:', failed_games, '\n') - time.sleep(60) # wait for 1 minute for the server to recover? + try: + if failed_games == 0 or time.time() > t_end: + break + df = process_games(df) + failed_games = len(df[df['status'] == 0]) + print('Number of not scraped yet:', failed_games, '\n') + time.sleep(60) # wait for 1 minute for the server to recover? + except Exception as e: + print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data') + print("Exception: ", e) + df.to_csv('before_crashing_'+csvfilename, sep=",", + encoding='utf-8', index=False) + raise e elapsed_time = time.time() - start_time print("Scraped", rec_count, "games in", From c0d4e324cc8ebc04b89abf32f8fdbb168beef96e Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Wed, 10 Apr 2019 12:12:39 -0400 Subject: [PATCH 11/18] updated readme --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a72d099..d233efe 100644 --- a/README.md +++ b/README.md @@ -29,5 +29,10 @@ Free proxies: [3](https://proxy.rudnkh.me/txt) [4](https://www.us-proxy.org/) +- [*] added multiprocessing for faster results with a maximum of 24 workers. +- [*] added proxies to avoid being blocked +- [*] handling couple of exceptions +- [*] scraped data gets saved before raising an unexpected error +- [ ] add the option to continue where we left off due to an unexpected error - [ ] convert the script to a class or use scrapy, reference -https://edmundmartin.com/multi-threaded-crawler-in-python/ \ No newline at end of file + - https://edmundmartin.com/multi-threaded-crawler-in-python/ \ No newline at end of file From 48060d22447503277e6ba94d608d5a23d14079a2 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Wed, 10 Apr 2019 12:13:47 -0400 Subject: [PATCH 12/18] updated readme --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index d233efe..fc6e8ff 100644 --- a/README.md +++ b/README.md @@ -29,10 +29,10 @@ Free proxies: [3](https://proxy.rudnkh.me/txt) [4](https://www.us-proxy.org/) -- [*] added multiprocessing for faster results with a maximum of 24 workers. -- [*] added proxies to avoid being blocked -- [*] handling couple of exceptions -- [*] scraped data gets saved before raising an unexpected error +- [x] added multiprocessing for faster results with a maximum of 24 workers. +- [x] added proxies to avoid being blocked +- [x] handling couple of exceptions +- [x] scraped data gets saved before raising an unexpected error - [ ] add the option to continue where we left off due to an unexpected error - [ ] convert the script to a class or use scrapy, reference - https://edmundmartin.com/multi-threaded-crawler-in-python/ \ No newline at end of file From 69076d725b7294c15b32a9426234b0c1d8c4e424 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Wed, 10 Apr 2019 16:11:26 -0400 Subject: [PATCH 13/18] added option to continue after error --- README.md | 2 +- proxies_gen.py | 6 ++++-- vgchartzfull.py | 29 ++++++++++++++++++++--------- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index fc6e8ff..eea6706 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,6 @@ Free proxies: - [x] added proxies to avoid being blocked - [x] handling couple of exceptions - [x] scraped data gets saved before raising an unexpected error -- [ ] add the option to continue where we left off due to an unexpected error +- [x] add the option to continue where we left off due to an unexpected error - [ ] convert the script to a class or use scrapy, reference - https://edmundmartin.com/multi-threaded-crawler-in-python/ \ No newline at end of file diff --git a/proxies_gen.py b/proxies_gen.py index 925478f..8ce26e1 100644 --- a/proxies_gen.py +++ b/proxies_gen.py @@ -13,11 +13,13 @@ def get_proxies(num=None): # if i.xpath('.//td[7][contains(text(),"yes")]'): # proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]]) # proxies.append(proxy) - # proxies = list(requests.get( - # 'http://multiproxy.org/txt_all/proxy.txt').text.split()) + link = "https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=1000&country=all&ssl=all&anonymity=all&uptime=100" proxies = list(requests.get(link).text.split()) np.random.shuffle(proxies) + if len(proxies) == 0: + proxies = list(requests.get( + 'http://multiproxy.org/txt_all/proxy.txt').text.split()) print('Found', len(proxies), 'proxies, testing them now') if num is None: diff --git a/vgchartzfull.py b/vgchartzfull.py index 94e33dd..bec5ad6 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -12,6 +12,7 @@ from requests.exceptions import ConnectionError, Timeout, ProxyError, RequestException from urllib3.exceptions import ProtocolError import sys +import os sys.setrecursionlimit(10000) # need to optimize code. proxy_enabled = True @@ -147,8 +148,9 @@ def process_games(df): df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS) pool = Pool(processes=NUM_WORKERS) results = pool.map(retry_game, df_subsets) - df_updated = pd.concat(results) - df = pd.concat([df[df['status'] == 1], df_updated]) + if None not in results: + df_updated = pd.concat(results) + df = pd.concat([df[df['status'] == 1], df_updated]) pool.close() pool.join() return df @@ -156,18 +158,27 @@ def process_games(df): rec_count = 0 start_time = time.time() current_time = time.time() - csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" + crashed_tag = 'before_crashing_' + exists = [s for s in os.listdir() if crashed_tag in s] + if exists: + csvfilename = exists[0].replace(crashed_tag, '') + df = pd.read_csv(exists[0]) + page = int(len(df)/1000) + df = process_games(df) + else: + csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" # initialize a panda dataframe to store all games with the following columns: # rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer, # publisher, release year, critic score, user score, na sales, pal sales, # jp sales, other sales, total sales, total shipped, last update, url, status # last two columns for debugging - df = pd.DataFrame(columns=[ - 'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher', - 'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score', - 'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', - 'Other_Sales', 'Year', 'Last_Update', 'url', 'status']) + if not exists: + df = pd.DataFrame(columns=[ + 'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher', + 'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score', + 'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', + 'Other_Sales', 'Year', 'Last_Update', 'url', 'status']) urlhead = 'http://www.vgchartz.com/games/games.php?page=' urltail = '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' @@ -182,7 +193,7 @@ def process_games(df): "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0] pages = int(np.ceil(int(x.replace(',', ""))/1000)) - page = 1 + if not exists: page = 1 proxy = get_proxies(1) while page <= pages: try: From 1b553afbd893415248cf1a8b5e662eb7d67ec4dd Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Wed, 10 Apr 2019 16:28:40 -0400 Subject: [PATCH 14/18] good that I caught it before running the script --- vgchartzfull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vgchartzfull.py b/vgchartzfull.py index bec5ad6..f3e23ca 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -163,7 +163,7 @@ def process_games(df): if exists: csvfilename = exists[0].replace(crashed_tag, '') df = pd.read_csv(exists[0]) - page = int(len(df)/1000) + page = int(len(df)/1000) + 1 # because we already scraped current page df = process_games(df) else: csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" From 7c20f3348f1c37144863af5d55bdcb92d373517d Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Thu, 11 Apr 2019 12:21:24 -0400 Subject: [PATCH 15/18] few updates and bug fixes --- README.md | 2 ++ proxies_gen.py | 6 ++++-- vgchartzfull.py | 36 +++++++++++++++++------------------- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index eea6706..e48f286 100644 --- a/README.md +++ b/README.md @@ -35,4 +35,6 @@ Free proxies: - [x] scraped data gets saved before raising an unexpected error - [x] add the option to continue where we left off due to an unexpected error - [ ] convert the script to a class or use scrapy, reference +- [ ] optimize it +- [ ] create a log file - https://edmundmartin.com/multi-threaded-crawler-in-python/ \ No newline at end of file diff --git a/proxies_gen.py b/proxies_gen.py index 8ce26e1..020eb15 100644 --- a/proxies_gen.py +++ b/proxies_gen.py @@ -17,9 +17,11 @@ def get_proxies(num=None): link = "https://api.proxyscrape.com/?request=getproxies&proxytype=http&timeout=1000&country=all&ssl=all&anonymity=all&uptime=100" proxies = list(requests.get(link).text.split()) np.random.shuffle(proxies) + proxies = [] if len(proxies) == 0: proxies = list(requests.get( - 'http://multiproxy.org/txt_all/proxy.txt').text.split()) + link[:-3]+'99').text.split()) # change uptime to 99 + np.random.shuffle(proxies) print('Found', len(proxies), 'proxies, testing them now') if num is None: @@ -48,7 +50,7 @@ def test_proxies(proxies, num): return working_proxies -# proxies = get_proxies(5) +proxies = get_proxies(5) # # with open('proxies.txt') as f: # # proxies = f.read().splitlines() # # test_proxies(proxies, 10) diff --git a/vgchartzfull.py b/vgchartzfull.py index f3e23ca..b816f46 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -29,8 +29,7 @@ def parse_games(game_tags): for tag in game_tags: game = {} game["Name"] = " ".join(tag.string.split()) - print(rec_count+1, 'Fetch Data for game', - unidecode.unidecode(game['Name'])) + print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name'])) data = tag.parent.parent.find_all("td") if data: @@ -71,7 +70,7 @@ def parse_games(game_tags): year_to_add = np.int32("19" + year) else: year_to_add = np.int32("20" + year) - game["Year"] = year_to_add + game["Year"] = year_to_add game["Last_Update"] = data[16].get_text().strip() game['Genre'] = 'N/A' game['ESRB_Rating'] = 'N/A' @@ -93,8 +92,7 @@ def parse_genre_esrb(df): for index, row in df.iterrows(): try: - game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={ - "http": proxy, "https": proxy}, timeout=5) + game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}, timeout=5) if game_page.status_code == 200: sub_soup = BeautifulSoup(game_page.text, "lxml") # again, the info box is inconsistent among games so we @@ -116,15 +114,13 @@ def parse_genre_esrb(df): '_')[1].split('.')[0].upper() # we successfuly got the genre and rating df.loc[index, 'status'] = 1 - print('Successfully scraped genre and rating for :', - df.at[index, 'Name']) + print('Successfully scraped genre and rating for :', df.at[index, 'Name']) except(ProxyError): proxy = next(proxies) except (ConnectionError, Timeout, ProtocolError): - print('Something went wrong while connecting to', - df.at[index, 'Name'], 'url, will try again later') + print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') except Exception as e: print('different error occurred while connecting, will pass') @@ -148,9 +144,11 @@ def process_games(df): df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS) pool = Pool(processes=NUM_WORKERS) results = pool.map(retry_game, df_subsets) - if None not in results: + try: df_updated = pd.concat(results) df = pd.concat([df[df['status'] == 1], df_updated]) + except: + print('error occurred while joining dataframe') pool.close() pool.join() return df @@ -163,7 +161,8 @@ def process_games(df): if exists: csvfilename = exists[0].replace(crashed_tag, '') df = pd.read_csv(exists[0]) - page = int(len(df)/1000) + 1 # because we already scraped current page + rec_count = len(df) + page = int(rec_count/1000) + 1 # because we already scraped current df = process_games(df) else: csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" @@ -188,15 +187,15 @@ def process_games(df): urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' # get the number of pages - page = requests.get('http://www.vgchartz.com/gamedb/').text - x = fromstring(page).xpath( + vglink = requests.get('http://www.vgchartz.com/gamedb/').text + x = fromstring(vglink).xpath( "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0] pages = int(np.ceil(int(x.replace(',', ""))/1000)) if not exists: page = 1 - proxy = get_proxies(1) while page <= pages: try: + proxy = get_proxies(1)[0] headers = {'User-Agent': generate_user_agent( device_type='desktop', os=('mac', 'linux'))} surl = urlhead + str(page) + urltail @@ -221,14 +220,14 @@ def process_games(df): except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError): print('Something went wrong while connecting to page: ', page, ', will try again later') - proxy = get_proxies(1) + #proxy = get_proxies(1) time.sleep(60) except Exception as e: print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data') print("Exception: ", e) - df.to_csv('before_crashing_'+csvfilename, sep=",", + df.to_csv(crashed_tag + csvfilename, sep=",", encoding='utf-8', index=False) raise e @@ -248,13 +247,12 @@ def process_games(df): except Exception as e: print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data') print("Exception: ", e) - df.to_csv('before_crashing_'+csvfilename, sep=",", + df.to_csv(crashed_tag + csvfilename, sep=",", encoding='utf-8', index=False) raise e elapsed_time = time.time() - start_time - print("Scraped", rec_count, "games in", - round(elapsed_time/60, 2), "minutes.") + print("Scraped", rec_count, "games in", round(elapsed_time/60, 2), "minutes.") # select only these columns in the final dataset df = df.sort_index() From 67819a531f60d30a422150962990f7a7e4d56d69 Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Thu, 11 Apr 2019 18:16:31 -0400 Subject: [PATCH 16/18] changed how to calculate num_workers, and minor updates --- README.md | 3 ++- proxies_gen.py | 19 +++++++++++-------- vgchartzfull.py | 23 +++++++++++++---------- 3 files changed, 26 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index e48f286..c7d9d6f 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,8 @@ Free proxies: - [x] handling couple of exceptions - [x] scraped data gets saved before raising an unexpected error - [x] add the option to continue where we left off due to an unexpected error -- [ ] convert the script to a class or use scrapy, reference +- [x] clean version removes the print statements, should results in better performance! - [ ] optimize it - [ ] create a log file +- [ ] convert the script to a class or use scrapy, reference - https://edmundmartin.com/multi-threaded-crawler-in-python/ \ No newline at end of file diff --git a/proxies_gen.py b/proxies_gen.py index 020eb15..6c5a54e 100644 --- a/proxies_gen.py +++ b/proxies_gen.py @@ -20,9 +20,9 @@ def get_proxies(num=None): proxies = [] if len(proxies) == 0: proxies = list(requests.get( - link[:-3]+'99').text.split()) # change uptime to 99 + link[:-3]+'99').text.split()) # change uptime to 99 np.random.shuffle(proxies) - print('Found', len(proxies), 'proxies, testing them now') + # print('Found', len(proxies), 'proxies, testing them now') if num is None: num = len(proxies) @@ -35,22 +35,25 @@ def test_proxies(proxies, num): proxy_pool = cycle(proxies) working_proxies = [] for i in range(1, len(proxies)): - if num == 0: break + if num == 0: + break # Get a proxy from the pool proxy = next(proxy_pool) - print("Request #%d" % i) + # print("Request #%d" % i) try: - response = requests.get(url, proxies={"http": proxy, "https": proxy}, timeout = 1) - print(response.json()) + response = requests.get( + url, proxies={"http": proxy, "https": proxy}, timeout=1) + # print(response.json()) working_proxies.append(proxy) num -= 1 except: # Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. - print("Skipping. Connnection error") + # print("Skipping. Connnection error") + pass return working_proxies -proxies = get_proxies(5) +# proxies = get_proxies(5) # # with open('proxies.txt') as f: # # proxies = f.read().splitlines() # # test_proxies(proxies, 10) diff --git a/vgchartzfull.py b/vgchartzfull.py index b816f46..07475ca 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -8,7 +8,7 @@ from proxies_gen import get_proxies, test_proxies from itertools import cycle from lxml.html import fromstring -from multiprocessing import Pool # This is a thread-based Pool +from multiprocessing import Pool, cpu_count # This is a thread-based Pool from requests.exceptions import ConnectionError, Timeout, ProxyError, RequestException from urllib3.exceptions import ProtocolError import sys @@ -87,7 +87,7 @@ def parse_genre_esrb(df): if proxy_enabled: print("\n******getting list of proxies and testing them******'\n") # this an api call which returns a list of working proxies that get checked evrey 15 minutes - proxies = cycle(get_proxies(10)) + proxies = cycle(get_proxies(5)) proxy = next(proxies) for index, row in df.iterrows(): @@ -138,10 +138,11 @@ def retry_game(df): if __name__ == "__main__": def process_games(df): failed_games = len(df[df['status'] == 0]) - NUM_WORKERS = int(np.ceil(failed_games/100)) + 1 - if NUM_WORKERS > 24: - NUM_WORKERS = 24 # trying to keep it to max 24 workers at every run + NUM_WORKERS = cpu_count() * 2 df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS) + #update num_workers + df_subsets = [i for i in df_subsets if len(i) != 0] + NUM_WORKERS = len(df_subsets) # we don't want to have a worker for empty subsets pool = Pool(processes=NUM_WORKERS) results = pool.map(retry_game, df_subsets) try: @@ -190,10 +191,12 @@ def process_games(df): vglink = requests.get('http://www.vgchartz.com/gamedb/').text x = fromstring(vglink).xpath( "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0] - pages = int(np.ceil(int(x.replace(',', ""))/1000)) + pages = int(x.split(',')[0]) if not exists: page = 1 - while page <= pages: + while True: + if page > pages: + break try: proxy = get_proxies(1)[0] headers = {'User-Agent': generate_user_agent( @@ -238,10 +241,10 @@ def process_games(df): t_end = start_time + 60 * 60 * 36 while True: try: - if failed_games == 0 or time.time() > t_end: - break df = process_games(df) failed_games = len(df[df['status'] == 0]) + if failed_games == 0 or time.time() > t_end: + break print('Number of not scraped yet:', failed_games, '\n') time.sleep(60) # wait for 1 minute for the server to recover? except Exception as e: @@ -256,7 +259,7 @@ def process_games(df): # select only these columns in the final dataset df = df.sort_index() - df.to_csv('complete-vgchartz', sep=",", encoding='utf-8', index=False) + df.to_csv('complete-vgchartz.csv', sep=",", encoding='utf-8', index=False) df_final = df[[ 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating', 'Publisher', 'Developer', 'Critic_Score', 'User_Score', From b7c12e429b102ce0c3dc7efa96cda6be78c14efb Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Thu, 11 Apr 2019 21:40:30 -0400 Subject: [PATCH 17/18] added a cleaner version without the print statements --- clean-vgchartzfull.py | 269 ++++++++++++++++++++++++++++++++++++++++++ vgchartzfull.py | 2 +- 2 files changed, 270 insertions(+), 1 deletion(-) create mode 100644 clean-vgchartzfull.py diff --git a/clean-vgchartzfull.py b/clean-vgchartzfull.py new file mode 100644 index 0000000..7715060 --- /dev/null +++ b/clean-vgchartzfull.py @@ -0,0 +1,269 @@ +from bs4 import BeautifulSoup, element +import pandas as pd +import numpy as np +import requests +import time +import unidecode +from user_agent import generate_user_agent +from proxies_gen import get_proxies, test_proxies +from itertools import cycle +from lxml.html import fromstring +from multiprocessing import Pool, cpu_count # This is a thread-based Pool +from requests.exceptions import ConnectionError, Timeout, ProxyError, RequestException +from urllib3.exceptions import ProtocolError +import sys +import os +sys.setrecursionlimit(10000) # need to optimize code. +proxy_enabled = True + + +def parse_games(game_tags): + """ + parse the games table on current page + parameters: + game_tags: games tags after reading the html page + df: the dataframe where we will store the games + """ + global rec_count + global df + for tag in game_tags: + game = {} + game["Name"] = " ".join(tag.string.split()) + print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name'])) + + data = tag.parent.parent.find_all("td") + if data: + game["Rank"] = np.int32(data[0].string) + game["img_url"] = data[1].a.img.get('src') + game["url"] = data[2].a.get('href') + if len(game["Name"].split("/")) > 1: + # replace accented chars with ascii + game["basename"] = unidecode.unidecode( + game['Name'].strip().split('/')[0].strip().replace(' ', '-')) + else: + game["basename"] = game["url"].rsplit('/', 2)[1] + game["Platform"] = data[3].img.get('alt') + game["Publisher"] = data[4].get_text().strip() + game["Developer"] = data[5].get_text().strip() + game["Vgchartzscore"] = data[6].get_text().strip() + game["Critic_Score"] = float( + data[7].string) if not data[7].string.startswith("N/A") else np.nan + game["User_Score"] = float( + data[8].string) if not data[8].string.startswith("N/A") else np.nan + game["Total_Shipped"] = float( + data[9].string[:-1]) if not data[9].string.startswith("N/A") else np.nan + game["Global_Sales"] = float( + data[10].string[:-1]) if not data[10].string.startswith("N/A") else np.nan + game["NA_Sales"] = float( + data[11].string[:-1]) if not data[11].string.startswith("N/A") else np.nan + game["PAL_Sales"] = float( + data[12].string[:-1]) if not data[12].string.startswith("N/A") else np.nan + game["JP_Sales"] = float( + data[13].string[:-1]) if not data[13].string.startswith("N/A") else np.nan + game["Other_Sales"] = float( + data[14].string[:-1]) if not data[14].string.startswith("N/A") else np.nan + year = data[15].string.split()[-1] + if year.startswith('N/A'): + game["Year"] = 'N/A' + else: + if int(year) >= 80: + year_to_add = np.int32("19" + year) + else: + year_to_add = np.int32("20" + year) + game["Year"] = year_to_add + game["Last_Update"] = data[16].get_text().strip() + game['Genre'] = 'N/A' + game['ESRB_Rating'] = 'N/A' + game['status'] = 0 + df = df.append(game, ignore_index=True) + rec_count += 1 + + +def parse_genre_esrb(df): + """loads every game's url to get genre and esrb rating""" + headers = {'User-Agent': generate_user_agent( + device_type='desktop', os=('mac', 'linux'))} + proxy = {} + if proxy_enabled: + #print("\n******getting list of proxies and testing them******'\n") + # this an api call which returns a list of working proxies that get checked evrey 15 minutes + proxies = cycle(get_proxies(5)) + proxy = next(proxies) + + for index, row in df.iterrows(): + try: + game_page = requests.get(df.at[index, 'url'], headers=headers, proxies={"http": proxy, "https": proxy}, timeout=5) + if game_page.status_code == 200: + sub_soup = BeautifulSoup(game_page.text, "lxml") + # again, the info box is inconsistent among games so we + # have to find all the h2 and traverse from that to the genre + gamebox = sub_soup.find("div", {"id": "gameGenInfoBox"}) + h2s = gamebox.find_all('h2') + # make a temporary tag here to search for the one that contains + # the word "Genre" + temp_tag = element.Tag + for h2 in h2s: + if h2.string == 'Genre': + temp_tag = h2 + df.loc[index, 'Genre'] = temp_tag.next_sibling.string + + # find the ESRB rating + game_rating = gamebox.find('img').get('src') + if 'esrb' in game_rating: + df.loc[index, 'ESRB_Rating'] = game_rating.split( + '_')[1].split('.')[0].upper() + # we successfuly got the genre and rating + df.loc[index, 'status'] = 1 + #print('Successfully scraped genre and rating for :', df.at[index, 'Name']) + + except(ProxyError): + proxy = next(proxies) + + except (ConnectionError, Timeout, ProtocolError): + print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') + + except Exception as e: + print('different error occurred while connecting, will pass') + # wait for 2 seconds between every call, + # we do not want to get blocked or abuse the server + time.sleep(2) + return df + + +def retry_game(df): + """try to scrape the missing data again""" + return parse_genre_esrb(df) + + +if __name__ == "__main__": + def process_games(df): + failed_games = len(df[df['status'] == 0]) + NUM_WORKERS = cpu_count() * 2 + df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS) + #update num_workers + df_subsets = [i for i in df_subsets if len(i) != 0] + NUM_WORKERS = len(df_subsets) # we don't want to have a worker for empty subsets + pool = Pool(processes=NUM_WORKERS) + results = pool.map(retry_game, df_subsets) + try: + df_updated = pd.concat(results) + df = pd.concat([df[df['status'] == 1], df_updated]) + except: + print('error occurred while joining dataframe') + pool.close() + pool.join() + return df + + rec_count = 0 + start_time = time.time() + current_time = time.time() + crashed_tag = 'before_crashing_' + exists = [s for s in os.listdir() if crashed_tag in s] + if exists: + csvfilename = exists[0].replace(crashed_tag, '') + df = pd.read_csv(exists[0]) + rec_count = len(df) + page = int(rec_count/1000) + 1 # because we already scraped current + df = process_games(df) + else: + csvfilename = "vgsales-" + time.strftime("%Y-%m-%d_%H_%M_%S") + ".csv" + + # initialize a panda dataframe to store all games with the following columns: + # rank, name, img-url, vgchartz score, genre, ESRB rating, platform, developer, + # publisher, release year, critic score, user score, na sales, pal sales, + # jp sales, other sales, total sales, total shipped, last update, url, status + # last two columns for debugging + if not exists: + df = pd.DataFrame(columns=[ + 'Rank', 'Name', 'basename', 'Genre', 'ESRB_Rating', 'Platform', 'Publisher', + 'Developer', 'VGChartz_Score', 'Critic_Score', 'User_Score', + 'Total_Shipped', 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', + 'Other_Sales', 'Year', 'Last_Update', 'url', 'status']) + + urlhead = 'http://www.vgchartz.com/games/games.php?page=' + urltail = '&results=1000&name=&console=&keyword=&publisher=&genre=&order=Sales&ownership=Both' + urltail += '&banner=Both&showdeleted=®ion=All&goty_year=&developer=' + urltail += '&direction=DESC&showtotalsales=1&shownasales=1&showpalsales=1&showjapansales=1' + urltail += '&showothersales=1&showpublisher=1&showdeveloper=1&showreleasedate=1&showlastupdate=1' + urltail += '&showvgchartzscore=1&showcriticscore=1&showuserscore=1&showshipped=1&alphasort=&showmultiplat=Yes&showgenre=1' + + # get the number of pages + vglink = requests.get('http://www.vgchartz.com/gamedb/').text + x = fromstring(vglink).xpath( + "//th[@colspan='3']/text()")[0].split('(', 1)[1].split(')')[0] + pages = int(x.split(',')[0]) + + if not exists: page = 1 + while True: + if page > pages: + break + try: + proxy = get_proxies(1)[0] + headers = {'User-Agent': generate_user_agent( + device_type='desktop', os=('mac', 'linux'))} + surl = urlhead + str(page) + urltail + r = requests.get(surl, headers=headers, proxies={ + 'http': proxy, 'https': proxy}, timeout=10) + if r.status_code == 200: + soup = BeautifulSoup(r.text, 'lxml') + #print("******Scraping page " + str(page) + "******'\n") + + # vgchartz website is really weird so we have to search for + # tags with game urls + game_tags = list(filter( + lambda x: x.attrs['href'].startswith('http://www.vgchartz.com/game/'), soup.find_all("a")))[10:] + # discard the first 10 elements because those + # links are in the navigation bar + + parse_games(game_tags) + #print('\n******begin scraping for Genre and Rating******\n') + df = process_games(df) + page += 1 + + except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError): + print('Something went wrong while connecting to page: ', + page, ', will try again later') + #proxy = get_proxies(1) + time.sleep(60) + + except Exception as e: + print("something went wrong! We're on page: " + + str(page) + '\nSaving successfully crawled data') + print("Exception: ", e) + df.to_csv(crashed_tag + csvfilename, sep=",", + encoding='utf-8', index=False) + raise e + + + failed_games = len(df[df['status'] == 0]) + print("******Finished scraping games, will try to scrape missing data******") + # 36 hours max, should be enough to scrape everything + t_end = start_time + 60 * 60 * 36 + while True: + try: + df = process_games(df) + failed_games = len(df[df['status'] == 0]) + if failed_games == 0 or time.time() > t_end: + break + #print('Number of not scraped yet:', failed_games, '\n') + time.sleep(60) # wait for 1 minute for the server to recover? + except Exception as e: + print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data') + print("Exception: ", e) + df.to_csv(crashed_tag + csvfilename, sep=",", + encoding='utf-8', index=False) + raise e + + elapsed_time = time.time() - start_time + print("Scraped", rec_count, "games in", round(elapsed_time/60, 2), "minutes.") + + # select only these columns in the final dataset + df = df.sort_values(by=['Rank']) + df.to_csv('complete-vgchartz.csv', sep=",", encoding='utf-8', index=False) + df_final = df[[ + 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating', + 'Publisher', 'Developer', 'Critic_Score', 'User_Score', + 'Global_Sales', 'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales']] + + df_final.to_csv(csvfilename, sep=",", encoding='utf-8', index=False) + print("Wrote scraper data to", csvfilename) diff --git a/vgchartzfull.py b/vgchartzfull.py index 07475ca..e1f6200 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -258,7 +258,7 @@ def process_games(df): print("Scraped", rec_count, "games in", round(elapsed_time/60, 2), "minutes.") # select only these columns in the final dataset - df = df.sort_index() + df = df.sort_values(by=['Rank']) df.to_csv('complete-vgchartz.csv', sep=",", encoding='utf-8', index=False) df_final = df[[ 'Rank', 'Name', 'Platform', 'Year', 'Genre', 'ESRB_Rating', From debad0f3b0cdf1bfa9dcf39e47e19bc7d442d98b Mon Sep 17 00:00:00 2001 From: Abdulshaheed Alqunber Date: Sat, 13 Apr 2019 02:13:09 -0400 Subject: [PATCH 18/18] bug fixes and few changes --- clean-vgchartzfull.py | 52 +++++++++++++++++++++++-------------------- vgchartzfull.py | 13 ++++++----- 2 files changed, 35 insertions(+), 30 deletions(-) diff --git a/clean-vgchartzfull.py b/clean-vgchartzfull.py index 7715060..39cf3ed 100644 --- a/clean-vgchartzfull.py +++ b/clean-vgchartzfull.py @@ -29,7 +29,7 @@ def parse_games(game_tags): for tag in game_tags: game = {} game["Name"] = " ".join(tag.string.split()) - print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name'])) + #print(rec_count+1, 'Fetch Data for game', unidecode.unidecode(game['Name'])) data = tag.parent.parent.find_all("td") if data: @@ -66,7 +66,7 @@ def parse_games(game_tags): if year.startswith('N/A'): game["Year"] = 'N/A' else: - if int(year) >= 80: + if int(year) >= 70: year_to_add = np.int32("19" + year) else: year_to_add = np.int32("20" + year) @@ -119,14 +119,16 @@ def parse_genre_esrb(df): except(ProxyError): proxy = next(proxies) - except (ConnectionError, Timeout, ProtocolError): - print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') + except (ConnectionError, Timeout, ProtocolError, TimeoutError): + #print('Something went wrong while connecting to', df.at[index, 'Name'], 'url, will try again later') + continue except Exception as e: - print('different error occurred while connecting, will pass') - # wait for 2 seconds between every call, + #print('different error occurred while connecting, will pass') + continue + # wait for 1 seconds between every call, # we do not want to get blocked or abuse the server - time.sleep(2) + time.sleep(1) return df @@ -142,16 +144,17 @@ def process_games(df): df_subsets = np.array_split(df[df['status'] == 0], NUM_WORKERS) #update num_workers df_subsets = [i for i in df_subsets if len(i) != 0] - NUM_WORKERS = len(df_subsets) # we don't want to have a worker for empty subsets - pool = Pool(processes=NUM_WORKERS) - results = pool.map(retry_game, df_subsets) - try: - df_updated = pd.concat(results) - df = pd.concat([df[df['status'] == 1], df_updated]) - except: - print('error occurred while joining dataframe') - pool.close() - pool.join() + if len(df_subsets) != 0: + NUM_WORKERS = len(df_subsets)# we don't want to have a worker for empty subsets + pool = Pool(processes=NUM_WORKERS) + results = pool.map(retry_game, df_subsets) + try: + df_updated = pd.concat(results) + df = pd.concat([df[df['status'] == 1], df_updated]) + except: + print('error occurred while joining dataframe') + pool.close() + pool.join() return df rec_count = 0 @@ -160,9 +163,10 @@ def process_games(df): crashed_tag = 'before_crashing_' exists = [s for s in os.listdir() if crashed_tag in s] if exists: + print("found a data saved from a crash, will continue on it") csvfilename = exists[0].replace(crashed_tag, '') df = pd.read_csv(exists[0]) - rec_count = len(df) + rec_count = df['Rank'].max() page = int(rec_count/1000) + 1 # because we already scraped current df = process_games(df) else: @@ -206,7 +210,7 @@ def process_games(df): 'http': proxy, 'https': proxy}, timeout=10) if r.status_code == 200: soup = BeautifulSoup(r.text, 'lxml') - #print("******Scraping page " + str(page) + "******'\n") + print("******Scraping page " + str(page) + "******'\n") # vgchartz website is really weird so we have to search for # tags with game urls @@ -216,15 +220,15 @@ def process_games(df): # links are in the navigation bar parse_games(game_tags) - #print('\n******begin scraping for Genre and Rating******\n') - df = process_games(df) page += 1 + print('\n******begin scraping for Genre and Rating******\n') + df = process_games(df) - except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError): + except (ConnectionError, Timeout, ProxyError, RequestException, ProtocolError, TimeoutError): print('Something went wrong while connecting to page: ', page, ', will try again later') #proxy = get_proxies(1) - time.sleep(60) + time.sleep(10) except Exception as e: print("something went wrong! We're on page: " + @@ -246,7 +250,7 @@ def process_games(df): if failed_games == 0 or time.time() > t_end: break #print('Number of not scraped yet:', failed_games, '\n') - time.sleep(60) # wait for 1 minute for the server to recover? + time.sleep(10) # wait for 10 seconds for the server to recover? except Exception as e: print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data') print("Exception: ", e) diff --git a/vgchartzfull.py b/vgchartzfull.py index e1f6200..7938e72 100644 --- a/vgchartzfull.py +++ b/vgchartzfull.py @@ -66,7 +66,7 @@ def parse_games(game_tags): if year.startswith('N/A'): game["Year"] = 'N/A' else: - if int(year) >= 80: + if int(year) >= 70: year_to_add = np.int32("19" + year) else: year_to_add = np.int32("20" + year) @@ -124,9 +124,9 @@ def parse_genre_esrb(df): except Exception as e: print('different error occurred while connecting, will pass') - # wait for 2 seconds between every call, + # wait for 1 seconds between every call, # we do not want to get blocked or abuse the server - time.sleep(2) + time.sleep(1) return df @@ -160,9 +160,10 @@ def process_games(df): crashed_tag = 'before_crashing_' exists = [s for s in os.listdir() if crashed_tag in s] if exists: + print("found a data saved from a crash, will continue on it") csvfilename = exists[0].replace(crashed_tag, '') df = pd.read_csv(exists[0]) - rec_count = len(df) + rec_count = df['Rank'].max() page = int(rec_count/1000) + 1 # because we already scraped current df = process_games(df) else: @@ -224,7 +225,7 @@ def process_games(df): print('Something went wrong while connecting to page: ', page, ', will try again later') #proxy = get_proxies(1) - time.sleep(60) + time.sleep(10) except Exception as e: print("something went wrong! We're on page: " + @@ -246,7 +247,7 @@ def process_games(df): if failed_games == 0 or time.time() > t_end: break print('Number of not scraped yet:', failed_games, '\n') - time.sleep(60) # wait for 1 minute for the server to recover? + time.sleep(10) # wait for 10 seconds for the server to recover? except Exception as e: print("something went wrong! We're on page: " + str(page) + '\nSaving successfully crawled data') print("Exception: ", e)