From 4cd57e93287f76aac86b6302e5388d03f46b70a0 Mon Sep 17 00:00:00 2001 From: 21Vijeth Date: Sat, 22 Jun 2024 16:29:48 +0530 Subject: [PATCH] Add Webscraper --- Dockerfile | 34 +++++++-------- docker-compose.yaml | 26 ----------- docker-compose.yml | 35 +++++++++++++++ requirements.txt | 6 +-- scraper.py | 97 ++++++++++++++++++++++++++++++++++++++++++ web_scraping_sample.py | 31 -------------- 6 files changed, 151 insertions(+), 78 deletions(-) delete mode 100644 docker-compose.yaml create mode 100644 docker-compose.yml create mode 100644 scraper.py delete mode 100644 web_scraping_sample.py diff --git a/Dockerfile b/Dockerfile index f07ae2b..ef7054f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,17 +1,17 @@ -FROM python:3.10.2-alpine3.15 -COPY . . -# Install Postgres -RUN apk update -RUN apk add postgresql -RUN chown postgres:postgres /run/postgresql/ -# Install requirements -COPY ./requirements.txt /tmp -RUN pip install -r /tmp/requirements.txt -# For psycopg2 -RUN apk add --virtual postgresql-deps libpq-dev -# Create directories -RUN mkdir -p /root/workspace/src -# Mount your local file -COPY ./web_scraping_sample.py /root/workspace/src -# Switch to project directory -WORKDIR /root/workspace/src \ No newline at end of file +# Use the official Python image +FROM python:3.9-slim + +# Set the working directory +WORKDIR /app + +# Copy the requirements file +COPY requirements.txt . + +# Install the required Python packages +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the Python script +COPY scraper.py . + +# Run the Python script +CMD ["python", "scraper.py"] diff --git a/docker-compose.yaml b/docker-compose.yaml deleted file mode 100644 index cad1491..0000000 --- a/docker-compose.yaml +++ /dev/null @@ -1,26 +0,0 @@ -version: "3" -services: - pyhton_service: - build: - context: ./ - dockerfile: Dockerfile - image: workshop1 - container_name: workshop_python_container - stdin_open: true # docker attach container_id - tty: true - ports: - - "8000:8000" - volumes: - - .:/app - depends_on: - - postgres_service - - postgres_service: - image: postgres - container_name: workshop_postgres_container - ports: - - "5432:5432" - environment: - POSTGRES_PASSWORD: admin - volumes: - - .:/var/lib/postgres diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..f4d3c12 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,35 @@ +version: '3.8' + +services: + db: + image: postgres:13 + container_name: postgres_container + environment: + POSTGRES_DB: blogdata + POSTGRES_USER: postgres + POSTGRES_PASSWORD: password + networks: + - blog_net + volumes: + - pgdata:/var/lib/postgresql/data + + scraper: + build: . + container_name: scraper_container + environment: + DB_NAME: blogdata + DB_USER: postgres + DB_PASSWORD: password + DB_HOST: db + DB_PORT: 5432 + depends_on: + - db + networks: + - blog_net + +networks: + blog_net: + external: true + +volumes: + pgdata: diff --git a/requirements.txt b/requirements.txt index 34c449f..b897dd6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,3 @@ -psycopg2==2.9.3 -bs4 -urllib2 requests -html5lib==1.1 \ No newline at end of file +beautifulsoup4 +psycopg2-binary diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..fa0ce73 --- /dev/null +++ b/scraper.py @@ -0,0 +1,97 @@ +import requests +from bs4 import BeautifulSoup +import psycopg2 +import os +import time + +def scrape_blog(): + base_url = "https://blog.python.org/" + current_url = base_url + all_posts = [] + + while True: + response = requests.get(current_url) + if response.status_code != 200: + print(f"Failed to retrieve the page. Status code: {response.status_code}") + break + + soup = BeautifulSoup(response.content, 'html.parser') + posts = soup.find_all('div', class_='date-outer') + + for post in posts: + title_tag = post.find('h3', class_='post-title') + date_tag = post.find('h2', class_='date-header') + content_tag = post.find('div', class_='post-body') + author_tag = post.find('span', class_='fn') + if title_tag and date_tag and content_tag: + title = title_tag.get_text(strip=True) + date = date_tag.get_text(strip=True) + content = content_tag.get_text(strip=True) + else: + print("Skipping incomplete post") + continue + author = author_tag.get_text(strip=True) if author_tag else 'Unknown' + all_posts.append({ + 'title': title, + 'date': date, + 'author': author, + 'content': content + }) + older_posts_link = soup.find('a', {'class': 'blog-pager-older-link'}) + if older_posts_link: + current_url = older_posts_link['href'] + else: + break + return all_posts + +def save_to_postgres(blog_posts): + # PostgreSQL connection details + DB_NAME = os.getenv("DB_NAME", "blogdata") + DB_USER = os.getenv("DB_USER", "postgres") + DB_PASSWORD = os.getenv("DB_PASSWORD", "password") + DB_HOST = os.getenv("DB_HOST", "db") + DB_PORT = os.getenv("DB_PORT", "5432") + + time.sleep(10) + + conn = psycopg2.connect( + dbname=DB_NAME, + user=DB_USER, + password=DB_PASSWORD, + host=DB_HOST, + port=DB_PORT + ) + + + cur = conn.cursor() + + + cur.execute(""" + CREATE TABLE IF NOT EXISTS blog_posts ( + id SERIAL PRIMARY KEY, + date TEXT, + title TEXT, + author TEXT, + content TEXT + ); + """) + + + for post in blog_posts: + cur.execute( + "INSERT INTO blog_posts (date, title, author, content) VALUES (%s, %s, %s, %s)", + (post['date'], post['title'], post['author'], post['content']) + ) + + + conn.commit() + cur.close() + conn.close() + print("Data has been successfully written to the PostgreSQL database") + +def main(): + blog_posts = scrape_blog() + save_to_postgres(blog_posts) + +if __name__ == "__main__": + main() diff --git a/web_scraping_sample.py b/web_scraping_sample.py deleted file mode 100644 index c9ca142..0000000 --- a/web_scraping_sample.py +++ /dev/null @@ -1,31 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import re -import psycopg2 - -# Create connection to database -conn = psycopg2.connect( - host="postgres_service", - database="LipsumGenerator", - user="postgres", - password="admin") -cursor = conn.cursor() - -res = requests.get('https://www.lipsum.com/') -soup = BeautifulSoup(res.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib -data = soup.find(re.compile(r'div'), attrs={'id': "Panes"}) -print(data.find("lorem")) - -question_list = [] -answer_list = [] -for row in data.findAll("div"): - question_list.append(row.h2.text) - temp_string = "" - counter=0 - for i in row.findAll("p"): - temp_string = temp_string + "\n" + i.text - answer_list.append(temp_string) -file = open("qn_ans_ans", "w") - -for i in range(len(question_list)): - cursor.execute("insert into qn_ans values(%s,%s)", (question_list[i], answer_list[i]))