From 4cd57e93287f76aac86b6302e5388d03f46b70a0 Mon Sep 17 00:00:00 2001
From: 21Vijeth <vijethfernandes21@gmail.com>
Date: Sat, 22 Jun 2024 16:29:48 +0530
Subject: [PATCH] Add Webscraper

---
 Dockerfile             | 34 +++++++--------
 docker-compose.yaml    | 26 -----------
 docker-compose.yml     | 35 +++++++++++++++
 requirements.txt       |  6 +--
 scraper.py             | 97 ++++++++++++++++++++++++++++++++++++++++++
 web_scraping_sample.py | 31 --------------
 6 files changed, 151 insertions(+), 78 deletions(-)
 delete mode 100644 docker-compose.yaml
 create mode 100644 docker-compose.yml
 create mode 100644 scraper.py
 delete mode 100644 web_scraping_sample.py

diff --git a/Dockerfile b/Dockerfile
index f07ae2b..ef7054f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,17 +1,17 @@
-FROM python:3.10.2-alpine3.15
-COPY . .
-# Install Postgres
-RUN apk update
-RUN apk add postgresql
-RUN chown postgres:postgres /run/postgresql/
-# Install requirements
-COPY ./requirements.txt /tmp
-RUN pip install -r /tmp/requirements.txt
-# For psycopg2
-RUN apk add --virtual postgresql-deps libpq-dev
-# Create directories
-RUN mkdir -p /root/workspace/src
-# Mount your local file
-COPY ./web_scraping_sample.py /root/workspace/src
-# Switch to project directory
-WORKDIR /root/workspace/src
\ No newline at end of file
+# Use the official Python image
+FROM python:3.9-slim
+
+# Set the working directory
+WORKDIR /app
+
+# Copy the requirements file
+COPY requirements.txt .
+
+# Install the required Python packages
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the Python script
+COPY scraper.py .
+
+# Run the Python script
+CMD ["python", "scraper.py"]
diff --git a/docker-compose.yaml b/docker-compose.yaml
deleted file mode 100644
index cad1491..0000000
--- a/docker-compose.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-version: "3"
-services:
-  pyhton_service:
-    build:
-      context: ./
-      dockerfile: Dockerfile
-    image: workshop1
-    container_name: workshop_python_container
-    stdin_open: true #  docker attach container_id
-    tty: true
-    ports:
-     - "8000:8000"
-    volumes:
-     - .:/app
-    depends_on:
-     - postgres_service
-
-  postgres_service:
-    image: postgres
-    container_name: workshop_postgres_container
-    ports:
-     - "5432:5432"
-    environment:
-     POSTGRES_PASSWORD: admin
-    volumes:
-     - .:/var/lib/postgres
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..f4d3c12
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,35 @@
+version: '3.8'
+
+services:
+  db:
+    image: postgres:13
+    container_name: postgres_container
+    environment:
+      POSTGRES_DB: blogdata
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: password
+    networks:
+      - blog_net
+    volumes:
+      - pgdata:/var/lib/postgresql/data
+
+  scraper:
+    build: .
+    container_name: scraper_container
+    environment:
+      DB_NAME: blogdata
+      DB_USER: postgres
+      DB_PASSWORD: password
+      DB_HOST: db
+      DB_PORT: 5432
+    depends_on:
+      - db
+    networks:
+      - blog_net
+
+networks:
+  blog_net:
+    external: true
+
+volumes:
+  pgdata:
diff --git a/requirements.txt b/requirements.txt
index 34c449f..b897dd6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,3 @@
-psycopg2==2.9.3
-bs4
-urllib2
 requests
-html5lib==1.1
\ No newline at end of file
+beautifulsoup4
+psycopg2-binary
diff --git a/scraper.py b/scraper.py
new file mode 100644
index 0000000..fa0ce73
--- /dev/null
+++ b/scraper.py
@@ -0,0 +1,97 @@
+import requests
+from bs4 import BeautifulSoup
+import psycopg2
+import os
+import time
+
+def scrape_blog():
+    base_url = "https://blog.python.org/"
+    current_url = base_url
+    all_posts = []
+
+    while True:
+        response = requests.get(current_url)
+        if response.status_code != 200:
+            print(f"Failed to retrieve the page. Status code: {response.status_code}")
+            break
+
+        soup = BeautifulSoup(response.content, 'html.parser')
+        posts = soup.find_all('div', class_='date-outer')
+
+        for post in posts:
+            title_tag = post.find('h3', class_='post-title')
+            date_tag = post.find('h2', class_='date-header')
+            content_tag = post.find('div', class_='post-body')
+            author_tag = post.find('span', class_='fn')
+            if title_tag and date_tag and content_tag:
+                title = title_tag.get_text(strip=True)
+                date = date_tag.get_text(strip=True)
+                content = content_tag.get_text(strip=True)
+            else:
+                print("Skipping incomplete post")
+                continue
+            author = author_tag.get_text(strip=True) if author_tag else 'Unknown'
+            all_posts.append({
+                'title': title,
+                'date': date,
+                'author': author,
+                'content': content
+            })
+        older_posts_link = soup.find('a', {'class': 'blog-pager-older-link'})
+        if older_posts_link:
+            current_url = older_posts_link['href']
+        else:
+            break
+    return all_posts
+
+def save_to_postgres(blog_posts):
+    # PostgreSQL connection details
+    DB_NAME = os.getenv("DB_NAME", "blogdata")
+    DB_USER = os.getenv("DB_USER", "postgres")
+    DB_PASSWORD = os.getenv("DB_PASSWORD", "password")
+    DB_HOST = os.getenv("DB_HOST", "db")
+    DB_PORT = os.getenv("DB_PORT", "5432")
+
+    time.sleep(10)  
+
+    conn = psycopg2.connect(
+        dbname=DB_NAME,
+        user=DB_USER,
+        password=DB_PASSWORD,
+        host=DB_HOST,
+        port=DB_PORT
+    )
+
+   
+    cur = conn.cursor()
+
+   
+    cur.execute("""
+    CREATE TABLE IF NOT EXISTS blog_posts (
+        id SERIAL PRIMARY KEY,
+        date TEXT,
+        title TEXT,
+        author TEXT,
+        content TEXT
+    );
+    """)
+
+
+    for post in blog_posts:
+        cur.execute(
+            "INSERT INTO blog_posts (date, title, author, content) VALUES (%s, %s, %s, %s)",
+            (post['date'], post['title'], post['author'], post['content'])
+        )
+
+
+    conn.commit()
+    cur.close()
+    conn.close()
+    print("Data has been successfully written to the PostgreSQL database")
+
+def main():
+    blog_posts = scrape_blog()
+    save_to_postgres(blog_posts)
+
+if __name__ == "__main__":
+    main()
diff --git a/web_scraping_sample.py b/web_scraping_sample.py
deleted file mode 100644
index c9ca142..0000000
--- a/web_scraping_sample.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-import re
-import psycopg2
-
-# Create connection to database
-conn = psycopg2.connect(
-    host="postgres_service",
-    database="LipsumGenerator",
-    user="postgres",
-    password="admin")
-cursor = conn.cursor()
-
-res = requests.get('https://www.lipsum.com/')
-soup = BeautifulSoup(res.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib
-data = soup.find(re.compile(r'div'), attrs={'id': "Panes"})
-print(data.find("lorem"))
-
-question_list = []
-answer_list = []
-for row in data.findAll("div"):
-    question_list.append(row.h2.text)
-    temp_string = ""
-    counter=0
-    for i in row.findAll("p"):
-        temp_string = temp_string + "\n" + i.text
-        answer_list.append(temp_string)
-file = open("qn_ans_ans", "w")
-
-for i in range(len(question_list)):
-    cursor.execute("insert into qn_ans values(%s,%s)", (question_list[i], answer_list[i]))