From 3c23bba65927801a268a0009534287e34516dbdf Mon Sep 17 00:00:00 2001
From: cchaithanya83 <cchaithanya83@gmail.com>
Date: Fri, 21 Jun 2024 20:24:14 +0530
Subject: [PATCH] addded two assignment including home work

---
 Web scraping(python blog)/docker-compose.yaml |  27 ++++
 Web scraping(python blog)/dockerfile          |  21 +++
 Web scraping(python blog)/main.py             |  94 +++++++++++++
 Web scraping(python blog)/readme.md           | 130 ++++++++++++++++++
 Web scraping(python blog)/requirements.txt    |   5 +
 web scraping (lipsum)/Dockerfile              |  22 +++
 web scraping (lipsum)/docker-compose.yaml     |  27 ++++
 web scraping (lipsum)/requirements.txt        |   5 +
 web scraping (lipsum)/web_scraping_sample.py  |  34 +++++
 9 files changed, 365 insertions(+)
 create mode 100644 Web scraping(python blog)/docker-compose.yaml
 create mode 100644 Web scraping(python blog)/dockerfile
 create mode 100644 Web scraping(python blog)/main.py
 create mode 100644 Web scraping(python blog)/readme.md
 create mode 100644 Web scraping(python blog)/requirements.txt
 create mode 100644 web scraping (lipsum)/Dockerfile
 create mode 100644 web scraping (lipsum)/docker-compose.yaml
 create mode 100644 web scraping (lipsum)/requirements.txt
 create mode 100644 web scraping (lipsum)/web_scraping_sample.py

diff --git a/Web scraping(python blog)/docker-compose.yaml b/Web scraping(python blog)/docker-compose.yaml
new file mode 100644
index 0000000..0c7dd13
--- /dev/null
+++ b/Web scraping(python blog)/docker-compose.yaml	
@@ -0,0 +1,27 @@
+version: "3"
+services:
+  pyhton_service:
+    build:
+      context: ./
+      dockerfile: Dockerfile
+    image: workshop
+    container_name: scraper_python_container
+    stdin_open: true
+    tty: true
+    ports:
+     - "8000:8000"
+    volumes:
+     - .:/app
+    depends_on:
+     - postgres_service
+
+  postgres_service:
+    image: postgres
+    container_name: scraper_postgres_container
+    ports:
+     - "5432:5432"
+    environment:
+     POSTGRES_USER : postgres
+     POSTGRES_PASSWORD: admin
+    volumes:
+     - .:/var/lib/postgres
\ No newline at end of file
diff --git a/Web scraping(python blog)/dockerfile b/Web scraping(python blog)/dockerfile
new file mode 100644
index 0000000..e4ca877
--- /dev/null
+++ b/Web scraping(python blog)/dockerfile	
@@ -0,0 +1,21 @@
+FROM python:3.10.2-alpine3.15
+COPY . .
+# Install Postgres
+RUN mkdir -p /root/workspace/src
+RUN mkdir -p /run/postgresql/
+WORKDIR /run/postgresql/
+RUN apk update
+RUN apk add postgresql 
+RUN chown postgres:postgres /run/postgresql/
+# Install requirements
+COPY ./requirements.txt /tmp
+RUN pip install -r /tmp/requirements.txt
+# For psycopg2
+RUN apk add --virtual postgresql-deps libpq-dev
+# Create directories
+
+# Mount your local file
+COPY ./main.py /root/workspace/src
+# Switch to project directory
+WORKDIR /root/workspace/src
+
diff --git a/Web scraping(python blog)/main.py b/Web scraping(python blog)/main.py
new file mode 100644
index 0000000..657dbb6
--- /dev/null
+++ b/Web scraping(python blog)/main.py	
@@ -0,0 +1,94 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+import psycopg2
+import os
+
+conn = psycopg2.connect(
+    host="postgres_service",
+    database="pythondata",
+    user="postgres",
+    password="admin")
+cursor = conn.cursor()
+
+## Storing the contents in files 
+def save_post_to_file(post_content, post_index):
+    file_path = f"post_{post_index}.txt"
+    with open(file_path, 'w') as file:
+        file.write(post_content)
+    return file_path
+
+def extract_posts(soup, posts_list, post_index):
+    # date
+    date_header = soup.find('h2', class_='date-header')
+    date = date_header.get_text(strip=True)
+
+    # post-outer div segments each partion
+    posts = soup.find_all('div', class_='post-outer')
+    for post in posts:
+        # Title
+        title_tag = post.find('h3', class_='post-title')
+        title = title_tag.get_text(strip=True) if title_tag else None
+        
+        # Author name
+        author_tag = post.find('span', class_='post-author')
+        author = author_tag.find('span', class_='fn').get_text(strip=True) if author_tag else None
+        
+        # Time
+        time_tag = post.find('span', class_='post-timestamp')
+        time = time_tag.find('abbr').get_text(strip=True) if time_tag else None
+
+
+        post_content = post.get_text(strip=True)
+        file_path = save_post_to_file(post_content, post_index)
+        
+        # storing values in the code
+        posts_list.append({
+            'Title': title,
+            'Date': date,
+            'Author': author,
+            'Time': time,
+            'FilePath': file_path
+        })
+        
+        cursor.execute("""
+            INSERT INTO data (title, date, author, time, file_path) 
+            VALUES (%s, %s, %s, %s, %s)
+        """, (title, date, author, time, file_path))
+        conn.commit()
+
+        post_index += 1
+        
+    return posts_list, post_index
+
+## Function to extract the code of page and call the extract_posts function sending code  of page
+def scrape_blog(url, post_index):       
+    posts_list = []                                         
+    while len(posts_list) < 20:                                                                   
+        response = requests.get(url)       
+        soup = BeautifulSoup(response.content, 'html.parser')
+        posts_list, post_index = extract_posts(soup, posts_list, post_index)
+        if len(posts_list) >= 20:          
+            break                               
+                                                               
+        next_button = soup.find('a', class_='blog-pager-older-link')
+        if next_button:                                 
+            url = next_button['href']                                
+        else:               
+            break                                           
+                                                                                                  
+    return posts_list 
+
+
+
+### starting call 
+posts = scrape_blog('https://blog.python.org/', post_index=1)
+
+## testing part
+for post in posts:
+    print(f"Title: {post['Title']}")
+    print(f"Date: {post['Date']}")
+    print(f"Author: {post['Author']}")
+    print(f"Time: {post['Time']}")
+    print(f"File Path: {post['FilePath']}")
+    print('---')
\ No newline at end of file
diff --git a/Web scraping(python blog)/readme.md b/Web scraping(python blog)/readme.md
new file mode 100644
index 0000000..4e231cf
--- /dev/null
+++ b/Web scraping(python blog)/readme.md	
@@ -0,0 +1,130 @@
+## Running the Application
+
+1. **Start the Docker Containers**
+
+   First, ensure your Docker containers are up and running:
+
+   ```sh
+   docker-compose up
+   ```
+
+2. **Access the PostgreSQL Container**
+
+   Open a new terminal and run the following commands:
+
+   ```sh
+   docker exec -it scraper_postgres_container sh
+   ```
+
+   This command opens an interactive shell inside the PostgreSQL container.
+
+3. **Switch to PostgreSQL User**
+
+   Inside the container, switch to the `postgres` user:
+
+   ```sh
+   su - postgres
+   ```
+
+4. **Access PostgreSQL CLI**
+
+   Start the PostgreSQL command-line interface:
+
+   ```sh
+   psql
+   ```
+
+5. **Create the Database**
+
+   Create a new database named `pythondata`:
+
+   ```sh
+   create database pythondata;
+   ```
+
+6. **Exit the PostgreSQL CLI**
+
+   Exit the PostgreSQL command-line interface:
+
+   ```sh
+   exit
+   ```
+
+7. **Connect to the Newly Created Database**
+
+   Connect to the `pythondata` database:
+
+   ```sh
+   psql -h postgres_service -d pythondata -U postgres
+   ```
+
+8. **Enter the Password**
+
+   When prompted, enter the password:
+
+   ```sh
+   admin
+   ```
+
+9. **Create the Data Table**
+
+   Create a table named `data` to store the scraped blog post details:
+
+   ```sh
+   CREATE TABLE data (
+       id SERIAL PRIMARY KEY,
+       title TEXT,
+       date TEXT,
+       author TEXT,
+       time TEXT,
+       file_path TEXT
+   );
+   ```
+
+10. **Exit the PostgreSQL CLI**
+
+    Exit the PostgreSQL command-line interface:
+
+    ```sh
+    exit
+    ```
+
+11. **Run the Scraper Script**
+
+    Open another terminal and run the following commands to execute the scraper script inside the Python container:
+
+    ```sh
+    sudo docker exec -it scraper_python_container sh
+    ```
+
+    This command opens an interactive shell inside the Python container.
+
+12. **Execute the Python Script**
+
+    Run the `main.py` script:
+
+    ```sh
+    python main.py
+    ```
+
+    The script will start scraping the blog and storing the data in the PostgreSQL database.
+
+13. **Verify the Stored Data**
+
+    You can check the stored data in the PostgreSQL database by running:
+
+    ```sh
+    sudo docker exec -it scraper_postgres_container sh
+    su - postgres
+    psql -h postgres_service -d pythondata -U postgres
+    ```
+
+    Enter the password `admin` when prompted. Then, execute the following command to see the data:
+
+    ```sh
+    select * from data;
+    ```
+
+    This command retrieves all the records stored in the `data` table.
+
+	
diff --git a/Web scraping(python blog)/requirements.txt b/Web scraping(python blog)/requirements.txt
new file mode 100644
index 0000000..fda7081
--- /dev/null
+++ b/Web scraping(python blog)/requirements.txt	
@@ -0,0 +1,5 @@
+psycopg2-binary
+bs4
+
+requests
+html5lib==1.1
\ No newline at end of file
diff --git a/web scraping (lipsum)/Dockerfile b/web scraping (lipsum)/Dockerfile
new file mode 100644
index 0000000..aba720f
--- /dev/null
+++ b/web scraping (lipsum)/Dockerfile	
@@ -0,0 +1,22 @@
+FROM python:3.10.2-alpine3.15
+COPY . .
+# Install Postgres
+RUN mkdir -p /root/workspace/src
+RUN mkdir -p /run/postgresql/
+WORKDIR /run/postgresql/
+RUN apk update
+RUN apk add postgresql 
+RUN chown postgres:postgres /run/postgresql/
+# Install requirements
+COPY ./requirements.txt /tmp
+RUN pip install -r /tmp/requirements.txt
+# For psycopg2
+RUN apk add --virtual postgresql-deps libpq-dev
+# Create directories
+
+# Mount your local file
+COPY ./web_scraping_sample.py /root/workspace/src
+# Switch to project directory
+WORKDIR /root/workspace/src
+
+
diff --git a/web scraping (lipsum)/docker-compose.yaml b/web scraping (lipsum)/docker-compose.yaml
new file mode 100644
index 0000000..dc4a98d
--- /dev/null
+++ b/web scraping (lipsum)/docker-compose.yaml	
@@ -0,0 +1,27 @@
+version: "3"
+services:
+  pyhton_service:
+    build:
+      context: ./
+      dockerfile: Dockerfile
+    image: workshop1
+    container_name: workshop_python_container
+    stdin_open: true #  docker attach container_id
+    tty: true
+    ports:
+     - "8000:8000"
+    volumes:
+     - .:/app
+    depends_on:
+     - postgres_service
+
+  postgres_service:
+    image: postgres
+    container_name: workshop_postgres_container
+    ports:
+     - "5432:5432"
+    environment:
+     POSTGRES_USER : postgres
+     POSTGRES_PASSWORD: admin
+    volumes:
+     - .:/var/lib/postgres
diff --git a/web scraping (lipsum)/requirements.txt b/web scraping (lipsum)/requirements.txt
new file mode 100644
index 0000000..fda7081
--- /dev/null
+++ b/web scraping (lipsum)/requirements.txt	
@@ -0,0 +1,5 @@
+psycopg2-binary
+bs4
+
+requests
+html5lib==1.1
\ No newline at end of file
diff --git a/web scraping (lipsum)/web_scraping_sample.py b/web scraping (lipsum)/web_scraping_sample.py
new file mode 100644
index 0000000..2b388ad
--- /dev/null
+++ b/web scraping (lipsum)/web_scraping_sample.py	
@@ -0,0 +1,34 @@
+import requests
+from bs4 import BeautifulSoup
+import re
+import psycopg2
+
+# Create connection to database
+conn = psycopg2.connect(
+    host="postgres_service",
+    database="lipsumgenerator",
+    user="postgres",
+    password="admin")
+cursor = conn.cursor()
+
+res = requests.get('https://www.lipsum.com/')
+soup = BeautifulSoup(res.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib
+data = soup.find(re.compile(r'div'), attrs={'id': "Panes"})
+print(data.find("div"))
+
+question_list = []
+answer_list = []
+for row in data.findAll("div"):
+    question_list.append(row.h2.text)
+    temp_string = ""
+    counter=0
+    for i in row.findAll("p"):
+        temp_string = temp_string + i.text
+        answer_list.append(temp_string)
+file = open("qn_ans_ans", "w")
+
+for i in range(len(question_list)):
+    print(question_list[i],)
+    cursor.execute("insert into qn_ans values(%s,%s)", (question_list[i], answer_list[i]))
+
+conn.commit()