From 3c23bba65927801a268a0009534287e34516dbdf Mon Sep 17 00:00:00 2001 From: cchaithanya83 Date: Fri, 21 Jun 2024 20:24:14 +0530 Subject: [PATCH] addded two assignment including home work --- Web scraping(python blog)/docker-compose.yaml | 27 ++++ Web scraping(python blog)/dockerfile | 21 +++ Web scraping(python blog)/main.py | 94 +++++++++++++ Web scraping(python blog)/readme.md | 130 ++++++++++++++++++ Web scraping(python blog)/requirements.txt | 5 + web scraping (lipsum)/Dockerfile | 22 +++ web scraping (lipsum)/docker-compose.yaml | 27 ++++ web scraping (lipsum)/requirements.txt | 5 + web scraping (lipsum)/web_scraping_sample.py | 34 +++++ 9 files changed, 365 insertions(+) create mode 100644 Web scraping(python blog)/docker-compose.yaml create mode 100644 Web scraping(python blog)/dockerfile create mode 100644 Web scraping(python blog)/main.py create mode 100644 Web scraping(python blog)/readme.md create mode 100644 Web scraping(python blog)/requirements.txt create mode 100644 web scraping (lipsum)/Dockerfile create mode 100644 web scraping (lipsum)/docker-compose.yaml create mode 100644 web scraping (lipsum)/requirements.txt create mode 100644 web scraping (lipsum)/web_scraping_sample.py diff --git a/Web scraping(python blog)/docker-compose.yaml b/Web scraping(python blog)/docker-compose.yaml new file mode 100644 index 0000000..0c7dd13 --- /dev/null +++ b/Web scraping(python blog)/docker-compose.yaml @@ -0,0 +1,27 @@ +version: "3" +services: + pyhton_service: + build: + context: ./ + dockerfile: Dockerfile + image: workshop + container_name: scraper_python_container + stdin_open: true + tty: true + ports: + - "8000:8000" + volumes: + - .:/app + depends_on: + - postgres_service + + postgres_service: + image: postgres + container_name: scraper_postgres_container + ports: + - "5432:5432" + environment: + POSTGRES_USER : postgres + POSTGRES_PASSWORD: admin + volumes: + - .:/var/lib/postgres \ No newline at end of file diff --git a/Web scraping(python blog)/dockerfile b/Web scraping(python blog)/dockerfile new file mode 100644 index 0000000..e4ca877 --- /dev/null +++ b/Web scraping(python blog)/dockerfile @@ -0,0 +1,21 @@ +FROM python:3.10.2-alpine3.15 +COPY . . +# Install Postgres +RUN mkdir -p /root/workspace/src +RUN mkdir -p /run/postgresql/ +WORKDIR /run/postgresql/ +RUN apk update +RUN apk add postgresql +RUN chown postgres:postgres /run/postgresql/ +# Install requirements +COPY ./requirements.txt /tmp +RUN pip install -r /tmp/requirements.txt +# For psycopg2 +RUN apk add --virtual postgresql-deps libpq-dev +# Create directories + +# Mount your local file +COPY ./main.py /root/workspace/src +# Switch to project directory +WORKDIR /root/workspace/src + diff --git a/Web scraping(python blog)/main.py b/Web scraping(python blog)/main.py new file mode 100644 index 0000000..657dbb6 --- /dev/null +++ b/Web scraping(python blog)/main.py @@ -0,0 +1,94 @@ +import requests +from bs4 import BeautifulSoup +import re +import psycopg2 +import os + +conn = psycopg2.connect( + host="postgres_service", + database="pythondata", + user="postgres", + password="admin") +cursor = conn.cursor() + +## Storing the contents in files +def save_post_to_file(post_content, post_index): + file_path = f"post_{post_index}.txt" + with open(file_path, 'w') as file: + file.write(post_content) + return file_path + +def extract_posts(soup, posts_list, post_index): + # date + date_header = soup.find('h2', class_='date-header') + date = date_header.get_text(strip=True) + + # post-outer div segments each partion + posts = soup.find_all('div', class_='post-outer') + for post in posts: + # Title + title_tag = post.find('h3', class_='post-title') + title = title_tag.get_text(strip=True) if title_tag else None + + # Author name + author_tag = post.find('span', class_='post-author') + author = author_tag.find('span', class_='fn').get_text(strip=True) if author_tag else None + + # Time + time_tag = post.find('span', class_='post-timestamp') + time = time_tag.find('abbr').get_text(strip=True) if time_tag else None + + + post_content = post.get_text(strip=True) + file_path = save_post_to_file(post_content, post_index) + + # storing values in the code + posts_list.append({ + 'Title': title, + 'Date': date, + 'Author': author, + 'Time': time, + 'FilePath': file_path + }) + + cursor.execute(""" + INSERT INTO data (title, date, author, time, file_path) + VALUES (%s, %s, %s, %s, %s) + """, (title, date, author, time, file_path)) + conn.commit() + + post_index += 1 + + return posts_list, post_index + +## Function to extract the code of page and call the extract_posts function sending code of page +def scrape_blog(url, post_index): + posts_list = [] + while len(posts_list) < 20: + response = requests.get(url) + soup = BeautifulSoup(response.content, 'html.parser') + posts_list, post_index = extract_posts(soup, posts_list, post_index) + if len(posts_list) >= 20: + break + + next_button = soup.find('a', class_='blog-pager-older-link') + if next_button: + url = next_button['href'] + else: + break + + return posts_list + + + +### starting call +posts = scrape_blog('https://blog.python.org/', post_index=1) + +## testing part +for post in posts: + print(f"Title: {post['Title']}") + print(f"Date: {post['Date']}") + print(f"Author: {post['Author']}") + print(f"Time: {post['Time']}") + print(f"File Path: {post['FilePath']}") + print('---') \ No newline at end of file diff --git a/Web scraping(python blog)/readme.md b/Web scraping(python blog)/readme.md new file mode 100644 index 0000000..4e231cf --- /dev/null +++ b/Web scraping(python blog)/readme.md @@ -0,0 +1,130 @@ +## Running the Application + +1. **Start the Docker Containers** + + First, ensure your Docker containers are up and running: + + ```sh + docker-compose up + ``` + +2. **Access the PostgreSQL Container** + + Open a new terminal and run the following commands: + + ```sh + docker exec -it scraper_postgres_container sh + ``` + + This command opens an interactive shell inside the PostgreSQL container. + +3. **Switch to PostgreSQL User** + + Inside the container, switch to the `postgres` user: + + ```sh + su - postgres + ``` + +4. **Access PostgreSQL CLI** + + Start the PostgreSQL command-line interface: + + ```sh + psql + ``` + +5. **Create the Database** + + Create a new database named `pythondata`: + + ```sh + create database pythondata; + ``` + +6. **Exit the PostgreSQL CLI** + + Exit the PostgreSQL command-line interface: + + ```sh + exit + ``` + +7. **Connect to the Newly Created Database** + + Connect to the `pythondata` database: + + ```sh + psql -h postgres_service -d pythondata -U postgres + ``` + +8. **Enter the Password** + + When prompted, enter the password: + + ```sh + admin + ``` + +9. **Create the Data Table** + + Create a table named `data` to store the scraped blog post details: + + ```sh + CREATE TABLE data ( + id SERIAL PRIMARY KEY, + title TEXT, + date TEXT, + author TEXT, + time TEXT, + file_path TEXT + ); + ``` + +10. **Exit the PostgreSQL CLI** + + Exit the PostgreSQL command-line interface: + + ```sh + exit + ``` + +11. **Run the Scraper Script** + + Open another terminal and run the following commands to execute the scraper script inside the Python container: + + ```sh + sudo docker exec -it scraper_python_container sh + ``` + + This command opens an interactive shell inside the Python container. + +12. **Execute the Python Script** + + Run the `main.py` script: + + ```sh + python main.py + ``` + + The script will start scraping the blog and storing the data in the PostgreSQL database. + +13. **Verify the Stored Data** + + You can check the stored data in the PostgreSQL database by running: + + ```sh + sudo docker exec -it scraper_postgres_container sh + su - postgres + psql -h postgres_service -d pythondata -U postgres + ``` + + Enter the password `admin` when prompted. Then, execute the following command to see the data: + + ```sh + select * from data; + ``` + + This command retrieves all the records stored in the `data` table. + + diff --git a/Web scraping(python blog)/requirements.txt b/Web scraping(python blog)/requirements.txt new file mode 100644 index 0000000..fda7081 --- /dev/null +++ b/Web scraping(python blog)/requirements.txt @@ -0,0 +1,5 @@ +psycopg2-binary +bs4 + +requests +html5lib==1.1 \ No newline at end of file diff --git a/web scraping (lipsum)/Dockerfile b/web scraping (lipsum)/Dockerfile new file mode 100644 index 0000000..aba720f --- /dev/null +++ b/web scraping (lipsum)/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.10.2-alpine3.15 +COPY . . +# Install Postgres +RUN mkdir -p /root/workspace/src +RUN mkdir -p /run/postgresql/ +WORKDIR /run/postgresql/ +RUN apk update +RUN apk add postgresql +RUN chown postgres:postgres /run/postgresql/ +# Install requirements +COPY ./requirements.txt /tmp +RUN pip install -r /tmp/requirements.txt +# For psycopg2 +RUN apk add --virtual postgresql-deps libpq-dev +# Create directories + +# Mount your local file +COPY ./web_scraping_sample.py /root/workspace/src +# Switch to project directory +WORKDIR /root/workspace/src + + diff --git a/web scraping (lipsum)/docker-compose.yaml b/web scraping (lipsum)/docker-compose.yaml new file mode 100644 index 0000000..dc4a98d --- /dev/null +++ b/web scraping (lipsum)/docker-compose.yaml @@ -0,0 +1,27 @@ +version: "3" +services: + pyhton_service: + build: + context: ./ + dockerfile: Dockerfile + image: workshop1 + container_name: workshop_python_container + stdin_open: true # docker attach container_id + tty: true + ports: + - "8000:8000" + volumes: + - .:/app + depends_on: + - postgres_service + + postgres_service: + image: postgres + container_name: workshop_postgres_container + ports: + - "5432:5432" + environment: + POSTGRES_USER : postgres + POSTGRES_PASSWORD: admin + volumes: + - .:/var/lib/postgres diff --git a/web scraping (lipsum)/requirements.txt b/web scraping (lipsum)/requirements.txt new file mode 100644 index 0000000..fda7081 --- /dev/null +++ b/web scraping (lipsum)/requirements.txt @@ -0,0 +1,5 @@ +psycopg2-binary +bs4 + +requests +html5lib==1.1 \ No newline at end of file diff --git a/web scraping (lipsum)/web_scraping_sample.py b/web scraping (lipsum)/web_scraping_sample.py new file mode 100644 index 0000000..2b388ad --- /dev/null +++ b/web scraping (lipsum)/web_scraping_sample.py @@ -0,0 +1,34 @@ +import requests +from bs4 import BeautifulSoup +import re +import psycopg2 + +# Create connection to database +conn = psycopg2.connect( + host="postgres_service", + database="lipsumgenerator", + user="postgres", + password="admin") +cursor = conn.cursor() + +res = requests.get('https://www.lipsum.com/') +soup = BeautifulSoup(res.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib +data = soup.find(re.compile(r'div'), attrs={'id': "Panes"}) +print(data.find("div")) + +question_list = [] +answer_list = [] +for row in data.findAll("div"): + question_list.append(row.h2.text) + temp_string = "" + counter=0 + for i in row.findAll("p"): + temp_string = temp_string + i.text + answer_list.append(temp_string) +file = open("qn_ans_ans", "w") + +for i in range(len(question_list)): + print(question_list[i],) + cursor.execute("insert into qn_ans values(%s,%s)", (question_list[i], answer_list[i])) + +conn.commit()