Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions Web scraping(python blog)/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
version: "3"
services:
pyhton_service:
build:
context: ./
dockerfile: Dockerfile
image: workshop
container_name: scraper_python_container
stdin_open: true
tty: true
ports:
- "8000:8000"
volumes:
- .:/app
depends_on:
- postgres_service

postgres_service:
image: postgres
container_name: scraper_postgres_container
ports:
- "5432:5432"
environment:
POSTGRES_USER : postgres
POSTGRES_PASSWORD: admin
volumes:
- .:/var/lib/postgres
21 changes: 21 additions & 0 deletions Web scraping(python blog)/dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.10.2-alpine3.15
COPY . .
# Install Postgres
RUN mkdir -p /root/workspace/src
RUN mkdir -p /run/postgresql/
WORKDIR /run/postgresql/
RUN apk update
RUN apk add postgresql
RUN chown postgres:postgres /run/postgresql/
# Install requirements
COPY ./requirements.txt /tmp
RUN pip install -r /tmp/requirements.txt
# For psycopg2
RUN apk add --virtual postgresql-deps libpq-dev
# Create directories

# Mount your local file
COPY ./main.py /root/workspace/src
# Switch to project directory
WORKDIR /root/workspace/src

94 changes: 94 additions & 0 deletions Web scraping(python blog)/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import requests
from bs4 import BeautifulSoup
import re
import psycopg2
import os

conn = psycopg2.connect(
host="postgres_service",
database="pythondata",
user="postgres",
password="admin")
cursor = conn.cursor()

## Storing the contents in files
def save_post_to_file(post_content, post_index):
file_path = f"post_{post_index}.txt"
with open(file_path, 'w') as file:
file.write(post_content)
return file_path

def extract_posts(soup, posts_list, post_index):
# date
date_header = soup.find('h2', class_='date-header')
date = date_header.get_text(strip=True)

# post-outer div segments each partion
posts = soup.find_all('div', class_='post-outer')
for post in posts:
# Title
title_tag = post.find('h3', class_='post-title')
title = title_tag.get_text(strip=True) if title_tag else None

# Author name
author_tag = post.find('span', class_='post-author')
author = author_tag.find('span', class_='fn').get_text(strip=True) if author_tag else None

# Time
time_tag = post.find('span', class_='post-timestamp')
time = time_tag.find('abbr').get_text(strip=True) if time_tag else None


post_content = post.get_text(strip=True)
file_path = save_post_to_file(post_content, post_index)

# storing values in the code
posts_list.append({
'Title': title,
'Date': date,
'Author': author,
'Time': time,
'FilePath': file_path
})

cursor.execute("""
INSERT INTO data (title, date, author, time, file_path)
VALUES (%s, %s, %s, %s, %s)
""", (title, date, author, time, file_path))
conn.commit()

post_index += 1

return posts_list, post_index

## Function to extract the code of page and call the extract_posts function sending code of page
def scrape_blog(url, post_index):
posts_list = []
while len(posts_list) < 20:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
posts_list, post_index = extract_posts(soup, posts_list, post_index)
if len(posts_list) >= 20:
break

next_button = soup.find('a', class_='blog-pager-older-link')
if next_button:
url = next_button['href']
else:
break

return posts_list



### starting call
posts = scrape_blog('https://blog.python.org/', post_index=1)

## testing part
for post in posts:
print(f"Title: {post['Title']}")
print(f"Date: {post['Date']}")
print(f"Author: {post['Author']}")
print(f"Time: {post['Time']}")
print(f"File Path: {post['FilePath']}")
print('---')
130 changes: 130 additions & 0 deletions Web scraping(python blog)/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
## Running the Application

1. **Start the Docker Containers**

First, ensure your Docker containers are up and running:

```sh
docker-compose up
```

2. **Access the PostgreSQL Container**

Open a new terminal and run the following commands:

```sh
docker exec -it scraper_postgres_container sh
```

This command opens an interactive shell inside the PostgreSQL container.

3. **Switch to PostgreSQL User**

Inside the container, switch to the `postgres` user:

```sh
su - postgres
```

4. **Access PostgreSQL CLI**

Start the PostgreSQL command-line interface:

```sh
psql
```

5. **Create the Database**

Create a new database named `pythondata`:

```sh
create database pythondata;
```

6. **Exit the PostgreSQL CLI**

Exit the PostgreSQL command-line interface:

```sh
exit
```

7. **Connect to the Newly Created Database**

Connect to the `pythondata` database:

```sh
psql -h postgres_service -d pythondata -U postgres
```

8. **Enter the Password**

When prompted, enter the password:

```sh
admin
```

9. **Create the Data Table**

Create a table named `data` to store the scraped blog post details:

```sh
CREATE TABLE data (
id SERIAL PRIMARY KEY,
title TEXT,
date TEXT,
author TEXT,
time TEXT,
file_path TEXT
);
```

10. **Exit the PostgreSQL CLI**

Exit the PostgreSQL command-line interface:

```sh
exit
```

11. **Run the Scraper Script**

Open another terminal and run the following commands to execute the scraper script inside the Python container:

```sh
sudo docker exec -it scraper_python_container sh
```

This command opens an interactive shell inside the Python container.

12. **Execute the Python Script**

Run the `main.py` script:

```sh
python main.py
```

The script will start scraping the blog and storing the data in the PostgreSQL database.

13. **Verify the Stored Data**

You can check the stored data in the PostgreSQL database by running:

```sh
sudo docker exec -it scraper_postgres_container sh
su - postgres
psql -h postgres_service -d pythondata -U postgres
```

Enter the password `admin` when prompted. Then, execute the following command to see the data:

```sh
select * from data;
```

This command retrieves all the records stored in the `data` table.


5 changes: 5 additions & 0 deletions Web scraping(python blog)/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
psycopg2-binary
bs4

requests
html5lib==1.1
22 changes: 22 additions & 0 deletions web scraping (lipsum)/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
FROM python:3.10.2-alpine3.15
COPY . .
# Install Postgres
RUN mkdir -p /root/workspace/src
RUN mkdir -p /run/postgresql/
WORKDIR /run/postgresql/
RUN apk update
RUN apk add postgresql
RUN chown postgres:postgres /run/postgresql/
# Install requirements
COPY ./requirements.txt /tmp
RUN pip install -r /tmp/requirements.txt
# For psycopg2
RUN apk add --virtual postgresql-deps libpq-dev
# Create directories

# Mount your local file
COPY ./web_scraping_sample.py /root/workspace/src
# Switch to project directory
WORKDIR /root/workspace/src


27 changes: 27 additions & 0 deletions web scraping (lipsum)/docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
version: "3"
services:
pyhton_service:
build:
context: ./
dockerfile: Dockerfile
image: workshop1
container_name: workshop_python_container
stdin_open: true # docker attach container_id
tty: true
ports:
- "8000:8000"
volumes:
- .:/app
depends_on:
- postgres_service

postgres_service:
image: postgres
container_name: workshop_postgres_container
ports:
- "5432:5432"
environment:
POSTGRES_USER : postgres
POSTGRES_PASSWORD: admin
volumes:
- .:/var/lib/postgres
5 changes: 5 additions & 0 deletions web scraping (lipsum)/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
psycopg2-binary
bs4

requests
html5lib==1.1
34 changes: 34 additions & 0 deletions web scraping (lipsum)/web_scraping_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import requests
from bs4 import BeautifulSoup
import re
import psycopg2

# Create connection to database
conn = psycopg2.connect(
host="postgres_service",
database="lipsumgenerator",
user="postgres",
password="admin")
cursor = conn.cursor()

res = requests.get('https://www.lipsum.com/')
soup = BeautifulSoup(res.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib
data = soup.find(re.compile(r'div'), attrs={'id': "Panes"})
print(data.find("div"))

question_list = []
answer_list = []
for row in data.findAll("div"):
question_list.append(row.h2.text)
temp_string = ""
counter=0
for i in row.findAll("p"):
temp_string = temp_string + i.text
answer_list.append(temp_string)
file = open("qn_ans_ans", "w")

for i in range(len(question_list)):
print(question_list[i],)
cursor.execute("insert into qn_ans values(%s,%s)", (question_list[i], answer_list[i]))

conn.commit()