Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions Homework/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
FROM python:3.10.2-alpine3.15
# Create directories
RUN mkdir -p /root/workspace/src
COPY ./python_web_scrape.py /root/workspace/src
# Switch to project directory
WORKDIR /root/workspace/src
# Install required packages
RUN pip install --upgrade pip
RUN pip install requests bs4 html5lib psycopg2-binary
CMD ["python_web_scrape.py"]
ENTRYPOINT ["python"]


9 changes: 9 additions & 0 deletions Homework/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
psql-db:
image: 'postgres:14'
container_name: psql-db
environment:
- PGPASSWORD=123456
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=123456
ports:
- '5434:5432'
142 changes: 142 additions & 0 deletions Homework/python_web_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import requests
from bs4 import BeautifulSoup
import re
import psycopg2
from psycopg2 import Error

url = 'https://blog.python.org/'

# If this line causes an error, run 'pip install html5lib' or install html5lib

def create_connection(db_name, db_user, db_password, db_host, db_port):
connection = None
try:
connection = psycopg2.connect(
database=db_name,
user=db_user,
password=db_password,
host=db_host,
port=db_port
)
print("Connection to PostgreSQL DB successful")
except Error as e:
print(f"The error '{e}' occurred")

return connection

# Function to execute insert queries
def execute_query(connection,data):
cursor = connection.cursor()
try:
query = """
INSERT INTO python_blog_articles (date, title, body, author)
VALUES (%s, %s, %s, %s)
"""
cursor.execute(query, data)
connection.commit()
print("Query executed successfully")
except Error as e:
print(f"The error '{e}' occurred")

def create_table(connection):
try:
cursor = connection.cursor()
# SQL statement to create table if not exists
create_table_query = """
CREATE TABLE IF NOT EXISTS python_blog_articles (
id SERIAL PRIMARY KEY,
date VARCHAR(100),
title TEXT,
body TEXT,
author VARCHAR(100)
);
"""
# Execute the SQL query
cursor.execute(create_table_query)
connection.commit()
print("Table created successfully or already exists")
except Error as e:
print(f"The error '{e}' occurred")

date=[]
titletext=[]
bodytext=[]
author=[]

# Find all <div> elements with class="date-outer"
def process_page(soup):
for div in soup.find_all('div', class_='date-outer'):
hd = div.find_all('div', 'post-outer')
for i in hd:
date_header = div.find('h2', class_='date-header')
if date_header:
date_text = date_header.find('span')
dt = date_text.get_text(strip=True)
date.append(dt)
tdiv = div.find('div', class_='date-posts')
for div1 in tdiv.find_all('div', class_='post-outer'):
title_head = div1.find('h3', class_='post-title entry-title')
if title_head:
title_text = title_head.text.strip()
titletext.append(title_text)
content_div = div1.find('div', class_='post-body entry-content')
if content_div:
for p_tag in content_div.find_all('p'):
paragraph_text = content_div.text.strip()
cleaned_content = re.sub(r'\n+', ' ', paragraph_text)
bodytext.append(cleaned_content)
foot = div.find_all('div', class_='post-outer')
for i in foot:
footer_head = div.find('div', class_='post-footer')
footer_text = footer_head.find('span', class_='post-author vcard').text.strip()
cleanedf_content = re.sub(r'\n+', ' ', footer_text)
author.append(cleanedf_content)




def main():
# PostgreSQL database connection settings
db_name = 'webdemo'
db_user = 'postgres'
db_password = '123456'
db_host = 'localhost' # or your host
db_port = '5434' # or your port

# Establish connection to PostgreSQL
connection = create_connection(db_name, db_user, db_password, db_host, db_port)

if connection:
try:
res = requests.get(url)
soup = BeautifulSoup(res.content, 'html5lib')
process_page(soup
)

# Scraping subsequent pages until we have 50 articles
while len(titletext) < 50:
older_posts_link = soup.find('a', string=re.compile(r'Older Posts', re.IGNORECASE))
if older_posts_link:
next_page_url = older_posts_link['href']
res = requests.get(next_page_url)
soup = BeautifulSoup(res.content, 'html5lib')
process_page(soup)
else:
break

create_table(connection)
for i in range(len(titletext)):
data = (date[i], titletext[i], bodytext[i], author[i])
execute_query(connection,data)


except Error as e:
print(f"Error: {e}")

finally:
if connection:
connection.close()
print("PostgreSQL connection is closed")

if __name__ == "__main__":
main()
45 changes: 26 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,29 @@ One Day workshop on understanding Docker, Web Scrapping, Regular Expressions, Po
docker-compose --version
```
##### **_docker-compose version 1.25.0, build 0a186604_**

## What will you learn by the end of this workshop?
- By the end of this workshop you will learn how to build docker image and it's usage.
- You will learn how to scrape a website using urllib/requests and Beautifulsoup.
- You will learn Regular Expressions and how to work with it.
- You will learn key features of PostgreSQL.
- You will learn how to dockerize your project.

## Schedule
| Time | Topics
|---------------|-------
| 09:00 - 11:00 | [`Introduction to Docker`](/docs/introduction_to_docker.md)
| 11:00 - 01:00 | [`Introduction to Webscrapping.`](/docs/introduction_to_webscraping.md)
| 01:00 - 02:00 | `Break`
| 02:00 - 03:00 | [`Dockerizing a project`](/docs/working_with_docker_container.md)
| 03:00 - 04:00 | [`Introduction to PostgreSQL`](/docs/introduction_to_postgresql.md)
| 04:00 - 04:30 | [`Introduction to Github`](/docs/introduction_to_git_commands.md)
| 04:30 - 04:45 | `Q & A`
| 04:45 - 05:00 | [`Wrapping Up`](/docs/workshop1_home_work.md)

### Homework
- Run docker-compose.yml with posgresql commands to start server
```
docker-compose up -d
```
```
docker exec -it psql-db bash
```
```
psql -U postgres
```
- create a database to store the scraped content

- Run Dockerfile using commands
```
docker build --no-cache --network=host ./ -t simple_python
```
```
docker run --network=host simple_python
```

- The scraped content will be stored in a table format
- Date | Title | Content/BodyText | Author


4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
psycopg2==2.9.3
psycopg2-binary==2.9.3
bs4
urllib2
requests
html5lib==1.1
html5lib==1.1