diff --git a/Homework/Dockerfile b/Homework/Dockerfile new file mode 100644 index 0000000..602fb81 --- /dev/null +++ b/Homework/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.10.2-alpine3.15 +# Create directories +RUN mkdir -p /root/workspace/src +COPY ./python_web_scrape.py /root/workspace/src +# Switch to project directory +WORKDIR /root/workspace/src +# Install required packages +RUN pip install --upgrade pip +RUN pip install requests bs4 html5lib psycopg2-binary +CMD ["python_web_scrape.py"] +ENTRYPOINT ["python"] + + diff --git a/Homework/docker-compose.yml b/Homework/docker-compose.yml new file mode 100644 index 0000000..698411c --- /dev/null +++ b/Homework/docker-compose.yml @@ -0,0 +1,9 @@ +psql-db: + image: 'postgres:14' + container_name: psql-db + environment: + - PGPASSWORD=123456 + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=123456 + ports: + - '5434:5432' diff --git a/Homework/python_web_scrape.py b/Homework/python_web_scrape.py new file mode 100644 index 0000000..04c89ec --- /dev/null +++ b/Homework/python_web_scrape.py @@ -0,0 +1,142 @@ +import requests +from bs4 import BeautifulSoup +import re +import psycopg2 +from psycopg2 import Error + +url = 'https://blog.python.org/' + +# If this line causes an error, run 'pip install html5lib' or install html5lib + +def create_connection(db_name, db_user, db_password, db_host, db_port): + connection = None + try: + connection = psycopg2.connect( + database=db_name, + user=db_user, + password=db_password, + host=db_host, + port=db_port + ) + print("Connection to PostgreSQL DB successful") + except Error as e: + print(f"The error '{e}' occurred") + + return connection + +# Function to execute insert queries +def execute_query(connection,data): + cursor = connection.cursor() + try: + query = """ + INSERT INTO python_blog_articles (date, title, body, author) + VALUES (%s, %s, %s, %s) + """ + cursor.execute(query, data) + connection.commit() + print("Query executed successfully") + except Error as e: + print(f"The error '{e}' occurred") + +def create_table(connection): + try: + cursor = connection.cursor() + # SQL statement to create table if not exists + create_table_query = """ + CREATE TABLE IF NOT EXISTS python_blog_articles ( + id SERIAL PRIMARY KEY, + date VARCHAR(100), + title TEXT, + body TEXT, + author VARCHAR(100) + ); + """ + # Execute the SQL query + cursor.execute(create_table_query) + connection.commit() + print("Table created successfully or already exists") + except Error as e: + print(f"The error '{e}' occurred") + +date=[] +titletext=[] +bodytext=[] +author=[] + +# Find all
elements with class="date-outer" +def process_page(soup): + for div in soup.find_all('div', class_='date-outer'): + hd = div.find_all('div', 'post-outer') + for i in hd: + date_header = div.find('h2', class_='date-header') + if date_header: + date_text = date_header.find('span') + dt = date_text.get_text(strip=True) + date.append(dt) + tdiv = div.find('div', class_='date-posts') + for div1 in tdiv.find_all('div', class_='post-outer'): + title_head = div1.find('h3', class_='post-title entry-title') + if title_head: + title_text = title_head.text.strip() + titletext.append(title_text) + content_div = div1.find('div', class_='post-body entry-content') + if content_div: + for p_tag in content_div.find_all('p'): + paragraph_text = content_div.text.strip() + cleaned_content = re.sub(r'\n+', ' ', paragraph_text) + bodytext.append(cleaned_content) + foot = div.find_all('div', class_='post-outer') + for i in foot: + footer_head = div.find('div', class_='post-footer') + footer_text = footer_head.find('span', class_='post-author vcard').text.strip() + cleanedf_content = re.sub(r'\n+', ' ', footer_text) + author.append(cleanedf_content) + + + + +def main(): + # PostgreSQL database connection settings + db_name = 'webdemo' + db_user = 'postgres' + db_password = '123456' + db_host = 'localhost' # or your host + db_port = '5434' # or your port + + # Establish connection to PostgreSQL + connection = create_connection(db_name, db_user, db_password, db_host, db_port) + + if connection: + try: + res = requests.get(url) + soup = BeautifulSoup(res.content, 'html5lib') + process_page(soup + ) + + # Scraping subsequent pages until we have 50 articles + while len(titletext) < 50: + older_posts_link = soup.find('a', string=re.compile(r'Older Posts', re.IGNORECASE)) + if older_posts_link: + next_page_url = older_posts_link['href'] + res = requests.get(next_page_url) + soup = BeautifulSoup(res.content, 'html5lib') + process_page(soup) + else: + break + + create_table(connection) + for i in range(len(titletext)): + data = (date[i], titletext[i], bodytext[i], author[i]) + execute_query(connection,data) + + + except Error as e: + print(f"Error: {e}") + + finally: + if connection: + connection.close() + print("PostgreSQL connection is closed") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/README.md b/README.md index 9dd0a0e..bd0b751 100644 --- a/README.md +++ b/README.md @@ -46,22 +46,29 @@ One Day workshop on understanding Docker, Web Scrapping, Regular Expressions, Po docker-compose --version ``` ##### **_docker-compose version 1.25.0, build 0a186604_** - -## What will you learn by the end of this workshop? -- By the end of this workshop you will learn how to build docker image and it's usage. -- You will learn how to scrape a website using urllib/requests and Beautifulsoup. -- You will learn Regular Expressions and how to work with it. -- You will learn key features of PostgreSQL. -- You will learn how to dockerize your project. - -## Schedule -| Time | Topics -|---------------|------- -| 09:00 - 11:00 | [`Introduction to Docker`](/docs/introduction_to_docker.md) -| 11:00 - 01:00 | [`Introduction to Webscrapping.`](/docs/introduction_to_webscraping.md) -| 01:00 - 02:00 | `Break` -| 02:00 - 03:00 | [`Dockerizing a project`](/docs/working_with_docker_container.md) -| 03:00 - 04:00 | [`Introduction to PostgreSQL`](/docs/introduction_to_postgresql.md) -| 04:00 - 04:30 | [`Introduction to Github`](/docs/introduction_to_git_commands.md) -| 04:30 - 04:45 | `Q & A` -| 04:45 - 05:00 | [`Wrapping Up`](/docs/workshop1_home_work.md) + + ### Homework + - Run docker-compose.yml with posgresql commands to start server + ``` + docker-compose up -d + ``` + ``` + docker exec -it psql-db bash + ``` + ``` + psql -U postgres + ``` + - create a database to store the scraped content + + - Run Dockerfile using commands + ``` + docker build --no-cache --network=host ./ -t simple_python + ``` + ``` + docker run --network=host simple_python + ``` + + - The scraped content will be stored in a table format + - Date | Title | Content/BodyText | Author + + diff --git a/requirements.txt b/requirements.txt index 34c449f..6e1cbed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -psycopg2==2.9.3 +psycopg2-binary==2.9.3 bs4 urllib2 requests -html5lib==1.1 \ No newline at end of file +html5lib==1.1