From 0eb4903965de7da55fdc0192ecec3a4430eb05c1 Mon Sep 17 00:00:00 2001 From: Preethesh Lewis Date: Mon, 24 Jun 2024 22:33:51 +0530 Subject: [PATCH 1/7] Code for scraping Python blog page --- SJEC_CS116_workshop1 | 1 + 1 file changed, 1 insertion(+) create mode 160000 SJEC_CS116_workshop1 diff --git a/SJEC_CS116_workshop1 b/SJEC_CS116_workshop1 new file mode 160000 index 0000000..e31bc4b --- /dev/null +++ b/SJEC_CS116_workshop1 @@ -0,0 +1 @@ +Subproject commit e31bc4b8e989d5c6ba18e1e6ae69fa6bacb390e8 From c0bc90af7a4503ef389ebbb8d20a6214b9e393bf Mon Sep 17 00:00:00 2001 From: Preethesh Lewis Date: Tue, 2 Jul 2024 23:11:29 +0530 Subject: [PATCH 2/7] Python scrape --- SJEC_CS116_workshop1 | 1 - docs/Dockerfile | 13 ++++ docs/docker-compose.yml | 9 +++ docs/python_web_scrape.py | 142 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 164 insertions(+), 1 deletion(-) delete mode 160000 SJEC_CS116_workshop1 create mode 100644 docs/Dockerfile create mode 100644 docs/docker-compose.yml create mode 100644 docs/python_web_scrape.py diff --git a/SJEC_CS116_workshop1 b/SJEC_CS116_workshop1 deleted file mode 160000 index e31bc4b..0000000 --- a/SJEC_CS116_workshop1 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e31bc4b8e989d5c6ba18e1e6ae69fa6bacb390e8 diff --git a/docs/Dockerfile b/docs/Dockerfile new file mode 100644 index 0000000..602fb81 --- /dev/null +++ b/docs/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.10.2-alpine3.15 +# Create directories +RUN mkdir -p /root/workspace/src +COPY ./python_web_scrape.py /root/workspace/src +# Switch to project directory +WORKDIR /root/workspace/src +# Install required packages +RUN pip install --upgrade pip +RUN pip install requests bs4 html5lib psycopg2-binary +CMD ["python_web_scrape.py"] +ENTRYPOINT ["python"] + + diff --git a/docs/docker-compose.yml b/docs/docker-compose.yml new file mode 100644 index 0000000..698411c --- /dev/null +++ b/docs/docker-compose.yml @@ -0,0 +1,9 @@ +psql-db: + image: 'postgres:14' + container_name: psql-db + environment: + - PGPASSWORD=123456 + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=123456 + ports: + - '5434:5432' diff --git a/docs/python_web_scrape.py b/docs/python_web_scrape.py new file mode 100644 index 0000000..04c89ec --- /dev/null +++ b/docs/python_web_scrape.py @@ -0,0 +1,142 @@ +import requests +from bs4 import BeautifulSoup +import re +import psycopg2 +from psycopg2 import Error + +url = 'https://blog.python.org/' + +# If this line causes an error, run 'pip install html5lib' or install html5lib + +def create_connection(db_name, db_user, db_password, db_host, db_port): + connection = None + try: + connection = psycopg2.connect( + database=db_name, + user=db_user, + password=db_password, + host=db_host, + port=db_port + ) + print("Connection to PostgreSQL DB successful") + except Error as e: + print(f"The error '{e}' occurred") + + return connection + +# Function to execute insert queries +def execute_query(connection,data): + cursor = connection.cursor() + try: + query = """ + INSERT INTO python_blog_articles (date, title, body, author) + VALUES (%s, %s, %s, %s) + """ + cursor.execute(query, data) + connection.commit() + print("Query executed successfully") + except Error as e: + print(f"The error '{e}' occurred") + +def create_table(connection): + try: + cursor = connection.cursor() + # SQL statement to create table if not exists + create_table_query = """ + CREATE TABLE IF NOT EXISTS python_blog_articles ( + id SERIAL PRIMARY KEY, + date VARCHAR(100), + title TEXT, + body TEXT, + author VARCHAR(100) + ); + """ + # Execute the SQL query + cursor.execute(create_table_query) + connection.commit() + print("Table created successfully or already exists") + except Error as e: + print(f"The error '{e}' occurred") + +date=[] +titletext=[] +bodytext=[] +author=[] + +# Find all
elements with class="date-outer" +def process_page(soup): + for div in soup.find_all('div', class_='date-outer'): + hd = div.find_all('div', 'post-outer') + for i in hd: + date_header = div.find('h2', class_='date-header') + if date_header: + date_text = date_header.find('span') + dt = date_text.get_text(strip=True) + date.append(dt) + tdiv = div.find('div', class_='date-posts') + for div1 in tdiv.find_all('div', class_='post-outer'): + title_head = div1.find('h3', class_='post-title entry-title') + if title_head: + title_text = title_head.text.strip() + titletext.append(title_text) + content_div = div1.find('div', class_='post-body entry-content') + if content_div: + for p_tag in content_div.find_all('p'): + paragraph_text = content_div.text.strip() + cleaned_content = re.sub(r'\n+', ' ', paragraph_text) + bodytext.append(cleaned_content) + foot = div.find_all('div', class_='post-outer') + for i in foot: + footer_head = div.find('div', class_='post-footer') + footer_text = footer_head.find('span', class_='post-author vcard').text.strip() + cleanedf_content = re.sub(r'\n+', ' ', footer_text) + author.append(cleanedf_content) + + + + +def main(): + # PostgreSQL database connection settings + db_name = 'webdemo' + db_user = 'postgres' + db_password = '123456' + db_host = 'localhost' # or your host + db_port = '5434' # or your port + + # Establish connection to PostgreSQL + connection = create_connection(db_name, db_user, db_password, db_host, db_port) + + if connection: + try: + res = requests.get(url) + soup = BeautifulSoup(res.content, 'html5lib') + process_page(soup + ) + + # Scraping subsequent pages until we have 50 articles + while len(titletext) < 50: + older_posts_link = soup.find('a', string=re.compile(r'Older Posts', re.IGNORECASE)) + if older_posts_link: + next_page_url = older_posts_link['href'] + res = requests.get(next_page_url) + soup = BeautifulSoup(res.content, 'html5lib') + process_page(soup) + else: + break + + create_table(connection) + for i in range(len(titletext)): + data = (date[i], titletext[i], bodytext[i], author[i]) + execute_query(connection,data) + + + except Error as e: + print(f"Error: {e}") + + finally: + if connection: + connection.close() + print("PostgreSQL connection is closed") + +if __name__ == "__main__": + main() \ No newline at end of file From fe1d412f2571c014457165a6e3acc4c8586f837a Mon Sep 17 00:00:00 2001 From: Preethesh Lewis Date: Tue, 2 Jul 2024 23:13:17 +0530 Subject: [PATCH 3/7] Python scrape --- {docs => Homework}/Dockerfile | 0 {docs => Homework}/docker-compose.yml | 0 {docs => Homework}/python_web_scrape.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename {docs => Homework}/Dockerfile (100%) rename {docs => Homework}/docker-compose.yml (100%) rename {docs => Homework}/python_web_scrape.py (100%) diff --git a/docs/Dockerfile b/Homework/Dockerfile similarity index 100% rename from docs/Dockerfile rename to Homework/Dockerfile diff --git a/docs/docker-compose.yml b/Homework/docker-compose.yml similarity index 100% rename from docs/docker-compose.yml rename to Homework/docker-compose.yml diff --git a/docs/python_web_scrape.py b/Homework/python_web_scrape.py similarity index 100% rename from docs/python_web_scrape.py rename to Homework/python_web_scrape.py From 12eb12d46913393a4e10cf8a3ea06464f6f3c5bf Mon Sep 17 00:00:00 2001 From: Preethesh Lewis Date: Tue, 2 Jul 2024 23:22:48 +0530 Subject: [PATCH 4/7] Update README.md --- README.md | 35 ++++++++++++++++------------------- requirements.txt | 4 ++-- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 9dd0a0e..b9383ca 100644 --- a/README.md +++ b/README.md @@ -46,22 +46,19 @@ One Day workshop on understanding Docker, Web Scrapping, Regular Expressions, Po docker-compose --version ``` ##### **_docker-compose version 1.25.0, build 0a186604_** - -## What will you learn by the end of this workshop? -- By the end of this workshop you will learn how to build docker image and it's usage. -- You will learn how to scrape a website using urllib/requests and Beautifulsoup. -- You will learn Regular Expressions and how to work with it. -- You will learn key features of PostgreSQL. -- You will learn how to dockerize your project. - -## Schedule -| Time | Topics -|---------------|------- -| 09:00 - 11:00 | [`Introduction to Docker`](/docs/introduction_to_docker.md) -| 11:00 - 01:00 | [`Introduction to Webscrapping.`](/docs/introduction_to_webscraping.md) -| 01:00 - 02:00 | `Break` -| 02:00 - 03:00 | [`Dockerizing a project`](/docs/working_with_docker_container.md) -| 03:00 - 04:00 | [`Introduction to PostgreSQL`](/docs/introduction_to_postgresql.md) -| 04:00 - 04:30 | [`Introduction to Github`](/docs/introduction_to_git_commands.md) -| 04:30 - 04:45 | `Q & A` -| 04:45 - 05:00 | [`Wrapping Up`](/docs/workshop1_home_work.md) + +###Homework + - Run docker-compose.yml with posgresql commands to run server + - docker-compose up -d + - docker exec -it psql-db bash + - psql -U postgres + - create a database to store the scraped content + + - Run Dockerfile using commands + - docker build --no-cache --network=host ./ -t simple_python + - docker run --network=host simple_python + + - The scraped content will be stored in a table format + Date | Title | Content/BodyText | Author + + diff --git a/requirements.txt b/requirements.txt index 34c449f..6e1cbed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -psycopg2==2.9.3 +psycopg2-binary==2.9.3 bs4 urllib2 requests -html5lib==1.1 \ No newline at end of file +html5lib==1.1 From 86920746af16e554155159d6ad1666e676fdf657 Mon Sep 17 00:00:00 2001 From: Preethesh Lewis Date: Tue, 2 Jul 2024 23:29:01 +0530 Subject: [PATCH 5/7] Update README.md --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b9383ca..51b03db 100644 --- a/README.md +++ b/README.md @@ -47,18 +47,19 @@ One Day workshop on understanding Docker, Web Scrapping, Regular Expressions, Po ``` ##### **_docker-compose version 1.25.0, build 0a186604_** -###Homework + ### Homework - Run docker-compose.yml with posgresql commands to run server - docker-compose up -d - docker exec -it psql-db bash - psql -U postgres - create a database to store the scraped content - - Run Dockerfile using commands - - docker build --no-cache --network=host ./ -t simple_python + - Run Dockerfile using commands + + - docker build --no-cache --network=host ./ -t simple_python - docker run --network=host simple_python - The scraped content will be stored in a table format - Date | Title | Content/BodyText | Author + - Date | Title | Content/BodyText | Author From 8afd862f6d3e87d3d592320d2dbf79d64f48b789 Mon Sep 17 00:00:00 2001 From: Preethesh Lewis Date: Tue, 2 Jul 2024 23:29:54 +0530 Subject: [PATCH 6/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 51b03db..4efd094 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ One Day workshop on understanding Docker, Web Scrapping, Regular Expressions, Po ##### **_docker-compose version 1.25.0, build 0a186604_** ### Homework - - Run docker-compose.yml with posgresql commands to run server + - Run docker-compose.yml with posgresql commands to start server - docker-compose up -d - docker exec -it psql-db bash - psql -U postgres From 0d55982ea2fc527cc00963359602c669cc925618 Mon Sep 17 00:00:00 2001 From: Preethesh Lewis Date: Tue, 2 Jul 2024 23:32:46 +0530 Subject: [PATCH 7/7] Update README.md --- README.md | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4efd094..bd0b751 100644 --- a/README.md +++ b/README.md @@ -49,15 +49,24 @@ One Day workshop on understanding Docker, Web Scrapping, Regular Expressions, Po ### Homework - Run docker-compose.yml with posgresql commands to start server - - docker-compose up -d - - docker exec -it psql-db bash - - psql -U postgres + ``` + docker-compose up -d + ``` + ``` + docker exec -it psql-db bash + ``` + ``` + psql -U postgres + ``` - create a database to store the scraped content - Run Dockerfile using commands - - - docker build --no-cache --network=host ./ -t simple_python - - docker run --network=host simple_python + ``` + docker build --no-cache --network=host ./ -t simple_python + ``` + ``` + docker run --network=host simple_python + ``` - The scraped content will be stored in a table format - Date | Title | Content/BodyText | Author