From e06e8fcf373c6ed21fa9bc1a99ca6784097b63c2 Mon Sep 17 00:00:00 2001 From: Raghugowd <84554619+Raghugowd@users.noreply.github.com> Date: Tue, 25 Apr 2023 01:42:58 +0530 Subject: [PATCH 1/2] Homework Write a Python program to Scrape the pages from Python Blogs and save the data into the DB. and dockerize. --- Dockerfile | 18 +++++------------- docker-compose.yml | 10 ++++++++++ webScraping.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 13 deletions(-) create mode 100644 docker-compose.yml create mode 100644 webScraping.py diff --git a/Dockerfile b/Dockerfile index f07ae2b..84f0e65 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,17 +1,9 @@ FROM python:3.10.2-alpine3.15 -COPY . . -# Install Postgres -RUN apk update -RUN apk add postgresql -RUN chown postgres:postgres /run/postgresql/ -# Install requirements -COPY ./requirements.txt /tmp -RUN pip install -r /tmp/requirements.txt -# For psycopg2 -RUN apk add --virtual postgresql-deps libpq-dev # Create directories RUN mkdir -p /root/workspace/src -# Mount your local file -COPY ./web_scraping_sample.py /root/workspace/src +COPY ./webScraping.py /root/workspace/src # Switch to project directory -WORKDIR /root/workspace/src \ No newline at end of file +WORKDIR /root/workspace/src +# Install required packages +RUN pip install --upgrade pip +RUN pip install requests bs4 html5lib psycopg2-binary diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..e3bff76 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,10 @@ + +psql-db: + image: 'postgres:14' + container_name: psql-db + environment: + - PGPASSWORD=123456 + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=123456 + ports: + - '5434:5432' diff --git a/webScraping.py b/webScraping.py new file mode 100644 index 0000000..160f7b9 --- /dev/null +++ b/webScraping.py @@ -0,0 +1,32 @@ +import requests +from bs4 import BeautifulSoup +import re +import psycopg2 +conn = psycopg2.connect( +host="172.17.0.2", +port="5432", +database="pybd", +user="postgres", +password="123456" +) +print("Connection Successful") +cur = conn.cursor() +res = requests.get('https://blog.python.org/') +soup = BeautifulSoup(res.content, 'html5lib') +titles=[] +authors=[] +for i in soup.find_all('h3', class_='entry-title'): + + string = i.find('a').getText() + titles.append(string.strip()) +for i in soup.find_all('span', class_='fn'): + string = i.getText() + authors.append(string.strip()) +for i in range(4): + cur.execute( +"INSERT INTO py1(no,title,author) VALUES(%s,%s,%s)", (i+1, titles[i], authors[i]) +) +conn.commit() +cur.close() +conn.close() + From cdd6dfb15ae583f0cbf816506f66f8421faa70c1 Mon Sep 17 00:00:00 2001 From: Raghugowd <84554619+Raghugowd@users.noreply.github.com> Date: Tue, 25 Apr 2023 09:55:55 +0530 Subject: [PATCH 2/2] homework --- HOMEWORK/Dockerfile | 9 +++++++++ HOMEWORK/docker-compose.yml | 10 ++++++++++ HOMEWORK/webScraping.py | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+) create mode 100644 HOMEWORK/Dockerfile create mode 100644 HOMEWORK/docker-compose.yml create mode 100644 HOMEWORK/webScraping.py diff --git a/HOMEWORK/Dockerfile b/HOMEWORK/Dockerfile new file mode 100644 index 0000000..84f0e65 --- /dev/null +++ b/HOMEWORK/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.10.2-alpine3.15 +# Create directories +RUN mkdir -p /root/workspace/src +COPY ./webScraping.py /root/workspace/src +# Switch to project directory +WORKDIR /root/workspace/src +# Install required packages +RUN pip install --upgrade pip +RUN pip install requests bs4 html5lib psycopg2-binary diff --git a/HOMEWORK/docker-compose.yml b/HOMEWORK/docker-compose.yml new file mode 100644 index 0000000..e3bff76 --- /dev/null +++ b/HOMEWORK/docker-compose.yml @@ -0,0 +1,10 @@ + +psql-db: + image: 'postgres:14' + container_name: psql-db + environment: + - PGPASSWORD=123456 + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=123456 + ports: + - '5434:5432' diff --git a/HOMEWORK/webScraping.py b/HOMEWORK/webScraping.py new file mode 100644 index 0000000..670e58a --- /dev/null +++ b/HOMEWORK/webScraping.py @@ -0,0 +1,33 @@ +import requests +from bs4 import BeautifulSoup +import re +import psycopg2 +conn = psycopg2.connect( +host="172.17.0.2", +port="5432", +database="pybd", +user="postgres", +password="123456" +) +print("Connection Successful") +cur = conn.cursor() +res = requests.get('https://blog.python.org/') +soup = BeautifulSoup(res.content, 'html5lib') +titles=[] +authors=[] +for i in soup.find_all('h3', class_='entry-title'): + + string = i.find('a').getText() + titles.append(string.strip()) +for i in soup.find_all('span', class_='fn'): + string = i.getText() + authors.append(string.strip()) +for i in range(4): + cur.execute( + "CREATE TABLE py1(no INT, title VARCHAR(100),author VARCHAR(100));" +"INSERT INTO py1(no,title,author) VALUES(%s,%s,%s)", (i+1, titles[i], authors[i]) +) +conn.commit() +cur.close() +conn.close() +