diff --git a/Homework/Dockerfile b/Homework/Dockerfile new file mode 100644 index 0000000..7b1479e --- /dev/null +++ b/Homework/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.10.2-alpine3.15 +# Create directories +RUN mkdir -p /root/workspace/src +COPY ./python_web_scrape.py /root/workspace/src +# Switch to project directory +WORKDIR /root/workspace/src +# Install required packages +RUN pip install --upgrade pip +RUN pip install requests bs4 html5lib +RUN pip install psycopg2-binary +CMD ["python_web_scrape.py"] +ENTRYPOINT ["python"] \ No newline at end of file diff --git a/Homework/docker-compose.yml b/Homework/docker-compose.yml new file mode 100644 index 0000000..0876790 --- /dev/null +++ b/Homework/docker-compose.yml @@ -0,0 +1,9 @@ +psql-db: + image: 'postgres:14' + container_name: psql-db + environment: + - PGPASSWORD=123456 + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=123456 + ports: + - '5434:5432' \ No newline at end of file diff --git a/Homework/python_web_scrape.py b/Homework/python_web_scrape.py new file mode 100644 index 0000000..5316336 --- /dev/null +++ b/Homework/python_web_scrape.py @@ -0,0 +1,121 @@ +import requests +from bs4 import BeautifulSoup +import re +import psycopg2 +from psycopg2 import Error + +url = 'https://blog.python.org/' + +def create_connection(db_name, db_user, db_password, db_host, db_port): + try: + connection = psycopg2.connect( + database=db_name, + user=db_user, + password=db_password, + host=db_host, + port=db_port + ) + print("Connection to PostgreSQL DB successful") + return connection + except Error as e: + print(f"The error '{e}' occurred") + return None + +def execute_query(connection, data): + cursor = connection.cursor() + try: + query = """ + INSERT INTO python_blog_articles (date, title, body, author) + VALUES (%s, %s, %s, %s) + """ + cursor.execute(query, data) + connection.commit() + print("Query executed successfully") + except Error as e: + print(f"The error '{e}' occurred") + +def create_table(connection): + cursor = connection.cursor() + try: + create_table_query = """ + CREATE TABLE IF NOT EXISTS python_blog_articles ( + id SERIAL PRIMARY KEY, + date VARCHAR(100), + title TEXT, + body TEXT, + author VARCHAR(100) + ); + """ + cursor.execute(create_table_query) + connection.commit() + print("Table created successfully or already exists") + except Error as e: + print(f"The error '{e}' occurred") + +def process_page(soup, date, titletext, bodytext, author): + for div in soup.find_all('div', class_='date-outer'): + date_header = div.find('h2', class_='date-header') + if date_header: + date_text = date_header.find('span').get_text(strip=True) + date.append(date_text) + + for post in div.find_all('div', class_='post-outer'): + title_head = post.find('h3', class_='post-title entry-title') + if title_head: + titletext.append(title_head.text.strip()) + + content_div = post.find('div', class_='post-body entry-content') + if content_div: + paragraph_text = ' '.join([p.text.strip() for p in content_div.find_all('p')]) + bodytext.append(paragraph_text) + + footer_head = post.find('div', class_='post-footer') + if footer_head: + footer_text = footer_head.find('span', class_='post-author vcard').text.strip() + author.append(footer_text) + +def main(): + db_name = 'webdemo' + db_user = 'postgres' + db_password = '123456' + db_host = 'localhost' + db_port = '5434' + + connection = create_connection(db_name, db_user, db_password, db_host, db_port) + + if connection: + try: + date = [] + titletext = [] + bodytext = [] + author = [] + + res = requests.get(url) + soup = BeautifulSoup(res.content, 'html5lib') + process_page(soup, date, titletext, bodytext, author) + + while len(titletext) < 50: + older_posts_link = soup.find('a', string=re.compile(r'Older Posts', re.IGNORECASE)) + if older_posts_link: + next_page_url = older_posts_link['href'] + res = requests.get(next_page_url) + soup = BeautifulSoup(res.content, 'html5lib') + process_page(soup, date, titletext, bodytext, author) + else: + break + + create_table(connection) + for i in range(len(titletext)): + data = (date[i], titletext[i], bodytext[i], author[i]) + execute_query(connection, data) + + except Error as e: + print(f"Error: {e}") + + finally: + if connection: + connection.close() + print("PostgreSQL connection is closed") + +if __name__ == "__main__": + main() diff --git a/Homework/requirements.txt b/Homework/requirements.txt new file mode 100644 index 0000000..02a89c4 --- /dev/null +++ b/Homework/requirements.txt @@ -0,0 +1,4 @@ +psycopg2-binary==2.9.3 +beautifulsoup4==4.11.1 +requests==2.27.1 +html5lib==1.1