diff --git a/Session_HW/README.md b/Session_HW/README.md new file mode 100644 index 0000000..95b4991 --- /dev/null +++ b/Session_HW/README.md @@ -0,0 +1,2 @@ +# Webscraping+docker+postgresql +Hi, This is my first project of docker and postgresql diff --git a/Session_HW/database.py b/Session_HW/database.py new file mode 100644 index 0000000..9ad9689 --- /dev/null +++ b/Session_HW/database.py @@ -0,0 +1,58 @@ +import psycopg2 +from postedBy import your_function_that_returns_data + +def insert_data_into_postgres(date_list, pythonVersionsBlogs, pythonVersionsBlogLinks, version_href_list, author_list, posted_by_list): + conn_params = { + 'dbname': 'postgres', + 'user': 'postgres', + 'password': '123456', + 'host': 'psql-db', + 'port': '5432' + } + + try: + conn = psycopg2.connect(**conn_params) + cur = conn.cursor() + + create_table_query = """ + CREATE TABLE IF NOT EXISTS blog_data ( + id SERIAL PRIMARY KEY, + date TEXT, + blog_title TEXT, + blog_link TEXT, + version_links TEXT, + authors TEXT, + posted_by TEXT + ); + """ + cur.execute(create_table_query) + + # Insert data + for i in range(len(date_list)): + insert_query = """ + INSERT INTO blog_data (date, blog_title, blog_link, version_links, authors, posted_by) + VALUES (%s, %s, %s, %s, %s, %s) + """ + cur.execute(insert_query, ( + date_list[i], + pythonVersionsBlogs[i], + pythonVersionsBlogLinks[i], + version_href_list[i], + author_list[i], + posted_by_list[i] + )) + + conn.commit() + cur.close() + conn.close() + except Exception as e: + print(f"An error occurred: {e}") + +def main(): + # Extract data + date_list, pythonVersionsBlogs, pythonVersionsBlogLinks, version_href_list, author_list, posted_by_list = your_function_that_returns_data() + + insert_data_into_postgres(date_list, pythonVersionsBlogs, pythonVersionsBlogLinks, version_href_list, author_list, posted_by_list) + +if __name__ == "__main__": + main() diff --git a/Session_HW/docker-compose.yml b/Session_HW/docker-compose.yml new file mode 100644 index 0000000..cd76671 --- /dev/null +++ b/Session_HW/docker-compose.yml @@ -0,0 +1,27 @@ +version: "3" +services: + python_service: + build: + context: ./ + dockerfile: dockerfile + image: homework + container_name: workshop_container_homework1 + stdin_open: true + tty: true + ports: + - "8000:8000" + volumes: + - .:/app + + + psql-db: + image: "postgres:16" + container_name: psql-db3 + environment: + - PGPASSWORD=123456 + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=123456 + ports: + - "5434:5432" + volumes: + - ./pgdata:/var/lib/postgresql/data diff --git a/Session_HW/dockerfile b/Session_HW/dockerfile new file mode 100644 index 0000000..2255956 --- /dev/null +++ b/Session_HW/dockerfile @@ -0,0 +1,18 @@ + +FROM python:3.10.2-alpine3.15 + +RUN mkdir -p /root/workspace/src +COPY ./database.py /root/workspace/src +COPY ./postedBy.py /root/workspace/src +WORKDIR /root/workspace/src + +RUN pip install --upgrade pip +RUN pip install requests bs4 html5lib + +RUN pip install --no-cache-dir --upgrade pip + +RUN apk add --no-cache postgresql-dev gcc python3-dev musl-dev + +RUN pip install --no-cache-dir requests bs4 html5lib psycopg2-binary +CMD [ "database.py"] +ENTRYPOINT ["python"] \ No newline at end of file diff --git a/Session_HW/postedBy.py b/Session_HW/postedBy.py new file mode 100644 index 0000000..c4d6a07 --- /dev/null +++ b/Session_HW/postedBy.py @@ -0,0 +1,123 @@ +# postedBy.py +import requests +from bs4 import BeautifulSoup + +def get_content(url): + response = requests.get(url) + return BeautifulSoup(response.content, 'html.parser') + +def extract_posted_by(soup): + posted_by = [] + post_footer_divs = soup.find_all('div', class_='post-footer-line post-footer-line-1') + for post_footer_div in post_footer_divs: + span_tag = post_footer_div.find('span', class_='fn') + if span_tag: + posted_by_name = span_tag.text.strip() + posted_by.append(posted_by_name) + return posted_by + +def extract_authors_from_content(content): + extracted_phrases = [] + found_release_team = False + + release_team_positions = [pos for pos in range(len(content)) if content.lower().find('release team,', pos) == pos] + + if release_team_positions: + pos = release_team_positions[-1] + len('release team,') + after_release_team = content[pos:].strip() + if after_release_team: + lines = after_release_team.split('\n') + for line in lines: + line = line.strip() + if line: + extracted_phrases.append(line) + + if extracted_phrases: + return ', '.join(extracted_phrases) + return None + +def extract_authors(soup): + authors_list = [] + post_body_divs = soup.find_all('div', class_='post-body entry-content') + for post_body_div in post_body_divs: + content = post_body_div.get_text(separator='\n', strip=True) + authors = extract_authors_from_content(content) + if authors: + authors_list.append(authors) + return authors_list + +def get_older_posts_url(soup): + older_posts_link = soup.find('a', class_='blog-pager-older-link') + return older_posts_link['href'] if older_posts_link else None + +def your_function_that_returns_data(): + base_url = 'https://blog.python.org/' + + posted_by_list = [] + author_list = [] + date_list = [] + pythonVersionsBlogs = [] + pythonVersionsBlogLinks = [] + version_href_list = [] + + current_url = base_url + + blogs_processed = 0 + max_blogs = 50 + + while blogs_processed < max_blogs: + soup = get_content(current_url) + + posted_by = extract_posted_by(soup) + posted_by_list.extend(posted_by) + + authors = extract_authors(soup) + author_list.extend(authors) + + data = soup.find_all('div', attrs={'class': 'date-outer'}) + for section in data: + date_header = section.find('h2', class_='date-header') + if date_header: + date = date_header.text.strip() + else: + date = "No date" + + posts = section.find_all('div', class_='post hentry') + for post in posts: + date_list.append(date) + pythonVersionsBlog = post.find('h3', class_='post-title entry-title').text.strip() + pythonVersionsBlogs.append(pythonVersionsBlog) + pythonVersionsBlogLinkTag = post.find('h3', class_='post-title entry-title').find('a', href=True) + pythonVersionsBlogLink = pythonVersionsBlogLinkTag['href'] + pythonVersionsBlogLinks.append(pythonVersionsBlogLink) + + elements = soup.find_all(class_='post-body entry-content') + for element in elements: + links = element.find_all('a', href=True) + filtered_links = [] + for link in links: + href = link['href'] + if href.startswith('https://www.python.org/downloads/release/python'): + filtered_links.append(href) + if filtered_links: + version_href_list.append(','.join(filtered_links)) + else: + version_href_list.append("No links found") + + blogs_processed += min(len(posted_by), len(authors), len(date_list), len(pythonVersionsBlogs), len(pythonVersionsBlogLinks), len(version_href_list)) + + if blogs_processed >= max_blogs: + break + + current_url = get_older_posts_url(soup) + if not current_url: + break + + posted_by_list = posted_by_list[:max_blogs] + author_list = author_list[:max_blogs] + date_list = date_list[:max_blogs] + pythonVersionsBlogs = pythonVersionsBlogs[:max_blogs] + pythonVersionsBlogLinks = pythonVersionsBlogLinks[:max_blogs] + version_href_list = version_href_list[:max_blogs] + + return date_list, pythonVersionsBlogs, pythonVersionsBlogLinks, version_href_list, author_list, posted_by_list