Image_Text_Comparaison_AI/htmlExtractor.py at main · shadlia/Image_Text_Comparaison_AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import shutil
from PIL import Image
from io import BytesIO
from modelExtraction import ArticleExtractor


class ImageDownloader:
    def __init__(self):
        self.driver = None

    def setup_driver(self, url):
        # Set up Selenium WebDriver
        self.driver = webdriver.Chrome()
        self.driver.get(url)

    def create_folder(self, folder_name):
        # Clear the folder if it already exists
        if os.path.exists(folder_name):
            for file_name in os.listdir(folder_name):
                file_path = os.path.join(folder_name, file_name)
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
        else:
            os.makedirs(folder_name)

    def extract_images(self, exclude_keywords=("icon",)):
        """Extract all image URLs from the HTML content using BeautifulSoup, with filters."""
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        images = soup.find_all("img")
        img_urls = [
            img["src"]
            for img in images
            if "src" in img.attrs
            and not any(keyword in img["src"] for keyword in exclude_keywords)
        ]
        return img_urls

    def filter_image_by_size(self, img_data, min_size=(0, 0)):
        """Filter images based on size using Pillow."""
        try:
            img = Image.open(BytesIO(img_data))
            width, height = img.size
            print(f"Image size: {width}x{height}")  # Debugging info
            return width >= min_size[0] and height >= min_size[1]
        except Exception as e:
            print(f"Failed to check image size: {e}")
            return False

    def extract_text(self):
        """Extract all text content from the HTML and return it as a string."""
        page_source = self.driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        text = soup.get_text(separator="\n", strip=True)
        return text

    def save_text_to_file(self, text, file_name="article.txt"):
        """Save the extracted text to a .txt file."""
        with open(file_name, "w", encoding="utf-8") as file:
            file.write(text)
        print(f"Text saved to {file_name}")

    def extract_html(self):
        """Extract the HTML content from the current page."""
        html_content = self.driver.page_source
        return html_content

    def save_html_to_file(self, html_content, file_name="html.txt"):
        """Save the extracted HTML to a .txt file."""
        with open(file_name, "w", encoding="utf-8") as file:
            file.write(html_content)
        print(f"HTML content saved to {file_name}")

    def image_download(
        self,
        url,
        folder_name="imagesCache",
        valid_extensions=(".png", ".jpeg"),
        exclude_keywords=("icon",),
        min_size=(0, 0),  # Min width and height
    ):
        # Set up the driver and navigate to the URL
        self.setup_driver(url)

        # Create the folder to save images
        self.create_folder(folder_name)

        # Extract all image URLs from the page
        img_urls = self.extract_images(exclude_keywords=exclude_keywords)

        # Download and save each image
        for i, img_url in enumerate(img_urls):
            # Handle cases where src might be empty or None
            if not img_url:
                continue

            # Only download images that end with the specified extensions
            if img_url.endswith(valid_extensions):
                try:
                    # Get the image content
                    response = requests.get(img_url, stream=True)
                    img_data = response.content
                    print(f"Downloading image from {img_url}")  # Debugging info

                    # Check the image size
                    if not self.filter_image_by_size(img_data, min_size):
                        print(f"Skipped image due to size: {img_url}")
                        continue

                    img_extension = img_url.split(".")[-1].split("?")[
                        0
                    ]  # Remove query parameters
                    img_name = os.path.join(folder_name, f"image_{i+1}.{img_extension}")

                    # Save the image
                    with open(img_name, "wb") as img_file:
                        img_file.write(img_data)
                    print(f"Downloaded: {img_name}")
                except Exception as e:
                    print(f"Failed to download {img_url}: {e}")

        # Save the extracted text to a file
        text = self.extract_text()
        self.save_text_to_file(text)

        # Extract and save the HTML content to a file
        html_content = self.extract_html()
        self.save_html_to_file(html_content)

        # Close the browser
        self.driver.quit()