From 7df99725bdfc1c920e3058250d7970c31f834a1e Mon Sep 17 00:00:00 2001 From: Tanuj Jain Date: Tue, 29 Jul 2025 13:52:09 +0200 Subject: [PATCH 1/5] Address several documentation related issues as documented in issue-126. --- CONTRIBUTING.md | 4 ++++ imagededup/methods/cnn.py | 4 ++-- imagededup/methods/hashing.py | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 592a3ab2..fc5de30b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -23,6 +23,10 @@ You may look through the [GitHub issues](https://github.com/idealo/image-dedup/i 2. For bug fixes, add new tests and for new features, please add changes to the documentation. 3. Do a PR from your new branch to our `dev` branch of the original Imagededup repo. +## Style +1. Use single quotes for strings instead of double quotes. +2. Keep import order and groups as specified by [PEP-0008](https://www.python.org/dev/peps/pep-0008/#imports). + ## Documentation - Make sure any new function or class you introduce has proper docstrings. diff --git a/imagededup/methods/cnn.py b/imagededup/methods/cnn.py index 1b096fd9..1f88a85e 100644 --- a/imagededup/methods/cnn.py +++ b/imagededup/methods/cnn.py @@ -412,7 +412,7 @@ def find_duplicates( image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as numpy arrays which represent the CNN encoding for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and - corresponding CNN encodings. + corresponding CNN encodings. An encoding is a numpy array with shape (features,). min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9 scores: Optional, boolean indicating whether similarity scores are to be returned along with retrieved duplicates. @@ -495,7 +495,7 @@ def find_duplicates_to_remove( image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as numpy arrays which represent the CNN encoding for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and - corresponding CNN encodings. + corresponding CNN encodings. An encoding is a numpy array with shape (features,). min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9 outfile: Optional, name of the file to save the results, must be a json. Default is None. recursive: Optional, find images recursively in a nested image directory structure, set to False by default. diff --git a/imagededup/methods/hashing.py b/imagededup/methods/hashing.py index 2bea299d..98346a90 100644 --- a/imagededup/methods/hashing.py +++ b/imagededup/methods/hashing.py @@ -267,7 +267,7 @@ def find_duplicates( image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as hash strings for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and - corresponding hashes. + corresponding hashes. The hashes are 16 character hexadecimal strings. max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). Default is 10. scores: Optional, boolean indicating whether Hamming distances are to be returned along with retrieved duplicates. @@ -389,7 +389,7 @@ def find_duplicates_to_remove( image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as hash strings for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and - corresponding hashes. + corresponding hashes. The hashes are 16 character hexadecimal strings. max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). Default is 10. outfile: Optional, name of the file to save the results, must be a json. Default is None. From fda057e6e97a964d878d940e698ca9f43a321099 Mon Sep 17 00:00:00 2001 From: Tanuj Jain Date: Tue, 29 Jul 2025 13:53:04 +0200 Subject: [PATCH 2/5] Address documentation related issues as mentioned in issue 126. --- mkdocs/docs/user_guide/windows.md | 61 +++++++++++++++++++++++++++++++ mkdocs/mkdocs.yml | 1 + 2 files changed, 62 insertions(+) create mode 100644 mkdocs/docs/user_guide/windows.md diff --git a/mkdocs/docs/user_guide/windows.md b/mkdocs/docs/user_guide/windows.md new file mode 100644 index 00000000..0df4a995 --- /dev/null +++ b/mkdocs/docs/user_guide/windows.md @@ -0,0 +1,61 @@ +# Windows +The following code snippet on Windows is likely to break with a `RuntimeError`: + +```python +from imagededup.methods import PHash +phasher = PHash() + +# Generate encodings for all images in an image directory +encodings = phasher.encode_images(image_dir='path/to/image/directory') + +# Find duplicates using the generated encodings +duplicates = phasher.find_duplicates(encoding_map=encodings) + +# plot duplicates obtained for a given file using the duplicates dictionary +from imagededup.utils import plot_duplicates +plot_duplicates(image_dir='path/to/image/directory', + duplicate_map=duplicates, + filename='ukbench00120.jpg') +``` + +``` +RuntimeError: + An attempt has been made to start a new process before the + current process has finished its bootstrapping phase. + + This probably means that you are not using fork to start your + child processes and you have forgotten to use the proper idiom + in the main module: + + if __name__ == '__main__': + freeze_support() + ... + + The "freeze_support()" line can be omitted if the program + is not going to be frozen to produce an executable. +``` +The error occurs due to the underlying dependency `multiprocessing` which observes certain restrictions when working on Windows as explained [here](https://docs.python.org/2/library/multiprocessing.html#windows). + +## The Fix + +Just enclose the logic in a `if __name__ == '__main__':` entry point: + +```python +from imagededup.methods import PHash + +if __name__ == '__main__': + phasher = PHash() + + # Generate encodings for all images in an image directory + encodings = phasher.encode_images(image_dir='path/to/image/directory') + + # Find duplicates using the generated encodings + duplicates = phasher.find_duplicates(encoding_map=encodings) + + # plot duplicates obtained for a given file using the duplicates dictionary + from imagededup.utils import plot_duplicates + plot_duplicates(image_dir='path/to/image/directory', + duplicate_map=duplicates, + filename='ukbench00120.jpg') +``` +Also see issues [194](https://github.com/idealo/imagededup/issues/194) and [214](https://github.com/idealo/imagededup/issues/214). \ No newline at end of file diff --git a/mkdocs/mkdocs.yml b/mkdocs/mkdocs.yml index 26fb480c..2f2d31db 100644 --- a/mkdocs/mkdocs.yml +++ b/mkdocs/mkdocs.yml @@ -10,6 +10,7 @@ nav: - Plotting duplicates: user_guide/plotting_duplicates.md - Evaluating performance: user_guide/evaluating_performance.md - Benchmarks: user_guide/benchmarks.md + - Windows: user_guide/windows.md - API reference: - Methods: - CNN: methods/cnn.md From 0a012802b15cb635a6c9afae08a0cf872bd6b7c6 Mon Sep 17 00:00:00 2001 From: Tanuj Jain Date: Tue, 29 Jul 2025 13:56:20 +0200 Subject: [PATCH 3/5] Add a pointer to make the snippet run on Windows in Readme. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6054515a..c457bd72 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,7 @@ plot_duplicates(image_dir='path/to/image/directory', duplicate_map=duplicates, filename='ukbench00120.jpg') ``` +To run the above snippet on Windows, have a look [here](https://idealo.github.io/imagededup/user_guide/windows/). It is also possible to use your own custom models for finding duplicates using the CNN method. For examples, refer [this](https://github.com/idealo/imagededup/tree/master/examples) part of the From cafad22747bf081a93e87921cd2b1790e127c935 Mon Sep 17 00:00:00 2001 From: Tanuj Jain Date: Tue, 29 Jul 2025 15:18:45 +0200 Subject: [PATCH 4/5] Remove module deletion from import test --- tests/test_cnn.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/test_cnn.py b/tests/test_cnn.py index f50909b8..eb9dcc0f 100644 --- a/tests/test_cnn.py +++ b/tests/test_cnn.py @@ -7,6 +7,7 @@ import json import numpy as np import torch +from torch.hub import get_dir from PIL import Image import pytest from torchvision.transforms import transforms @@ -48,8 +49,6 @@ def mocker_save_json(mocker): def test_import_defaults(): """Ensure that MobileNet does not get downloaded on import""" - from torch.hub import get_dir - from pathlib import Path checkpoint_dir = Path(get_dir()) / "checkpoints" @@ -64,10 +63,7 @@ def test_import_defaults(): ] # Re-import cnn and assert model is not downloaded - import sys - - del sys.modules["imagededup.methods.cnn"] - from imagededup.methods.cnn import CNN + from imagededup.methods import CNN assert not [ m_path From d26a6a1556c7303edbdf9b8f9d29bcfc57f75c74 Mon Sep 17 00:00:00 2001 From: Tanuj Jain Date: Tue, 29 Jul 2025 15:28:43 +0200 Subject: [PATCH 5/5] Ensure checkpoint die exists to mkae tests pass on CI --- tests/test_cnn.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_cnn.py b/tests/test_cnn.py index eb9dcc0f..e36f2b62 100644 --- a/tests/test_cnn.py +++ b/tests/test_cnn.py @@ -52,6 +52,9 @@ def test_import_defaults(): checkpoint_dir = Path(get_dir()) / "checkpoints" + # Ensure the directory exists + checkpoint_dir.mkdir(parents=True, exist_ok=True) + # Clear cached MobileNet model for model_path in checkpoint_dir.iterdir(): if model_path.name.startswith("mobilenet_v3_small"):