diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 592a3ab2..fc5de30b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -23,6 +23,10 @@ You may look through the [GitHub issues](https://github.com/idealo/image-dedup/i 2. For bug fixes, add new tests and for new features, please add changes to the documentation. 3. Do a PR from your new branch to our `dev` branch of the original Imagededup repo. +## Style +1. Use single quotes for strings instead of double quotes. +2. Keep import order and groups as specified by [PEP-0008](https://www.python.org/dev/peps/pep-0008/#imports). + ## Documentation - Make sure any new function or class you introduce has proper docstrings. diff --git a/README.md b/README.md index 6054515a..c457bd72 100644 --- a/README.md +++ b/README.md @@ -117,6 +117,7 @@ plot_duplicates(image_dir='path/to/image/directory', duplicate_map=duplicates, filename='ukbench00120.jpg') ``` +To run the above snippet on Windows, have a look [here](https://idealo.github.io/imagededup/user_guide/windows/). It is also possible to use your own custom models for finding duplicates using the CNN method. For examples, refer [this](https://github.com/idealo/imagededup/tree/master/examples) part of the diff --git a/imagededup/methods/cnn.py b/imagededup/methods/cnn.py index 1b096fd9..1f88a85e 100644 --- a/imagededup/methods/cnn.py +++ b/imagededup/methods/cnn.py @@ -412,7 +412,7 @@ def find_duplicates( image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as numpy arrays which represent the CNN encoding for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and - corresponding CNN encodings. + corresponding CNN encodings. An encoding is a numpy array with shape (features,). min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9 scores: Optional, boolean indicating whether similarity scores are to be returned along with retrieved duplicates. @@ -495,7 +495,7 @@ def find_duplicates_to_remove( image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as numpy arrays which represent the CNN encoding for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and - corresponding CNN encodings. + corresponding CNN encodings. An encoding is a numpy array with shape (features,). min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9 outfile: Optional, name of the file to save the results, must be a json. Default is None. recursive: Optional, find images recursively in a nested image directory structure, set to False by default. diff --git a/imagededup/methods/hashing.py b/imagededup/methods/hashing.py index 2bea299d..98346a90 100644 --- a/imagededup/methods/hashing.py +++ b/imagededup/methods/hashing.py @@ -267,7 +267,7 @@ def find_duplicates( image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as hash strings for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and - corresponding hashes. + corresponding hashes. The hashes are 16 character hexadecimal strings. max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). Default is 10. scores: Optional, boolean indicating whether Hamming distances are to be returned along with retrieved duplicates. @@ -389,7 +389,7 @@ def find_duplicates_to_remove( image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as hash strings for the key image file. encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and - corresponding hashes. + corresponding hashes. The hashes are 16 character hexadecimal strings. max_distance_threshold: Optional, hamming distance between two images below which retrieved duplicates are valid. (must be an int between 0 and 64). Default is 10. outfile: Optional, name of the file to save the results, must be a json. Default is None. diff --git a/mkdocs/docs/user_guide/windows.md b/mkdocs/docs/user_guide/windows.md new file mode 100644 index 00000000..0df4a995 --- /dev/null +++ b/mkdocs/docs/user_guide/windows.md @@ -0,0 +1,61 @@ +# Windows +The following code snippet on Windows is likely to break with a `RuntimeError`: + +```python +from imagededup.methods import PHash +phasher = PHash() + +# Generate encodings for all images in an image directory +encodings = phasher.encode_images(image_dir='path/to/image/directory') + +# Find duplicates using the generated encodings +duplicates = phasher.find_duplicates(encoding_map=encodings) + +# plot duplicates obtained for a given file using the duplicates dictionary +from imagededup.utils import plot_duplicates +plot_duplicates(image_dir='path/to/image/directory', + duplicate_map=duplicates, + filename='ukbench00120.jpg') +``` + +``` +RuntimeError: + An attempt has been made to start a new process before the + current process has finished its bootstrapping phase. + + This probably means that you are not using fork to start your + child processes and you have forgotten to use the proper idiom + in the main module: + + if __name__ == '__main__': + freeze_support() + ... + + The "freeze_support()" line can be omitted if the program + is not going to be frozen to produce an executable. +``` +The error occurs due to the underlying dependency `multiprocessing` which observes certain restrictions when working on Windows as explained [here](https://docs.python.org/2/library/multiprocessing.html#windows). + +## The Fix + +Just enclose the logic in a `if __name__ == '__main__':` entry point: + +```python +from imagededup.methods import PHash + +if __name__ == '__main__': + phasher = PHash() + + # Generate encodings for all images in an image directory + encodings = phasher.encode_images(image_dir='path/to/image/directory') + + # Find duplicates using the generated encodings + duplicates = phasher.find_duplicates(encoding_map=encodings) + + # plot duplicates obtained for a given file using the duplicates dictionary + from imagededup.utils import plot_duplicates + plot_duplicates(image_dir='path/to/image/directory', + duplicate_map=duplicates, + filename='ukbench00120.jpg') +``` +Also see issues [194](https://github.com/idealo/imagededup/issues/194) and [214](https://github.com/idealo/imagededup/issues/214). \ No newline at end of file diff --git a/mkdocs/mkdocs.yml b/mkdocs/mkdocs.yml index 26fb480c..2f2d31db 100644 --- a/mkdocs/mkdocs.yml +++ b/mkdocs/mkdocs.yml @@ -10,6 +10,7 @@ nav: - Plotting duplicates: user_guide/plotting_duplicates.md - Evaluating performance: user_guide/evaluating_performance.md - Benchmarks: user_guide/benchmarks.md + - Windows: user_guide/windows.md - API reference: - Methods: - CNN: methods/cnn.md diff --git a/tests/test_cnn.py b/tests/test_cnn.py index f50909b8..e36f2b62 100644 --- a/tests/test_cnn.py +++ b/tests/test_cnn.py @@ -7,6 +7,7 @@ import json import numpy as np import torch +from torch.hub import get_dir from PIL import Image import pytest from torchvision.transforms import transforms @@ -48,11 +49,12 @@ def mocker_save_json(mocker): def test_import_defaults(): """Ensure that MobileNet does not get downloaded on import""" - from torch.hub import get_dir - from pathlib import Path checkpoint_dir = Path(get_dir()) / "checkpoints" + # Ensure the directory exists + checkpoint_dir.mkdir(parents=True, exist_ok=True) + # Clear cached MobileNet model for model_path in checkpoint_dir.iterdir(): if model_path.name.startswith("mobilenet_v3_small"): @@ -64,10 +66,7 @@ def test_import_defaults(): ] # Re-import cnn and assert model is not downloaded - import sys - - del sys.modules["imagededup.methods.cnn"] - from imagededup.methods.cnn import CNN + from imagededup.methods import CNN assert not [ m_path