diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml
index 6522f8d..39e1d90 100644
--- a/.pre-commit-hooks.yaml
+++ b/.pre-commit-hooks.yaml
@@ -1,7 +1,7 @@
- id: clear-metadata
- name: dmeta — clear metadata from Office files
+ name: dmeta — clear metadata from Office and image files
description: |
- Recursively strip metadata from Microsoft Office files
+ Recursively strip metadata from Microsoft Office and image files
in-place before each commit.
entry: dmeta --clear-all --inplace
language: python
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a1f0a9b..7607658 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,12 +12,23 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- `clear_jpeg_metadata` function in `functions.py`
- `clear_png_metadata` function in `functions.py`
- `extract_metadata` function in `functions.py`
+- `SUPPORTED_IMAGE_FORMATS` and `SUPPORTED_FORMATS` in `params.py`
+- `get_file_format` function in `util.py`
+- `CLEAR_HANDLERS` dict in `functions.py`
+- `clear_file` function in `functions.py`
### Changed
- `test.yml`
- `clear` function in `functions.py`
- `update` function in `functions.py`
+- `clear_all` function in `functions.py`
+- `update_all` function in `functions.py`
+- `run_dmeta` function in `functions.py`
+- CLI help text in `__main__.py`
+- `.pre-commit-hooks.yaml` updated
- Test system modified
- `README.md` updated
+### Removed
+- `get_microsoft_format` function in `util.py`
## [0.4] - 2025-06-16
### Added
- `Acknowledgments` in `README.md`
diff --git a/README.md b/README.md
index fa13daa..a0ed66a 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@
## Overview
-DMeta is an open source Python package that removes metadata of Microsoft Office files.
+DMeta is an open source Python package that removes metadata of Microsoft Office files and image files.
@@ -81,7 +81,15 @@ from dmeta.functions import clear
DOCX_FILE_PATH = os.path.join(os.getcwd(), "sample.docx")
clear(DOCX_FILE_PATH, in_place=True)
```
-#### Clear metadata for all existing microsoft files (.docx|.pptx|.xlsx) in the current directory
+#### Clear metadata for any supported file in place
+```python
+import os
+from dmeta.functions import clear_file
+
+FILE_PATH = os.path.join(os.getcwd(), "photo.png")
+clear_file(FILE_PATH, in_place=True)
+```
+#### Clear metadata for all existing supported files (.docx|.pptx|.xlsx|.png|.jpg|.jpeg|.gif) in the current directory
```python
from dmeta.functions import clear_all
clear_all()
@@ -114,7 +122,11 @@ update_all(CONFIG_FILE_PATH)
```console
dmeta --clear "./test_a.docx" --inplace
```
-#### Clear metadata for all existing microsoft files (.docx|.pptx|.xlsx) in the current directory
+#### Clear metadata for a .png file in place
+```console
+dmeta --clear "./photo.png" --inplace
+```
+#### Clear metadata for all existing supported files (.docx|.pptx|.xlsx|.png|.jpg|.jpeg|.gif) in the current directory
```console
dmeta --clear-all
```
@@ -162,7 +174,7 @@ repos:
pre-commit install # or pre_commit install (in windows)
```
-Now, every time you `git commit`, Dmeta will automatically clear metadata from any Microsoft files in-place.
+Now, every time you `git commit`, Dmeta will automatically clear metadata from any supported files in-place.
#### ⚠️ Important: Clean Before You Commit
@@ -186,6 +198,9 @@ If you run `git add` on Office files that still contain embedded metadata, the p
| Microsoft Word (.docx) | ✅ |
| Microsoft PowerPoint (.pptx) | ✅ |
| Microsoft Excel (.xlsx) | ✅ |
+| PNG (.png) | ✅ |
+| JPEG (.jpg, .jpeg) | ✅ |
+| GIF (.gif) | ✅ |
## Issues & bug reports
diff --git a/dmeta/__main__.py b/dmeta/__main__.py
index 83af790..143b6ae 100644
--- a/dmeta/__main__.py
+++ b/dmeta/__main__.py
@@ -15,15 +15,15 @@ def main():
parser.add_argument(
'--clear',
nargs=1,
- metavar=".docx file",
+ metavar="file",
type=str,
- help="the `clear` command clears all metadata in the given `.docx` file.",
+ help="the `clear` command clears all metadata in the given file (.docx, .pptx, .xlsx, .png, .jpg, .jpeg, .gif).",
)
parser.add_argument(
'--clear-all',
action="store_true",
default=False,
- help='the `clear-all` command clears all metadata in any `.docx` file in the current directory.',
+ help='the `clear-all` command clears all metadata in any supported file in the current directory.',
)
parser.add_argument(
'--update',
diff --git a/dmeta/functions.py b/dmeta/functions.py
index bd2927a..c7e008a 100644
--- a/dmeta/functions.py
+++ b/dmeta/functions.py
@@ -7,9 +7,10 @@
from art import tprint
import defusedxml.lxml as lxml
from .errors import DMetaBaseError
-from .util import get_microsoft_format, extract, read_json
+from .util import get_file_format, extract, read_json
from .params import CORE_XML_MAP, APP_XML_MAP, OVERVIEW, DMETA_VERSION, \
- UPDATE_COMMAND_WITH_NO_CONFIG_FILE_ERROR, SUPPORTED_MICROSOFT_FORMATS, \
+ UPDATE_COMMAND_WITH_NO_CONFIG_FILE_ERROR, \
+ SUPPORTED_MICROSOFT_FORMATS, SUPPORTED_FORMATS, \
JPEG_MARKER_PREFIX, JPEG_SOI, JPEG_EOI, JPEG_SOS, JPEG_COM, \
JPEG_APP_FIRST, JPEG_APP_LAST, JPEG_STANDALONE_MARKERS, \
GIF_TRAILER, GIF_EXTENSION_INTRODUCER, GIF_IMAGE_DESCRIPTOR, \
@@ -56,8 +57,8 @@ def clear(microsoft_file_name, in_place=False, verbose=False):
:type verbose: bool
:return: None
"""
- microsoft_format = get_microsoft_format(microsoft_file_name)
- if microsoft_format is None:
+ microsoft_format = get_file_format(microsoft_file_name)
+ if microsoft_format is None or microsoft_format not in SUPPORTED_MICROSOFT_FORMATS:
return
unzipped_dir, source_file = extract(microsoft_file_name)
doc_props_dir = os.path.join(unzipped_dir, "docProps")
@@ -107,7 +108,7 @@ def is_metadata_cleared(xml_path, is_core=True):
def clear_all(in_place=False, verbose=False):
"""
- Clear all the editable metadata in any Microsoft file in the current directory and its subdirectories.
+ Clear all the editable metadata in any supported file in the current directory and its subdirectories.
:param in_place: the `in_place` flag applies the changes directly to the original file
:type in_place: bool
@@ -117,20 +118,20 @@ def clear_all(in_place=False, verbose=False):
"""
path = os.getcwd()
counter = {
- format: 0 for format in SUPPORTED_MICROSOFT_FORMATS
+ fmt: 0 for fmt in SUPPORTED_FORMATS
}
for root, _, files in os.walk(path):
for file in files:
- format = get_microsoft_format(file)
- if format is None:
+ fmt = get_file_format(file)
+ if fmt is None:
continue
- clear(os.path.join(root, file), in_place, verbose)
- counter[format] += 1
+ clear_file(os.path.join(root, file), in_place, verbose)
+ counter[fmt] += 1
if verbose:
- for format in counter.keys():
- print("Metadata of {} files with the format of {} has been cleared.".format(counter[format], format))
+ for fmt in counter.keys():
+ print("Metadata of {} files with the format of {} has been cleared.".format(counter[fmt], fmt))
def update(config_file_name, microsoft_file_name, in_place=False, verbose=False):
@@ -158,8 +159,8 @@ def update(config_file_name, microsoft_file_name, in_place=False, verbose=False)
print("There isn't any chosen personal field to remove.")
return
- microsoft_format = get_microsoft_format(microsoft_file_name)
- if microsoft_format is None:
+ microsoft_format = get_file_format(microsoft_file_name)
+ if microsoft_format is None or microsoft_format not in SUPPORTED_MICROSOFT_FORMATS:
return
unzipped_dir, source_file = extract(microsoft_file_name)
@@ -230,11 +231,11 @@ def update_all(config_file_name, in_place=False, verbose=False):
for root, _, files in os.walk(path):
for file in files:
- format = get_microsoft_format(file)
- if format is None:
- return
+ fmt = get_file_format(file)
+ if fmt is None or fmt not in SUPPORTED_MICROSOFT_FORMATS:
+ continue
update(config_file_name, os.path.join(root, file), in_place, verbose)
- counter[format] += 1
+ counter[fmt] += 1
if verbose:
for format in counter.keys():
@@ -430,6 +431,39 @@ def skip_sub_blocks(start):
return output_path
+CLEAR_HANDLERS = {
+ "docx": clear,
+ "pptx": clear,
+ "xlsx": clear,
+ "png": clear_png_metadata,
+ "jpg": clear_jpeg_metadata,
+ "jpeg": clear_jpeg_metadata,
+ "gif": clear_gif_metadata,
+}
+
+
+def clear_file(file_name, in_place=False, verbose=False):
+ """
+ Clear all metadata from the given file based on its format.
+
+ :param file_name: path to the file
+ :type file_name: str
+ :param in_place: applies changes directly to the original file
+ :type in_place: bool
+ :param verbose: enables detailed output
+ :type verbose: bool
+ :return: path to the cleared file, or None if format is unsupported
+ :rtype: str or None
+ """
+ fmt = get_file_format(file_name)
+ if fmt is None:
+ return None
+ handler = CLEAR_HANDLERS.get(fmt)
+ if handler is None:
+ return None
+ return handler(file_name, in_place, verbose)
+
+
def extract_metadata(microsoft_file_name):
"""
Extract all the editable metadata from the given Microsoft file.
@@ -483,7 +517,7 @@ def run_dmeta(args):
"""
verbose = args.verbose
if args.clear:
- clear(args.clear[0], args.inplace, verbose)
+ clear_file(args.clear[0], args.inplace, verbose)
elif args.clear_all:
clear_all(args.inplace, verbose)
elif args.update:
diff --git a/dmeta/params.py b/dmeta/params.py
index c3a1ba0..c305029 100644
--- a/dmeta/params.py
+++ b/dmeta/params.py
@@ -2,7 +2,7 @@
"""DMeta parameters and constants."""
DMETA_VERSION = "0.4"
OVERVIEW = """
-A Python library for removing personal metadata in Microsoft files(.docx, .pptx, .xlsx).
+A Python library for removing personal metadata in Microsoft files(.docx, .pptx, .xlsx) and image files(.png, .jpg, .jpeg, .gif).
"""
CORE_XML_MAP = {
@@ -36,6 +36,13 @@
"pptx",
"xlsx"
]
+SUPPORTED_IMAGE_FORMATS = [
+ "png",
+ "jpg",
+ "jpeg",
+ "gif"
+]
+SUPPORTED_FORMATS = SUPPORTED_MICROSOFT_FORMATS + SUPPORTED_IMAGE_FORMATS
# JPEG marker codes per ITU-T T.81.
JPEG_MARKER_PREFIX = 0xFF
JPEG_SOI = 0xD8 # Start Of Image
diff --git a/dmeta/util.py b/dmeta/util.py
index 26135dc..abf9f0b 100644
--- a/dmeta/util.py
+++ b/dmeta/util.py
@@ -4,27 +4,28 @@
import json
from shutil import rmtree
from zipfile import ZipFile
-from .params import SUPPORTED_MICROSOFT_FORMATS, INVALID_CONFIG_FILE_NAME_ERROR, CONFIG_FILE_DOES_NOT_EXIST_ERROR
+from .params import SUPPORTED_FORMATS, INVALID_CONFIG_FILE_NAME_ERROR, CONFIG_FILE_DOES_NOT_EXIST_ERROR
from .errors import DMetaBaseError
-def get_microsoft_format(file_name):
+def get_file_format(file_name):
"""
- Extract format from the end of the given microsoft file name.
+ Extract format from the end of the given file name.
- :param file_name: name of the microsoft file name
+ :param file_name: name of the file
:type file_name: str
- :return: str
+ :return: format string if supported, None otherwise
+ :rtype: str or None
"""
if not isinstance(file_name, str):
return None
last_dot_index = file_name.rfind('.')
- if (last_dot_index == -1):
+ if last_dot_index == -1:
return None
- format = file_name[last_dot_index + 1:]
- if format not in SUPPORTED_MICROSOFT_FORMATS:
+ fmt = file_name[last_dot_index + 1:].lower()
+ if fmt not in SUPPORTED_FORMATS:
return None
- return format
+ return fmt
def extract(file_name):