Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-hooks.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
- id: clear-metadata
name: dmeta — clear metadata from Office files
name: dmeta — clear metadata from Office and image files
description: |
Recursively strip metadata from Microsoft Office files
Recursively strip metadata from Microsoft Office and image files
in-place before each commit.
entry: dmeta --clear-all --inplace
language: python
Expand Down
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,23 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- `clear_jpeg_metadata` function in `functions.py`
- `clear_png_metadata` function in `functions.py`
- `extract_metadata` function in `functions.py`
- `SUPPORTED_IMAGE_FORMATS` and `SUPPORTED_FORMATS` in `params.py`
- `get_file_format` function in `util.py`
- `CLEAR_HANDLERS` dict in `functions.py`
- `clear_file` function in `functions.py`
### Changed
- `test.yml`
- `clear` function in `functions.py`
- `update` function in `functions.py`
- `clear_all` function in `functions.py`
- `update_all` function in `functions.py`
- `run_dmeta` function in `functions.py`
- CLI help text in `__main__.py`
- `.pre-commit-hooks.yaml` updated
- Test system modified
- `README.md` updated
### Removed
- `get_microsoft_format` function in `util.py`
## [0.4] - 2025-06-16
### Added
- `Acknowledgments` in `README.md`
Expand Down
23 changes: 19 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

## Overview
<p align="justify">
DMeta is an open source Python package that removes metadata of Microsoft Office files.
DMeta is an open source Python package that removes metadata of Microsoft Office files and image files.
</p>
<table>
<tr>
Expand Down Expand Up @@ -81,7 +81,15 @@ from dmeta.functions import clear
DOCX_FILE_PATH = os.path.join(os.getcwd(), "sample.docx")
clear(DOCX_FILE_PATH, in_place=True)
```
#### Clear metadata for all existing microsoft files (.docx|.pptx|.xlsx) in the current directory
#### Clear metadata for any supported file in place
```python
import os
from dmeta.functions import clear_file

FILE_PATH = os.path.join(os.getcwd(), "photo.png")
clear_file(FILE_PATH, in_place=True)
```
#### Clear metadata for all existing supported files (.docx|.pptx|.xlsx|.png|.jpg|.jpeg|.gif) in the current directory
```python
from dmeta.functions import clear_all
clear_all()
Expand Down Expand Up @@ -114,7 +122,11 @@ update_all(CONFIG_FILE_PATH)
```console
dmeta --clear "./test_a.docx" --inplace
```
#### Clear metadata for all existing microsoft files (.docx|.pptx|.xlsx) in the current directory
#### Clear metadata for a .png file in place
```console
dmeta --clear "./photo.png" --inplace
```
#### Clear metadata for all existing supported files (.docx|.pptx|.xlsx|.png|.jpg|.jpeg|.gif) in the current directory
```console
dmeta --clear-all
```
Expand Down Expand Up @@ -162,7 +174,7 @@ repos:
pre-commit install # or pre_commit install (in windows)
```

Now, every time you `git commit`, Dmeta will automatically clear metadata from any Microsoft files in-place.
Now, every time you `git commit`, Dmeta will automatically clear metadata from any supported files in-place.

#### ⚠️ Important: Clean Before You Commit

Expand All @@ -186,6 +198,9 @@ If you run `git add` on Office files that still contain embedded metadata, the p
| Microsoft Word (.docx) | &#x2705; |
| Microsoft PowerPoint (.pptx) | &#x2705; |
| Microsoft Excel (.xlsx) | &#x2705; |
| PNG (.png) | &#x2705; |
| JPEG (.jpg, .jpeg) | &#x2705; |
| GIF (.gif) | &#x2705; |


## Issues & bug reports
Expand Down
6 changes: 3 additions & 3 deletions dmeta/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ def main():
parser.add_argument(
'--clear',
nargs=1,
metavar=".docx file",
metavar="file",
type=str,
help="the `clear` command clears all metadata in the given `.docx` file.",
help="the `clear` command clears all metadata in the given file (.docx, .pptx, .xlsx, .png, .jpg, .jpeg, .gif).",
)
parser.add_argument(
'--clear-all',
action="store_true",
default=False,
help='the `clear-all` command clears all metadata in any `.docx` file in the current directory.',
help='the `clear-all` command clears all metadata in any supported file in the current directory.',
)
parser.add_argument(
'--update',
Expand Down
72 changes: 53 additions & 19 deletions dmeta/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
from art import tprint
import defusedxml.lxml as lxml
from .errors import DMetaBaseError
from .util import get_microsoft_format, extract, read_json
from .util import get_file_format, extract, read_json
from .params import CORE_XML_MAP, APP_XML_MAP, OVERVIEW, DMETA_VERSION, \
UPDATE_COMMAND_WITH_NO_CONFIG_FILE_ERROR, SUPPORTED_MICROSOFT_FORMATS, \
UPDATE_COMMAND_WITH_NO_CONFIG_FILE_ERROR, \
SUPPORTED_MICROSOFT_FORMATS, SUPPORTED_FORMATS, \
JPEG_MARKER_PREFIX, JPEG_SOI, JPEG_EOI, JPEG_SOS, JPEG_COM, \
JPEG_APP_FIRST, JPEG_APP_LAST, JPEG_STANDALONE_MARKERS, \
GIF_TRAILER, GIF_EXTENSION_INTRODUCER, GIF_IMAGE_DESCRIPTOR, \
Expand Down Expand Up @@ -56,8 +57,8 @@ def clear(microsoft_file_name, in_place=False, verbose=False):
:type verbose: bool
:return: None
"""
microsoft_format = get_microsoft_format(microsoft_file_name)
if microsoft_format is None:
microsoft_format = get_file_format(microsoft_file_name)
if microsoft_format is None or microsoft_format not in SUPPORTED_MICROSOFT_FORMATS:
return
unzipped_dir, source_file = extract(microsoft_file_name)
doc_props_dir = os.path.join(unzipped_dir, "docProps")
Expand Down Expand Up @@ -107,7 +108,7 @@ def is_metadata_cleared(xml_path, is_core=True):

def clear_all(in_place=False, verbose=False):
"""
Clear all the editable metadata in any Microsoft file in the current directory and its subdirectories.
Clear all the editable metadata in any supported file in the current directory and its subdirectories.

:param in_place: the `in_place` flag applies the changes directly to the original file
:type in_place: bool
Expand All @@ -117,20 +118,20 @@ def clear_all(in_place=False, verbose=False):
"""
path = os.getcwd()
counter = {
format: 0 for format in SUPPORTED_MICROSOFT_FORMATS
fmt: 0 for fmt in SUPPORTED_FORMATS
}

for root, _, files in os.walk(path):
for file in files:
format = get_microsoft_format(file)
if format is None:
fmt = get_file_format(file)
if fmt is None:
continue
clear(os.path.join(root, file), in_place, verbose)
counter[format] += 1
clear_file(os.path.join(root, file), in_place, verbose)
counter[fmt] += 1

if verbose:
for format in counter.keys():
print("Metadata of {} files with the format of {} has been cleared.".format(counter[format], format))
for fmt in counter.keys():
print("Metadata of {} files with the format of {} has been cleared.".format(counter[fmt], fmt))


def update(config_file_name, microsoft_file_name, in_place=False, verbose=False):
Expand Down Expand Up @@ -158,8 +159,8 @@ def update(config_file_name, microsoft_file_name, in_place=False, verbose=False)
print("There isn't any chosen personal field to remove.")
return

microsoft_format = get_microsoft_format(microsoft_file_name)
if microsoft_format is None:
microsoft_format = get_file_format(microsoft_file_name)
if microsoft_format is None or microsoft_format not in SUPPORTED_MICROSOFT_FORMATS:
return

unzipped_dir, source_file = extract(microsoft_file_name)
Expand Down Expand Up @@ -230,11 +231,11 @@ def update_all(config_file_name, in_place=False, verbose=False):

for root, _, files in os.walk(path):
for file in files:
format = get_microsoft_format(file)
if format is None:
return
fmt = get_file_format(file)
if fmt is None or fmt not in SUPPORTED_MICROSOFT_FORMATS:
continue
update(config_file_name, os.path.join(root, file), in_place, verbose)
counter[format] += 1
counter[fmt] += 1

if verbose:
for format in counter.keys():
Expand Down Expand Up @@ -430,6 +431,39 @@ def skip_sub_blocks(start):
return output_path


CLEAR_HANDLERS = {
"docx": clear,
"pptx": clear,
"xlsx": clear,
"png": clear_png_metadata,
"jpg": clear_jpeg_metadata,
"jpeg": clear_jpeg_metadata,
"gif": clear_gif_metadata,
}


def clear_file(file_name, in_place=False, verbose=False):
"""
Clear all metadata from the given file based on its format.

:param file_name: path to the file
:type file_name: str
:param in_place: applies changes directly to the original file
:type in_place: bool
:param verbose: enables detailed output
:type verbose: bool
:return: path to the cleared file, or None if format is unsupported
:rtype: str or None
"""
fmt = get_file_format(file_name)
if fmt is None:
return None
handler = CLEAR_HANDLERS.get(fmt)
if handler is None:
return None
return handler(file_name, in_place, verbose)


def extract_metadata(microsoft_file_name):
"""
Extract all the editable metadata from the given Microsoft file.
Expand Down Expand Up @@ -483,7 +517,7 @@ def run_dmeta(args):
"""
verbose = args.verbose
if args.clear:
clear(args.clear[0], args.inplace, verbose)
clear_file(args.clear[0], args.inplace, verbose)
elif args.clear_all:
clear_all(args.inplace, verbose)
elif args.update:
Expand Down
9 changes: 8 additions & 1 deletion dmeta/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""DMeta parameters and constants."""
DMETA_VERSION = "0.4"
OVERVIEW = """
A Python library for removing personal metadata in Microsoft files(.docx, .pptx, .xlsx).
A Python library for removing personal metadata in Microsoft files(.docx, .pptx, .xlsx) and image files(.png, .jpg, .jpeg, .gif).

"""
CORE_XML_MAP = {
Expand Down Expand Up @@ -36,6 +36,13 @@
"pptx",
"xlsx"
]
SUPPORTED_IMAGE_FORMATS = [
"png",
"jpg",
"jpeg",
"gif"
]
SUPPORTED_FORMATS = SUPPORTED_MICROSOFT_FORMATS + SUPPORTED_IMAGE_FORMATS
# JPEG marker codes per ITU-T T.81.
JPEG_MARKER_PREFIX = 0xFF
JPEG_SOI = 0xD8 # Start Of Image
Expand Down
19 changes: 10 additions & 9 deletions dmeta/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,28 @@
import json
from shutil import rmtree
from zipfile import ZipFile
from .params import SUPPORTED_MICROSOFT_FORMATS, INVALID_CONFIG_FILE_NAME_ERROR, CONFIG_FILE_DOES_NOT_EXIST_ERROR
from .params import SUPPORTED_FORMATS, INVALID_CONFIG_FILE_NAME_ERROR, CONFIG_FILE_DOES_NOT_EXIST_ERROR
from .errors import DMetaBaseError


def get_microsoft_format(file_name):
def get_file_format(file_name):
"""
Extract format from the end of the given microsoft file name.
Extract format from the end of the given file name.

:param file_name: name of the microsoft file name
:param file_name: name of the file
:type file_name: str
:return: str
:return: format string if supported, None otherwise
:rtype: str or None
"""
if not isinstance(file_name, str):
return None
last_dot_index = file_name.rfind('.')
if (last_dot_index == -1):
if last_dot_index == -1:
return None
format = file_name[last_dot_index + 1:]
if format not in SUPPORTED_MICROSOFT_FORMATS:
fmt = file_name[last_dot_index + 1:].lower()
if fmt not in SUPPORTED_FORMATS:
return None
return format
return fmt


def extract(file_name):
Expand Down
Loading