Skip to content

Commit 6d15762

Browse files
authored
feat(download_lt/safe_zip): new secure way to extract ZIPs (jxmorris12#165)
1 parent 673af1a commit 6d15762

6 files changed

Lines changed: 1568 additions & 30 deletions

File tree

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,20 @@ Exit codes:
315315
- `LTP_DOWNLOAD_SHA256_<VERSION>`: version-specific expected SHA-256 for the downloaded LanguageTool archive, for example `LTP_DOWNLOAD_SHA256_6_9_SNAPSHOT`.
316316
- `LTP_DOWNLOAD_SHA256`: fallback expected SHA-256 for the downloaded LanguageTool archive.
317317
- `LTP_BYPASS_VERIFIED_DOWNLOADS`: set to `true` to skip SHA-256 verification.
318+
- `LTP_MAX_DOWNLOAD_BYTES`: maximum downloaded ZIP size in bytes.
319+
- default: `536870912` (512 MiB)
320+
- `LTP_SAFE_ZIP_MAX_ARCHIVE_BYTES`: maximum total compressed member size in bytes.
321+
- default: `536870912` (512 MiB)
322+
- `LTP_SAFE_ZIP_MAX_EXTRACTED_BYTES`: maximum total extracted size in bytes.
323+
- default: `805306368` (768 MiB)
324+
- `LTP_SAFE_ZIP_MAX_MEMBERS`: maximum ZIP member count.
325+
- default: `5000`
326+
- `LTP_SAFE_ZIP_MAX_MEMBER_EXTRACTED_BYTES`: maximum extracted size for a single ZIP member in bytes.
327+
- default: `134217728` (128 MiB)
328+
- `LTP_SAFE_ZIP_MAX_MEMBER_COMPRESSION_RATIO`: maximum compression ratio for a single ZIP member.
329+
- default: `100.0`
330+
- `LTP_SAFE_ZIP_MAX_TOTAL_COMPRESSION_RATIO`: maximum compression ratio for the whole ZIP archive.
331+
- default: `10.0`
318332

319333
Downloaded zips are verified with SHA-256 when a checksum is available. Checksums are resolved in this order:
320334
1. `LTP_DOWNLOAD_SHA256_<VERSION>`, where non-alphanumeric characters in the version are replaced with `_` and the name is uppercased.

language_tool_python/download_lt.py

Lines changed: 101 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@
2727
from ._deprecated import deprecated
2828
from .config_file import LanguageToolConfig
2929
from .exceptions import JavaError, PathError
30+
from .safe_zip import SafeZipExtractor
3031
from .utils import (
3132
LTP_JAR_DIR_PATH_ENV_VAR,
33+
get_env_int,
3234
get_language_tool_download_path,
3335
)
3436

@@ -55,6 +57,9 @@
5557
LT_SNAPSHOT_CURRENT_VERSION = "6.9-SNAPSHOT"
5658
LTP_DOWNLOAD_SHA256_ENV_VAR = "LTP_DOWNLOAD_SHA256"
5759
LTP_BYPASS_VERIFIED_DOWNLOADS_ENV_VAR = "LTP_BYPASS_VERIFIED_DOWNLOADS"
60+
LTP_MAX_DOWNLOAD_BYTES_ENV_VAR = "LTP_MAX_DOWNLOAD_BYTES"
61+
DOWNLOAD_CHUNK_BYTES = 1024 * 1024
62+
_SAFE_ZIP_EXTRACTOR = SafeZipExtractor()
5863

5964
with (
6065
importlib.resources.as_file(
@@ -76,6 +81,12 @@
7681
)
7782

7883

84+
MAX_DOWNLOAD_BYTES = get_env_int(
85+
LTP_MAX_DOWNLOAD_BYTES_ENV_VAR,
86+
512 * 1024 * 1024,
87+
) # 512 MiB, latest snapshot: 246.58 MiB archive
88+
89+
7990
def _get_zip_hash(version_name: str) -> Optional[str]:
8091
"""Get the expected SHA-256 hash for a given version of LanguageTool.
8192
This function checks for environment variables that may specify the expected hash for the given version. It normalizes the version name to construct the environment variable name. If no specific environment variable is found for the version, it falls back to a general environment variable or a manifest lookup. If the bypass environment variable is set, it will skip verification and return None.
@@ -109,6 +120,39 @@ def _get_zip_hash(version_name: str) -> Optional[str]:
109120
return None
110121

111122

123+
def _validate_download_size(content_length: Optional[str]) -> Optional[int]:
124+
"""
125+
Validate the HTTP Content-Length header before downloading a ZIP file.
126+
127+
:param content_length: The Content-Length header value, if present.
128+
:type content_length: Optional[str]
129+
:return: The parsed content length, or None when the header is missing.
130+
:rtype: Optional[int]
131+
:raises PathError: If the header is invalid or exceeds the download size limit.
132+
"""
133+
if content_length is None:
134+
return None
135+
136+
try:
137+
total = int(content_length)
138+
except ValueError as e:
139+
err = f"Invalid Content-Length header: {content_length!r}."
140+
raise PathError(err) from e
141+
142+
if total < 0:
143+
err = f"Invalid Content-Length header: {content_length!r}."
144+
raise PathError(err)
145+
146+
if total > MAX_DOWNLOAD_BYTES:
147+
err = (
148+
f"Refusing to download {total} bytes. "
149+
f"Maximum allowed download size is {MAX_DOWNLOAD_BYTES} bytes."
150+
)
151+
raise PathError(err)
152+
153+
return total
154+
155+
112156
def parse_java_version(version_text: str) -> Tuple[int, int]:
113157
"""
114158
Parse the Java version from a given version text.
@@ -261,8 +305,15 @@ def unzip_file(temp_file_name: str, directory_to_extract_to: Path) -> None:
261305
"""
262306

263307
logger.info("Unzipping %s to %s", temp_file_name, directory_to_extract_to)
264-
with zipfile.ZipFile(temp_file_name, "r") as zip_ref:
265-
zip_ref.extractall(directory_to_extract_to)
308+
with (
309+
tempfile.TemporaryDirectory(dir=directory_to_extract_to.parent) as temp_dir,
310+
zipfile.ZipFile(temp_file_name, "r") as zip_ref,
311+
):
312+
_SAFE_ZIP_EXTRACTOR.extractall(
313+
zip_ref,
314+
directory_to_extract_to,
315+
work_dir=Path(temp_dir),
316+
)
266317

267318

268319
@deprecated(
@@ -419,8 +470,6 @@ def _get_remote_zip(
419470
except requests.exceptions.Timeout as e:
420471
err = f"Request to {self.download_url} timed out."
421472
raise TimeoutError(err) from e
422-
content_length = req.headers.get("Content-Length")
423-
total = int(content_length) if content_length is not None else None
424473
if req.status_code == 404:
425474
err = f"Could not find at URL {self.download_url}. The given version may not exist or is no longer available."
426475
raise PathError(err)
@@ -430,14 +479,25 @@ def _get_remote_zip(
430479
if req.status_code != 200:
431480
err = f"Failed to download from {self.download_url}. HTTP status code: {req.status_code}."
432481
raise PathError(err)
482+
content_length = req.headers.get("Content-Length")
483+
total = _validate_download_size(content_length)
433484
progress = tqdm.tqdm(
434485
unit="B",
435486
unit_scale=True,
436487
total=total,
437488
desc=f"Downloading LanguageTool {self.version_name}",
438489
)
439-
for chunk in req.iter_content(chunk_size=1024):
490+
downloaded_bytes = 0
491+
for chunk in req.iter_content(chunk_size=DOWNLOAD_CHUNK_BYTES):
440492
if chunk: # filter out keep-alive new chunks
493+
downloaded_bytes += len(chunk)
494+
if downloaded_bytes > MAX_DOWNLOAD_BYTES:
495+
progress.close()
496+
err = (
497+
f"Refusing to download more than {MAX_DOWNLOAD_BYTES} bytes "
498+
f"from {self.download_url}."
499+
)
500+
raise PathError(err)
441501
sha256.update(chunk)
442502
progress.update(len(chunk))
443503
downloaded_file.write(chunk)
@@ -708,13 +768,17 @@ def download(self) -> None:
708768

709769
if self not in self.get_installed_versions():
710770
with (
711-
tempfile.TemporaryDirectory() as temp_dir,
771+
tempfile.TemporaryDirectory(dir=download_folder) as temp_dir,
712772
tempfile.NamedTemporaryFile(
713773
suffix=".zip", dir=temp_dir
714774
) as downloaded_file,
715775
self._get_remote_zip(downloaded_file) as zip_file,
716776
):
717-
zip_file.extractall(download_folder)
777+
_SAFE_ZIP_EXTRACTOR.extractall(
778+
zip_file,
779+
download_folder,
780+
work_dir=Path(temp_dir),
781+
)
718782

719783
@property
720784
def version_name(self) -> str:
@@ -790,8 +854,7 @@ def download(self) -> None:
790854
Download and install this snapshot version of LanguageTool.
791855
792856
This method checks Java compatibility, downloads the snapshot ZIP file,
793-
and extracts it to the download folder. For snapshots, the extracted
794-
directory is renamed to match the expected version name if necessary.
857+
and extracts it to the download folder using the requested snapshot name.
795858
"""
796859
confirm_java_compatibility(self._version_name)
797860

@@ -803,33 +866,41 @@ def download(self) -> None:
803866
return
804867

805868
if self not in self.get_installed_versions():
806-
# For snapshots, pass expected_dirname to rename the extracted folder
807869
with (
808-
tempfile.TemporaryDirectory() as temp_dir,
870+
tempfile.TemporaryDirectory(dir=download_folder) as temp_dir,
809871
tempfile.NamedTemporaryFile(
810872
suffix=".zip", dir=temp_dir
811873
) as downloaded_file,
812874
self._get_remote_zip(downloaded_file) as zip_file,
813875
):
814-
lt_dir = zip_file.infolist()[0].filename
815-
expected_dirname = f"LanguageTool-{self.version_name}/"
816-
if lt_dir != expected_dirname:
817-
with (
818-
tempfile.NamedTemporaryFile(
819-
suffix=".zip", dir=temp_dir
820-
) as temp_file,
821-
zipfile.ZipFile(temp_file, "w") as renamed_zip,
822-
):
823-
for item in zip_file.infolist():
824-
buffer = zip_file.read(item.filename)
825-
new_name = item.filename.replace(
826-
lt_dir, expected_dirname, 1
827-
)
828-
renamed_zip.writestr(new_name, buffer)
829-
temp_file.seek(0)
830-
renamed_zip.extractall(download_folder)
831-
else:
832-
zip_file.extractall(download_folder)
876+
snapshot_extract_dir = Path(temp_dir) / "snapshot"
877+
_SAFE_ZIP_EXTRACTOR.extractall(
878+
zip_file,
879+
snapshot_extract_dir,
880+
work_dir=Path(temp_dir),
881+
)
882+
extracted_roots = list(snapshot_extract_dir.iterdir())
883+
if len(extracted_roots) != 1 or not extracted_roots[0].is_dir():
884+
err = (
885+
"Expected snapshot archive to contain exactly one "
886+
"root directory."
887+
)
888+
raise PathError(err)
889+
890+
expected_dir = download_folder / f"LanguageTool-{self.version_name}"
891+
if expected_dir.exists() or expected_dir.is_symlink():
892+
err = (
893+
"Refusing to overwrite existing LanguageTool snapshot "
894+
f"directory: {expected_dir}."
895+
)
896+
raise PathError(err)
897+
898+
logger.debug(
899+
"Renaming extracted snapshot directory %s to %s",
900+
extracted_roots[0],
901+
expected_dir,
902+
)
903+
extracted_roots[0].rename(expected_dir)
833904

834905
@property
835906
def version_name(self) -> str:

0 commit comments

Comments
 (0)