From 96356882ef122fec5d70b94b34a581ed0a586fc4 Mon Sep 17 00:00:00 2001 From: Rodrigo Barbosa Date: Thu, 16 Apr 2026 14:56:15 -0300 Subject: [PATCH 1/3] Add async zip upload flow to Workspace.upload_dataset (DATAMAN-240) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Route .zip paths and opt-in (use_zip_upload=True / --zip-upload) directory uploads through the new server-side async zip endpoint. Directory inputs keep the legacy per-image flow by default — no behavior change for existing callers. - rfapi: init_zip_upload, upload_zip_to_signed_url, get_zip_upload_status - workspace: dispatch, _zip_directory, _poll_zip_status helpers - CLI: --zip-upload and --no-wait flags, server-result passthrough - Tests: zip flow, dispatch rules, is_prediction guard, _zip_directory filter - tests/manual/demo_zip_upload.py: manual validation scenarios --- roboflow/adapters/rfapi.py | 43 +++++++ roboflow/cli/handlers/image.py | 36 +++++- roboflow/core/workspace.py | 123 ++++++++++++++++-- tests/cli/test_image_handler.py | 160 ++++++++++++++++++++++++ tests/manual/demo_zip_upload.py | 122 ++++++++++++++++++ tests/test_project.py | 212 ++++++++++++++++++++++++++++++++ tests/test_workspace.py | 43 +++++++ 7 files changed, 728 insertions(+), 11 deletions(-) create mode 100644 tests/manual/demo_zip_upload.py create mode 100644 tests/test_workspace.py diff --git a/roboflow/adapters/rfapi.py b/roboflow/adapters/rfapi.py index 5e224c02..f08043e2 100644 --- a/roboflow/adapters/rfapi.py +++ b/roboflow/adapters/rfapi.py @@ -482,6 +482,49 @@ def _save_annotation_error(response): return AnnotationSaveError(str(responsejson), status_code=response.status_code) +# --------------------------------------------------------------------------- +# Zip upload endpoints +# --------------------------------------------------------------------------- + + +def init_zip_upload(api_key, workspace_url, project_url, split=None, tags=None, batch_name=None) -> dict: + """POST /{ws}/{proj}/upload/zip — initialize a zip upload and get a signed URL.""" + url = f"{API_URL}/{workspace_url}/{project_url}/upload/zip" + body: Dict[str, Union[str, List[str]]] = {} + if split is not None: + body["split"] = split + if tags is not None: + body["tags"] = tags + if batch_name is not None: + body["batchName"] = batch_name + response = requests.post(url, params={"api_key": api_key}, json=body) + if response.status_code not in (200, 201): + raise RoboflowError(response.text) + return response.json() + + +def upload_zip_to_signed_url(signed_url, zip_path) -> None: + """PUT the zip file to the GCS signed URL returned by init_zip_upload.""" + with open(zip_path, "rb") as fh: + response = requests.put( + signed_url, + data=fh, + headers={"Content-Type": "application/zip"}, + timeout=(60, 3600), + ) + if not response.ok: + raise RoboflowError(f"Zip upload to signed URL failed ({response.status_code}): {response.text}") + + +def get_zip_upload_status(api_key, workspace_url, task_id) -> dict: + """GET /{ws}/upload/zip/{task_id} — poll status of an async zip upload.""" + url = f"{API_URL}/{workspace_url}/upload/zip/{task_id}" + response = requests.get(url, params={"api_key": api_key}) + if response.status_code != 200: + raise RoboflowError(response.text) + return response.json() + + # --------------------------------------------------------------------------- # Phase 2: Annotation batch & job endpoints # --------------------------------------------------------------------------- diff --git a/roboflow/cli/handlers/image.py b/roboflow/cli/handlers/image.py index 633ac6a8..a4e60c9c 100644 --- a/roboflow/cli/handlers/image.py +++ b/roboflow/cli/handlers/image.py @@ -29,6 +29,14 @@ def upload_image( retries: Annotated[int, typer.Option("-r", "--retries", help="Retry failed uploads N times")] = 0, labelmap: Annotated[Optional[str], typer.Option(help="Path to labelmap file")] = None, is_prediction: Annotated[bool, typer.Option("--is-prediction", help="Mark upload as prediction")] = False, + zip_upload: Annotated[ + bool, + typer.Option("--zip-upload", help="Zip the directory client-side and use the async zip upload flow"), + ] = False, + no_wait: Annotated[ + bool, + typer.Option("--no-wait", help="Zip flow: return immediately with task_id instead of polling"), + ] = False, ) -> None: """Upload an image file or import a directory.""" args = ctx_to_args( @@ -44,6 +52,8 @@ def upload_image( retries=retries, labelmap=labelmap, is_prediction=is_prediction, + zip_upload=zip_upload, + no_wait=no_wait, ) _handle_upload(args) @@ -191,7 +201,7 @@ def _handle_upload(args): # noqa: ANN001 return path = args.path - if os.path.isdir(path): + if os.path.isdir(path) or (os.path.isfile(path) and path.lower().endswith(".zip")): _handle_upload_directory(args, api_key, path) elif os.path.isfile(path): _handle_upload_single(args, api_key, path) @@ -262,20 +272,40 @@ def _handle_upload_directory(args, api_key: str, path: str) -> None: # noqa: AN return retries = getattr(args, "retries", None) or getattr(args, "num_retries", 0) or 0 + tag_raw = getattr(args, "tag", None) + tags = [t.strip() for t in tag_raw.split(",") if t.strip()] if tag_raw else None + wait = not getattr(args, "no_wait", False) try: - workspace.upload_dataset( + result = workspace.upload_dataset( dataset_path=path, project_name=args.project, num_workers=args.concurrency, batch_name=getattr(args, "batch", None), num_retries=retries, + is_prediction=getattr(args, "is_prediction", False), + use_zip_upload=getattr(args, "zip_upload", False), + split=getattr(args, "split", None), + tags=tags, + wait=wait, ) except Exception as exc: output_error(args, str(exc)) return - # Count files uploaded (approximate via image extensions) + if isinstance(result, dict): + status = result.get("status", "unknown") + data = { + "status": status, + "task_id": result.get("task_id") or result.get("taskId"), + "path": path, + "project": args.project, + "result": result, + } + output(args, data, text=f"Imported {path} to {args.project} (zip upload, status={status})") + return + + # Per-image fallback — count files via image extensions count = 0 image_exts = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff", ".webp"} for root, _dirs, files in os.walk(path): diff --git a/roboflow/core/workspace.py b/roboflow/core/workspace.py index 6790a20b..4bc974a8 100644 --- a/roboflow/core/workspace.py +++ b/roboflow/core/workspace.py @@ -5,7 +5,9 @@ import json import os import sys +import tempfile import time +import zipfile from typing import Any, Dict, Generator, List, Optional import requests @@ -298,35 +300,94 @@ def upload_dataset( batch_name=None, num_retries=0, is_prediction=False, - ): + *, + use_zip_upload: bool = False, + tags: Optional[List[str]] = None, + split: Optional[str] = None, + wait: bool = True, + poll_interval: float = 5.0, + poll_timeout: float = 3600.0, + ) -> Optional[dict]: """ Upload a dataset to Roboflow. + A `.zip` ``dataset_path`` or ``use_zip_upload=True`` routes to the + server's async zip upload flow. Everything else (directory inputs by + default) keeps the legacy per-image flow. + Args: - dataset_path (str): path to the dataset + dataset_path (str): path to the dataset directory or a `.zip` file. project_name (str): name of the project - num_workers (int): number of workers to use for parallel uploads + num_workers (int): number of workers to use for parallel uploads (per-image flow only) dataset_format (str): format of the dataset (`voc`, `yolov8`, `yolov5`) project_license (str): license of the project (set to `private` for private projects, only available for paid customers) project_type (str): type of the project (only `object-detection` is supported) batch_name (str, optional): name of the batch to upload the images to. Defaults to an automatically generated value. num_retries (int, optional): number of times to retry uploading an image if the upload fails. Defaults to 0. is_prediction (bool, optional): whether the annotations provided in the dataset are predictions and not ground truth. Defaults to False. - """ # noqa: E501 // docs - from roboflow.util import folderparser - from roboflow.util.image_utils import load_labelmap + use_zip_upload (bool, optional): opt-in to the zip flow for a directory input (the SDK zips it client-side). Ignored when dataset_path is already a `.zip`. + tags (list[str], optional): zip flow only — tags to apply to the uploaded batch. + split (str, optional): zip flow only — dataset split for the uploaded batch. + wait (bool, optional): zip flow only — poll for processing completion. Defaults to True. + poll_interval (float, optional): zip flow only — seconds between status polls. + poll_timeout (float, optional): zip flow only — total seconds to wait before timing out. + Returns: + dict | None: zip flow returns the final/pending status dict; per-image flow returns None. + """ # noqa: E501 // docs if dataset_format != "NOT_USED": print("Warning: parameter 'dataset_format' is deprecated and will be removed in a future release") project, created = self._get_or_create_project( project_id=project_name, license=project_license, type=project_type ) - is_classification = project.type == "classification" - parsed_dataset = folderparser.parsefolder(dataset_path, is_classification=is_classification) if created: print(f"Created project {project.id}") else: print(f"Uploading to existing project {project.id}") + + is_zip_file = dataset_path.lower().endswith(".zip") and os.path.isfile(dataset_path) + use_zip_flow = is_zip_file or use_zip_upload + if use_zip_flow and is_prediction: + raise RoboflowError( + "Zip upload flow does not support is_prediction=True. " + "Call upload_dataset without use_zip_upload for prediction uploads." + ) + + if use_zip_flow: + project_slug = project.id.rsplit("/")[1] + temp_zip = None + try: + if dataset_path.lower().endswith(".zip") and os.path.isfile(dataset_path): + zip_path = dataset_path + else: + zip_path = temp_zip = _zip_directory(dataset_path) + print(f"Zipped {dataset_path} -> {zip_path}") + + init = rfapi.init_zip_upload( + self.__api_key, + self.url, + project_slug, + split=split, + tags=tags, + batch_name=batch_name, + ) + print(f"Uploading zip to Roboflow (task_id={init['taskId']})...") + rfapi.upload_zip_to_signed_url(init["signedUrl"], zip_path) + + if not wait: + print(f"Zip uploaded; not waiting for processing. task_id={init['taskId']}") + return {"task_id": init["taskId"], "status": "pending"} + + return _poll_zip_status(self.__api_key, self.url, init["taskId"], poll_interval, poll_timeout) + finally: + if temp_zip and os.path.exists(temp_zip): + os.unlink(temp_zip) + + from roboflow.util import folderparser + from roboflow.util.image_utils import load_labelmap + + is_classification = project.type == "classification" + parsed_dataset = folderparser.parsefolder(dataset_path, is_classification=is_classification) images = parsed_dataset["images"] location = parsed_dataset["location"] @@ -434,6 +495,8 @@ def _upload(imagedesc): with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: list(executor.map(_upload, images)) + return None + def _get_or_create_project(self, project_id, license: str = "MIT", type: str = "object-detection"): try: existing_project = self.project(project_id) @@ -1271,3 +1334,47 @@ def __str__(self): json_value = {"name": self.name, "url": self.url, "projects": projects} return json.dumps(json_value, indent=2) + + +def _zip_directory(src_dir: str) -> str: + """Zip src_dir into a temp file, skipping hidden and macOS-junk entries.""" + fd, zip_path = tempfile.mkstemp(suffix=".zip", prefix="roboflow-upload-") + os.close(fd) + with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf: + for root, dirs, files in os.walk(src_dir): + dirs[:] = [d for d in dirs if not d.startswith(".") and d != "__MACOSX"] + for name in files: + if name.startswith(".") or name == "Thumbs.db": + continue + abs_path = os.path.join(root, name) + rel = os.path.relpath(abs_path, src_dir) + zf.write(abs_path, arcname=rel) + return zip_path + + +def _poll_zip_status( + api_key: str, + workspace_url: str, + task_id: str, + poll_interval: float, + poll_timeout: float, +) -> dict: + deadline = time.monotonic() + poll_timeout + last_progress = None + while True: + status = rfapi.get_zip_upload_status(api_key, workspace_url, task_id) + state = status.get("status") + progress = (status.get("progress") or {}).get("current") + if progress is not None and progress != last_progress: + print(f" zip-upload progress: {progress}") + last_progress = progress + if state in {"completed", "failed"}: + return status + if time.monotonic() >= deadline: + raise RoboflowError( + f"Zip upload polling timed out after {poll_timeout}s " + f"(task_id={task_id}, last_status={state}). " + f"Call Workspace.upload_dataset(..., wait=False) and poll with " + f"rfapi.get_zip_upload_status to check later." + ) + time.sleep(poll_interval) diff --git a/tests/cli/test_image_handler.py b/tests/cli/test_image_handler.py index ec5fe231..982c4255 100644 --- a/tests/cli/test_image_handler.py +++ b/tests/cli/test_image_handler.py @@ -185,6 +185,166 @@ def test_upload_directory(self, mock_rf_cls): self.assertEqual(result["status"], "imported") self.assertEqual(result["count"], 2) # .jpg and .png only + @patch("roboflow.Roboflow") + def test_upload_zip_file_routes_to_directory_handler(self, mock_rf_cls): + from roboflow.cli.handlers.image import _handle_upload + + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as f: + f.write(b"fake zip") + zip_path = f.name + + try: + mock_ws = MagicMock() + mock_ws.upload_dataset.return_value = {"status": "completed", "task_id": "t1"} + mock_project = MagicMock() + mock_rf_cls.return_value.workspace.return_value = mock_ws + mock_ws.project.return_value = mock_project + + args = _make_args( + json=True, + path=zip_path, + project="proj", + annotation=None, + split="train", + batch=None, + tag=None, + metadata=None, + concurrency=10, + retries=0, + labelmap=None, + is_prediction=False, + no_wait=False, + ) + + buf = io.StringIO() + old = sys.stdout + sys.stdout = buf + try: + _handle_upload(args) + finally: + sys.stdout = old + + mock_ws.upload_dataset.assert_called_once() + mock_project.single_upload.assert_not_called() + finally: + os.unlink(zip_path) + + @patch("roboflow.Roboflow") + def test_no_wait_forwarded(self, mock_rf_cls): + from roboflow.cli.handlers.image import _handle_upload + + with tempfile.TemporaryDirectory() as tmpdir: + mock_ws = MagicMock() + mock_ws.upload_dataset.return_value = {"status": "pending", "task_id": "t9"} + mock_rf_cls.return_value.workspace.return_value = mock_ws + + args = _make_args( + json=True, + path=tmpdir, + project="proj", + annotation=None, + split="train", + batch=None, + tag=None, + metadata=None, + concurrency=10, + retries=0, + labelmap=None, + is_prediction=False, + zip_upload=True, + no_wait=True, + ) + + buf = io.StringIO() + old = sys.stdout + sys.stdout = buf + try: + _handle_upload(args) + finally: + sys.stdout = old + + _, kwargs = mock_ws.upload_dataset.call_args + self.assertEqual(kwargs.get("wait"), False) + self.assertEqual(kwargs.get("use_zip_upload"), True) + + @patch("roboflow.Roboflow") + def test_zip_flow_uses_server_result_in_output(self, mock_rf_cls): + from roboflow.cli.handlers.image import _handle_upload + + with tempfile.TemporaryDirectory() as tmpdir: + mock_ws = MagicMock() + mock_ws.upload_dataset.return_value = {"status": "completed", "task_id": "t1"} + mock_rf_cls.return_value.workspace.return_value = mock_ws + + args = _make_args( + json=True, + path=tmpdir, + project="proj", + annotation=None, + split="train", + batch=None, + tag="foo,bar", + metadata=None, + concurrency=10, + retries=0, + labelmap=None, + is_prediction=False, + zip_upload=True, + no_wait=False, + ) + + buf = io.StringIO() + old = sys.stdout + sys.stdout = buf + try: + _handle_upload(args) + finally: + sys.stdout = old + + result = json.loads(buf.getvalue()) + self.assertEqual(result["task_id"], "t1") + self.assertEqual(result["status"], "completed") + + _, kwargs = mock_ws.upload_dataset.call_args + self.assertEqual(kwargs.get("tags"), ["foo", "bar"]) + self.assertEqual(kwargs.get("use_zip_upload"), True) + + @patch("roboflow.Roboflow") + def test_zip_upload_flag_defaults_false(self, mock_rf_cls): + from roboflow.cli.handlers.image import _handle_upload + + with tempfile.TemporaryDirectory() as tmpdir: + mock_ws = MagicMock() + # MagicMock return → not a dict → per-image output branch + mock_ws.upload_dataset.return_value = None + mock_rf_cls.return_value.workspace.return_value = mock_ws + + args = _make_args( + json=True, + path=tmpdir, + project="proj", + annotation=None, + split="train", + batch=None, + tag=None, + metadata=None, + concurrency=10, + retries=0, + labelmap=None, + is_prediction=False, + ) + + buf = io.StringIO() + old = sys.stdout + sys.stdout = buf + try: + _handle_upload(args) + finally: + sys.stdout = old + + _, kwargs = mock_ws.upload_dataset.call_args + self.assertEqual(kwargs.get("use_zip_upload"), False) + class TestImageDelete(unittest.TestCase): """Test the delete handler.""" diff --git a/tests/manual/demo_zip_upload.py b/tests/manual/demo_zip_upload.py new file mode 100644 index 00000000..2d4e6d5b --- /dev/null +++ b/tests/manual/demo_zip_upload.py @@ -0,0 +1,122 @@ +"""Manual validation for the zip upload flow on Workspace.upload_dataset. + +Edit the constants below, then uncomment the scenario you want to run. +""" + +from __future__ import annotations + +import os +import sys +import time + +thisdir = os.path.dirname(os.path.abspath(__file__)) +rootdir = os.path.abspath(f"{thisdir}/../..") +sys.path.insert(0, rootdir) + +from roboflow import Roboflow # noqa: E402 +from roboflow.adapters import rfapi # noqa: E402 + +# ---- edit these ----------------------------------------------------------- +# Reads from env by default; set directly if you prefer. +API_KEY = os.environ.get("ROBOFLOW_API_KEY", "YOUR_API_KEY") +WORKSPACE = os.environ.get("ROBOFLOW_WORKSPACE", "your-workspace") +PROJECT = os.environ.get("ROBOFLOW_PROJECT", "your-project") + +ZIP_PATH = os.path.expanduser("~/Downloads/COCO Dataset.v50i.coco.zip") +DIR_PATH = os.path.expanduser("~/Downloads/some-dataset-dir") +# For the `status` scenario, paste the task_id returned by the `no_wait` run +TASK_ID = "" +# --------------------------------------------------------------------------- + + +def _batch(tag: str) -> str: + return f"zip-demo-{tag}-{int(time.time())}" + + +def scenario_zip_path(workspace) -> None: + print(f"\n=== scenario: zip_path (file={ZIP_PATH}) ===") + result = workspace.upload_dataset( + dataset_path=ZIP_PATH, + project_name=PROJECT, + batch_name=_batch("zip"), + ) + print(f"result: {result}") + + +def scenario_dir_default(workspace) -> None: + """Directory without use_zip_upload — legacy per-image flow (returns None).""" + print(f"\n=== scenario: dir_default (dir={DIR_PATH}) ===") + result = workspace.upload_dataset( + dataset_path=DIR_PATH, + project_name=PROJECT, + batch_name=_batch("dir-peritem"), + ) + print(f"result: {result} (expected: None -- per-image flow)") + + +def scenario_dir_zip_opt_in(workspace) -> None: + """Directory with use_zip_upload=True — SDK zips client-side.""" + print(f"\n=== scenario: dir_zip_opt_in (dir={DIR_PATH}) ===") + result = workspace.upload_dataset( + dataset_path=DIR_PATH, + project_name=PROJECT, + batch_name=_batch("dir-zip"), + use_zip_upload=True, + ) + print(f"result: {result}") + + +def scenario_no_wait(workspace) -> None: + print(f"\n=== scenario: no_wait (file={ZIP_PATH}) ===") + result = workspace.upload_dataset( + dataset_path=ZIP_PATH, + project_name=PROJECT, + batch_name=_batch("nowait"), + wait=False, + ) + print(f"result: {result}") + print(f"-> paste this task_id into TASK_ID and run scenario_status: {result['task_id']}") + + +def scenario_status(workspace) -> None: + print(f"\n=== scenario: status (task_id={TASK_ID}) ===") + status = rfapi.get_zip_upload_status(API_KEY, workspace.url, TASK_ID) + print(f"status: {status}") + + +def scenario_with_tags_and_split(workspace) -> None: + print(f"\n=== scenario: tags + split (file={ZIP_PATH}) ===") + result = workspace.upload_dataset( + dataset_path=ZIP_PATH, + project_name=PROJECT, + batch_name=_batch("tagged"), + split="train", + tags=["reviewed", "batch-q4"], + ) + print(f"result: {result}") + + +def scenario_prediction_per_image(workspace) -> None: + """Prediction upload always uses per-image flow (zip flow doesn't support it).""" + print(f"\n=== scenario: prediction_per_image (dir={DIR_PATH}) ===") + result = workspace.upload_dataset( + dataset_path=DIR_PATH, + project_name=PROJECT, + batch_name=_batch("pred"), + is_prediction=True, + ) + print(f"result: {result} (expected: None -- per-image flow)") + + +if __name__ == "__main__": + rf = Roboflow(api_key=API_KEY) + workspace = rf.workspace(WORKSPACE) + + # Uncomment the scenario you want to run: + scenario_zip_path(workspace) + # scenario_dir_default(workspace) + # scenario_dir_zip_opt_in(workspace) + # scenario_no_wait(workspace) + # scenario_status(workspace) + # scenario_with_tags_and_split(workspace) + # scenario_prediction_per_image(workspace) diff --git a/tests/test_project.py b/tests/test_project.py index 87dbde92..79580725 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -865,3 +865,215 @@ def test_search_with_annotation_job_params(self): # specified keys match, it doesn't fail if additional keys are missing results = self.project.search() self.assertEqual(len(results), 2) + + +class TestZipUpload(RoboflowTest): + def _rfapi_mocks(self, get_status_side_effect=None, get_status_return=None): + import_target = "roboflow.core.workspace.rfapi" + init_mock = patch( + f"{import_target}.init_zip_upload", + return_value={"signedUrl": "https://signed.example/upload", "taskId": "task-123"}, + ) + put_mock = patch(f"{import_target}.upload_zip_to_signed_url", return_value=None) + if get_status_side_effect is not None: + status_mock = patch(f"{import_target}.get_zip_upload_status", side_effect=get_status_side_effect) + else: + status_mock = patch( + f"{import_target}.get_zip_upload_status", + return_value=get_status_return or {"status": "completed", "result": {"ok": True}}, + ) + project_mock = patch( + "roboflow.core.workspace.Workspace._get_or_create_project", + return_value=(self.project, False), + ) + return {"init": init_mock, "put": put_mock, "status": status_mock, "project": project_mock} + + def test_zip_path_passthrough(self): + import tempfile + + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as fh: + fh.write(b"fake zip") + zip_path = fh.name + + mocks = self._rfapi_mocks() + zip_dir_mock = patch("roboflow.core.workspace._zip_directory") + started = {name: m.start() for name, m in mocks.items()} + started["zip_dir"] = zip_dir_mock.start() + try: + result = self.workspace.upload_dataset(dataset_path=zip_path, project_name=PROJECT_NAME) + self.assertEqual(result, {"status": "completed", "result": {"ok": True}}) + started["init"].assert_called_once() + started["put"].assert_called_once() + started["zip_dir"].assert_not_called() + put_args, _ = started["put"].call_args + self.assertEqual(put_args[0], "https://signed.example/upload") + self.assertEqual(put_args[1], zip_path) + finally: + for m in list(mocks.values()) + [zip_dir_mock]: + m.stop() + import os as _os + + if _os.path.exists(zip_path): + _os.unlink(zip_path) + + def test_directory_with_use_zip_upload_zips_and_cleans_up(self): + import os as _os + import tempfile + + # Pre-create a temp zip path to be returned by _zip_directory + fd, fake_zip = tempfile.mkstemp(suffix=".zip", prefix="roboflow-upload-") + _os.close(fd) + with open(fake_zip, "wb") as fh: + fh.write(b"fake zip payload") + + src_dir = tempfile.mkdtemp() + try: + mocks = self._rfapi_mocks() + zip_dir_mock = patch("roboflow.core.workspace._zip_directory", return_value=fake_zip) + started = {name: m.start() for name, m in mocks.items()} + started["zip_dir"] = zip_dir_mock.start() + try: + self.workspace.upload_dataset( + dataset_path=src_dir, project_name=PROJECT_NAME, use_zip_upload=True + ) + started["zip_dir"].assert_called_once_with(src_dir) + started["init"].assert_called_once() + self.assertFalse(_os.path.exists(fake_zip), "temp zip was not cleaned up") + finally: + for m in list(mocks.values()) + [zip_dir_mock]: + m.stop() + finally: + if _os.path.exists(fake_zip): + _os.unlink(fake_zip) + if _os.path.isdir(src_dir): + _os.rmdir(src_dir) + + def test_directory_default_stays_on_per_image(self): + import tempfile + + src_dir = tempfile.mkdtemp() + try: + rfapi_mocks = self._rfapi_mocks() + per_image = { + "parser": patch( + "roboflow.util.folderparser.parsefolder", + return_value={"location": "/tmp/", "images": []}, + ), + } + started = {name: m.start() for name, m in {**rfapi_mocks, **per_image}.items()} + try: + result = self.workspace.upload_dataset(dataset_path=src_dir, project_name=PROJECT_NAME) + self.assertIsNone(result) + started["init"].assert_not_called() + started["parser"].assert_called_once() + finally: + for m in list(rfapi_mocks.values()) + list(per_image.values()): + m.stop() + finally: + import os as _os + + _os.rmdir(src_dir) + + def test_use_zip_upload_with_is_prediction_raises(self): + import tempfile + + from roboflow.adapters.rfapi import RoboflowError + + src_dir = tempfile.mkdtemp() + try: + mocks = self._rfapi_mocks() + started = {name: m.start() for name, m in mocks.items()} + try: + with self.assertRaises(RoboflowError): + self.workspace.upload_dataset( + dataset_path=src_dir, + project_name=PROJECT_NAME, + use_zip_upload=True, + is_prediction=True, + ) + started["init"].assert_not_called() + finally: + for m in mocks.values(): + m.stop() + finally: + import os as _os + + _os.rmdir(src_dir) + + def test_wait_false_returns_task_id_without_polling(self): + import tempfile + + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as fh: + fh.write(b"fake") + zip_path = fh.name + + mocks = self._rfapi_mocks() + started = {name: m.start() for name, m in mocks.items()} + try: + result = self.workspace.upload_dataset(dataset_path=zip_path, project_name=PROJECT_NAME, wait=False) + self.assertEqual(result, {"task_id": "task-123", "status": "pending"}) + started["status"].assert_not_called() + finally: + for m in mocks.values(): + m.stop() + import os as _os + + if _os.path.exists(zip_path): + _os.unlink(zip_path) + + def test_poll_loop_completes(self): + import tempfile + + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as fh: + fh.write(b"fake") + zip_path = fh.name + + responses_seq = [ + {"status": "running", "progress": {"current": "10%"}}, + {"status": "completed", "result": {"imageCount": 5}}, + ] + mocks = self._rfapi_mocks(get_status_side_effect=responses_seq) + sleep_mock = patch("roboflow.core.workspace.time.sleep", return_value=None) + started = {name: m.start() for name, m in mocks.items()} + started["sleep"] = sleep_mock.start() + try: + result = self.workspace.upload_dataset(dataset_path=zip_path, project_name=PROJECT_NAME, poll_interval=0.0) + self.assertEqual(result, {"status": "completed", "result": {"imageCount": 5}}) + self.assertEqual(started["status"].call_count, 2) + finally: + for m in list(mocks.values()) + [sleep_mock]: + m.stop() + import os as _os + + if _os.path.exists(zip_path): + _os.unlink(zip_path) + + def test_poll_loop_timeout(self): + import tempfile + + from roboflow.adapters.rfapi import RoboflowError + + with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as fh: + fh.write(b"fake") + zip_path = fh.name + + mocks = self._rfapi_mocks(get_status_return={"status": "running"}) + # Make time.monotonic advance past the deadline on the second call. + monotonic_values = iter([1000.0, 1000.0, 9999.0]) + monotonic_mock = patch("roboflow.core.workspace.time.monotonic", side_effect=lambda: next(monotonic_values)) + sleep_mock = patch("roboflow.core.workspace.time.sleep", return_value=None) + started = {name: m.start() for name, m in mocks.items()} + started["monotonic"] = monotonic_mock.start() + started["sleep"] = sleep_mock.start() + try: + with self.assertRaises(RoboflowError): + self.workspace.upload_dataset( + dataset_path=zip_path, project_name=PROJECT_NAME, poll_timeout=1.0, poll_interval=0.0 + ) + finally: + for m in list(mocks.values()) + [monotonic_mock, sleep_mock]: + m.stop() + import os as _os + + if _os.path.exists(zip_path): + _os.unlink(zip_path) diff --git a/tests/test_workspace.py b/tests/test_workspace.py new file mode 100644 index 00000000..19d3c78b --- /dev/null +++ b/tests/test_workspace.py @@ -0,0 +1,43 @@ +"""Unit tests for module-level helpers in roboflow.core.workspace.""" + +import os +import tempfile +import unittest +import zipfile + +from roboflow.core.workspace import _zip_directory + + +class TestZipDirectory(unittest.TestCase): + def test_filters_hidden_and_junk_entries(self): + with tempfile.TemporaryDirectory() as src: + # Real content + with open(os.path.join(src, "sample.jpg"), "wb") as fh: + fh.write(b"jpg bytes") + # Hidden / junk files at the top level + with open(os.path.join(src, ".DS_Store"), "wb") as fh: + fh.write(b"x") + with open(os.path.join(src, "Thumbs.db"), "wb") as fh: + fh.write(b"x") + # macOS junk directory + mac_dir = os.path.join(src, "__MACOSX") + os.mkdir(mac_dir) + with open(os.path.join(mac_dir, "whatever.txt"), "wb") as fh: + fh.write(b"x") + # Hidden directory + hidden_dir = os.path.join(src, ".hidden") + os.mkdir(hidden_dir) + with open(os.path.join(hidden_dir, "inside.txt"), "wb") as fh: + fh.write(b"x") + + zip_path = _zip_directory(src) + try: + with zipfile.ZipFile(zip_path) as zf: + names = set(zf.namelist()) + self.assertEqual(names, {"sample.jpg"}) + finally: + os.unlink(zip_path) + + +if __name__ == "__main__": + unittest.main() From 429834fb758b1618707d1025b7ade3344a505ed2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 16 Apr 2026 17:57:05 +0000 Subject: [PATCH 2/3] =?UTF-8?q?fix(pre=5Fcommit):=20=F0=9F=8E=A8=20auto=20?= =?UTF-8?q?format=20pre-commit=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_project.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_project.py b/tests/test_project.py index 79580725..7a883232 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -933,9 +933,7 @@ def test_directory_with_use_zip_upload_zips_and_cleans_up(self): started = {name: m.start() for name, m in mocks.items()} started["zip_dir"] = zip_dir_mock.start() try: - self.workspace.upload_dataset( - dataset_path=src_dir, project_name=PROJECT_NAME, use_zip_upload=True - ) + self.workspace.upload_dataset(dataset_path=src_dir, project_name=PROJECT_NAME, use_zip_upload=True) started["zip_dir"].assert_called_once_with(src_dir) started["init"].assert_called_once() self.assertFalse(_os.path.exists(fake_zip), "temp zip was not cleaned up") From 577046d39090b96ebda287246697a94e9f058d20 Mon Sep 17 00:00:00 2001 From: Rodrigo Barbosa Date: Thu, 16 Apr 2026 16:33:02 -0300 Subject: [PATCH 3/3] demo zip --- tests/manual/demo_zip_upload.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/manual/demo_zip_upload.py b/tests/manual/demo_zip_upload.py index 2d4e6d5b..19ee09ac 100644 --- a/tests/manual/demo_zip_upload.py +++ b/tests/manual/demo_zip_upload.py @@ -18,12 +18,12 @@ # ---- edit these ----------------------------------------------------------- # Reads from env by default; set directly if you prefer. -API_KEY = os.environ.get("ROBOFLOW_API_KEY", "YOUR_API_KEY") -WORKSPACE = os.environ.get("ROBOFLOW_WORKSPACE", "your-workspace") -PROJECT = os.environ.get("ROBOFLOW_PROJECT", "your-project") +API_KEY = os.environ.get("ROBOFLOW_API_KEY", "") +WORKSPACE = os.environ.get("ROBOFLOW_WORKSPACE", "rodrigo-xn5xn") +PROJECT = os.environ.get("ROBOFLOW_PROJECT", "small-od") -ZIP_PATH = os.path.expanduser("~/Downloads/COCO Dataset.v50i.coco.zip") -DIR_PATH = os.path.expanduser("~/Downloads/some-dataset-dir") +ZIP_PATH = os.path.expanduser("~/Downloads/instance-seg.coco-segmentation.zip") +DIR_PATH = os.path.expanduser("~/Downloads/instance-seg.coco-segmentation") # For the `status` scenario, paste the task_id returned by the `no_wait` run TASK_ID = "" # --------------------------------------------------------------------------- @@ -113,9 +113,9 @@ def scenario_prediction_per_image(workspace) -> None: workspace = rf.workspace(WORKSPACE) # Uncomment the scenario you want to run: - scenario_zip_path(workspace) + # scenario_zip_path(workspace) # scenario_dir_default(workspace) - # scenario_dir_zip_opt_in(workspace) + scenario_dir_zip_opt_in(workspace) # scenario_no_wait(workspace) # scenario_status(workspace) # scenario_with_tags_and_split(workspace)