Handle tar file extraction errors (#58)

jmsmkn · web-flow · commit 3b8b8853a457 · 2025-10-22T11:35:47.000+02:00
Closes #33 Closes #40
diff --git a/sagemaker_shim/app.py b/sagemaker_shim/app.py
@@ -36,8 +36,13 @@
 @asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncIterator[None]:
     async with get_s3_resources() as s3_resources:
-        async with AuxiliaryData(s3_resources=s3_resources):
+        auxiliary_data = AuxiliaryData(s3_resources=s3_resources)
+        await auxiliary_data.setup()
+
+        try:
             yield
+        finally:
+            await auxiliary_data.teardown()
 
 
 app = FastAPI(lifespan=lifespan)
diff --git a/sagemaker_shim/cli.py b/sagemaker_shim/cli.py
@@ -14,6 +14,7 @@
 from pydantic import ValidationError
 
 from sagemaker_shim.app import app
+from sagemaker_shim.exceptions import UserSafeError
 from sagemaker_shim.logging import LOGGING_CONFIG
 from sagemaker_shim.models import (
     AuxiliaryData,
@@ -86,7 +87,16 @@ async def invoke(tasks: str, file: str) -> None:
             tasks=tasks, file=file, s3_resources=s3_resources
         )
 
-        async with AuxiliaryData(s3_resources=s3_resources):
+        auxiliary_data = AuxiliaryData(s3_resources=s3_resources)
+
+        try:
+            try:
+                await auxiliary_data.setup()
+            except* UserSafeError as exception_group:
+                for exception in exception_group.exceptions:
+                    logger.error(msg=str(exception), extra={"internal": False})
+                raise SystemExit(1) from exception_group
+
             for task in parsed_tasks.root:
                 # Only run one task at a time
                 result = await task.invoke(s3_resources=s3_resources)
@@ -96,9 +106,11 @@ async def invoke(tasks: str, file: str) -> None:
                     logger.error(
                         f"Stopping due to failure of task {result.pk}"
                     )
-                    raise SystemExit(0)
+                    raise SystemExit(result.return_code)
 
-    logger.info("Model invocation complete")
+            logger.info("Model invocation complete")
+        finally:
+            await auxiliary_data.teardown()
 
 
 async def _parse_tasks(
diff --git a/sagemaker_shim/models.py b/sagemaker_shim/models.py
@@ -22,7 +22,6 @@
 from importlib.metadata import version
 from pathlib import Path
 from tempfile import SpooledTemporaryFile, TemporaryDirectory
-from types import TracebackType
 from typing import TYPE_CHECKING, Any, NamedTuple
 from zipfile import BadZipFile
 
@@ -309,8 +308,14 @@ async def download_and_extract_tarball(
 
         f.seek(0)
 
-        with ProcUserTarfile.open(fileobj=f, mode="r") as tar:
-            tar.extractall(path=dest, filter="data")
+        try:
+            with ProcUserTarfile.open(fileobj=f, mode="r") as tar:
+                tar.extractall(path=dest, filter="data")
+        except (tarfile.TarError, FileNotFoundError) as error:
+            logger.error(
+                f"Tarfile could not be extracted: {error}", exc_info=error
+            )
+            raise UserSafeError("Tarfile could not be extracted") from error
 
 
 class AuxiliaryData:
@@ -378,23 +383,24 @@ def post_clean_directories(self) -> list[Path]:
         logger.debug(f"{post_clean_directories=}")
         return post_clean_directories
 
-    async def __aenter__(self) -> "AuxiliaryData":
+    async def setup(self) -> None:
         logger.info("Setting up Auxiliary Data")
 
         self.ensure_directories_are_writable()
 
-        async with asyncio.TaskGroup() as task_group:
-            task_group.create_task(self.download_model())
-            task_group.create_task(self.download_ground_truth())
+        try:
+            await self.download_model()
+        except UserSafeError as error:
+            raise UserSafeError(f"Could not setup model: {error}") from error
 
-        return self
+        try:
+            await self.download_ground_truth()
+        except UserSafeError as error:
+            raise UserSafeError(
+                f"Could not setup ground truth: {error}"
+            ) from error
 
-    async def __aexit__(
-        self,
-        exc_type: type[BaseException] | None,
-        exc_val: BaseException | None,
-        exc_tb: TracebackType | None,
-    ) -> None:
+    async def teardown(self) -> None:
         logger.info("Cleaning up Auxiliary Data")
         for p in self.post_clean_directories:
             logger.info(f"Cleaning {p=}")
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,5 +1,6 @@
 import io
 import json
+import os
 import resource
 from unittest.mock import patch
 from uuid import uuid4
@@ -192,7 +193,7 @@ def test_bad_command_inference_from_task_list(minio, monkeypatch):
         f'{{"log": "Stopping due to failure of task {pk1}", "level": "ERROR", '
         '"source": "stderr", "internal": true, "task": null}' in result.output
     )
-    assert result.exit_code == 0
+    assert result.exit_code == 1
 
 
 def test_good_command_inference_from_s3_uri(minio, monkeypatch):
@@ -317,7 +318,7 @@ def test_bad_command_inference_from_s3_uri(minio, monkeypatch):
         f'{{"log": "Stopping due to failure of task {pk1}", "level": "ERROR", '
         '"source": "stderr", "internal": true, "task": null}' in result.output
     )
-    assert result.exit_code == 0
+    assert result.exit_code == 1
 
 
 def test_logging_setup(minio, monkeypatch):
@@ -440,3 +441,50 @@ def test_memory_limit_defined(minio, monkeypatch):
         '{"log": "Setting memory limit to 1337 MB", "level": "INFO", '
         '"source": "stdout", "internal": true, "task": null}'
     ) in result.output
+
+
+def test_aux_data_failure(minio, monkeypatch, tmp_path):
+    pk = str(uuid4())
+    prefix = f"tasks/{pk}"
+    model_key = f"{prefix}/sub/dodgy.tar"
+    model_destination = tmp_path / "model"
+    tasks = [
+        {
+            "pk": pk,
+            "inputs": [],
+            "output_bucket_name": minio.output_bucket_name,
+            "output_prefix": prefix,
+            "timeout": "PT10S",
+        }
+    ]
+
+    monkeypatch.setenv(
+        "GRAND_CHALLENGE_COMPONENT_CMD_B64J",
+        encode_b64j(val=["echo", "hello"]),
+    )
+    monkeypatch.setenv("GRAND_CHALLENGE_COMPONENT_SET_EXTRA_GROUPS", "False")
+    monkeypatch.setenv("GRAND_CHALLENGE_COMPONENT_USE_LINKED_INPUT", "False")
+    monkeypatch.setenv("GRAND_CHALLENGE_COMPONENT_WRITABLE_DIRECTORIES", "")
+    monkeypatch.setenv(
+        "GRAND_CHALLENGE_COMPONENT_MODEL",
+        f"s3://{minio.input_bucket_name}/{model_key}",
+    )
+    monkeypatch.setenv(
+        "GRAND_CHALLENGE_COMPONENT_MODEL_DEST", str(model_destination)
+    )
+
+    sync_s3_operation(
+        method=s3_upload_fileobj,
+        Fileobj=io.BytesIO(os.urandom(8)),
+        Bucket=minio.input_bucket_name,
+        Key=model_key,
+    )
+
+    runner = CliRunner()
+    result = runner.invoke(cli, ["invoke", "-t", json.dumps(tasks)])
+
+    assert result.exit_code == 1
+    assert result.stderr.splitlines()[-1] == (
+        '{"log": "Could not setup model: Tarfile could not be extracted", '
+        '"level": "ERROR", "source": "stderr", "internal": false, "task": null}'
+    )
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -395,12 +395,14 @@ async def test_model_and_ground_truth_extraction(
                 f"{ground_truth_pk}/ground_truth.tar.gz",
             )
 
-        async with AuxiliaryData(s3_resources=s3_resources):
-            downloaded_files = {
-                str(f.relative_to(tmp_path))
-                for f in tmp_path.rglob("**/*")
-                if f.is_file()
-            }
+        auxiliary_data = AuxiliaryData(s3_resources=s3_resources)
+        await auxiliary_data.setup()
+        downloaded_files = {
+            str(f.relative_to(tmp_path))
+            for f in tmp_path.rglob("**/*")
+            if f.is_file()
+        }
+        await auxiliary_data.teardown()
 
     assert downloaded_files == {
         "model/model-file1.txt",
@@ -425,17 +427,18 @@ async def test_ensure_directories_are_writable_unset(monkeypatch):
     monkeypatch.setenv("GRAND_CHALLENGE_COMPONENT_POST_CLEAN_DIRECTORIES", "")
 
     async with get_s3_resources() as s3_resources:
-        async with AuxiliaryData(s3_resources=s3_resources) as d:
-            assert d.writable_directories == []
-            assert d.post_clean_directories == []
-            assert d.model_source is None
-            assert d.model_dest == Path("/opt/ml/model")
-            assert not d.model_dest.exists()
-            assert d.ground_truth_source is None
-            assert d.ground_truth_dest == Path(
-                "/opt/ml/input/data/ground_truth"
-            )
-            assert not d.ground_truth_dest.exists()
+        auxiliary_data = AuxiliaryData(s3_resources=s3_resources)
+
+        assert auxiliary_data.writable_directories == []
+        assert auxiliary_data.post_clean_directories == []
+        assert auxiliary_data.model_source is None
+        assert auxiliary_data.model_dest == Path("/opt/ml/model")
+        assert not auxiliary_data.model_dest.exists()
+        assert auxiliary_data.ground_truth_source is None
+        assert auxiliary_data.ground_truth_dest == Path(
+            "/opt/ml/input/data/ground_truth"
+        )
+        assert not auxiliary_data.ground_truth_dest.exists()
 
 
 @pytest.mark.asyncio
@@ -456,8 +459,8 @@ async def test_ensure_directories_are_writable_set(
     )
 
     async with get_s3_resources() as s3_resources:
-        async with AuxiliaryData(s3_resources=s3_resources) as d:
-            assert d.writable_directories == expected
+        auxiliary_data = AuxiliaryData(s3_resources=s3_resources)
+        assert auxiliary_data.writable_directories == expected
 
 
 @pytest.mark.asyncio
@@ -480,8 +483,9 @@ async def test_ensure_directories_are_writable(tmp_path, monkeypatch):
     )
 
     async with get_s3_resources() as s3_resources:
-        async with AuxiliaryData(s3_resources=s3_resources):
-            pass
+        auxiliary_data = AuxiliaryData(s3_resources=s3_resources)
+        await auxiliary_data.setup()
+        await auxiliary_data.teardown()
 
     assert data.stat().st_mode == 0o40777
     assert model.stat().st_mode == 0o40777