diff --git a/rock/actions/sandbox/response.py b/rock/actions/sandbox/response.py index 1e44d26017..abb41a9566 100644 --- a/rock/actions/sandbox/response.py +++ b/rock/actions/sandbox/response.py @@ -54,6 +54,7 @@ class SandboxStatusResponse(BaseModel): start_time: str | None = None stop_time: str | None = None create_time: str | None = None + state_history: list[dict[str, str]] = [] class CommandResponse(BaseModel): diff --git a/rock/actions/sandbox/sandbox_info.py b/rock/actions/sandbox/sandbox_info.py index 33ee9ab1d5..3721d84ad2 100644 --- a/rock/actions/sandbox/sandbox_info.py +++ b/rock/actions/sandbox/sandbox_info.py @@ -27,6 +27,7 @@ class SandboxInfo(TypedDict, total=False): stop_time: str delete_time: str extended_params: dict[str, str] + state_history: list[dict[str, str]] _SANDBOX_INFO_KEYS = frozenset(SandboxInfo.__annotations__.keys()) diff --git a/rock/admin/proto/response.py b/rock/admin/proto/response.py index 41aca0e7fe..fb8f140046 100644 --- a/rock/admin/proto/response.py +++ b/rock/admin/proto/response.py @@ -15,6 +15,13 @@ class SandboxStartResponse(SandboxResponse): disk_limit_rootfs: str | None = None +class StateTransitionRecord(BaseModel): + from_state: str + to_state: str + event: str + timestamp: str + + # TODO: inherit from SandboxStartResponse class SandboxStatusResponse(BaseModel): sandbox_id: str = None @@ -36,6 +43,7 @@ class SandboxStatusResponse(BaseModel): start_time: str | None = None stop_time: str | None = None create_time: str | None = None + state_history: list[StateTransitionRecord] = [] @classmethod def from_sandbox_info(cls, sandbox_info: "SandboxInfo") -> "SandboxStatusResponse": @@ -53,6 +61,7 @@ def from_sandbox_info(cls, sandbox_info: "SandboxInfo") -> "SandboxStatusRespons cpus=sandbox_info.get("cpus"), memory=sandbox_info.get("memory"), disk_limit_rootfs=sandbox_info.get("disk_limit_rootfs"), + state_history=sandbox_info.get("state_history", []), ) diff --git a/rock/deployments/docker.py b/rock/deployments/docker.py index 971130eb3a..b6f06fde42 100644 --- a/rock/deployments/docker.py +++ b/rock/deployments/docker.py @@ -964,6 +964,10 @@ async def restart(self): logger.info(f"Restarting container {self._container_name} with docker start") + self._service_status.update_status( + phase_name="image_pull", status=Status.SUCCESS, message="skip image pull on restart" + ) + # Reuse the same Popen-based attached start used by start(), so the # restart path also produces a valid self._container_process. Without # this, _stop() would skip its `if self._container_process is not None` diff --git a/rock/sandbox/sandbox_manager.py b/rock/sandbox/sandbox_manager.py index 1c1ff3070a..5f55a16fbf 100644 --- a/rock/sandbox/sandbox_manager.py +++ b/rock/sandbox/sandbox_manager.py @@ -319,6 +319,7 @@ async def get_status(self, sandbox_id, include_all_states: bool = False) -> Sand start_time=sandbox_info.get("start_time"), stop_time=sandbox_info.get("stop_time"), create_time=sandbox_info.get("create_time"), + state_history=sm.sandbox_info.get("state_history", []) if sm.sandbox_info else [], ) async def build_sandbox_info_from_redis(self, sandbox_id: str, deployment_info: SandboxInfo) -> SandboxInfo | None: diff --git a/rock/sandbox/sandbox_statemachine.py b/rock/sandbox/sandbox_statemachine.py index 105b2423d9..e421767b99 100644 --- a/rock/sandbox/sandbox_statemachine.py +++ b/rock/sandbox/sandbox_statemachine.py @@ -62,6 +62,24 @@ def __init__(self, **kwargs): # Callbacks + def before_transition(self, event, source, target): + if source.value == target.value: + return + if self.sandbox_info is None: + self.sandbox_info = {} + history = self.sandbox_info.setdefault("state_history", []) + history.append( + { + "from_state": source.value.value, + "to_state": target.value.value, + "event": str(event), + "timestamp": get_iso8601_timestamp(), + } + ) + # Cap history to avoid unbounded growth in long-lived sandboxes + if len(history) > 100: + del history[:-100] + async def on_stop(self, sandbox_id: str, operator, meta_store, reason: StopReason = StopReason.MANUAL) -> None: logger.info(f"stop sandbox {sandbox_id} (reason={reason.value})") sandbox_info = self.sandbox_info or {} @@ -98,6 +116,8 @@ async def on_alive(self, sandbox_id: str, meta_store, sandbox_info: SandboxInfo) sandbox_info["state"] = RockState.RUNNING if not sandbox_info.get("start_time"): sandbox_info["start_time"] = get_iso8601_timestamp() + if self.sandbox_info and "state_history" in self.sandbox_info: + sandbox_info["state_history"] = self.sandbox_info["state_history"] await meta_store.update(sandbox_id, sandbox_info) async def on_restart(self, sandbox_id: str, operator, meta_store) -> None: @@ -136,6 +156,7 @@ async def on_restart(self, sandbox_id: str, operator, meta_store) -> None: new_info = dict(info) new_info["state"] = RockState.PENDING new_info.pop("stop_time", None) + new_info.pop("phases", None) await meta_store.update(sandbox_id, new_info) await meta_store.update_timeout(sandbox_id, timeout_info) diff --git a/tests/unit/deployments/test_docker_restart.py b/tests/unit/deployments/test_docker_restart.py new file mode 100644 index 0000000000..93e15a44a8 --- /dev/null +++ b/tests/unit/deployments/test_docker_restart.py @@ -0,0 +1,52 @@ +"""Tests for DockerDeployment.restart() phase handling.""" + +import json +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from rock.deployments.constants import Status +from rock.deployments.docker import DockerDeployment + + +@pytest.fixture +def deployment(): + with patch("rock.deployments.docker.env_vars") as mock_env: + mock_env.ROCK_WORKER_ENV_TYPE = "docker" + mock_env.ROCK_SERVICE_STATUS_DIR = "/tmp/test_status" + d = DockerDeployment(container_name="sb-1", image="python:3.11", port=22555) + return d + + +class TestRestartPhases: + @pytest.mark.asyncio + async def test_image_pull_is_success_after_restart(self, deployment): + """Restart reuses existing container — image_pull should be marked SUCCESS.""" + status_data = { + "phases": { + "image_pull": {"status": "success", "message": "image pull success"}, + "docker_run": {"status": "success", "message": "docker run success"}, + }, + "port_mapping": {"22555": 22555}, + } + + mock_popen = MagicMock() + with ( + patch.object(deployment, "_docker_start", return_value=mock_popen), + patch("os.path.exists", return_value=True), + patch("builtins.open", create=True) as mock_open, + patch.object(deployment, "_wait_until_alive", new_callable=AsyncMock), + patch("rock.deployments.docker.RemoteSandboxRuntime") as mock_runtime_cls, + ): + mock_open.return_value.__enter__ = lambda s: s + mock_open.return_value.__exit__ = MagicMock(return_value=False) + mock_open.return_value.read = MagicMock(return_value=json.dumps(status_data)) + mock_runtime = MagicMock() + mock_runtime_cls.from_config.return_value = mock_runtime + + # Patch json.load to return status_data + with patch("json.load", return_value=status_data): + await deployment.restart() + + phase = deployment._service_status.get_phase("image_pull") + assert phase.status == Status.SUCCESS, f"Expected image_pull to be SUCCESS after restart, got {phase.status}" diff --git a/tests/unit/sandbox/test_sandbox_statemachine.py b/tests/unit/sandbox/test_sandbox_statemachine.py index 54f93b4952..31e765320a 100644 --- a/tests/unit/sandbox/test_sandbox_statemachine.py +++ b/tests/unit/sandbox/test_sandbox_statemachine.py @@ -298,6 +298,16 @@ async def test_updates_state_to_pending(self, mock_meta_store): updated_info = mock_meta_store.update.call_args[0][1] assert updated_info["state"] == State.PENDING + @pytest.mark.asyncio + async def test_clears_stale_phases_on_restart(self, mock_meta_store): + info = dict(_VALID_RESTART_INFO) + info["phases"] = {"image_pull": {"status": "done", "message": "ok"}} + info["stop_time"] = "2026-01-01T01:00:00+08:00" + await self._send_restart(mock_meta_store, sandbox_info=info) + updated_info = mock_meta_store.update.call_args[0][1] + assert "phases" not in updated_info + assert "stop_time" not in updated_info + @pytest.mark.asyncio async def test_writes_timeout_built_from_spec(self, mock_meta_store): # auto_clear_time_minutes=30 in spec → make_timeout_info uses 30 @@ -411,3 +421,98 @@ async def test_missing_spec_skips_operator_but_still_archives(self, mock_meta_st async def test_restores_deleted_from_state_value(self): sm = await SandboxStateMachine.from_state_value(State.DELETED, sandbox_info={}) assert sm.deleted.is_active + + +# --------------------------------------------------------------------------- +# state_history recording +# --------------------------------------------------------------------------- + + +class TestStateHistory: + @pytest.mark.asyncio + async def test_alive_records_history(self): + sm = SandboxStateMachine(sandbox_info={"sandbox_id": "sb-1"}) + await sm.activate_initial_state() + await sm.send("alive", sandbox_id="sb-1", meta_store=AsyncMock(), sandbox_info={}) + history = sm.sandbox_info["state_history"] + assert len(history) == 1 + assert history[0]["from_state"] == "pending" + assert history[0]["to_state"] == "running" + assert history[0]["event"] == "alive" + assert "timestamp" in history[0] + + @pytest.mark.asyncio + async def test_stop_records_history(self): + sm = await SandboxStateMachine.from_state_value(State.RUNNING, sandbox_info={"sandbox_id": "sb-1"}) + await sm.send("stop", sandbox_id="sb-1", operator=AsyncMock(), meta_store=AsyncMock()) + history = sm.sandbox_info["state_history"] + assert len(history) == 1 + assert history[0]["from_state"] == "running" + assert history[0]["to_state"] == "stopped" + assert history[0]["event"] == "stop" + + @pytest.mark.asyncio + async def test_full_lifecycle_accumulates_history(self): + sm = SandboxStateMachine(sandbox_info={"sandbox_id": "sb-1"}) + await sm.activate_initial_state() + await sm.send("alive", sandbox_id="sb-1", meta_store=AsyncMock(), sandbox_info={}) + await sm.send("stop", sandbox_id="sb-1", operator=AsyncMock(), meta_store=AsyncMock()) + history = sm.sandbox_info["state_history"] + assert len(history) == 2 + assert history[0]["from_state"] == "pending" + assert history[0]["to_state"] == "running" + assert history[1]["from_state"] == "running" + assert history[1]["to_state"] == "stopped" + + @pytest.mark.asyncio + async def test_stop_noop_skips_history(self): + sm = await SandboxStateMachine.from_state_value(State.STOPPED, sandbox_info={"sandbox_id": "sb-1"}) + await sm.send("stop_noop", sandbox_id="sb-1") + history = sm.sandbox_info.get("state_history", []) + assert len(history) == 0 + + @pytest.mark.asyncio + async def test_restart_records_history(self): + sm = await SandboxStateMachine.from_state_value(State.STOPPED, sandbox_info=dict(_VALID_RESTART_INFO)) + await sm.send("restart", sandbox_id="sb-1", operator=AsyncMock(), meta_store=AsyncMock()) + history = sm.sandbox_info["state_history"] + assert len(history) == 1 + assert history[0]["from_state"] == "stopped" + assert history[0]["to_state"] == "pending" + assert history[0]["event"] == "restart" + + @pytest.mark.asyncio + async def test_delete_records_history(self): + sm = await SandboxStateMachine.from_state_value(State.STOPPED, sandbox_info=dict(_VALID_DELETE_INFO)) + await sm.send("delete", sandbox_id="sb-1", operator=AsyncMock(), meta_store=AsyncMock()) + history = sm.sandbox_info["state_history"] + assert len(history) == 1 + assert history[0]["from_state"] == "stopped" + assert history[0]["to_state"] == "deleted" + assert history[0]["event"] == "delete" + + @pytest.mark.asyncio + async def test_history_preserved_from_state_value(self): + existing_history = [ + {"from_state": "pending", "to_state": "running", "event": "alive", "timestamp": "2026-01-01T00:00:00+08:00"} + ] + sm = await SandboxStateMachine.from_state_value( + State.RUNNING, sandbox_info={"sandbox_id": "sb-1", "state_history": existing_history} + ) + await sm.send("stop", sandbox_id="sb-1", operator=AsyncMock(), meta_store=AsyncMock()) + history = sm.sandbox_info["state_history"] + assert len(history) == 2 + assert history[0]["event"] == "alive" + assert history[1]["event"] == "stop" + + @pytest.mark.asyncio + async def test_alive_copies_history_to_operator_sandbox_info(self): + sm = SandboxStateMachine(sandbox_info={"sandbox_id": "sb-1"}) + await sm.activate_initial_state() + operator_info = {"sandbox_id": "sb-1"} + mock_meta_store = AsyncMock() + await sm.send("alive", sandbox_id="sb-1", meta_store=mock_meta_store, sandbox_info=operator_info) + updated_info = mock_meta_store.update.call_args[0][1] + assert "state_history" in updated_info + assert len(updated_info["state_history"]) == 1 + assert updated_info["state_history"][0]["event"] == "alive"