Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
a1d8b06
feat(sandbox): record state machine transition history in status
zhangjaycee Jun 22, 2026
3df690f
fix(sandbox): clear stale phases on restart to reflect actual contain…
zhangjaycee Jun 25, 2026
7fd5ba4
feat(archive): integrate archive lifecycle into sandbox state machine…
zhangjaycee Jun 10, 2026
75e64f4
feat(archive): add configurable size limits for image push and dir up…
zhangjaycee Jun 10, 2026
64ee310
feat(scripts): add --table filter and --alter-from to gen_ddl.py
zhangjaycee Jun 10, 2026
ac06d50
chore(sql): regenerate sandbox_record.sql with new time columns
zhangjaycee Jun 10, 2026
d7c638f
feat(archive): add archive API endpoint and SDK client method
zhangjaycee Jun 10, 2026
7da1aed
test(archive): add unit and integration tests for archive lifecycle
zhangjaycee Jun 10, 2026
399a196
test(archive): add interactive manual E2E script for archive lifecycle
zhangjaycee Jun 10, 2026
e0b0bf3
chore(diagram): highlight intermediate states with colored fill
zhangjaycee Jun 10, 2026
9f1020f
refactor(sandbox): make BaseManager an ABC with abstract _check_job_b…
zhangjaycee Jun 17, 2026
8b0dd52
refactor(sandbox): rename state_enter_time to intermediate_state_star…
zhangjaycee Jun 17, 2026
a93e184
refactor(schema): group time columns together in SandboxRecord
zhangjaycee Jun 17, 2026
b3fa48b
refactor(archive): encapsulate key helpers into ArchiveKeys class, re…
zhangjaycee Jun 17, 2026
cd3fb84
fix(test): update archive e2e test for ArchiveKeys rename and mock pr…
zhangjaycee Jun 17, 2026
e712129
refactor(config): rename lifecycle *_sec fields to *_seconds
zhangjaycee Jun 17, 2026
9c1a017
refactor(archive): extract storage init into factory + SandboxManager…
zhangjaycee Jun 17, 2026
71833cd
feat(admin): add POST /sandboxes/{sandbox_id}/archive endpoint
zhangjaycee Jun 17, 2026
a46fcca
refactor(archive): move factory functions to abstract base class from…
zhangjaycee Jun 22, 2026
73c15fd
refactor(sandbox): replace intermediate_state_started_at with state_h…
zhangjaycee Jun 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions rock/actions/sandbox/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class State(str, Enum):
PENDING = "pending"
RUNNING = "running"
STOPPED = "stopped"
ARCHIVING = "archiving"
ARCHIVED = "archived"
DELETED = "deleted"


Expand Down Expand Up @@ -54,6 +56,7 @@ class SandboxStatusResponse(BaseModel):
start_time: str | None = None
stop_time: str | None = None
create_time: str | None = None
state_history: list[dict[str, str]] = []


class CommandResponse(BaseModel):
Expand Down
2 changes: 2 additions & 0 deletions rock/actions/sandbox/sandbox_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ class SandboxInfo(TypedDict, total=False):
start_time: str
stop_time: str
delete_time: str
archive_time: str
extended_params: dict[str, str]
state_history: list[dict[str, str]]


_SANDBOX_INFO_KEYS = frozenset(SandboxInfo.__annotations__.keys())
Expand Down
2 changes: 2 additions & 0 deletions rock/admin/core/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ class SandboxRecord(Base):
create_time = Column(String(64), nullable=False, default="")
start_time = Column(String(64), nullable=True)
stop_time = Column(String(64), nullable=True)
archive_time = Column(String(64), nullable=True)
delete_time = Column(String(64), nullable=True)
host_name = Column(String(255), nullable=True)
auth_token = Column(String(512), nullable=True)
rock_authorization_encrypted = Column(String(1024), nullable=True)
Expand Down
17 changes: 15 additions & 2 deletions rock/admin/entrypoints/sandbox_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import Annotated, Any

import httpx
from fastapi import APIRouter, Body, Depends, File, Form, UploadFile
from fastapi import APIRouter, Body, Depends, File, Form, Header, UploadFile

from rock.actions import (
BashObservation,
Expand Down Expand Up @@ -327,7 +327,7 @@ async def _apply_accelerator_type_validation(config: DockerDeploymentConfig) ->

if config.accelerator_type not in allowed:
raise BadRequestRockError(
f"Invalid accelerator_type {config.accelerator_type!r}. " f"Allowed values: {sorted(allowed)}"
f"Invalid accelerator_type {config.accelerator_type!r}. Allowed values: {sorted(allowed)}"
)


Expand Down Expand Up @@ -512,6 +512,19 @@ async def delete(sandbox_id: str = Body(..., embed=True)) -> RockResponse:
return RockResponse(result=f"{sandbox_id} deleted")


@sandbox_router.post("/sandboxes/{sandbox_id}/archive")
@handle_exceptions(error_message="archive sandbox failed")
async def archive(
sandbox_id: NonBlankStr,
rock_authorization: str | None = Header(default=None, alias="X-Key"),
) -> RockResponse:
archive_cfg = sandbox_manager.rock_config.lifecycle.archive
if not archive_cfg.is_allowed(rock_authorization):
raise BadRequestRockError("archive not allowed for this authorization key")
await sandbox_manager.archive_sandbox(sandbox_id)
return RockResponse(result=f"{sandbox_id} archiving")


@sandbox_router.post("/restart")
@handle_exceptions(error_message="restart sandbox failed")
async def restart(sandbox_id: str = Body(..., embed=True)) -> RockResponse[SandboxStartResponse]:
Expand Down
1 change: 1 addition & 0 deletions rock/admin/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ async def lifespan(app: FastAPI):
meta_store=meta_store,
)
set_sandbox_manager(sandbox_manager)

warmup_service = WarmupService(rock_config.warmup)
await warmup_service.init()
set_warmup_service(warmup_service)
Expand Down
9 changes: 9 additions & 0 deletions rock/admin/proto/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ class SandboxStartResponse(SandboxResponse):
disk_limit_rootfs: str | None = None


class StateTransitionRecord(BaseModel):
from_state: str
to_state: str
event: str
timestamp: str


# TODO: inherit from SandboxStartResponse
class SandboxStatusResponse(BaseModel):
sandbox_id: str = None
Expand All @@ -36,6 +43,7 @@ class SandboxStatusResponse(BaseModel):
start_time: str | None = None
stop_time: str | None = None
create_time: str | None = None
state_history: list[StateTransitionRecord] = []

@classmethod
def from_sandbox_info(cls, sandbox_info: "SandboxInfo") -> "SandboxStatusResponse":
Expand All @@ -53,6 +61,7 @@ def from_sandbox_info(cls, sandbox_info: "SandboxInfo") -> "SandboxStatusRespons
cpus=sandbox_info.get("cpus"),
memory=sandbox_info.get("memory"),
disk_limit_rootfs=sandbox_info.get("disk_limit_rootfs"),
state_history=sandbox_info.get("state_history", []),
)


Expand Down
18 changes: 16 additions & 2 deletions rock/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,22 @@ class ArchiveAcrConfig:

@dataclass
class ArchiveConfig:
enabled: bool = False
allowed_keys: list[str] | None = None
dir_storage: ArchiveDirStorageConfig = field(default_factory=ArchiveDirStorageConfig)
acr: ArchiveAcrConfig = field(default_factory=ArchiveAcrConfig)
scan_interval_sec: int = 30
timeout_sec: int = 1800
max_retries: int = 3
prefix: str = "rock-archives/"
max_image_push_size: str = "16g"
max_dir_upload_size: str = "16g"

def is_allowed(self, rock_authorization: str | None) -> bool:
"""Check if the caller is allowed to use archive. None = open to all."""
if self.allowed_keys is None:
return True
if not isinstance(self.allowed_keys, list):
return False
return rock_authorization in self.allowed_keys

def __post_init__(self):
if isinstance(self.dir_storage, dict):
Expand All @@ -188,6 +198,9 @@ def __post_init__(self):

@dataclass
class SandboxLifecycleConfig:
reconcile_interval_seconds: int = 30
archive_timeout_seconds: int = 1800
restore_timeout_seconds: int = 1800
archive: ArchiveConfig = field(default_factory=ArchiveConfig)

default_startup_timeout_seconds: float = 600
Expand Down Expand Up @@ -595,4 +608,5 @@ async def update(self):
f", image_registry_mirrors={self.image_registry_mirrors}"
f", image_mirror_lookup_allowlist={self.image_mirror_lookup_allowlist}"
f", instance_registry_mirrors={self.runtime.instance_registry_mirrors}"
f", lifecycle={self.lifecycle}"
)
4 changes: 3 additions & 1 deletion rock/deployments/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ class DockerDeploymentConfig(DeploymentConfig):
"""Custom name for the container. If None, a random name will be generated."""

auto_delete_seconds: int | None = None
"""If set, the container will be automatically deleted after container stopped."""
"""Per-sandbox auto-delete policy. 0 = --rm (immediate delete on stop);
>0 = delete after this many seconds idle; None = fall back to global
lifecycle.auto_delete_after_sec."""

type: Literal["docker"] = "docker"
"""Deployment type discriminator for serialization/deserialization and CLI parsing. Should not be modified."""
Expand Down
116 changes: 116 additions & 0 deletions rock/deployments/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,10 @@ async def restart(self):

logger.info(f"Restarting container {self._container_name} with docker start")

self._service_status.update_status(
phase_name="image_pull", status=Status.SUCCESS, message="skip image pull on restart"
)

# Reuse the same Popen-based attached start used by start(), so the
# restart path also produces a valid self._container_process. Without
# this, _stop() would skip its `if self._container_process is not None`
Expand Down Expand Up @@ -1003,6 +1007,118 @@ async def restart(self):

logger.info(f"Container {self._container_name} restarted successfully")

def _container_exists(self) -> bool:
result = subprocess.run(
["docker", "inspect", self._container_name],
capture_output=True,
timeout=5,
)
return result.returncode == 0

async def restart_from_image(self, image_ref: str):
"""Restart a container, recreating it from a committed image if it no longer exists.

Used by the archive-restore flow: the image was previously committed
and pushed to a registry. If the original container still exists,
behaves like restart(). If not, creates a new container from
*image_ref* with the same config (ports, memory, cpus, volumes).
"""
if self._container_exists():
await self.restart()
return

logger.info(f"Container {self._container_name} not found, recreating from {image_ref}")

executor = get_executor()
loop = asyncio.get_running_loop()

# Recover port mappings from persisted file, or allocate new ones.
status_path = PersistedServiceStatus.gen_service_status_path(self._container_name)
self._service_status.set_sandbox_id(self._container_name)
if os.path.exists(status_path):
with open(status_path) as f:
data = json.load(f)
for port_value, mapping in data.get("port_mapping", {}).items():
self._service_status.add_port_mapping(int(port_value), mapping)
if self._config.port is None:
self._config.port = self._service_status.port_mapping.get(Port.PROXY)

if self._config.port is None:
await self.do_port_mapping()

# Build docker create command from archived image.
env_arg = ["-e", f"ROCK_TIME_ZONE={env_vars.ROCK_TIME_ZONE}"]
volume_args = self._prepare_volume_mounts()
if env_vars.ROCK_LOGGING_PATH:
log_file_path = f"{env_vars.ROCK_LOGGING_PATH}/{self._container_name}"
os.makedirs(log_file_path, exist_ok=True)
os.chmod(log_file_path, 0o777)
volume_args.extend(["-v", f"{log_file_path}:{env_vars.ROCK_LOGGING_PATH}"])
env_arg.extend(
[
"-e",
f"ROCK_LOGGING_PATH={env_vars.ROCK_LOGGING_PATH}",
"-e",
f"ROCK_LOGGING_LEVEL={env_vars.ROCK_LOGGING_LEVEL}",
]
)
volume_args.extend(self._prepare_timezone_mount())
runtime_args = self._build_runtime_args()

cmds = [
"docker",
"create",
"--entrypoint",
"",
*env_arg,
*volume_args,
*runtime_args,
"-p",
f"{self._config.port}:{Port.PROXY}",
"-p",
f"{self._service_status.get_mapped_port(Port.SERVER)}:8080",
"-p",
f"{self._service_status.get_mapped_port(Port.SSH)}:22",
*self._memory(),
*self._cpus(),
*self._storage_opts(),
*self._config.docker_args,
"--name",
self._container_name,
image_ref,
*self._get_rocklet_start_cmd(),
]

cmd_str = shlex.join(cmds)
logger.info(f"Recreating container {self._container_name} from archived image {image_ref}")
logger.info(f"Command: {cmd_str!r}")

await loop.run_in_executor(executor, self._docker_create, cmds)
try:
self._container_process = await loop.run_in_executor(executor, self._docker_start)
except Exception:
DockerUtil.remove_container_force(self._container_name)
raise

if self._config.port is None:
self._config.port = self._service_status.port_mapping.get(Port.PROXY)
if self._config.port is None:
raise Exception(f"Cannot determine rocklet port for container {self._container_name}")

logger.info(f"Starting runtime at {self._config.port}")
self._runtime = RemoteSandboxRuntime.from_config(
RemoteSandboxRuntimeConfig(port=self._config.port, timeout=self._runtime_timeout)
)
self._runtime.set_executor(executor)

with StageTimer("startup_timing", f"[{self._container_name}] Wait until alive", logger):
await self._wait_until_alive(timeout=self._config.startup_timeout)

if self._config.enable_auto_clear:
self._check_stop_task = asyncio.create_task(self._check_stop())

logger.info(f"Container {self._container_name} recreated from {image_ref} successfully")

async def delete(self) -> None:
"""Remove the container via ``docker rm -f``.

Expand Down
32 changes: 32 additions & 0 deletions rock/sandbox/archive/abstract.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from __future__ import annotations

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from rock.config import ArchiveAcrConfig, ArchiveDirStorageConfig


class AbstractDirStorage(ABC):
Expand All @@ -8,6 +14,22 @@ class AbstractDirStorage(ABC):
decides compression format and upload strategy internally.
"""

@classmethod
def from_config(cls, cfg: ArchiveDirStorageConfig) -> AbstractDirStorage:
from rock.sandbox.archive.oss_storage import OssDirStorage
from rock.sandbox.archive.s3_storage import S3DirStorage

kwargs = dict(
endpoint=cfg.endpoint,
bucket=cfg.bucket,
access_key_id=cfg.access_key_id,
access_key_secret=cfg.access_key_secret,
region=cfg.region,
)
if cfg.type == "s3":
return S3DirStorage(**kwargs)
return OssDirStorage(**kwargs)

@abstractmethod
async def upload_dir(self, local_dir: str, key: str) -> None:
"""Pack and upload local_dir to key. Raises FileNotFoundError if local_dir missing."""
Expand Down Expand Up @@ -35,6 +57,16 @@ class AbstractImageStorage(ABC):
(node-local docker daemon). exists / delete are registry HTTP API calls.
"""

@classmethod
def from_config(cls, cfg: ArchiveAcrConfig) -> AbstractImageStorage:
from rock.sandbox.archive.registry_v2 import DockerRegistryV2ImageStorage

return DockerRegistryV2ImageStorage(
registry_url=cfg.registry_url,
username=cfg.username,
password=cfg.password,
)

@property
@abstractmethod
def registry_url(self) -> str:
Expand Down
8 changes: 8 additions & 0 deletions rock/sandbox/archive/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
class ArchiveKeys:
@staticmethod
def dir_key(sandbox_id: str, prefix: str) -> str:
return f"{prefix}{sandbox_id}.tar.gz"

@staticmethod
def image_ref(sandbox_id: str, registry_url: str, namespace: str) -> str:
return f"{registry_url}/{namespace}/sandbox_archived:{sandbox_id}"
Loading
Loading