diff --git a/docs/_specs/start-from-dockerfile/01_requirement.md b/docs/_specs/start-from-dockerfile/01_requirement.md new file mode 100644 index 0000000000..dd816e6d5c --- /dev/null +++ b/docs/_specs/start-from-dockerfile/01_requirement.md @@ -0,0 +1,37 @@ +# Start from Dockerfile — Requirement Spec + +## Background + +ROCK SDK 目前只支持通过预构建镜像(`SandboxConfig.image`)启动沙箱。调用方必须事先准备好镜像并推送到 registry,再将镜像名传入 `Sandbox.start()`。 + +在实际使用中,Harbor 的任务通常只提供一个包含 Dockerfile 的目录(`environment_dir`),而非预构建镜像,这类任务目前无法直接通过 ROCK SDK 启动沙箱。 + +本次需求:ROCK SDK 支持接收一个包含 Dockerfile 的目录(`environment_dir`)启动沙箱。 + +--- + +## Scope + +输入 `environment_dir`(本地目录,包含 Dockerfile),启动沙箱。 + +--- + +## Acceptance Criteria + +- **AC1**: 给定 `environment_dir`,成功启动沙箱,沙箱内可访问 Dockerfile 中 COPY 的文件 +- **AC2**: 镜像已存在时,跳过构建直接启动 + +--- + +## Constraints + +- 不引入新的 Python 依赖 +- 不新增 Admin API 接口 + +--- + +## Risks + +| 风险 | 影响 | 缓解 | +|------|------|------| +| 大构建上下文传输慢 | 启动延迟增加 | 利用 OSS 中转加速 | diff --git a/docs/_specs/start-from-dockerfile/02_investigation.md b/docs/_specs/start-from-dockerfile/02_investigation.md new file mode 100644 index 0000000000..ed2a3a47d5 --- /dev/null +++ b/docs/_specs/start-from-dockerfile/02_investigation.md @@ -0,0 +1,752 @@ +# Start from Dockerfile — 调研:各 Sandbox 平台如何支持从 Dockerfile 启动 + +## 概述 + +调研 Daytona、E2B、Modal、Runloop、GKE、Docker 六个 Sandbox 平台如何实现从 Dockerfile 启动沙箱,为 Rock 的实现提供参考。 + +--- + +## 各平台接口定义 + +### Daytona + +Daytona 暴露给用户的核心类型有两个:`Image`(客户端构建定义)和 `Snapshot`(服务端持久快照),二者位于不同抽象层。 + +#### `Image` — 客户端声明对象 + +`Image` 是 Pydantic BaseModel,**不直接构造**,通过静态工厂方法创建。它仅描述"如何构建",不持有任何服务端 ID,本身**从不在 Daytona 服务端存在**。 + +```python +class Image(BaseModel): + """不直接构造,通过 from_dockerfile / base / debian_slim 等工厂方法创建。""" + _dockerfile: str = PrivateAttr(default="") # 生成或读取的 Dockerfile 内容 + _context_list: list[Context] = PrivateAttr(default_factory=list) # COPY 依赖的本地上下文文件 + + @staticmethod + def from_dockerfile(path: str | Path) -> "Image": + """读取 Dockerfile,自动提取 COPY 指令依赖的上下文文件。""" + @staticmethod + def base(image: str) -> "Image": + """从已有镜像 tag 构造,等价于 `FROM {image}`。""" + @staticmethod + def debian_slim(python_version) -> "Image": ... + + # 链式调用追加 Dockerfile 指令 + def pip_install(self, *packages) -> "Image": ... + def run_commands(self, *commands) -> "Image": ... + def add_local_file(self, local_path, remote_path) -> "Image": ... + def env(self, vars: dict) -> "Image": ... +``` + +#### `Snapshot` — 服务端持久对象 + +`Snapshot` 继承自 OpenAPI 生成的 `SnapshotDto`,是 **Daytona 服务端的预配置沙箱快照**,在服务端**永久存在直到手动删除**。 + +```python +class Snapshot(SnapshotDto): + id: str + name: str + image_name: str + state: SnapshotState # PENDING / BUILDING / ACTIVE / ERROR / BUILD_FAILED + size: float | None + cpu: int; gpu: int; mem: int; disk: int # GiB + entrypoint: list[str] | None + created_at: str; updated_at: str; last_used_at: str + +class CreateSnapshotParams(BaseModel): + name: str + image: str | Image # str=已有镜像名,Image=声明式构建 + resources: Resources | None = None + entrypoint: list[str] | None = None + region_id: str | None = None + +class AsyncSnapshotService: + async def list() -> PaginatedSnapshots + async def get(name: str) -> Snapshot + async def create(params: CreateSnapshotParams, *, on_logs=None, timeout=0) -> Snapshot + async def delete(snapshot: Snapshot) -> None + async def activate(snapshot: Snapshot) -> Snapshot +``` + +#### Image 与 Snapshot 的关系 + +`Image` 是**输入**(构建定义),`Snapshot` 是**输出**(命名持久快照)。一个 Image 可以传入 `snapshot.create()` 产出一个 Snapshot;也可以直接传入 `daytona.create()` 触发一次性构建(不产出命名 Snapshot)。 + +``` +Image (客户端声明) + ├─→ snapshot.create(CreateSnapshotParams(name=..., image=Image)) ─→ 命名 Snapshot (服务端永久持有) + │ │ + │ ▼ + │ daytona.create(CreateSandboxFromSnapshotParams(snapshot=name)) + │ + └─→ daytona.create(CreateSandboxFromImageParams(image=Image)) ─→ 内部临时构建(24h 隐式缓存,无命名 Snapshot) +``` + +#### 启动接口 + +```python +class CreateSandboxFromImageParams(BaseModel): + image: str | Image # 必填,str 或 Image 声明 + resources: Resources | None = None + env_vars: dict[str, str] | None = None + auto_stop_interval: int | None = None # 分钟 + auto_delete_interval: int | None = None + network_block_all: bool | None = None + # ... 其他可选字段 + +class CreateSandboxFromSnapshotParams(BaseModel): + snapshot: str # 已存在的 Snapshot 名称 + auto_stop_interval: int | None = None + auto_delete_interval: int | None = None + network_block_all: bool | None = None + # ... 其他可选字段(不含 image / resources,资源由 Snapshot 决定) + +class AsyncDaytona: + async def create( + self, + params: CreateSandboxFromImageParams | CreateSandboxFromSnapshotParams | None = None, + *, + timeout: float = 60, + on_snapshot_create_logs: Callable[[str], None] | None = None, + ) -> AsyncSandbox: ... +``` + +#### 关键观察:两条路径在服务端是同一构建流程 + +从 SDK 源码 (`daytona/_async/daytona.py` 第 474-489 行) 可见,即使用户传 `CreateSandboxFromImageParams`,SDK 也会把 `Image` 序列化为 `CreateBuildInfo(dockerfile_content=..., context_hashes=...)` 发给服务端,服务端的处理流程(`PENDING_BUILD` 状态、流式 build_logs)与 `snapshot.create()` 完全相同。 + +```python +# AsyncDaytona._create() 内部 +if isinstance(params, CreateSandboxFromImageParams) and params.image: + if isinstance(params.image, str): + sandbox_data.build_info = CreateBuildInfo( + dockerfile_content=Image.base(params.image).dockerfile(), + ) + else: + context_hashes = await AsyncSnapshotService.process_image_context(...) + sandbox_data.build_info = CreateBuildInfo( + context_hashes=context_hashes, + dockerfile_content=params.image.dockerfile(), + ) +``` + +两条路径的差异仅在产物归属与生命周期: + +| 路径 | 调用 | 产物 | 生命周期 | +|------|------|------|---------| +| Image → 一次性构建 | `daytona.create(CreateSandboxFromImageParams(image=Image))` | 匿名构建产物 | 平台侧 24 小时隐式缓存,过期自动清理 | +| Image → 命名 Snapshot | `daytona.snapshot.create(CreateSnapshotParams(name=..., image=Image))` 然后 `daytona.create(CreateSandboxFromSnapshotParams(snapshot=name))` | 命名 Snapshot | 永久持有,需 `snapshot.delete()` 显式清理 | + +#### Harbor 的实际使用模式 + +Harbor 在 [harbor/src/harbor/environments/daytona.py](file:///root/harbor/src/harbor/environments/daytona.py) 第 165-217 行采取**外部预置 Snapshot + 客户端动态构建**的混合策略,**不在客户端代码内调用 `snapshot.create()`**: + +```python +# 1. 检查外部预置的命名 Snapshot 是否已 ACTIVE +snapshot_name = snapshot_template_name.format(name=environment_name) +try: + snapshot = await daytona.snapshot.get(snapshot_name) + snapshot_exists = (snapshot.state == SnapshotState.ACTIVE) +except Exception: + snapshot_exists = False + +if snapshot_exists: + # 热路径:复用命名 Snapshot + params = CreateSandboxFromSnapshotParams(snapshot=snapshot_name, ...) +elif force_build or not docker_image: + # 冷路径:从 Dockerfile 一次性构建(仅 24h 隐式缓存) + image = Image.from_dockerfile(dockerfile_path) + params = CreateSandboxFromImageParams(image=image, ...) +else: + # 备用路径:直接用 prebuilt image tag + image = Image.base(docker_image) + params = CreateSandboxFromImageParams(image=image, ...) + +await daytona.create(params=params) +``` + +命名 Snapshot 的生命周期完全由运维通过 Daytona Dashboard / CLI 管理。Harbor 客户端代码只负责"先查 Snapshot,命中就走快路径,否则走 Image 一次性构建"。 + +--- + +### E2B + +**核心类型:** + +```python +class TemplateBase: + def from_dockerfile(self, dockerfile_content_or_path: str) -> TemplateBuilder: ... + def from_image(self, image: str, username: str | None = None, password: str | None = None) -> TemplateBuilder: ... +``` + +`from_dockerfile()` 返回 `TemplateBuilder`,支持链式调用追加指令: + +```python +class TemplateBuilder: + def run_cmd(self, command: str | list[str]) -> TemplateBuilder: ... + def copy(self, src, dest) -> TemplateBuilder: ... + def set_envs(self, envs: dict[str, str]) -> TemplateBuilder: ... + def apt_install(self, packages) -> TemplateBuilder: ... + def pip_install(self, packages) -> TemplateBuilder: ... + # ... 其他 builder 方法 +``` + +**构建接口:** + +```python +class AsyncTemplate(TemplateBase): + @staticmethod + async def build( + template: TemplateBuilder, + name: str | None = None, + *, + alias: str | None = None, + cpu_count: int = 2, + memory_mb: int = 1024, + skip_cache: bool = False, + ) -> BuildInfo: ... + + @staticmethod + async def alias_exists(alias: str) -> bool: ... +``` + +**启动接口:** + +```python +class AsyncSandbox: + @classmethod + async def create( + cls, + template: str | None = None, # template name 或 ID + timeout: int | None = None, + envs: dict[str, str] | None = None, + allow_internet_access: bool = True, + ) -> Self: ... +``` + +- 两步模型:先 `build()` Template,再从 Template `create()` Sandbox +- Template 按 alias 缓存,内容哈希作为 alias 一部分 + +--- + +### Modal + +**核心类型:** + +```python +class Image(_Object): + """不直接构造,通过静态工厂方法创建。""" + + @staticmethod + def from_dockerfile( + path: str | Path, + *, + force_build: bool = False, + context_dir: Path | str | None = None, + build_args: dict[str, str] = {}, + secrets: Collection[Secret] | None = None, + gpu: GPU_T = None, + add_python: str | None = None, + ) -> "Image": ... + + @staticmethod + def from_registry( + tag: str, + secret: Secret | None = None, + *, + force_build: bool = False, + add_python: str | None = None, + ) -> "Image": ... +``` + +**启动接口:** + +```python +class Sandbox(_Object): + @staticmethod + async def create( + *args: str, + app: App | None = None, + image: Image | None = None, + cpu: float | tuple[float, float] | None = None, + memory: int | tuple[int, int] | None = None, # MiB + gpu: GPU_T = None, + timeout: int = 300, + block_network: bool = False, + volumes: dict[str | PathLike, Volume | CloudBucketMount] = {}, + env: dict[str, str | None] | None = None, + ) -> "Sandbox": ... +``` + +- `Image` 是惰性声明,实际构建在 `Sandbox.create()` 时由平台触发 +- 平台内部按内容哈希缓存 + +--- + +### Runloop + +**核心类型:** + +```python +class BlueprintCreateParams(TypedDict, total=False): + name: Required[str] + dockerfile: str | None # Dockerfile 内容(原始文本) + build_context: BuildContext | None # 构建上下文 + build_args: dict[str, str] | None + launch_parameters: LaunchParameters | None + # ... 其他可选字段 + +class BuildContext(TypedDict, total=False): + object_id: Required[str] # storage object ID + type: Required[Literal["object"]] + +class LaunchParameters(BaseModel): + architecture: Literal["x86_64", "arm64"] | None = None + custom_cpu_cores: int | None = None + custom_gb_memory: int | None = None # GiB + custom_disk_size: int | None = None # GiB + keep_alive_time_seconds: int | None = None + # ... 其他字段 + +class BlueprintView(BaseModel): + id: str + name: str + status: Literal["queued", "provisioning", "building", "failed", "build_complete"] + # ... 其他字段 +``` + +**构建接口:** + +```python +class AsyncRunloopSDK: + storage_object: AsyncStorageObjectOps + blueprint: AsyncBlueprintOps + devbox: AsyncDevboxOps + +# 上传构建上下文 +storage_object = await sdk.storage_object.upload_from_dir( + dir_path: Path, name: str, ttl: timedelta, +) -> StorageObject + +# 创建 Blueprint +blueprint = await sdk.blueprint.create( + name: str, dockerfile: str, build_context: BuildContext, ... +) -> AsyncBlueprint +``` + +**启动接口:** + +```python +devbox = await sdk.devbox.create_from_blueprint_id( + blueprint_id: str, name: str | None = None, ... +) -> AsyncDevbox +``` + +- 三步模型:上传上下文 → 创建 Blueprint → 从 Blueprint 创建 Devbox +- Blueprint 按名称缓存 + +--- + +### GKE + +无平台 SDK,通过 `gcloud` CLI 和 Kubernetes Python SDK 组合实现。 + +**构建:** + +```bash +gcloud builds submit \ + --tag /:latest \ + --timeout 2400 \ + --machine-type E2_HIGHCPU_8 \ + +``` + +**镜像检查:** + +```bash +gcloud artifacts docker images describe +``` + +**启动:** + +```python +from kubernetes import client as k8s_client + +core_api = k8s_client.CoreV1Api() +core_api.create_namespaced_pod(namespace=..., body=pod) +# pod spec 中引用 Cloud Build 产出的镜像 +``` + +- 构建和启动分离:Cloud Build 产出镜像 → Kubernetes 从镜像创建 Pod +- 按 `{environment_name}:latest` 检查 Artifact Registry 中镜像是否存在 + +--- + +### Docker + +无平台 SDK,直接通过 `docker compose` CLI 操作。 + +```bash +# 构建 +docker compose -f base.yaml -f build.yaml build + +# 启动 +docker compose ... up --detach --wait +``` + +- 构建和启动由 compose 统一管理 +- 依赖本地 Docker daemon,Docker layer cache 天然缓存 + +--- + +## 缓存机制 + +### Daytona — 双层缓存:命名 Snapshot(显式)+ 平台 24h 隐式缓存 + +Daytona 的缓存有两层: + +**第一层:调用方显式管理的命名 Snapshot**(热缓存) + +调用方按命名约定(如 `harbor__{name}__snapshot`)查找预创建的 Snapshot,命中即走快路径: + +```python +snapshot_name = snapshot_template_name.format(name=environment_name) + +# 检查 Snapshot 是否存在且可用 +snapshot = await daytona.snapshot.get(snapshot_name) # REST GET,不存在则抛异常 +if snapshot.state == SnapshotState.ACTIVE: + # 从 Snapshot 启动,跳过构建 + params = CreateSandboxFromSnapshotParams(snapshot=snapshot_name, ...) +``` + +- 缓存 key:调用方约定的 Snapshot 名称 +- 内容变更检测:无,Snapshot 必须由运维(Dashboard/CLI/`snapshot.create()`)外部预创建和更新 +- `force_build` 无法绕过 Snapshot(如果存在则始终使用) + +**第二层:Image 路径下平台侧 24 小时隐式缓存**(温缓存) + +当 Snapshot 不存在或 `force_build=True`,调用方走 `CreateSandboxFromImageParams(image=Image.from_dockerfile(...))`。SDK 把 Image 转为 `CreateBuildInfo(dockerfile_content, context_hashes)` 发给服务端,服务端按内容哈希自动缓存构建产物 24 小时(过期清理)。 + +- 缓存 key:服务端按 `dockerfile_content` + `context_hashes` 计算 +- 内容变更检测:自动,但只在 24h 窗口内有效 +- 不产生命名 Snapshot,即不会进入第一层缓存 + +### E2B — Template 内容哈希 + +缓存基于 `environment_dir` 目录内容的 SHA-256 哈希,嵌入 Template alias。 + +```python +# alias 格式:__ +template_name = f"{environment_name}__{dirhash(environment_dir, 'sha256')[:8]}".replace(".", "-") + +# 检查 Template 是否已存在 +exists = await AsyncTemplate.alias_exists(template_name) # REST GET /templates/aliases/{alias} + +if not force_build and exists: + pass # 跳过构建,直接用已有 Template 启动 +else: + await AsyncTemplate.build(template=..., alias=template_name, ...) +``` + +- 缓存 key:`environment_name` + 目录内容哈希 +- 内容变更检测:自动,任何文件变化产生新哈希 → 新 alias → 触发重建 +- 旧 Template 不会自动清理 + +### Modal — 平台侧隐式缓存 + +调用方无需管理缓存。`Image` 对象在 `Sandbox.create()` 时发送给 Modal 服务端,服务端根据完整的镜像定义(Dockerfile 内容、上下文文件、构建参数等)计算缓存 key。 + +```python +# 调用方代码中无任何缓存逻辑 +image = Image.from_dockerfile(path, context_dir=environment_dir) +sandbox = await Sandbox.create(image=image, ...) + +# SDK 内部:将完整镜像定义序列化为 protobuf,发送 ImageGetOrCreate 请求 +# 服务端判断是否命中缓存,命中则直接返回已有镜像 +req = api_pb2.ImageGetOrCreateRequest(image=image_definition, force_build=force_build, ...) +resp = await client.stub.ImageGetOrCreate(req) +``` + +- 缓存 key:服务端根据镜像定义 protobuf 计算(包含 Dockerfile 内容、上下文文件哈希) +- 内容变更检测:自动,服务端按内容哈希判断 +- `force_build` 通过 `Image.from_dockerfile(force_build=True)` 传递 + +### Runloop — Blueprint 名称查找 + +缓存基于 Blueprint 名称查找,无内容哈希。 + +```python +blueprint_name = f"harbor_{environment_name}_blueprint" + +# 查找已有 Blueprint:查私有 + 公有列表,取最新的 build_complete 状态 +private_page = await client.api.blueprints.list(name=blueprint_name) +public_page = await client.api.blueprints.list_public(name=blueprint_name) +candidates = [bp for bp in all_blueprints if bp.name == blueprint_name and bp.status == "build_complete"] +candidates.sort(key=lambda bp: bp.create_time_ms, reverse=True) +blueprint_id = candidates[0].id if candidates else None + +if not force_build and blueprint_id: + pass # 复用已有 Blueprint +else: + blueprint_id = await client.blueprint.create(name=blueprint_name, dockerfile=..., ...) +``` + +- 缓存 key:`harbor_{environment_name}_blueprint`(仅名称) +- 内容变更检测:无,`environment_dir` 内容变化但名称不变时,静默复用旧 Blueprint +- 同名 Blueprint 可共存多个,取最新的 `build_complete` + +### GKE — Registry 镜像检查 + +缓存基于 Artifact Registry 中镜像是否存在。 + +```python +image_url = f"{registry_location}-docker.pkg.dev/{project_id}/{registry_name}/{environment_name}:latest" + +# 检查镜像是否存在 +check_cmd = ["gcloud", "artifacts", "docker", "images", "describe", image_url, "--project", project_id] +result = await asyncio.create_subprocess_exec(*check_cmd, stdout=DEVNULL, stderr=DEVNULL) +exists = (result.returncode == 0) + +if not force_build and exists: + pass # 使用已有镜像 +else: + await _build_and_push_image() # gcloud builds submit,覆盖 :latest +``` + +- 缓存 key:`{environment_name}:latest`(固定 tag) +- 内容变更检测:无,`environment_dir` 内容变化但名称不变时,静默复用旧镜像 +- `force_build=True` 重新构建并覆盖 `:latest` + +### Docker — Layer Cache + 进程内锁 + +缓存依赖 Docker daemon 自身的 layer cache,进程内通过 `asyncio.Lock` 去重并发构建。 + +```python +# 类级别锁字典 +_image_build_locks: dict[str, asyncio.Lock] = {} + +# 构建时按 environment_name 加锁 +lock = _image_build_locks.setdefault(environment_name, asyncio.Lock()) +async with lock: + await docker_compose(["build"]) # Docker layer cache 处理增量构建 +``` + +- 缓存 key:Docker layer cache(按 Dockerfile 指令 + 文件内容) +- 内容变更检测:自动,Docker 逐层比对,变化的层及后续层重建 +- 进程内锁保证同一 `environment_name` 不并发构建,但不跨进程 + +--- + +## 构建产物存储 + +> 本节统一从五个维度描述每个平台:**产物类型 / 存储位置 / 用户可见的管理 API / 生命周期 / 用户控制粒度**。E2B 的服务端实现(Firecracker pipeline、SHA-256 层哈希链等)放在小节末尾的"补充"作为深入参考。 + +### Daytona — 两层产物:匿名构建产物 + 命名 Snapshot + +Daytona 同一个底层存储承载两种命名的产物,调用方需明确选哪一种: + +#### A. 匿名构建产物(`Image` 直走 `daytona.create()`) + +- **产物类型**:服务端按 Dockerfile 内容 + 上下文哈希计算的匿名快照(无名字、无 `id` 暴露给调用方) +- **存储位置**:Daytona 平台内部 Object Storage(S3 兼容),调用方不可直达底层 +- **管理 API**:**无**。调用方拿不到 ID,也不能 list/delete 这一层产物 +- **生命周期**:服务端自动缓存 **24 小时**,过期清理 +- **用户控制**:`Image.from_dockerfile(force_build=True)` 强制重建当次 + +#### B. 命名 Snapshot(`AsyncSnapshotService.create()`) + +- **产物类型**:注册到 Daytona 数据库的 Snapshot 对象(`id` / `name` / `state` / `image_name` / `size` / `cpu/gpu/mem/disk` 等字段)。**Snapshot 不是标准 Docker 镜像**,是平台专有快照格式 +- **存储位置**:同上,但产物在数据库中有名字、有状态、可查询 +- **管理 API**:完整的 CRUD 接口 + + ```python + class AsyncSnapshotService: + async def list(page=None, limit=None) -> PaginatedSnapshots + async def get(name: str) -> Snapshot + async def create(params: CreateSnapshotParams, *, on_logs=None, timeout=0) -> Snapshot + async def delete(snapshot: Snapshot) -> None + async def activate(snapshot: Snapshot) -> Snapshot # 激活归档态的 Snapshot + ``` +- **生命周期**:永久持有,需手动删除 +- **用户控制**:`snapshot.delete()` / Dashboard / CLI + +#### 构建上下文传输 + +`Image` 对象的 `_context_list`(`COPY` 引用的本地文件)通过 `AsyncObjectStorage.upload()` 上传,bucket 由服务端 `get_push_access()` 动态下发(SDK 的默认 fallback bucket 是 `daytona-volume-builds`,但生产环境通常不用 fallback)。上传产生 content hash 数组随 `CreateBuildInfo(context_hashes=..., dockerfile_content=...)` 提交给服务端。 + +--- + +### E2B — 命名 Template + +- **产物类型**:注册到 E2B 后端的 Template(暴露给调用方的标识是 `template_id` 或 `alias`)。底层是 Firecracker microVM 快照(rootfs/memfile/snapfile),但调用方不直接接触这一层 +- **存储位置**:E2B 平台云对象存储,元数据存数据库 +- **管理 API**: + + ```python + class AsyncTemplate: + @staticmethod + async def build(template, name=None, *, alias=None, cpu_count=2, memory_mb=1024, skip_cache=False) -> BuildInfo + @staticmethod + async def alias_exists(alias: str) -> bool # REST GET /templates/aliases/{alias} + # 删除走 CLI: `e2b template delete ` + ``` +- **生命周期**:永久保留,无自动清理;构建失败时服务端自动回收已上传对象 +- **用户控制**: + - 缓存复用:alias 相同则复用(Harbor 把 `dirhash[:8]` 嵌入 alias 实现内容寻址) + - 强制重建:`AsyncTemplate.build(skip_cache=True)` + - 删除:`e2b template delete` CLI / API + +#### 补充:服务端实现细节(如不关心可跳过) + +E2B 后端把 Dockerfile 拆成阶段流水线 `BaseBuilder → UserBuilder → StepBuilders(每条指令) → PostProcessing → Optimize`,每阶段计算 SHA-256 哈希作为缓存 key(输入含 `provision_version`、`disk_size`、`from_image`、`step_args`、`files_hash` 等),命中即跳过该阶段。每阶段产出 dirty-block 差异层(`rootfs.ext4.header`、`memfile.header`)。最终产物按 `buildID` 组织在 GCS/S3 (`TEMPLATE_BUCKET_NAME`),构建缓存索引在另一个 bucket (`BUILD_CACHE_BUCKET_NAME`)。这部分对调用方完全不可见,仅决定缓存命中率。 + +--- + +### Modal — 隐式哈希缓存(无显式产物) + +- **产物类型**:文件系统快照,**调用方完全无法引用**——SDK 不返回 `image_id` 给用户代码持有,下次调用时按内容重新计算哈希查找缓存 +- **存储位置**:Modal 平台内部,完全抽象 +- **管理 API**:**无**列表 / 查询 / 删除 API。Image 只是一个声明式 `_Image` 对象,调用 `Sandbox.create(image=image)` 时通过 gRPC `ImageGetOrCreate(image_definition_pb, force_build=...)` 提交给服务端,服务端按内容哈希返回已有或触发新构建 +- **生命周期**:随镜像定义自动缓存;定义变化(Dockerfile 内容、build_args、context_files、`force_build`)即触发重建 +- **用户控制**: + - 强制重建:`Image.from_dockerfile(force_build=True)` 或 `MODAL_FORCE_BUILD=1` 环境变量 + - 无手动删除入口(旧产物由平台按使用情况和容量策略自行回收) + +--- + +### Runloop — 命名 Blueprint + 独立的 build context 对象 + +- **产物类型**:Blueprint(平台托管的容器镜像),独立有 `id` / `name` / `status`(`queued`/`provisioning`/`building`/`failed`/`build_complete`) / `create_time_ms`。同名 Blueprint 可共存多个版本 +- **存储位置**:Runloop 平台内部 +- **管理 API**: + + ```python + client.api.blueprints.list(name=...) # 私有列表 + client.api.blueprints.list_public(name=...) # 公开列表 + client.blueprint.create(name=..., dockerfile=..., build_context=BuildContext(object_id=...)) + client.blueprint.delete(blueprint_id) + ``` +- **特殊:构建上下文是独立托管对象** + + ```python + storage_object = await sdk.storage_object.upload_from_dir( + dir_path=Path, name=str, ttl=timedelta, # 上下文有自己的 TTL + ) -> StorageObject + ``` + Blueprint 创建请求引用 `BuildContext(object_id=storage_object.id, type="object")`,因此构建上下文与 Blueprint 解耦:上下文短命(TTL 1h 即可),Blueprint 永久。 +- **生命周期**:Blueprint **永久保留并持续计费**(官方文档明确提醒);StorageObject 按 TTL 自动过期 +- **用户控制**: + - 缓存复用:按 `name` 查 list,取最新 `build_complete`;**无内容哈希**,同名同 dockerfile 改了内容也不会触发重建 + - 强制重建:调用 `blueprint.create()` 不复用旧 ID 即产生新 Blueprint + - 删除:`blueprint.delete()`(官方建议主动清理旧版本控制成本) + +--- + +### GKE — 用户自管 Artifact Registry + +- **产物类型**:标准 OCI/Docker 镜像(这是六个平台中唯一让用户拿到原生 Docker 镜像的) +- **存储位置**:**用户自有的** Google Artifact Registry,按 region 存储。Daytona/E2B/Modal/Runloop 都是平台托管,GKE 是用户托管 +- **管理 API**: + + ```bash + gcloud builds submit --tag /:latest # 构建并推送 + gcloud artifacts docker images describe # 检查存在 + gcloud artifacts docker images delete # 删除 + ``` + Repository 也支持 cleanup policy(按 tag 状态、版本数、镜像年龄自动清理) +- **生命周期**:用户完全自管,Cloud Build 缓存层由 GCP 自动管理 +- **用户控制**: + - 缓存复用:tag 固定为 `{environment_name}:latest`,**无内容哈希**,内容变化但 tag 不变会静默复用旧镜像 + - 强制重建:`force_build=True` 走 `gcloud builds submit` 覆盖 `:latest` + - 删除:CLI / Console / cleanup policy +- **费用**:用户 Artifact Registry 按 GB/月计费 + 跨 region 拉取的出网费用 + +--- + +### Docker — 本地 Docker daemon(无远端存储) + +- **产物类型**:标准 Docker 镜像 +- **存储位置**:**宿主机本地磁盘**(无 push 到 registry) +- **管理 API**:原生 Docker CLI + + ```bash + docker images # 列表 + docker rmi # 删除 + docker image prune # 清理悬挂镜像 + docker compose down --rmi all # 一并删除 compose 镜像 + ``` +- **生命周期**:持久存在直到显式 `docker rmi` 或 `docuum` 等清理工具 +- **用户控制**: + - 缓存复用:Docker daemon 自动按 layer cache,Dockerfile 指令或文件内容变化即触发对应层及其后所有层重建(**自动内容感知**) + - 进程内并发去重:Harbor 通过类级别 `_image_build_locks: dict[name, asyncio.Lock]` 串行化同名镜像的并发构建,跨进程不生效 + - 强制重建:`docker compose build --no-cache` + +--- + +## 对比 + +### 接口与缓存 + +| 平台 | 接口模式 | 缓存 key | 内容变更检测 | +|------|---------|---------|------------| +| Daytona | 热路径 `snapshot.get(name)` → `create(FromSnapshot)`;冷路径 `Image.from_dockerfile()` → `create(FromImage)` | 命名 Snapshot 名称(显式)+ 服务端构建定义哈希(24h 隐式) | 仅冷路径自动(24h 内) | +| E2B | `from_dockerfile()` → `build()` → `create()` | `name__sha256[:8]` | 自动(目录哈希) | +| Modal | `Image.from_dockerfile()` → `Sandbox.create()` | 平台侧计算(镜像定义哈希) | 自动(平台侧) | +| Runloop | `upload` → `blueprint.create()` → `devbox.create()` | `harbor_{name}_blueprint` | 无 | +| GKE | `gcloud builds submit` → `create_pod()` | `{name}:latest` | 无 | +| Docker | `docker compose build` → `up` | Docker layer cache | 自动(逐层比对) | + +### 构建产物与存储 + +| 平台 | 产物可见性 | 暴露给用户的标识 | 存储位置 | 默认生命周期 | 显式删除 API | +|------|----------|---------------|---------|------------|------------| +| Daytona(Image 路径) | 不可见 | 无 | 平台 S3 兼容存储 | 24h 自动过期 | 无(不可主动删) | +| Daytona(Snapshot 路径) | 可见,平台专有快照 | `name` / `id` / `state` | 同上 | 永久 | `snapshot.delete()` | +| E2B | 可见,命名 Template | `template_id` / `alias` | GCS/S3(平台托管) | 永久 | `e2b template delete` CLI | +| Modal | 不可见 | 无(无 `image_id` 句柄) | 平台内部抽象 | 平台自行回收 | 无(仅 `force_build`) | +| Runloop | 可见,命名 Blueprint | `id` / `name` / `status` | 平台内部 | 永久且持续计费 | `blueprint.delete()` | +| GKE | 可见,标准 OCI 镜像 | 镜像 URL `repo/name:tag` | **用户自有** Artifact Registry | 永久(cleanup policy 可选) | `gcloud artifacts docker images delete` | +| Docker | 可见,标准 Docker 镜像 | 本地 image name/id | **本地** Docker daemon | 永久直到 `docker rmi` | `docker rmi` / `docker image prune` | + +> **观察一**:六个平台只有 GKE 和 Docker 让用户拿到原生 OCI/Docker 镜像;其余四个均为平台专有的不透明产物。 +> +> **观察二**:仅有 Daytona(Image 路径)和 Modal 不暴露产物 ID,其它都暴露命名标识,可以查、可以删。 +> +> **观察三**:除 GKE 和 Docker 外,存储位置都在平台侧;Runloop 还会持续计费,意味着调用方需要主动管理生命周期。 + +### 用户可见标识与 Hash 编码 + +聚焦"调用方在自己的代码里实际持有/打印的标识"以及"hash 是否进入这个标识"。Rock 选 tag 方案时这是最直接的对照面。 + +| 平台 | 用户可见标识 | 是否原生 OCI tag | hash 进入标识 | hash 长度 | +|------|------------|--------------|-------------|---------| +| **GKE** | 镜像 URL `repo/{env_name}:latest` | ✅ | ❌ | — | +| **Docker** | `docker-compose.yaml` 中写死的镜像名 | ✅ | ❌ | — | +| Runloop | Blueprint name(`harbor_{name}_blueprint`) | ❌(平台 ID) | ❌ | — | +| Daytona | Snapshot name(`harbor__{name}__snapshot`) | ❌(平台 ID) | ❌ | — | +| **E2B** | Template alias(`{env_name}__{sha256[:8]}`) | ❌(平台 alias) | ✅ | **8 hex / 32 bit** | +| Modal | 无(SDK 不返回 image_id) | ❌(不暴露) | — | — | + +**观察四**:让用户拿到原生 docker tag 的两个平台(GKE / Docker)都不在 tag 里编码 hash,缓存逻辑要么靠固定 `:latest` + 平台/Daemon 自身的 layer cache,要么靠调用方手动管理命名约定。 + +**观察五**:在六个平台里只有 **E2B** 把 hash 嵌入到用户可见的标识,长度仅 **8 hex(32 bit)**。E2B 选 8 hex 的关键前提是 alias 还有 `env_name` 前缀做隔离 —— 碰撞只在"同名 env"内才发生。Rock "用 `user_id` 作为 repository" 的隔离思路与之同构。 + +**观察六**:服务端**不可见**的内部 hash(Daytona 的 `dockerfile_content + context_hashes`、Modal 的 image_definition protobuf、E2B 服务端每构建阶段的 SHA-256)普遍取**全长**或**长哈希**,因为服务端无人眼读。用户可见 hash 才会牺牲部分熵换可读。 + +**Rock 选择空间(同一 repository 内、按 birthday bound `n²/(2·2^bits)` 估,n=10⁶)**: + +| 长度 | 例 | 同 repo 内 1M 镜像碰撞概率 | +|------|------|-----------| +| 8 hex(E2B 同款) | `3a7bd3e2` | ~3% | +| 16 hex | `3a7bd3e2360a3d29` | 5×10⁻⁸ | +| 20 hex | `3a7bd3e2360a3d29eea4` | 8×10⁻¹³ | +| 32 hex | `3a7bd3e2360a3d29eea436fcfb7e44c7` | 4×10⁻²⁰ | +| 64 hex | 完整 SHA-256 | 0 | + +实际单个 user_id 下的环境数远小于 1M(通常 <1k),实际碰撞比表中数值再低 6 个数量级。对照下,E2B 的 8 hex 在量级上就已"工程零风险";Rock 取 16 hex 以上即可在所有合理场景获得碰撞 negligible。 + +--- + +### Harbor 使用方式参考 + +Harbor 的 `BaseEnvironment` 通过 `start(force_build: bool)` 统一入口,各环境在 `start()` 内部完成从 Dockerfile 到沙箱运行的完整流程。构建上下文统一为 `environment_dir`,Dockerfile 位于 `environment_dir / "Dockerfile"`。 diff --git a/docs/_specs/start-from-dockerfile/03_implementation.md b/docs/_specs/start-from-dockerfile/03_implementation.md new file mode 100644 index 0000000000..4b6bb13762 --- /dev/null +++ b/docs/_specs/start-from-dockerfile/03_implementation.md @@ -0,0 +1,600 @@ +# Start from Dockerfile — Implementation Plan + +## 背景 + +ROCK SDK 目前只支持通过预构建镜像名(`SandboxConfig.image: str`)启动沙箱。调用方必须在 SDK 外部完成 `docker build` + `docker push`,再将镜像 tag 传入 `Sandbox.start()`。 + +本次实现引入 `Image` 一等类型,支持 `Image.from_dockerfile(path)` 声明式接口。调用方将 `Image` 对象赋给 `SandboxConfig.image`,SDK 在 `Sandbox.start()` 内部透明地完成 DinD 构建、推送和缓存检查,最终将 `Image` 解析为字符串 image_name 后发送给 Admin API。方案参考 Daytona(`Image.from_dockerfile(path)` → `create()`)和 Modal(`Image.from_dockerfile(path)` → `Sandbox.create(image=image)`)的设计。 + +关键约束:Admin API(`SandboxStartRequest.image: str`)、DB schema(`SandboxRecord.image = Column(String(512))`)不做任何修改。`Image` 类型在 HTTP 边界之前完全解析为字符串。 + +--- + +## File Changes + +| 文件 | 修改类型 | 说明 | +|------|------|------| +| `rock/sdk/sandbox/image.py` | 新增 | `Image` 类 — 含 `base()` 和 `from_dockerfile()` 工厂方法,纯声明类型;命名走 4 段拼接(`registry_url/namespace/repository:tag`) | +| `rock/sdk/sandbox/image_resolver.py` | 新增 | `_ImageResolver` — DinD 构建编排:缓存检查 + docker build + push | +| `rock/sdk/sandbox/config.py` | 修改 | `SandboxConfig.image` 类型从 `str` 改为 `str \| Image`,添加 Pydantic validator | +| `rock/sdk/sandbox/client.py` | 修改 | `Sandbox.start()` 中增加 Image 解析逻辑(含 `repository` 注入);`__str__` 兼容 Image 类型 | +| `rock/env_vars.py` | 修改 | 新增 `ROCK_IMAGE_NAMESPACE`,默认 `"rock"` | +| `tests/unit/sdk/sandbox/test_image.py` | 新增 | `Image` 类单元测试(含 4 段拼接 + repository 注入 + tag 长度/格式断言) | +| `tests/unit/sdk/sandbox/test_image_resolver.py` | 新增 | `_ImageResolver` 单元测试(mock sandbox) | +| `tests/integration/sdk/sandbox/test_image_build.py` | 新增 | 端到端集成测试:from_dockerfile → start → 验证 COPY 文件 | + +--- + +## 核心逻辑 + +### 变更 1:Image 类(纯声明类型) + +文件:`rock/sdk/sandbox/image.py`(新增) + +`Image` 是一个 Pydantic BaseModel,提供两个静态工厂方法创建实例。`Image` 仅描述"从哪里构建、目标镜像名是什么",不包含任何构建执行逻辑。 + +```python +from __future__ import annotations + +from pathlib import Path + +from pydantic import BaseModel, Field, model_serializer, model_validator + + +class Image(BaseModel): + """镜像声明,不直接构造,通过静态工厂方法创建。 + + 示例: + Image.base("python:3.11") + Image.from_dockerfile("/path/to/env_dir") + Image.from_dockerfile( + "/path/to/env_dir", + registry_url="reg.io", + namespace="rock", + repository="my-env", + registry_username="user", + registry_password="pass", + ) + """ + + # ── base() 路径 ── + image_name: str | None = None # 仅 Image.base() 使用;from_dockerfile 不写 + + # ── from_dockerfile() 路径,4 段拼接 ── + dockerfile_path: str | None = None + registry_url: str | None = None # 默认 env_vars.ROCK_IMAGE_REGISTRY + namespace: str | None = None # 默认 env_vars.ROCK_IMAGE_NAMESPACE("rock") + repository: str | None = None # 默认 SandboxConfig.user_id(Sandbox.start() 注入) + # tag = content_hash()(完整 64 hex SHA-256),不暴露字段,build 时计算 + + # ── 通用 ── + force_build: bool = False + build_args: dict[str, str] = Field(default_factory=dict) + registry_username: str | None = None + registry_password: str | None = None + + @staticmethod + def base(image: str) -> Image: + """从已有镜像创建。等价于直接使用字符串。""" + return Image(image_name=image) + + @staticmethod + def from_dockerfile( + path: str | Path, + *, + registry_url: str | None = None, + namespace: str | None = None, + repository: str | None = None, + registry_username: str | None = None, + registry_password: str | None = None, + force_build: bool = False, + build_args: dict[str, str] | None = None, + ) -> Image: + """从包含 Dockerfile 的本地目录创建。 + + 镜像名按 4 段拼接:`{registry_url}/{namespace}/{repository}:{tag}`, + 其中 tag = build context 的完整 SHA-256(64 hex)。 + + Args: + path: 本地目录,包含 Dockerfile 和构建上下文文件。 + registry_url: registry host。不传则使用 ROCK_IMAGE_REGISTRY。 + namespace: 命名空间。不传则使用 ROCK_IMAGE_NAMESPACE(默认 "rock")。 + repository: 仓库名。不传则在 Sandbox.start() 时使用 SandboxConfig.user_id + (都缺失则退化为 "default")。 + registry_username: 镜像仓库用户名。不传则使用 ROCK_IMAGE_REGISTRY_USERNAME。 + registry_password: 镜像仓库密码。不传则使用 ROCK_IMAGE_REGISTRY_PASSWORD。 + force_build: 强制重新构建,即使镜像已存在。 + build_args: Docker build 参数(--build-arg)。 + + Note: + 不接受 image_name= 参数。如需完整字符串入口,使用 Image.base()。 + """ + return Image( + dockerfile_path=str(Path(path).resolve()), + registry_url=registry_url, + namespace=namespace, + repository=repository, + registry_username=registry_username, + registry_password=registry_password, + force_build=force_build, + build_args=build_args or {}, + ) + + @model_validator(mode="after") + def _validate(self) -> Image: + # 二选一:image_name (base) 与 dockerfile_path (from_dockerfile) 有且仅有一个 + if self.image_name is None and self.dockerfile_path is None: + raise ValueError("Image must have either 'image_name' or 'dockerfile_path'") + if self.dockerfile_path is not None: + p = Path(self.dockerfile_path) + if not p.is_dir(): + raise ValueError(f"dockerfile_path is not a directory: {self.dockerfile_path}") + if not (p / "Dockerfile").exists(): + raise ValueError(f"No Dockerfile found in: {self.dockerfile_path}") + # 自动填充默认凭证(不在此处拼 image_name,推迟到 _resolve_full_name) + if self.registry_username is None or self.registry_password is None: + from rock import env_vars + + if self.registry_username is None: + self.registry_username = env_vars.ROCK_IMAGE_REGISTRY_USERNAME + if self.registry_password is None: + self.registry_password = env_vars.ROCK_IMAGE_REGISTRY_PASSWORD + return self + + @property + def needs_build(self) -> bool: + return self.dockerfile_path is not None + + def content_hash(self) -> str: + """计算 dockerfile_path 目录的内容哈希(SHA-256, 64 hex)。 + 用作镜像 tag,内容变化即触发新 tag → 自动重建。 + """ + import hashlib + + if not self.dockerfile_path: + return "" + h = hashlib.sha256() + env_dir = Path(self.dockerfile_path) + for f in sorted(env_dir.rglob("*")): + if f.is_file() and ".git" not in f.parts: + h.update(str(f.relative_to(env_dir)).encode()) + h.update(f.read_bytes()) + return h.hexdigest() + + def _resolve_full_name(self) -> str: + """拼接 registry_url/namespace/repository:tag。 + 由 Sandbox.start() 在注入 repository 之后调用。 + """ + from rock import env_vars + + registry_url = self.registry_url or env_vars.ROCK_IMAGE_REGISTRY + namespace = self.namespace or env_vars.ROCK_IMAGE_NAMESPACE + repository = self.repository # 由 Sandbox.start() 注入 + if not (registry_url and namespace and repository): + missing = [k for k, v in [("registry_url", registry_url), + ("namespace", namespace), + ("repository", repository)] if not v] + raise ValueError(f"Cannot resolve image name, missing: {missing}") + tag = self.content_hash() # 完整 64 hex SHA-256 + return f"{registry_url.rstrip('/')}/{namespace}/{repository}:{tag}" + + @model_serializer(mode="wrap") + def _serialize(self, handler): + """model_dump(mode='json') 时输出 tag 字符串。 + 兼容 Harbor YAML 序列化路径,同时避免凭证泄漏到序列化输出。 + """ + if self.image_name is not None: + return self.image_name + return handler(self) +``` + +### 变更 2:_ImageResolver 类(DinD 构建编排) + +文件:`rock/sdk/sandbox/image_resolver.py`(新增) + +`_ImageResolver` 是内部执行类,承担 DinD 构建编排。接收 `Image` 对象,使用其自身的凭证和参数完成构建流程。 + +```python +from __future__ import annotations + +import io +import logging +import os +import shlex +import tarfile +import tempfile +from pathlib import Path + +from rock.sdk.sandbox.image import Image + +logger = logging.getLogger(__name__) + + +class _ImageResolver: + """将 Image 声明解析为镜像 tag 字符串。 + + 对于 base image 直接返回 tag。 + 对于 dockerfile image,启动一个 builder sandbox 完成 DinD 构建和推送。 + """ + + def __init__( + self, + *, + base_url: str, + cluster: str, + extra_headers: dict[str, str] | None = None, + builder_image: str | None = None, + _sandbox_factory=None, + ): + self._base_url = base_url + self._cluster = cluster + self._extra_headers = extra_headers or {} + self._builder_image = builder_image + self._sandbox_factory = _sandbox_factory + + async def resolve(self, image: Image) -> str: + """解析 Image 为镜像 tag 字符串。 + + 对于 base image 直接返回 tag。 + 对于 dockerfile image,启动一个 builder sandbox 完成: + 1. docker manifest inspect 检查镜像是否已存在 + 2. 若不存在(或 force_build),执行 docker build + push + 3. 返回 image_name + """ + if not image.needs_build: + return image.image_name + + from rock import env_vars + from rock.actions import CreateBashSessionRequest + from rock.sdk.sandbox.client import Sandbox + from rock.sdk.sandbox.config import SandboxConfig + from rock.utils import ImageUtil + + # 默认使用 Docker 官方 DinD 镜像(与 Daytona 一致) + builder_image = self._builder_image or env_vars.ROCK_IMAGE_BUILDER_IMAGE or "docker:28.3.3-dind" + builder_cfg = SandboxConfig( + image=builder_image, + base_url=self._base_url, + cluster=self._cluster, + extra_headers=self._extra_headers, + registry_username=image.registry_username, + registry_password=image.registry_password, + startup_timeout=600.0, + auto_clear_seconds=60 * 30, + ) + factory = self._sandbox_factory or Sandbox + builder = factory(builder_cfg) + session = "build" + try: + await builder.start() + await builder.create_session(CreateBashSessionRequest(session=session)) + + # ── Registry login ── + if image.registry_username and image.registry_password: + registry, _ = ImageUtil.parse_registry_and_others(image.image_name) + if not registry: + registry = "docker.io" + await builder.arun( + cmd=f"echo {shlex.quote(image.registry_password)} | docker login {shlex.quote(registry)} " + f"-u {shlex.quote(image.registry_username)} --password-stdin", + session=session, + ) + + # ── 计算 env_dir 内容哈希 ── + content_hash = image.content_hash() + + # ── 缓存检查:docker manifest inspect + 内容哈希对比 ── + if not image.force_build: + check = await builder.arun( + cmd=f"docker manifest inspect {shlex.quote(image.image_name)} > /dev/null 2>&1" + f" && echo EXISTS || echo MISSING", + session=session, + ) + if "EXISTS" in (check.output or ""): + # 镜像存在,进一步检查内容哈希是否匹配 + inspect_cmd = ( + f"docker pull {shlex.quote(image.image_name)} > /dev/null 2>&1 && " + f"docker inspect --format='{{{{index .Config.Labels \"rock.content_hash\"}}}}' {shlex.quote(image.image_name)}" + ) + result = await builder.arun(cmd=inspect_cmd, session=session) + remote_hash = (result.output or "").strip() + if remote_hash == content_hash: + logger.info("Image %s exists and content unchanged, skipping build", image.image_name) + return image.image_name + else: + logger.info("Image %s exists but content changed (remote=%s, local=%s), rebuilding", + image.image_name, remote_hash[:12], content_hash[:12]) + + # ── 启动 dockerd ── + await builder.arun(cmd="service docker start", session=session) + + # ── 上传构建上下文 ── + context_path = await self._upload_context(builder, session, image) + + # ── docker build(将内容哈希写入镜像 label)── + build_arg_flags = " ".join( + f"--build-arg {shlex.quote(f'{k}={v}')}" for k, v in image.build_args.items() + ) + label_flag = f"--label rock.content_hash={shlex.quote(content_hash)}" + build_cmd = f"docker build {build_arg_flags} {label_flag} -t {shlex.quote(image.image_name)} {shlex.quote(context_path)}".strip() + obs = await builder.arun(cmd=build_cmd, session=session, wait_timeout=600, mode="nohup") + if obs.exit_code != 0: + raise RuntimeError(f"docker build failed: {obs.failure_reason or obs.output}") + + # ── docker push ── + obs = await builder.arun(cmd=f"docker push {shlex.quote(image.image_name)}", session=session, wait_timeout=300, mode="nohup") + if obs.exit_code != 0: + raise RuntimeError(f"docker push failed: {obs.failure_reason or obs.output}") + + logger.info("Successfully built and pushed image %s", image.image_name) + return image.image_name + finally: + try: + await builder.stop() + except Exception: + logger.warning("Failed to stop builder sandbox: %s", builder.sandbox_id, exc_info=True) + + async def _upload_context(self, builder, session: str, image: Image) -> str: + """将 environment_dir 打包为 tar.gz 上传到 builder sandbox,返回解压后的远程路径。""" + remote_tar = "/tmp/rock_env_dir.tar.gz" + remote_ctx = "/tmp/rock_env_dir_ctx" + + env_dir = Path(image.dockerfile_path) + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w:gz") as tar: + tar.add(env_dir, arcname=".", filter=lambda ti: None if ti.name == ".git" else ti) + tar_bytes = buf.getvalue() + + with tempfile.NamedTemporaryFile(prefix="rock_env_dir_", suffix=".tar.gz", delete=False) as f: + f.write(tar_bytes) + local_tar_path = f.name + try: + upload_resp = await builder.upload_by_path(file_path=local_tar_path, target_path=remote_tar) + if not upload_resp.success: + raise RuntimeError(f"Failed to upload build context: {upload_resp.message}") + finally: + try: + os.remove(local_tar_path) + except OSError: + pass + + await builder.arun(cmd=f"mkdir -p {remote_ctx}", session=session) + await builder.arun(cmd=f"tar -xzf {remote_tar} -C {remote_ctx}", session=session) + return remote_ctx +``` + +### 变更 3:SandboxConfig.image 类型扩展 + +文件:`rock/sdk/sandbox/config.py` + +`image` 字段类型从 `str` 改为 `str | Image`,通过 `field_validator` 保持向后兼容: + +```python +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from rock.sdk.sandbox.image import Image + + +class SandboxConfig(BaseConfig): + image: str | Image = "python:3.11" # 扩展类型 + # ... 其他字段不变 + + @field_validator("image", mode="before") + @classmethod + def _coerce_image(cls, v): + from rock.sdk.sandbox.image import Image + + if isinstance(v, (str, Image)): + return v + if isinstance(v, dict): + try: + return Image(**v) + except Exception: + pass + return v +``` + +### 变更 4:Sandbox.start() Image 解析 + +文件:`rock/sdk/sandbox/client.py`,`start` 方法 + +在构建 POST body 之前,检测 `self.config.image` 是否为 `Image` 对象,若是则通过 `_ImageResolver` 解析为字符串 tag 并回写: + +```python +async def start(self): + # ── Image 解析 ── + from rock.sdk.sandbox.image import Image + + if isinstance(self.config.image, Image): + image_obj = self.config.image + if image_obj.needs_build: + # 注入 repository = config.user_id(缺失则用 "default") + # SandboxConfig.user_id 默认值不动(保持 None 以维持 X-User-Id header 现状), + # 仅在 Image 4 段拼接需要 repository 时本地 fallback。 + if image_obj.repository is None: + image_obj.repository = self.config.user_id or "default" + + from rock.sdk.sandbox.image_resolver import _ImageResolver + + resolver = _ImageResolver( + base_url=self.config.base_url, + cluster=self.config.cluster, + extra_headers=self.config.extra_headers, + ) + resolved_name = await resolver.resolve(image_obj) + self.config.image = resolved_name + # 同步凭证到 SandboxConfig,供 Admin 拉取镜像 + if image_obj.registry_username and not self.config.registry_username: + self.config.registry_username = image_obj.registry_username + self.config.registry_password = image_obj.registry_password + else: + self.config.image = image_obj.image_name + # ── 此时 self.config.image 必定为 str ── + + url = f"{self._url}/start_async" + # ... 原有逻辑不变 +``` + +`__str__` 兼容处理: + +```python +def __str__(self): + from rock.sdk.sandbox.image import Image + + image_display = self.config.image + if isinstance(image_display, Image): + image_display = f"Image(image_name={image_display.image_name}, dockerfile={image_display.dockerfile_path})" + + return ( + f"Sandbox(sandbox_id={self._sandbox_id}, " + f"host_name={self._host_name!r}, " + f"host_ip={self._host_ip}, " + f"image={image_display}, " + f"cluster={self._cluster})" + ) +``` + +### 设计要点 + +1. **Image 位于 `rock/sdk/sandbox/image.py`**:`Image` 是 `SandboxConfig.image` 字段的类型,与 `SandboxConfig` 紧密耦合,放在同一模块下。 + +2. **声明与执行分离**:`Image` 是纯声明类型,仅描述"从哪里构建、目标镜像名是什么";`_ImageResolver` 承担 DinD 构建编排,是内部实现类(前导下划线)。调用方只需构造 `Image` 对象,`Sandbox.start()` 内部自动使用 `_ImageResolver` 完成解析。两个类职责清晰:`Image` 负责验证和序列化,`_ImageResolver` 负责构建执行。 + +3. **resolve-and-replace**:在 `Sandbox.start()` 最顶部将 `Image` 解析为 `str` 并回写 `self.config.image`。此后所有下游代码(POST body、`__str__`、`SandboxGroup` 日志)自动看到纯字符串,无需逐一修改。 + +4. **Image 自包含凭证**:`Image` 自带 `registry_username` / `registry_password`,`_ImageResolver` 直接从 `Image` 对象读取凭证完成 registry login、push。解析完成后,`Sandbox.start()` 将凭证同步到 `SandboxConfig`(供 Admin 拉取镜像)。`Image` 作为一等类型,不依赖 `SandboxConfig` 即可独立完成构建。镜像仓库地址从 `image_name` 中解析(`ImageUtil.parse_registry_and_others()`)。 + +5. **缓存检查与构建合并**:在同一个 builder sandbox 会话中完成缓存检查和构建。缓存检查分两层:①`docker manifest inspect` 检查 image_name 是否存在于 registry;②若存在,`docker pull` + `docker inspect` 对比镜像 label 中的 `rock.content_hash` 与当前 env_dir 的内容哈希。两者都匹配才跳过构建,内容变化即使 tag 不变也会触发重建。构建时通过 `--label rock.content_hash=` 将哈希写入镜像。 + +6. **Admin 侧零改动**:`SandboxStartRequest.image: str`、`SandboxRecord.image = Column(String(512))` 不修改。`Image` 在 `Sandbox.start()` 的 HTTP 调用之前已解析为字符串。 + +7. **Pydantic 序列化兼容**:`Image` 的 `model_serializer(mode="wrap")` 在 `model_dump(mode="json")` 时输出 image_name 字符串。确保 `HarborJobConfig.to_harbor_yaml()` 和 `RockEnvironmentConfig.to_harbor_environment()` 的序列化路径不受影响。 + +8. **image_name 4 段拼接**:`from_dockerfile()` 不再接受 `image_name=` 参数,命名一律走 4 段拼接 `{registry_url}/{namespace}/{repository}:{tag}`: + - `registry_url` 默认 `ROCK_IMAGE_REGISTRY`(仅 registry host,如 `registry.cn-hangzhou.aliyuncs.com`) + - `namespace` 默认 `ROCK_IMAGE_NAMESPACE`(默认值 `"rock"`) + - `repository` 默认 `SandboxConfig.user_id`(缺失时 `"default"`),由 `Sandbox.start()` 注入 + - `tag` 强制 = `content_hash()`(完整 64 hex SHA-256),不允许用户传入 + + content_hash 作为 tag 意味着内容变化自动产生新 tag,无需额外的 label 对比即可识别缓存。凭证 `registry_username` / `registry_password` 为空时自动从 `ROCK_IMAGE_REGISTRY_USERNAME` / `ROCK_IMAGE_REGISTRY_PASSWORD` 读取。 + +9. **`SandboxConfig.user_id` 默认值保持不变**:`user_id` 仍是 `None` 默认,避免影响 `_add_user_defined_tag_into_headers` 当前的 `if self.config.user_id:` 判断(None 时不发 `X-User-Id` header)。仅在 `Sandbox.start()` 注入 `image.repository` 时本地 fallback 到 `"default"`。 + +10. **Builder 镜像选择**:`_ImageResolver` 启动 builder sandbox 时默认使用 `docker:28.3.3-dind`(Docker 官方 DinD 镜像),与 Harbor Daytona 环境使用的构建镜像一致。可通过 `ROCK_IMAGE_BUILDER_IMAGE` 环境变量或 `builder_image` 参数覆盖。 + +--- + +## Env Vars + +| 变量 | 现状 | 新方案 | +|---|---|---| +| `ROCK_IMAGE_REGISTRY` | 已有,旧实现把它当成完整镜像前缀直接拼 `:tag`(拼出 `host:tag` 缺 namespace/repo 段,docker push 必失败) | 重新定义为**仅 registry host**,参与 4 段拼接的第一段 | +| `ROCK_IMAGE_NAMESPACE` | 不存在 | **新增**,默认 `"rock"`,作为 4 段拼接的第二段 | +| `ROCK_IMAGE_REGISTRY_USERNAME` | 已有 | 不变 | +| `ROCK_IMAGE_REGISTRY_PASSWORD` | 已有 | 不变 | + +[rock/env_vars.py](../../../rock/env_vars.py) 改动: + +```python +ROCK_IMAGE_NAMESPACE: str = "rock" +... +"ROCK_IMAGE_NAMESPACE": lambda: os.getenv("ROCK_IMAGE_NAMESPACE", "rock"), +``` + +--- + +## Tag 长度决策(content_hash 截断长度) + +### 碰撞概率(Birthday paradox) + +| 截断长度 | bit | 50% 碰撞所需镜像数 | ROCK 实际场景(user < 10⁴ 镜像)下的碰撞概率 | +|---|---|---|---| +| 12 hex | 48 | ~ 2 × 10⁷ | ~ 1.8 × 10⁻¹ ⚠️ | +| 16 hex | 64 | ~ 5 × 10⁹ | ~ 2.7 × 10⁻¹² | +| 20 hex | 80 | ~ 1.1 × 10¹² | ~ 4.1 × 10⁻¹⁷ | +| 24 hex | 96 | ~ 7 × 10¹⁴ | ~ 6.3 × 10⁻²² | +| **64 hex** | **256** | **~ 4 × 10³⁸** | **~ 0** ✅ | + +### 业界基线 + +| 系统 | 短哈希长度 | bit | 哈希算法 | 说明 | +|---|---|---|---|---| +| Docker short image ID | 12 hex | 48 | SHA-256 截断 | `docker images` 默认显示,仅本地肉眼识别,靠 daemon 索引去重 | +| Git short SHA | 默认 7 hex | 28 | SHA-1 | 不够长时会自动延长(`core.abbrev` + 冲突检测) | +| Kubernetes pod-template-hash | 10 字符 base32 | ~50 | FNV-1a | ReplicaSet 区分模板,依赖 controller 重试 | +| OCI image digest (`sha256:...`) | 64 hex | 256 | SHA-256 | registry 标准,完全去重 | +| Modal image hash | 平台内部,不暴露 | — | — | 用户不感知,平台层处理碰撞 | +| Daytona snapshot hash | 平台内部 24h 缓存 | — | — | 同上 | + +### 决策 + +ROCK 场景属于 **"registry-stored、跨进程持久化、不靠中心去重"**:不同于 Docker short ID(仅本地肉眼)和 Git short SHA(有自动延长机制),我们没有冲突回退机制。 + +**采用 OCI digest 标准长度(64 hex / 256 bit SHA-256,不截断)。** 碰撞概率 ~10⁻⁶⁵,与 OCI manifest digest 完全一致的安全级别。 + +位置仍是 `:` 而非 OCI 标准的 `@sha256:`:OCI digest 是 push 完成后 registry 端算的 manifest digest,我们的 `content_hash` 是 build context 的 SHA-256,必须在 build 前作缓存键,所以放 tag 位。代价是 tag 总长 ~70 字符,registry UI 不如短哈希好读,但用户主要通过 `Image.from_dockerfile()` 接口操作,可接受。 + +--- + +## Validation Plan + +### 测试数据 + +路径:`tests/integration/test_data/image_from_dockerfile/` + +包含最小构建上下文:`Dockerfile`(`FROM python:3.11` + `COPY hello.txt`)和 `hello.txt` 标记文件。 + +### 单元测试 — `tests/unit/sdk/sandbox/test_image.py` + +| 用例 | 断言 | +|---|---| +| `from_dockerfile` 拒绝 `image_name=` kwarg | 抛 `TypeError` | +| `_resolve_full_name` 缺 `registry_url` / `namespace` / `repository` | 抛 `ValueError`,message 列出缺失字段 | +| `_resolve_full_name` 正确拼接 | 等于 `f"{registry_url}/{namespace}/{repository}:{hash}"` | +| `registry_url` 末尾 `/` 被剥掉 | 拼接结果不出现 `//` | +| env vars 默认生效 | 不传 `registry_url`/`namespace` 走 env;`namespace` 默认 `"rock"` | +| Tag 长度 = 64 | `len(tag) == 64`,`re.fullmatch(r"[0-9a-f]{64}", tag)` | +| `Sandbox.start()` 注入 user_id | `image.repository is None` 时被设为 `config.user_id`;都缺则用 `"default"` | +| `Image.base("python:3.11")` 不走 4 段解析 | `_resolve_full_name` 不被调用,`build()` 直接返回 `image_name` | + +### 集成测试 — `tests/integration/sdk/sandbox/test_image_build.py` + +测试 helper `_create_image` 与 `local_registry_info` fixture 按 4 段字段构造(不再传 `image_name=`): + +```python +def _create_image(env_dir, registry_info, **kwargs): + return Image.from_dockerfile( + env_dir, + registry_url=registry_info["registry_url"], + namespace=registry_info["namespace"], + repository=registry_info["repository"], + registry_username=registry_info["registry_username"], + registry_password=registry_info["registry_password"], + **kwargs, + ) +``` + +| 测试名 | 验证点 | marker | +|--------|--------|--------| +| `test_from_dockerfile_build_and_start` | `Image.from_dockerfile(path)` → `Sandbox.start()` → `cat /opt/hello.txt` 验证 COPY 文件可访问 | `@pytest.mark.need_admin` | +| `test_from_dockerfile_cache_skip` | 第二次 start 同一 Image,缓存命中跳过构建,耗时显著低于首次 | `@pytest.mark.need_admin` | +| `test_from_dockerfile_force_build` | `force_build=True` 即使镜像已存在也重新构建,验证文件内容正确 | `@pytest.mark.need_admin` | + +### 回归测试 + +```bash +uv run pytest -m "not need_ray and not need_admin and not need_admin_and_network" --reruns 1 +``` + +--- + +## Rollback + +- 删除新文件 `rock/sdk/sandbox/image.py`、`rock/sdk/sandbox/image_resolver.py` 及对应测试 +- 还原 `rock/sdk/sandbox/config.py`(`image: str | Image` → `image: str`,移除 validator) +- 还原 `rock/sdk/sandbox/client.py`(移除 `start()` 中的 Image 解析与 `repository` 注入、`__str__` 类型检查) +- 还原 `rock/env_vars.py`(移除 `ROCK_IMAGE_NAMESPACE`) +- Admin 侧无变更需回滚 diff --git a/rock/env_vars.py b/rock/env_vars.py index 11a9830613..db99684c8a 100644 --- a/rock/env_vars.py +++ b/rock/env_vars.py @@ -73,6 +73,12 @@ ROCK_MODEL_SERVICE_INSTALL_CMD: str + # Image Registry Config + ROCK_IMAGE_REGISTRY: str | None = None + ROCK_IMAGE_NAMESPACE: str = "rock" + ROCK_IMAGE_REGISTRY_USERNAME: str | None = None + ROCK_IMAGE_REGISTRY_PASSWORD: str | None = None + environment_variables: dict[str, Callable[[], Any]] = { "ROCK_LOGGING_PATH": lambda: os.getenv("ROCK_LOGGING_PATH"), @@ -89,6 +95,10 @@ "ROCK_CODE_SANDBOX_BASE_URL": lambda: os.getenv("ROCK_CODE_SANDBOX_BASE_URL", ""), "ROCK_ENVHUB_BASE_URL": lambda: os.getenv("ROCK_ENVHUB_BASE_URL", "http://localhost:8081"), "ROCK_ENVHUB_DEFAULT_DOCKER_IMAGE": lambda: os.getenv("ROCK_ENVHUB_DEFAULT_DOCKER_IMAGE", "python:3.11"), + "ROCK_IMAGE_BUILDER_IMAGE": lambda: os.getenv( + "ROCK_IMAGE_BUILDER_IMAGE", + "rock-n-roll-registry.cn-hangzhou.cr.aliyuncs.com/rock/rock-env-builder:latest", + ), "ROCK_ENVHUB_DB_URL": lambda: os.getenv( "ROCK_ENVHUB_DB_URL", f"sqlite:///{Path.home() / '.rock' / 'rock_envs.db'}" ), @@ -147,6 +157,11 @@ ), # Docker temp auth directory "ROCK_DOCKER_TEMP_AUTH_DIR": lambda: os.getenv("ROCK_DOCKER_TEMP_AUTH_DIR"), + # Image Registry Config + "ROCK_IMAGE_REGISTRY": lambda: os.getenv("ROCK_IMAGE_REGISTRY"), + "ROCK_IMAGE_NAMESPACE": lambda: os.getenv("ROCK_IMAGE_NAMESPACE", "rock"), + "ROCK_IMAGE_REGISTRY_USERNAME": lambda: os.getenv("ROCK_IMAGE_REGISTRY_USERNAME"), + "ROCK_IMAGE_REGISTRY_PASSWORD": lambda: os.getenv("ROCK_IMAGE_REGISTRY_PASSWORD"), } diff --git a/rock/sdk/sandbox/client.py b/rock/sdk/sandbox/client.py index 3a57eacda5..0a1820eea1 100644 --- a/rock/sdk/sandbox/client.py +++ b/rock/sdk/sandbox/client.py @@ -161,6 +161,24 @@ async def _parse_error_message_from_status(self, status: dict): return None async def start(self): + # ── Image 解析 ── + from rock.sdk.sandbox.image import Image + + if isinstance(self.config.image, Image): + image_obj = self.config.image + if image_obj.needs_build and image_obj.repository is None: + image_obj.repository = self.config.user_id or "default" + self.config.image = await image_obj.build( + base_url=self.config.base_url, + cluster=self.config.cluster, + extra_headers=self.config.extra_headers, + ) + # Sync image's registry credentials to SandboxConfig so admin can pull the + # built image. Don't override caller-provided creds. + if image_obj.registry_username and not self.config.registry_username: + self.config.registry_username = image_obj.registry_username + self.config.registry_password = image_obj.registry_password + url = f"{self._url}/start_async" headers = self._build_headers() data = { @@ -876,11 +894,17 @@ async def close(self) -> CloseResponse: def __str__(self): """Return user-friendly string representation with key attributes.""" + from rock.sdk.sandbox.image import Image + + image_display = self.config.image + if isinstance(image_display, Image): + image_display = f"Image(image_name={image_display.image_name}, dockerfile={image_display.dockerfile_path})" + return ( f"Sandbox(sandbox_id={self._sandbox_id}, " f"host_name={self._host_name!r}, " f"host_ip={self._host_ip}, " - f"image={self.config.image}, " + f"image={image_display}, " f"cluster={self._cluster})" ) diff --git a/rock/sdk/sandbox/config.py b/rock/sdk/sandbox/config.py index 4fcf59e030..dcee95f128 100644 --- a/rock/sdk/sandbox/config.py +++ b/rock/sdk/sandbox/config.py @@ -3,6 +3,7 @@ from pydantic import BaseModel, Field, field_validator from rock import env_vars +from rock.sdk.sandbox.image import Image class BaseConfig(BaseModel): @@ -28,7 +29,7 @@ def validate_xrl_authorization(cls, v): class SandboxConfig(BaseConfig): - image: str = "python:3.11" + image: str | Image = "python:3.11" image_os: str = "linux" auto_clear_seconds: int = 60 * 5 route_key: str | None = None @@ -46,6 +47,20 @@ class SandboxConfig(BaseConfig): sandbox_id: str | None = None auto_delete_seconds: int | None = None + @field_validator("image", mode="before") + @classmethod + def _coerce_image(cls, v): + from rock.sdk.sandbox.image import Image + + if isinstance(v, str | Image): + return v + if isinstance(v, dict): + try: + return Image(**v) + except Exception: + pass + return v + @field_validator("auto_delete_seconds") @classmethod def validate_auto_delete_seconds(cls, v): diff --git a/rock/sdk/sandbox/image.py b/rock/sdk/sandbox/image.py new file mode 100644 index 0000000000..923b1a592e --- /dev/null +++ b/rock/sdk/sandbox/image.py @@ -0,0 +1,171 @@ +from __future__ import annotations + +import hashlib +from pathlib import Path + +from pydantic import BaseModel, Field, model_serializer, model_validator + + +class Image(BaseModel): + """镜像声明,不直接构造,通过静态工厂方法创建。 + + 示例: + Image.base("python:3.11") + Image.from_dockerfile("/path/to/env_dir") + Image.from_dockerfile( + "/path/to/env_dir", + registry_url="reg.io", + namespace="rock", + repository="my-env", + registry_username="user", + registry_password="pass", + ) + """ + + # ── base() 路径 ── + image_name: str | None = None # 仅 Image.base() 使用 + + # ── from_dockerfile() 路径,4 段拼接 ── + dockerfile_path: str | None = None + registry_url: str | None = None # 默认 env_vars.ROCK_IMAGE_REGISTRY + namespace: str | None = None # 默认 env_vars.ROCK_IMAGE_NAMESPACE ("rock") + repository: str | None = None # 默认 SandboxConfig.user_id(Sandbox.start() 注入) + # tag = content_hash()(完整 64 hex SHA-256),不暴露字段 + + # ── 通用 ── + force_build: bool = False + build_args: dict[str, str] = Field(default_factory=dict) + registry_username: str | None = None + registry_password: str | None = None + + @staticmethod + def base(image: str) -> Image: + """从已有镜像创建。等价于直接使用字符串。""" + return Image(image_name=image) + + @staticmethod + def from_dockerfile( + path: str | Path, + *, + registry_url: str | None = None, + namespace: str | None = None, + repository: str | None = None, + registry_username: str | None = None, + registry_password: str | None = None, + force_build: bool = False, + build_args: dict[str, str] | None = None, + ) -> Image: + """从包含 Dockerfile 的本地目录创建。 + + 镜像名按 4 段拼接:`{registry_url}/{namespace}/{repository}:{tag}`, + 其中 tag = build context 的完整 SHA-256 (64 hex)。 + + Args: + path: 本地目录,包含 Dockerfile 和构建上下文文件。 + registry_url: registry host。不传则使用 ROCK_IMAGE_REGISTRY。 + namespace: 命名空间。不传则使用 ROCK_IMAGE_NAMESPACE(默认 "rock")。 + repository: 仓库名。不传则在 Sandbox.start() 时使用 SandboxConfig.user_id + (都缺失则退化为 "default")。 + registry_username: 镜像仓库用户名。不传则使用 ROCK_IMAGE_REGISTRY_USERNAME。 + registry_password: 镜像仓库密码。不传则使用 ROCK_IMAGE_REGISTRY_PASSWORD。 + force_build: 强制重新构建,即使镜像已存在。 + build_args: Docker build 参数(--build-arg)。 + """ + return Image( + dockerfile_path=str(Path(path).resolve()), + registry_url=registry_url, + namespace=namespace, + repository=repository, + registry_username=registry_username, + registry_password=registry_password, + force_build=force_build, + build_args=build_args or {}, + ) + + @model_validator(mode="after") + def _validate(self) -> Image: + if self.image_name is None and self.dockerfile_path is None: + raise ValueError("Image must have either 'image_name' or 'dockerfile_path'") + if self.dockerfile_path is not None: + p = Path(self.dockerfile_path) + if not p.is_dir(): + raise ValueError(f"dockerfile_path is not a directory: {self.dockerfile_path}") + if not (p / "Dockerfile").exists(): + raise ValueError(f"No Dockerfile found in: {self.dockerfile_path}") + if self.registry_username is None or self.registry_password is None: + from rock import env_vars + + if self.registry_username is None: + self.registry_username = env_vars.ROCK_IMAGE_REGISTRY_USERNAME + if self.registry_password is None: + self.registry_password = env_vars.ROCK_IMAGE_REGISTRY_PASSWORD + return self + + @property + def needs_build(self) -> bool: + return self.dockerfile_path is not None + + def content_hash(self) -> str: + """计算 dockerfile_path 目录的内容哈希(SHA-256, 64 hex)。""" + if not self.dockerfile_path: + return "" + h = hashlib.sha256() + env_dir = Path(self.dockerfile_path) + for f in sorted(env_dir.rglob("*")): + if f.is_file() and ".git" not in f.parts: + h.update(str(f.relative_to(env_dir)).encode()) + h.update(f.read_bytes()) + return h.hexdigest() + + def _resolve_full_name(self) -> str: + """拼接 registry_url/namespace/repository:tag。 + 由 Sandbox.start() 在注入 repository 之后调用。 + """ + from rock import env_vars + + registry_url = self.registry_url or env_vars.ROCK_IMAGE_REGISTRY + namespace = self.namespace or env_vars.ROCK_IMAGE_NAMESPACE + repository = self.repository + if not (registry_url and namespace and repository): + missing = [ + k + for k, v in [ + ("registry_url", registry_url), + ("namespace", namespace), + ("repository", repository), + ] + if not v + ] + raise ValueError(f"Cannot resolve image name, missing: {missing}") + tag = self.content_hash() + return f"{registry_url.rstrip('/')}/{namespace}/{repository}:{tag}" + + async def build( + self, + *, + base_url: str, + cluster: str, + extra_headers: dict[str, str] | None = None, + ) -> str: + """将 Image 构建为镜像 tag 字符串。 + + 对于 base image 直接返回 image_name。 + 对于 dockerfile image,启动 builder sandbox 完成 DinD 构建和推送。 + """ + if not self.needs_build: + return self.image_name + + from rock.sdk.sandbox.image_builder import ImageBuilder + + builder = ImageBuilder( + base_url=base_url, + cluster=cluster, + extra_headers=extra_headers, + ) + return await builder.build(self) + + @model_serializer(mode="wrap") + def _serialize(self, handler): + if self.image_name is not None: + return self.image_name + return handler(self) diff --git a/rock/sdk/sandbox/image_builder.py b/rock/sdk/sandbox/image_builder.py new file mode 100644 index 0000000000..7974e16a5e --- /dev/null +++ b/rock/sdk/sandbox/image_builder.py @@ -0,0 +1,235 @@ +from __future__ import annotations + +import io +import logging +import os +import shlex +import tarfile +import tempfile +from pathlib import Path + +from rock import env_vars +from rock.actions import CreateBashSessionRequest +from rock.sdk.sandbox.client import Sandbox +from rock.sdk.sandbox.config import SandboxConfig +from rock.sdk.sandbox.image import Image +from rock.utils import ImageUtil + +logger = logging.getLogger(__name__) + +_DOCKERD_SCRIPT = r"""#!/bin/bash +set -e +export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:$PATH" + +if command -v dockerd &>/dev/null; then + if ! pgrep -x dockerd &>/dev/null; then + echo "Starting dockerd..." + nohup dockerd &>/var/log/dockerd.log & + fi + for i in $(seq 1 60); do + if docker info &>/dev/null; then echo "DOCKERD_OK"; break; fi + sleep 1 + if [ "$i" -eq 60 ]; then + echo "DOCKERD_FAIL" + cat /var/log/dockerd.log 2>/dev/null | tail -50 + exit 1 + fi + done +fi +""" + +_BUILD_SCRIPT_TEMPLATE = r"""#!/bin/bash +set -e + +IMAGE_NAME={image_name} +CONTENT_HASH={content_hash} +FORCE_BUILD={force_build} + +# ── Cache check ── +if [ "$FORCE_BUILD" != "true" ]; then + if docker manifest inspect "$IMAGE_NAME" > /dev/null 2>&1; then + docker pull "$IMAGE_NAME" > /dev/null 2>&1 || true + REMOTE_HASH=$(docker inspect --format='{{{{index .Config.Labels "rock.content_hash"}}}}' "$IMAGE_NAME" 2>/dev/null || true) + if [ "$REMOTE_HASH" = "$CONTENT_HASH" ]; then + echo "CACHE_HIT" + echo "BUILD_OK" + exit 0 + else + echo "Cache miss: content changed, rebuilding" + fi + fi +fi + +# ── Docker build ── +echo "Building image $IMAGE_NAME..." +docker build {build_arg_flags} --label rock.content_hash="$CONTENT_HASH" -t "$IMAGE_NAME" {context_path} +echo "BUILD_OK" +""" + +_PUSH_SCRIPT_TEMPLATE = r"""#!/bin/bash +set -e + +IMAGE_NAME={image_name} +REGISTRY={registry} +REG_USER={registry_username} +REG_PASS={registry_password} + +# ── Registry login ── +if [ -n "$REG_USER" ] && [ -n "$REG_PASS" ]; then + echo "$REG_PASS" | docker login "$REGISTRY" -u "$REG_USER" --password-stdin +else + echo "No registry credentials, skipping login" +fi + +# ── Docker push ── +echo "Pushing image $IMAGE_NAME..." +docker push "$IMAGE_NAME" +echo "PUSH_OK" +""" + + +class ImageBuilder: + """将 Image 声明解析为镜像 tag 字符串。 + + 对于 base image 直接返回 tag。 + 对于 dockerfile image,启动一个 builder sandbox 完成 DinD 构建和推送。 + """ + + BUILD_SESSION = "build" + + def __init__( + self, + *, + base_url: str, + cluster: str, + extra_headers: dict[str, str] | None = None, + builder_image: str | None = None, + _sandbox_factory=None, + ): + self._base_url = base_url + self._cluster = cluster + self._extra_headers = extra_headers or {} + self._builder_image = builder_image + self._sandbox_factory = _sandbox_factory + + def create_builder(self) -> Sandbox: + """Construct (but do not start) the builder sandbox. + + Exposed so callers can start, customise (e.g. inject test-only NAT rules), then + hand the running builder to :meth:`build_with_builder`. + """ + builder_image = self._builder_image or env_vars.ROCK_IMAGE_BUILDER_IMAGE + builder_cfg = SandboxConfig( + image=builder_image, + base_url=self._base_url, + cluster=self._cluster, + extra_headers=self._extra_headers, + startup_timeout=600.0, + auto_clear_seconds=60 * 30, + ) + factory = self._sandbox_factory or Sandbox + return factory(builder_cfg) + + async def build(self, image: Image) -> str: + """Build `image` by managing the builder lifecycle internally.""" + if not image.needs_build: + return image.image_name + + builder = self.create_builder() + try: + await builder.start() + return await self.build_with_builder(image, builder) + finally: + try: + await builder.stop() + except Exception: + logger.warning("Failed to stop builder sandbox: %s", builder.sandbox_id, exc_info=True) + + async def build_with_builder(self, image: Image, builder: Sandbox) -> str: + """Run the build/push pipeline against an externally-managed, already-started + builder sandbox. + + The caller owns `builder`'s lifecycle (start/stop) and is free to perform any + environment-specific setup (firewall rules, mounts, etc.) before calling this. + """ + if not image.needs_build: + return image.image_name + + full_name = image._resolve_full_name() + session = self.BUILD_SESSION + await builder.create_session(CreateBashSessionRequest(session=session)) + + # ── Phase 1: Start dockerd ── + await self._run_script(builder, session, _DOCKERD_SCRIPT, "/tmp/rock_dockerd.sh", "DOCKERD_OK", 120) + + # ── Phase 2: Build image ── + content_hash = image.content_hash() + context_path = await self._upload_context(builder, session, image) + build_script = self._gen_build_script(image, full_name, content_hash, context_path) + build_output = await self._run_script(builder, session, build_script, "/tmp/rock_build.sh", "BUILD_OK", 600) + if "CACHE_HIT" in build_output: + logger.info("Image %s cache hit, skipping push", full_name) + return full_name + + # ── Phase 3: Login and push ── + push_script = self._gen_push_script(image, full_name) + await self._run_script(builder, session, push_script, "/tmp/rock_push.sh", "PUSH_OK", 300) + + logger.info("Successfully built and pushed image %s", full_name) + return full_name + + async def _run_script( + self, builder, session: str, script: str, remote_path: str, success_marker: str, timeout: int + ) -> str: + await builder.write_file_by_path(script, remote_path) + obs = await builder.arun(cmd=f"bash {remote_path}", session=session, wait_timeout=timeout, mode="nohup") + output = obs.output or "" + if obs.exit_code != 0 or success_marker not in output: + raise RuntimeError(f"Script {remote_path} failed (exit_code={obs.exit_code}): {output}") + return output + + def _gen_build_script(self, image: Image, full_name: str, content_hash: str, context_path: str) -> str: + build_arg_flags = " ".join(f"--build-arg {shlex.quote(f'{k}={v}')}" for k, v in image.build_args.items()) + return _BUILD_SCRIPT_TEMPLATE.format( + image_name=shlex.quote(full_name), + content_hash=shlex.quote(content_hash), + force_build="true" if image.force_build else "false", + build_arg_flags=build_arg_flags, + context_path=shlex.quote(context_path), + ) + + def _gen_push_script(self, image: Image, full_name: str) -> str: + registry, _ = ImageUtil.parse_registry_and_others(full_name) + return _PUSH_SCRIPT_TEMPLATE.format( + image_name=shlex.quote(full_name), + registry=shlex.quote(registry or "docker.io"), + registry_username=shlex.quote(image.registry_username or ""), + registry_password=shlex.quote(image.registry_password or ""), + ) + + async def _upload_context(self, builder, session: str, image: Image) -> str: + remote_tar = "/tmp/rock_env_dir.tar.gz" + remote_ctx = "/tmp/rock_env_dir_ctx" + + env_dir = Path(image.dockerfile_path) + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w:gz") as tar: + tar.add(env_dir, arcname=".", filter=lambda ti: None if ti.name == ".git" else ti) + tar_bytes = buf.getvalue() + + with tempfile.NamedTemporaryFile(prefix="rock_env_dir_", suffix=".tar.gz", delete=False) as f: + f.write(tar_bytes) + local_tar_path = f.name + try: + upload_resp = await builder.upload_by_path(file_path=local_tar_path, target_path=remote_tar) + if not upload_resp.success: + raise RuntimeError(f"Failed to upload build context: {upload_resp.message}") + finally: + try: + os.remove(local_tar_path) + except OSError: + pass + + await builder.arun(cmd=f"mkdir -p {remote_ctx}", session=session) + await builder.arun(cmd=f"tar -xzf {remote_tar} -C {remote_ctx}", session=session) + return remote_ctx diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index b39a9a385d..6a2c2cd1e7 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -275,7 +275,10 @@ async def local_registry(): check=True, ) - # 4. Wait for registry to be ready + # 4. Wait for registry to be ready. Use `localhost` so the registry name falls in + # the `127.0.0.0/8` CIDR that both outer dockerd and inner builder dockerd trust by + # default. Tests using this fixture from inside a builder sandbox must inject an + # iptables NAT rule mapping 127.0.0.1:port → host_ip:port (see test_image_build.py). registry_url = f"localhost:{port}" for _ in range(30): try: diff --git a/tests/integration/sdk/sandbox/test_image_build.py b/tests/integration/sdk/sandbox/test_image_build.py new file mode 100644 index 0000000000..37d387240f --- /dev/null +++ b/tests/integration/sdk/sandbox/test_image_build.py @@ -0,0 +1,195 @@ +"""Integration tests for Image.from_dockerfile() → Sandbox.start() flow. + +Verifies that a sandbox can be started from a local Dockerfile directory, +including build, cache skip, and content-change rebuild scenarios. + +Run: pytest tests/integration/sdk/sandbox/test_image_build.py -v -m need_admin +""" + +import shutil +import time +from contextlib import asynccontextmanager +from pathlib import Path + +import pytest + +from rock.actions.sandbox.request import CreateBashSessionRequest +from rock.logger import init_logger +from rock.sdk.sandbox.client import Sandbox +from rock.sdk.sandbox.config import SandboxConfig +from rock.sdk.sandbox.image import Image +from rock.sdk.sandbox.image_builder import ImageBuilder + +logger = init_logger(__name__) + +TEST_DATA_DIR = Path(__file__).resolve().parents[2] / "test_data" / "image_from_dockerfile" +EXPECTED_FILE_CONTENT = "rock-image-from-dockerfile-ok" +MODIFIED_CONTENT = "rock-content-changed" + + +# ── Helpers ── + + +def _create_image(env_dir, registry_info, **kwargs): + return Image.from_dockerfile( + env_dir, + registry_url=registry_info["registry_url"], + namespace=registry_info["namespace"], + repository=registry_info["repository"], + registry_username=registry_info["registry_username"], + registry_password=registry_info["registry_password"], + **kwargs, + ) + + +def _create_config(image, admin_remote_server, registry_info=None): + """Build a SandboxConfig for the just-built image. + + `image` is the already-resolved tag string (we pre-build via _build_with_loopback_nat + so the SDK's auto-resolve path inside Sandbox.start() isn't triggered here). + `registry_info` carries the credentials admin needs to pull the image. + """ + base_url = f"{admin_remote_server.endpoint}:{admin_remote_server.port}" + kwargs = dict(image=image, memory="2g", cpus=1.0, startup_timeout=600, base_url=base_url) + if registry_info: + kwargs["registry_username"] = registry_info["registry_username"] + kwargs["registry_password"] = registry_info["registry_password"] + return SandboxConfig(**kwargs) + + +@asynccontextmanager +async def _run_sandbox(config): + """Start a sandbox with default session, yield it, always stop on exit.""" + sandbox = Sandbox(config) + try: + await sandbox.start() + await sandbox.create_session(CreateBashSessionRequest(session="default")) + yield sandbox + finally: + try: + await sandbox.stop() + except Exception as e: + logger.warning("Failed to stop sandbox: %s", e) + + +async def _assert_file_content(sandbox, expected): + result = await sandbox.arun(cmd="cat /opt/hello.txt", session="default") + assert result.output is not None + assert result.output.strip() == expected + + +# ── Fixtures / helpers ── + + +async def _inject_loopback_nat(builder, port: int) -> None: + """NAT 127.0.0.1:port → builder.host_ip:port inside the builder. + + The local_registry fixture serves on the host's loopback (`localhost:port`, i.e. + 127.0.0.1:port). That address falls in 127.0.0.0/8 which dockerd trusts as insecure + by default, but from inside the builder (its own netns) 127.0.0.1 is the builder's + own loopback with no listener. Three things make the loopback URL actually reach + the host's docker-proxy: + 1. enable route_localnet (kernel default forbids routing 127.x off lo) + 2. OUTPUT DNAT 127.0.0.1:port → host_ip:port (rewrite outgoing dst) + 3. POSTROUTING MASQUERADE for host_ip:port (rewrite src so reply routes back) + """ + host_ip = builder.host_ip + cmd = ( + "echo 1 | tee /proc/sys/net/ipv4/conf/all/route_localnet " + "/proc/sys/net/ipv4/conf/lo/route_localnet > /dev/null && " + f"iptables -t nat -A OUTPUT -p tcp -d 127.0.0.1 --dport {port} " + f"-j DNAT --to-destination {host_ip}:{port} && " + f"iptables -t nat -A POSTROUTING -p tcp -d {host_ip} --dport {port} -j MASQUERADE" + ) + logger.info("Injecting builder loopback NAT: 127.0.0.1:%s -> %s:%s", port, host_ip, port) + obs = await builder.arun(cmd=cmd, session=ImageBuilder.BUILD_SESSION, mode="normal") + if obs.exit_code != 0: + raise RuntimeError(f"NAT setup failed (exit_code={obs.exit_code}): {obs.failure_reason or obs.output}") + + +async def _build_with_loopback_nat(image: Image, admin_remote_server) -> str: + """Drive the build using a builder we own so we can inject test-only NAT. + + Returns the resolved image name (string) once build+push completes. + """ + base_url = f"{admin_remote_server.endpoint}:{admin_remote_server.port}" + image_builder = ImageBuilder(base_url=base_url, cluster="default") + builder = image_builder.create_builder() + await builder.start() + try: + await builder.create_session(CreateBashSessionRequest(session=ImageBuilder.BUILD_SESSION)) + registry = image.registry_url or "" + host_part, _, port_part = registry.partition(":") + if (host_part.startswith("127.") or host_part == "localhost") and port_part: + await _inject_loopback_nat(builder, int(port_part)) + return await image_builder.build_with_builder(image, builder) + finally: + try: + await builder.stop() + except Exception: + logger.warning("Failed to stop builder sandbox: %s", builder.sandbox_id, exc_info=True) + + +@pytest.fixture +def local_registry_info(local_registry): + registry_url, username, password = local_registry + return { + "registry_url": registry_url, + "namespace": "rock-test", + "repository": "image-from-dockerfile", + "registry_username": username, + "registry_password": password, + } + + +@pytest.fixture +def modified_env_dir(tmp_path): + """Copy test data and modify hello.txt to detect rebuild.""" + env_dir = tmp_path / "env" + shutil.copytree(TEST_DATA_DIR, env_dir) + (env_dir / "hello.txt").write_text(MODIFIED_CONTENT + "\n") + return env_dir + + +# ── Tests ── + + +@pytest.mark.need_admin +@pytest.mark.asyncio +async def test_from_dockerfile_build_and_start(local_registry_info, admin_remote_server): + """Image.from_dockerfile() → build/push (via test-managed builder) → Sandbox.start().""" + image = _create_image(TEST_DATA_DIR, local_registry_info) + resolved = await _build_with_loopback_nat(image, admin_remote_server) + config = _create_config(resolved, admin_remote_server, local_registry_info) + async with _run_sandbox(config) as sandbox: + await _assert_file_content(sandbox, EXPECTED_FILE_CONTENT) + + +@pytest.mark.need_admin +@pytest.mark.asyncio +async def test_from_dockerfile_cache_skip(local_registry_info, admin_remote_server): + """Second build of the same Image should hit cache (CACHE_HIT) and skip push.""" + image = _create_image(TEST_DATA_DIR, local_registry_info) + + t0 = time.monotonic() + resolved = await _build_with_loopback_nat(image, admin_remote_server) + first_duration = time.monotonic() - t0 + + t0 = time.monotonic() + resolved2 = await _build_with_loopback_nat(image, admin_remote_server) + second_duration = time.monotonic() - t0 + + assert resolved == resolved2 + logger.info("First build: %.1fs, second build: %.1fs", first_duration, second_duration) + assert second_duration < first_duration + + +@pytest.mark.need_admin +@pytest.mark.asyncio +async def test_from_dockerfile_rebuilds_on_content_change(local_registry_info, admin_remote_server, modified_env_dir): + """Content change in env_dir triggers rebuild, new file content is picked up.""" + image = _create_image(modified_env_dir, local_registry_info) + resolved = await _build_with_loopback_nat(image, admin_remote_server) + config = _create_config(resolved, admin_remote_server, local_registry_info) + async with _run_sandbox(config) as sandbox: + await _assert_file_content(sandbox, MODIFIED_CONTENT) diff --git a/tests/integration/test_data/image_from_dockerfile/Dockerfile b/tests/integration/test_data/image_from_dockerfile/Dockerfile new file mode 100644 index 0000000000..2ca1dbccfc --- /dev/null +++ b/tests/integration/test_data/image_from_dockerfile/Dockerfile @@ -0,0 +1,2 @@ +FROM rock-n-roll-registry.cn-hangzhou.cr.aliyuncs.com/rock/example-sandbox:py311 +COPY hello.txt /opt/hello.txt diff --git a/tests/integration/test_data/image_from_dockerfile/hello.txt b/tests/integration/test_data/image_from_dockerfile/hello.txt new file mode 100644 index 0000000000..dbb76d57e7 --- /dev/null +++ b/tests/integration/test_data/image_from_dockerfile/hello.txt @@ -0,0 +1 @@ +rock-image-from-dockerfile-ok diff --git a/tests/unit/sdk/sandbox/test_image.py b/tests/unit/sdk/sandbox/test_image.py new file mode 100644 index 0000000000..35d8765e61 --- /dev/null +++ b/tests/unit/sdk/sandbox/test_image.py @@ -0,0 +1,109 @@ +"""Unit tests for Image — covers 4-segment image name composition. + +Run: uv run pytest tests/unit/sdk/sandbox/test_image.py -v +""" + +from __future__ import annotations + +import re +from pathlib import Path +from unittest.mock import patch + +import pytest + +from rock.sdk.sandbox.image import Image + + +@pytest.fixture +def env_dir(tmp_path: Path) -> Path: + """Minimal valid build context: a Dockerfile + a marker file.""" + d = tmp_path / "env" + d.mkdir() + (d / "Dockerfile").write_text("FROM python:3.11\nCOPY hello.txt /opt/hello.txt\n") + (d / "hello.txt").write_text("hi\n") + return d + + +def test_from_dockerfile_rejects_image_name_kwarg(env_dir: Path) -> None: + """The 4-segment refactor removes image_name= from from_dockerfile.""" + with pytest.raises(TypeError): + Image.from_dockerfile(env_dir, image_name="reg.io/ns/repo:tag") + + +def test_resolve_full_name_concatenates_four_segments(env_dir: Path) -> None: + """Happy path: explicit segments concatenated; trailing slash on registry stripped.""" + image = Image.from_dockerfile(env_dir, registry_url="reg.io/", namespace="myns", repository="myrepo") + name = image._resolve_full_name() + tag = image.content_hash() + assert name == f"reg.io/myns/myrepo:{tag}" + + +def test_resolve_full_name_raises_when_segments_missing(env_dir: Path) -> None: + """Missing segments → ValueError listing exactly which ones.""" + with patch("rock.env_vars.ROCK_IMAGE_REGISTRY", None), patch("rock.env_vars.ROCK_IMAGE_NAMESPACE", None): + image = Image.from_dockerfile(env_dir) # repository also unset + with pytest.raises(ValueError) as exc: + image._resolve_full_name() + msg = str(exc.value) + assert "registry_url" in msg and "namespace" in msg and "repository" in msg + + +def test_resolve_full_name_uses_env_defaults(env_dir: Path) -> None: + """registry_url / namespace default to env vars when kwargs omitted.""" + with ( + patch("rock.env_vars.ROCK_IMAGE_REGISTRY", "env-reg.io"), + patch("rock.env_vars.ROCK_IMAGE_NAMESPACE", "env-ns"), + ): + image = Image.from_dockerfile(env_dir, repository="myrepo") + assert image._resolve_full_name().startswith("env-reg.io/env-ns/myrepo:") + + +def test_tag_is_64_hex_sha256(env_dir: Path) -> None: + """Tag pinned to full SHA-256 (OCI digest length), no truncation.""" + image = Image.from_dockerfile(env_dir, registry_url="reg.io", namespace="ns", repository="repo") + tag = image._resolve_full_name().rsplit(":", 1)[1] + assert re.fullmatch(r"[0-9a-f]{64}", tag) + + +class _CapturedRepository(Exception): + def __init__(self, repository): + super().__init__(repository) + self.repository = repository + + +@pytest.mark.asyncio +async def test_sandbox_start_injects_user_id_as_repository(env_dir, monkeypatch): + from rock.sdk.sandbox.client import Sandbox + from rock.sdk.sandbox.config import SandboxConfig + from rock.sdk.sandbox.image import Image + + async def fake_build(self, **kwargs): + raise _CapturedRepository(self.repository) + + monkeypatch.setattr(Image, "build", fake_build) + + image = Image.from_dockerfile(env_dir, registry_url="reg.io", namespace="ns") + config = SandboxConfig(image=image, user_id="alice", base_url="http://x") + sandbox = Sandbox(config) + with pytest.raises(_CapturedRepository) as excinfo: + await sandbox.start() + assert excinfo.value.repository == "alice" + + +@pytest.mark.asyncio +async def test_sandbox_start_falls_back_to_default_repository(env_dir, monkeypatch): + from rock.sdk.sandbox.client import Sandbox + from rock.sdk.sandbox.config import SandboxConfig + from rock.sdk.sandbox.image import Image + + async def fake_build(self, **kwargs): + raise _CapturedRepository(self.repository) + + monkeypatch.setattr(Image, "build", fake_build) + + image = Image.from_dockerfile(env_dir, registry_url="reg.io", namespace="ns") + config = SandboxConfig(image=image, base_url="http://x") # no user_id + sandbox = Sandbox(config) + with pytest.raises(_CapturedRepository) as excinfo: + await sandbox.start() + assert excinfo.value.repository == "default"