diff --git a/backend/.sqlx/query-8c08eb619da44fd89d5f6f787b96f6a9252200e32280b2c6fa49a58ae2cb676f.json b/backend/.sqlx/query-8c08eb619da44fd89d5f6f787b96f6a9252200e32280b2c6fa49a58ae2cb676f.json new file mode 100644 index 0000000000000..63c335a09c6d5 --- /dev/null +++ b/backend/.sqlx/query-8c08eb619da44fd89d5f6f787b96f6a9252200e32280b2c6fa49a58ae2cb676f.json @@ -0,0 +1,23 @@ +{ + "db_name": "PostgreSQL", + "query": "SELECT value FROM resource WHERE workspace_id = $1 AND path = $2 AND resource_type = 'ai_agent'", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "value", + "type_info": "Jsonb" + } + ], + "parameters": { + "Left": [ + "Text", + "Text" + ] + }, + "nullable": [ + true + ] + }, + "hash": "8c08eb619da44fd89d5f6f787b96f6a9252200e32280b2c6fa49a58ae2cb676f" +} diff --git a/backend/migrations/20260627124035_ai_agent_resource_type.down.sql b/backend/migrations/20260627124035_ai_agent_resource_type.down.sql new file mode 100644 index 0000000000000..cfe9c91cae7f4 --- /dev/null +++ b/backend/migrations/20260627124035_ai_agent_resource_type.down.sql @@ -0,0 +1 @@ +DELETE FROM resource_type WHERE workspace_id = 'admins' AND name = 'ai_agent'; diff --git a/backend/migrations/20260627124035_ai_agent_resource_type.up.sql b/backend/migrations/20260627124035_ai_agent_resource_type.up.sql new file mode 100644 index 0000000000000..4d18883df4509 --- /dev/null +++ b/backend/migrations/20260627124035_ai_agent_resource_type.up.sql @@ -0,0 +1,41 @@ +-- Built-in `ai_agent` resource type backing reusable AI agent steps. +-- A resource of this type stores an agent's brain (provider/model/system prompt/etc.), +-- its tool set, and its eval suite. Flow steps link to it via FlowModuleValue::AIAgent.agent. +-- +-- Seeded into the `admins` workspace: list_resource_types unions `workspace_id = 'admins'`, +-- so this single row is visible from every workspace (existing and future), mirroring how +-- hub-synced built-in types (e.g. s3object) are made globally available. +INSERT INTO resource_type (workspace_id, name, schema, description, edited_at) VALUES + ('admins', 'ai_agent', '{ + "type": "object", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "required": ["provider"], + "properties": { + "provider": { + "type": "object", + "format": "ai-provider", + "description": "AI provider + model + credentials resource for the agent." + }, + "system_prompt": { "type": "string", "description": "System prompt for the agent." }, + "temperature": { "type": "number", "description": "Sampling temperature (0.0-2.0)." }, + "max_completion_tokens": { "type": "number", "description": "Maximum output tokens." }, + "max_iterations": { "type": "number", "description": "Max reasoning/tool-use loops." }, + "output_type": { "type": "string", "enum": ["text", "image"], "default": "text" }, + "output_schema": { "type": "object", "format": "json-schema", "description": "Structured-output JSON schema." }, + "streaming": { "type": "boolean" }, + "memory": { "type": "object", "description": "Conversation memory config (off/auto/manual)." }, + "tools": { "type": "array", "description": "Reusable tool definitions available to the agent." }, + "evals": { + "type": "object", + "description": "Eval suite: cases graded by deterministic assertions and/or an LLM judge.", + "properties": { + "cases": { "type": "array" }, + "judge": { "type": "object" } + } + } + } + }'::jsonb, + 'A reusable AI agent: provider/model, system prompt, tools and an eval suite. Referenced by AI agent flow steps.', + now()) +ON CONFLICT (workspace_id, name) DO UPDATE + SET schema = EXCLUDED.schema, description = EXCLUDED.description, edited_at = now(); diff --git a/backend/windmill-ai/src/types.rs b/backend/windmill-ai/src/types.rs index 8f864468dbcbb..2bd2e6017f8ed 100644 --- a/backend/windmill-ai/src/types.rs +++ b/backend/windmill-ai/src/types.rs @@ -160,6 +160,95 @@ impl From for AIAgentArgs { } } +// =========================================================================== +// Reusable AI agent — eval suite types +// +// An `ai_agent` resource stores an `AIAgentConfig` (brain + tools + evals). The +// brain fields mirror `AIAgentArgsRaw` and are merged into `AIAgentArgs` at +// runtime via a plain JSON merge, so they are not re-declared here. The types +// below model the eval suite, which the judge/run endpoints inspect directly. +// =========================================================================== + +/// One eval case stored under an `ai_agent` resource's `evals.cases`. +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct AgentEvalCase { + pub id: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub name: Option, + pub input: EvalInput, + /// LLM-judge acceptance criteria (each a single bullet the output must satisfy). + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub judge_checklist: Vec, + /// Deterministic checks evaluated without an LLM. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub assertions: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default)] +pub struct EvalInput { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub user_message: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub user_attachments: Option>, +} + +/// Deterministic, LLM-free check run against an agent's output. +#[derive(Serialize, Deserialize, Debug, Clone)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum Assertion { + Contains { + value: String, + #[serde(default)] + case_sensitive: bool, + }, + NotContains { + value: String, + #[serde(default)] + case_sensitive: bool, + }, + Regex { + pattern: String, + }, + /// JSONPath-style dotted path into a structured output equals the given value. + JsonPathEquals { + path: String, + value: serde_json::Value, + }, + /// Output validates against the agent's configured `output_schema`. + OutputSchemaValid, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct AssertionResult { + pub assertion: Assertion, + pub passed: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub detail: Option, +} + +/// LLM-judge verdict for one case (0-100 score + pass/fail + rationale). +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct JudgeResult { + pub score: u8, + pub pass: bool, + pub summary: String, +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct EvalCaseResult { + pub case_id: String, + pub passed: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub output: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub assertions: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub judge: Option, + pub latency_ms: u64, +} + #[derive(Deserialize, Debug)] pub struct ProviderResource { #[serde(alias = "apiKey", default, deserialize_with = "empty_string_as_none")] diff --git a/backend/windmill-api/openapi.yaml b/backend/windmill-api/openapi.yaml index e9c57c3fb0523..5108c37c801c2 100644 --- a/backend/windmill-api/openapi.yaml +++ b/backend/windmill-api/openapi.yaml @@ -6768,6 +6768,79 @@ paths: schema: type: string + /w/{workspace}/ai_agents/run: + post: + summary: run a saved AI agent once on an input + operationId: runAiAgent + tags: + - ai_agent + parameters: + - $ref: "#/components/parameters/WorkspaceId" + requestBody: + description: agent resource path and input + required: true + content: + application/json: + schema: + type: object + required: + - agent + properties: + agent: + type: string + description: Path of a saved ai_agent resource + input: + type: object + properties: + user_message: + type: string + user_attachments: + type: array + items: + type: object + responses: + "200": + description: agent output + content: + application/json: + schema: {} + + /w/{workspace}/ai_agents/eval_case: + post: + summary: run a single eval case against a saved AI agent + operationId: evalAiAgentCase + tags: + - ai_agent + parameters: + - $ref: "#/components/parameters/WorkspaceId" + requestBody: + description: agent resource path, eval case, and optional judge provider + required: true + content: + application/json: + schema: + type: object + required: + - agent + - case + properties: + agent: + type: string + description: Path of a saved ai_agent resource + case: + type: object + description: AgentEvalCase (id, input, judge_checklist, assertions) + judge_provider: + type: object + description: Optional ai-provider override for the judge; defaults to the agent's provider + responses: + "200": + description: eval case result (pass/fail, judge verdict, assertion results, output) + content: + application/json: + schema: + type: object + /w/{workspace}/resources/delete/{path}: delete: summary: delete resource diff --git a/backend/windmill-api/src/ai_agents.rs b/backend/windmill-api/src/ai_agents.rs new file mode 100644 index 0000000000000..3871f514f08ca --- /dev/null +++ b/backend/windmill-api/src/ai_agents.rs @@ -0,0 +1,502 @@ +//! Endpoints for reusable AI agents (the `ai_agent` resource type). +//! +//! Agents are run by pushing a single-module flow-preview job whose one step is an +//! `AIAgent` module. For a saved agent that step is *linked* (`agent: Some(path)`), so the +//! brain config + tools resolve at runtime from the resource via the same hybrid-linking path +//! the flow executor uses. The eval judge is itself run as an inline `AIAgent` step with a +//! structured `output_schema`, so the whole feature reuses the existing job machinery. + +use std::collections::HashMap; +use std::time::Instant; + +use axum::{extract::Path, routing::post, Extension, Json, Router}; +use serde::Deserialize; +use serde_json::value::RawValue; + +use windmill_ai::types::{ + AgentEvalCase, Assertion, AssertionResult, EvalCaseResult, EvalInput, JudgeResult, +}; +use windmill_common::error::{self, Error}; +use windmill_common::flows::{FlowModule, FlowModuleValue, FlowValue, InputTransform}; +use windmill_common::jobs::JobPayload; +use windmill_common::worker::to_raw_value; +use windmill_queue::{push, PushArgs, PushIsolationLevel}; + +use crate::db::{ApiAuthed, DB}; +use crate::jobs::run_wait_result_internal; +use crate::utils::check_scopes; +use windmill_common::db::UserDB; +use windmill_common::users::username_to_permissioned_as; + +pub fn workspaced_service() -> Router { + Router::new() + .route("/run", post(run_agent)) + .route("/eval_case", post(eval_case)) +} + +#[derive(Deserialize)] +struct RunAgentRequest { + /// Path of a saved `ai_agent` resource. + agent: String, + #[serde(default)] + input: EvalInput, +} + +#[derive(Deserialize)] +struct EvalCaseRequest { + /// Path of a saved `ai_agent` resource. + agent: String, + case: AgentEvalCase, + /// Optional judge provider (ai-provider shape, may contain nested `$res:`). Defaults to the + /// agent's own provider when omitted. + #[serde(default)] + judge_provider: Option>, +} + +/// Run a saved agent once on a given input and return its raw output. +async fn run_agent( + authed: ApiAuthed, + Extension(db): Extension, + Extension(user_db): Extension, + Path(w_id): Path, + Json(req): Json, +) -> error::Result>> { + check_scopes(&authed, || format!("jobs:run"))?; + let flow = build_linked_agent_flow(&req.agent, &req.input); + let (output, success, _ms) = + run_preview(&db, &user_db, &authed, &w_id, flow).await?; + if !success { + return Err(Error::ExecutionErr(format!( + "agent run failed: {}", + output.get() + ))); + } + Ok(Json(output)) +} + +/// Run a single eval case against a saved agent: execute, apply deterministic assertions, and +/// (if the case has a checklist) grade with an LLM judge. +async fn eval_case( + authed: ApiAuthed, + Extension(db): Extension, + Extension(user_db): Extension, + Path(w_id): Path, + Json(req): Json, +) -> error::Result> { + check_scopes(&authed, || format!("jobs:run"))?; + let case = req.case; + + // 1. Run the agent under test. + let started = Instant::now(); + let flow = build_linked_agent_flow(&req.agent, &case.input); + let run = run_preview(&db, &user_db, &authed, &w_id, flow).await; + let latency_ms = started.elapsed().as_millis() as u64; + + let (output, success) = match run { + Ok((output, success, _)) => (output, success), + Err(e) => { + return Ok(Json(EvalCaseResult { + case_id: case.id, + passed: false, + output: None, + error: Some(e.to_string()), + assertions: vec![], + judge: None, + latency_ms, + })); + } + }; + + if !success { + return Ok(Json(EvalCaseResult { + case_id: case.id, + passed: false, + output: Some(output.clone()), + error: Some(format!("agent run failed: {}", output.get())), + assertions: vec![], + judge: None, + latency_ms, + })); + } + + // 2. Deterministic assertions. + let assertion_results: Vec = case + .assertions + .iter() + .map(|a| evaluate_assertion(a, &output)) + .collect(); + let assertions_pass = assertion_results.iter().all(|r| r.passed); + + // 3. LLM judge (only when a checklist is provided). + let judge = if case.judge_checklist.is_empty() { + None + } else { + let provider = match req.judge_provider { + Some(p) => p, + None => agent_provider(&db, &w_id, &req.agent).await?, + }; + match run_judge(&db, &user_db, &authed, &w_id, provider, &output, &case.judge_checklist) + .await + { + Ok(j) => Some(j), + Err(e) => Some(JudgeResult { + score: 0, + pass: false, + summary: format!("judge failed: {e}"), + }), + } + }; + + let passed = assertions_pass && judge.as_ref().map(|j| j.pass).unwrap_or(true); + + Ok(Json(EvalCaseResult { + case_id: case.id, + passed, + output: Some(output), + error: None, + assertions: assertion_results, + judge, + latency_ms, + })) +} + +// --------------------------------------------------------------------------- +// Flow construction +// --------------------------------------------------------------------------- + +fn static_transform(value: &T) -> InputTransform { + InputTransform::new_static_value(to_raw_value(value)) +} + +fn single_module_flow(value: FlowModuleValue) -> FlowValue { + FlowValue { + modules: vec![FlowModule { + id: "a".to_string(), + value: to_raw_value(&value), + ..Default::default() + }], + ..Default::default() + } +} + +/// A one-step flow whose AIAgent module links to the saved agent resource. Only the flow-local +/// inputs are set locally; the brain + tools resolve from the resource at runtime. +fn build_linked_agent_flow(agent_path: &str, input: &EvalInput) -> FlowValue { + let mut input_transforms = HashMap::new(); + if let Some(msg) = &input.user_message { + input_transforms.insert("user_message".to_string(), static_transform(msg)); + } + if let Some(att) = &input.user_attachments { + input_transforms.insert("user_attachments".to_string(), static_transform(att)); + } + single_module_flow(FlowModuleValue::AIAgent { + input_transforms, + tools: vec![], + tag: None, + omit_output_from_conversation: false, + agent: Some(agent_path.to_string()), + }) +} + +const JUDGE_SYSTEM_PROMPT: &str = "You are a strict evaluator. You are given the OUTPUT produced \ +by an AI agent and a CHECKLIST of acceptance criteria. Judge whether the output satisfies every \ +criterion. Respond ONLY via the structured output: `score` is an integer 0-100 reflecting overall \ +quality against the checklist, `pass` is true only if all criteria are satisfied, and `summary` \ +is one or two sentences explaining the verdict."; + +/// Run the judge as an inline AIAgent step with a structured output schema. +async fn run_judge( + db: &DB, + user_db: &UserDB, + authed: &ApiAuthed, + w_id: &str, + provider: Box, + output: &RawValue, + checklist: &[String], +) -> error::Result { + let checklist_rendered = checklist + .iter() + .map(|c| format!("- {c}")) + .collect::>() + .join("\n"); + let user_message = format!( + "OUTPUT:\n{}\n\nCHECKLIST:\n{}", + output.get(), + checklist_rendered + ); + + let output_schema = serde_json::json!({ + "type": "object", + "properties": { + "score": { "type": "integer", "description": "0-100 overall quality" }, + "pass": { "type": "boolean", "description": "true only if all criteria pass" }, + "summary": { "type": "string", "description": "short rationale" } + }, + "required": ["score", "pass", "summary"] + }); + + let mut it = HashMap::new(); + it.insert("provider".to_string(), InputTransform::new_static_value(provider)); + it.insert("system_prompt".to_string(), static_transform(&JUDGE_SYSTEM_PROMPT)); + it.insert("user_message".to_string(), static_transform(&user_message)); + it.insert("output_type".to_string(), static_transform(&"text")); + it.insert("output_schema".to_string(), static_transform(&output_schema)); + it.insert("max_iterations".to_string(), static_transform(&1)); + + let flow = single_module_flow(FlowModuleValue::AIAgent { + input_transforms: it, + tools: vec![], + tag: None, + omit_output_from_conversation: false, + agent: None, + }); + + let (result, success, _) = run_preview(db, user_db, authed, w_id, flow).await?; + if !success { + return Err(Error::ExecutionErr(format!( + "judge run failed: {}", + result.get() + ))); + } + parse_judge_result(&result) +} + +/// Lenient parse: tolerate float scores and out-of-range values from the model. +fn parse_judge_result(value: &RawValue) -> error::Result { + let v: serde_json::Value = serde_json::from_str(value.get()) + .map_err(|e| Error::internal_err(format!("judge returned non-JSON output: {e}")))?; + let score = v + .get("score") + .and_then(|s| s.as_f64()) + .map(|s| s.round().clamp(0.0, 100.0) as u8) + .unwrap_or(0); + let pass = v.get("pass").and_then(|p| p.as_bool()).unwrap_or(false); + let summary = v + .get("summary") + .and_then(|s| s.as_str()) + .unwrap_or_default() + .to_string(); + Ok(JudgeResult { score, pass, summary }) +} + +/// Read the saved agent's `provider` field to use as the default judge provider. +async fn agent_provider(db: &DB, w_id: &str, path: &str) -> error::Result> { + let value = sqlx::query_scalar!( + "SELECT value FROM resource WHERE workspace_id = $1 AND path = $2 AND resource_type = 'ai_agent'", + w_id, + path + ) + .fetch_optional(db) + .await? + .flatten() + .ok_or_else(|| Error::NotFound(format!("ai_agent resource {path} not found")))?; + + value + .get("provider") + .map(to_raw_value) + .ok_or_else(|| { + Error::BadRequest(format!( + "ai_agent resource {path} has no provider; pass judge_provider explicitly" + )) + }) +} + +// --------------------------------------------------------------------------- +// Job push + wait +// --------------------------------------------------------------------------- + +/// Push a single-step flow preview and block until it completes. Returns (result, success, ms). +async fn run_preview( + db: &DB, + user_db: &UserDB, + authed: &ApiAuthed, + w_id: &str, + flow: FlowValue, +) -> error::Result<(Box, bool, u64)> { + let started = Instant::now(); + let args: HashMap> = HashMap::new(); + let tx = PushIsolationLevel::Isolated(user_db.clone(), authed.clone().into()); + + let (uuid, tx) = push( + db, + tx, + w_id, + JobPayload::RawFlow { value: flow, path: Some("ai_agent/eval".to_string()), restarted_from: None }, + PushArgs::from(&args), + authed.display_username(), + &authed.email, + username_to_permissioned_as(&authed.username), + authed.token_prefix.as_deref(), + None, + None, + None, + None, + None, + None, + false, + false, + None, + true, + None, + None, + None, + None, + Some(&authed.clone().into()), + false, + None, + None, + None, + ) + .await?; + tx.commit().await?; + + let (result, success) = + run_wait_result_internal(db, uuid, w_id, None, false, &authed.username).await?; + Ok((result, success, started.elapsed().as_millis() as u64)) +} + +// --------------------------------------------------------------------------- +// Deterministic assertions +// --------------------------------------------------------------------------- + +/// Render an agent output as a plain string for text-based assertions. A JSON string output is +/// unwrapped; anything else uses its compact JSON text. +fn output_as_string(output: &RawValue) -> String { + match serde_json::from_str::(output.get()) { + Ok(serde_json::Value::String(s)) => s, + _ => output.get().to_string(), + } +} + +fn evaluate_assertion(assertion: &Assertion, output: &RawValue) -> AssertionResult { + let (passed, detail) = match assertion { + Assertion::Contains { value, case_sensitive } => { + let (haystack, needle) = casefold(output_as_string(output), value, *case_sensitive); + (haystack.contains(&needle), None) + } + Assertion::NotContains { value, case_sensitive } => { + let (haystack, needle) = casefold(output_as_string(output), value, *case_sensitive); + (!haystack.contains(&needle), None) + } + Assertion::Regex { pattern } => match regex::Regex::new(pattern) { + Ok(re) => (re.is_match(&output_as_string(output)), None), + Err(e) => (false, Some(format!("invalid regex: {e}"))), + }, + Assertion::JsonPathEquals { path, value } => { + match serde_json::from_str::(output.get()) { + Ok(json) => { + let found = json_path_get(&json, path); + (found == Some(value), found.map(|f| f.to_string())) + } + Err(e) => (false, Some(format!("output is not JSON: {e}"))), + } + } + Assertion::OutputSchemaValid => { + // v1: a basic structural check that the output is valid, non-null JSON. A full + // schema validation against the agent's output_schema is a fast-follow. + match serde_json::from_str::(output.get()) { + Ok(serde_json::Value::Null) | Err(_) => (false, None), + Ok(_) => (true, None), + } + } + }; + AssertionResult { assertion: assertion.clone(), passed, detail } +} + +fn casefold(haystack: String, needle: &str, case_sensitive: bool) -> (String, String) { + if case_sensitive { + (haystack, needle.to_string()) + } else { + (haystack.to_lowercase(), needle.to_lowercase()) + } +} + +/// Navigate a dotted JSON path (e.g. `data.items.0.name`) into a value. +fn json_path_get<'a>(value: &'a serde_json::Value, path: &str) -> Option<&'a serde_json::Value> { + let mut current = value; + for segment in path.split('.') { + current = match current { + serde_json::Value::Object(map) => map.get(segment)?, + serde_json::Value::Array(arr) => arr.get(segment.parse::().ok()?)?, + _ => return None, + }; + } + Some(current) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + fn raw(s: &str) -> Box { + RawValue::from_string(s.to_string()).unwrap() + } + + #[test] + fn output_as_string_unwraps_json_strings() { + assert_eq!(output_as_string(&raw("\"hello\"")), "hello"); + assert_eq!(output_as_string(&raw("{\"a\":1}")), "{\"a\":1}"); + assert_eq!(output_as_string(&raw("42")), "42"); + } + + #[test] + fn contains_assertion_respects_case_sensitivity() { + let out = raw("\"Hello World\""); + let sensitive = Assertion::Contains { value: "hello".into(), case_sensitive: true }; + assert!(!evaluate_assertion(&sensitive, &out).passed); + let insensitive = Assertion::Contains { value: "hello".into(), case_sensitive: false }; + assert!(evaluate_assertion(&insensitive, &out).passed); + } + + #[test] + fn not_contains_and_regex_and_schema_assertions() { + let out = raw("\"order 123 shipped\""); + assert!(evaluate_assertion( + &Assertion::NotContains { value: "refund".into(), case_sensitive: false }, + &out + ) + .passed); + assert!(evaluate_assertion(&Assertion::Regex { pattern: r"order \d+".into() }, &out).passed); + assert!(!evaluate_assertion(&Assertion::Regex { pattern: "(".into() }, &out).passed); + assert!(evaluate_assertion(&Assertion::OutputSchemaValid, &raw("{\"a\":1}")).passed); + assert!(!evaluate_assertion(&Assertion::OutputSchemaValid, &raw("null")).passed); + } + + #[test] + fn json_path_navigates_objects_and_arrays() { + let v = json!({"data": {"items": [{"name": "a"}, {"name": "b"}]}}); + assert_eq!(json_path_get(&v, "data.items.1.name"), Some(&json!("b"))); + assert_eq!(json_path_get(&v, "data.missing"), None); + assert_eq!(json_path_get(&v, "data.items.9"), None); + } + + #[test] + fn json_path_equals_assertion() { + let out = raw("{\"status\":\"ok\",\"count\":3}"); + assert!(evaluate_assertion( + &Assertion::JsonPathEquals { path: "status".into(), value: json!("ok") }, + &out + ) + .passed); + assert!(!evaluate_assertion( + &Assertion::JsonPathEquals { path: "count".into(), value: json!(5) }, + &out + ) + .passed); + } + + #[test] + fn parse_judge_result_is_lenient() { + // Float score is rounded and clamped; pass/summary parsed. + let j = parse_judge_result(&raw("{\"score\": 87.6, \"pass\": true, \"summary\": \"good\"}")) + .unwrap(); + assert_eq!(j.score, 88); + assert!(j.pass); + assert_eq!(j.summary, "good"); + // Missing fields default safely. + let j2 = parse_judge_result(&raw("{\"score\": 200}")).unwrap(); + assert_eq!(j2.score, 100); + assert!(!j2.pass); + } +} diff --git a/backend/windmill-api/src/lib.rs b/backend/windmill-api/src/lib.rs index cbbb406db4fc8..12a47821fd58f 100644 --- a/backend/windmill-api/src/lib.rs +++ b/backend/windmill-api/src/lib.rs @@ -66,6 +66,7 @@ use crate::scim_oss::has_scim_token; use windmill_common::error::AppError; mod ai; +mod ai_agents; mod ai_skills; mod apps; pub mod args; @@ -621,6 +622,7 @@ pub async fn run_server( Router::new() }) .nest("/ai", ai::workspaced_service()) + .nest("/ai_agents", ai_agents::workspaced_service()) .nest("/ai_skills", ai_skills::workspaced_service()) .nest("/npm_proxy", windmill_api_npm_proxy::workspaced_service()) .nest( diff --git a/backend/windmill-types/src/flows.rs b/backend/windmill-types/src/flows.rs index 006185dbd9184..bed0502c80d2f 100644 --- a/backend/windmill-types/src/flows.rs +++ b/backend/windmill-types/src/flows.rs @@ -1000,6 +1000,11 @@ pub enum FlowModuleValue { tag: Option, #[serde(default, skip_serializing_if = "is_false")] omit_output_from_conversation: bool, + /// When set, the agent brain config (provider/model/system prompt/etc.) and tools are + /// resolved at runtime from this `ai_agent` resource path (hybrid linking). The module's + /// `input_transforms` then only carry the flow-local inputs (user_message/user_attachments). + #[serde(default, skip_serializing_if = "Option::is_none")] + agent: Option, }, } @@ -1035,6 +1040,7 @@ struct UntaggedFlowModuleValue { assets: Option>, tools: Option>, omit_output_from_conversation: Option, + agent: Option, pass_flow_input_directly: Option, squash: Option, #[serde(flatten)] @@ -1133,13 +1139,14 @@ impl<'de> Deserialize<'de> for FlowModuleValue { "identity" => Ok(FlowModuleValue::Identity), "aiagent" => Ok(FlowModuleValue::AIAgent { input_transforms: untagged.input_transforms.unwrap_or_default(), - tools: untagged - .tools - .ok_or_else(|| serde::de::Error::missing_field("tools"))?, + // Tools default to empty: a linked agent (see `agent`) resolves its tools from + // the referenced resource, so the module itself may carry none. + tools: untagged.tools.unwrap_or_default(), tag: untagged.tag, omit_output_from_conversation: untagged .omit_output_from_conversation .unwrap_or(false), + agent: untagged.agent, }), other => Err(serde::de::Error::unknown_variant( other, diff --git a/backend/windmill-worker/src/ai_executor.rs b/backend/windmill-worker/src/ai_executor.rs index 2591cc649527a..5e6900fd7e713 100644 --- a/backend/windmill-worker/src/ai_executor.rs +++ b/backend/windmill-worker/src/ai_executor.rs @@ -35,7 +35,7 @@ use windmill_common::{ error::{self, Error}, flow_conversations::MessageType, flow_status::AgentAction, - flows::{FlowModule, FlowModuleValue, ToolValue}, + flows::{AgentTool, FlowModule, FlowModuleValue, ToolValue}, get_latest_hash_for_path, jobs::JobKind, scripts::get_full_hub_script_by_path, @@ -156,14 +156,20 @@ pub async fn handle_ai_agent_job( has_stream: &mut bool, ) -> Result, Error> { // build_args_map returns None if no $res:/$var: transforms needed, in which case use original args - let args = match build_args_map(job, client, conn).await? { + let local_args = match build_args_map(job, client, conn).await? { Some(transformed) => transformed, None => job.args.as_ref().map(|a| a.0.clone()).unwrap_or_default(), }; - let args = serde_json::from_str::(&serde_json::to_string(&args)?)?; - // Handle dry_run mode - check credentials without making API calls - if args.credentials_check { + // Handle dry_run mode - check credentials without making API calls. + // The credentials check is always invoked inline (provider present, no agent link and no + // parent flow), so it resolves before any flow/agent-resource context is fetched. + let is_credentials_check = local_args + .get("credentials_check") + .map(|v| v.get().trim() == "true") + .unwrap_or(false); + if is_credentials_check { + let args = serde_json::from_str::(&serde_json::to_string(&local_args)?)?; return handle_credentials_check(&args.provider).await; } @@ -249,14 +255,75 @@ pub async fn handle_ai_agent_job( let summary = module.summary.clone(); - let FlowModuleValue::AIAgent { tools, omit_output_from_conversation, .. } = - module.get_value()? + let FlowModuleValue::AIAgent { + tools: module_tools, omit_output_from_conversation, agent, .. + } = module.get_value()? else { return Err(Error::internal_err( "AI agent module is not an AI agent".to_string(), )); }; + // Hybrid linking: when the step references a saved `ai_agent` resource, the brain config + // (provider/model/system prompt/etc.) and the tool set come from that resource; only the + // flow-local inputs (user_message/user_attachments) come from the step. When not linked, the + // brain is the step's resolved input_transforms and the tools are the module's own. + // v1 boundary: a linked agent's tools come wholly from the resource; per-tool flow-context + // wiring is a fast-follow. + let (args, tools): (AIAgentArgs, Vec) = if let Some(agent_ref) = agent.as_deref() { + let agent_path = agent_ref + .trim_start_matches("$res:") + .trim_start_matches("res://"); + let resource_value = client + .get_resource_value_interpolated::( + agent_path, + Some(job.id.to_string()), + ) + .await + .map_err(|e| { + Error::internal_err(format!( + "failed to load ai_agent resource {agent_path}: {e}" + )) + })?; + let mut config = match resource_value { + serde_json::Value::Object(map) => map, + _ => { + return Err(Error::internal_err(format!( + "ai_agent resource {agent_path} must be a JSON object" + ))) + } + }; + // Overlay flow-local inputs onto the agent brain. + for key in ["user_message", "user_attachments"] { + if let Some(v) = local_args.get(key) { + config.insert( + key.to_string(), + serde_json::from_str(v.get()).unwrap_or(serde_json::Value::Null), + ); + } + } + // A linked agent is rigid: brain and tools come wholly from the resource. To diverge, the + // step must be unlinked (forked), at which point the config is copied into the step. + let tools = match config.remove("tools") { + Some(t) => serde_json::from_value::>(t).map_err(|e| { + Error::internal_err(format!( + "invalid tools in ai_agent resource {agent_path}: {e}" + )) + })?, + None => Vec::new(), + }; + let args = serde_json::from_value::(serde_json::Value::Object(config)) + .map_err(|e| { + Error::internal_err(format!( + "invalid ai_agent resource config {agent_path}: {e}" + )) + })?; + (args, tools) + } else { + let args = serde_json::from_str::(&serde_json::to_string(&local_args)?)?; + (args, module_tools) + }; + // Separate Windmill tools from MCP tools, websearch, and extract MCP resource configs let mut windmill_modules: Vec = Vec::new(); #[allow(unused_mut)] diff --git a/backend/windmill-worker/src/worker_lockfiles.rs b/backend/windmill-worker/src/worker_lockfiles.rs index d98a2cd2e06f8..e63d2260dfa8a 100644 --- a/backend/windmill-worker/src/worker_lockfiles.rs +++ b/backend/windmill-worker/src/worker_lockfiles.rs @@ -1250,6 +1250,7 @@ async fn lock_modules( mut tools, tag, omit_output_from_conversation, + agent, } => { // Extract FlowModules from tools and track their original indices // MCP tools don't need locking, so we filter them out @@ -1302,6 +1303,7 @@ async fn lock_modules( tools, tag, omit_output_from_conversation, + agent, } .into(); } diff --git a/docs/reusable-ai-agents.md b/docs/reusable-ai-agents.md new file mode 100644 index 0000000000000..94b3a824d179b --- /dev/null +++ b/docs/reusable-ai-agents.md @@ -0,0 +1,36 @@ +# Reusable AI Agents + +An AI agent flow step can be saved as a **reusable agent** — a resource of the built-in +`ai_agent` resource type that bundles the agent's brain (provider/model, system prompt, +temperature, output schema, memory…), its tool set, and an **eval suite**. Other flows can +link to the same agent, and edits to the agent propagate to every linked step. + +## Hybrid linking + +`FlowModuleValue::AIAgent` has an optional `agent` field holding the resource path. When set: + +- The brain config and tools are resolved at runtime from the resource + (`windmill-worker/src/ai_executor.rs`, via `get_resource_value_interpolated` — nested + provider `$res:` credentials resolve automatically). +- The flow step keeps only the flow-local inputs (`user_message`, `user_attachments`) in its + `input_transforms`. +- v1 boundary: a linked agent's tools come wholly from the resource (per-tool flow-context + wiring is a fast-follow). + +In the flow editor, the AI agent step's **Step Input** tab shows a bar to *Save as agent*, +*Use saved agent* (picker), or *Unlink* (snapshots the resolved config back into the step). + +## Evals + +A saved agent carries `evals.cases`. Each case has an input message, an optional LLM-judge +checklist, and optional deterministic assertions (`contains` / `not_contains` / `regex` / +`json_path_equals` / `output_schema_valid`). The **Evals** tab authors and runs them. + +Cases run for real (they cost tokens): the backend pushes a single-step AIAgent flow-preview +job and grades the output. The judge itself runs as an inline AIAgent step with a structured +output schema (`{score, pass, summary}`), defaulting to the agent's own provider. + +- `POST /w/{workspace}/ai_agents/run` — run a saved agent once on an input. +- `POST /w/{workspace}/ai_agents/eval_case` — run one eval case (execute + assertions + judge). + +Sharing works through standard resource folder permissions (save agents under `f/...`). diff --git a/frontend/src/lib/components/flows/agentResourceUtils.test.ts b/frontend/src/lib/components/flows/agentResourceUtils.test.ts new file mode 100644 index 0000000000000..18963e27cb340 --- /dev/null +++ b/frontend/src/lib/components/flows/agentResourceUtils.test.ts @@ -0,0 +1,38 @@ +import { describe, expect, it } from 'vitest' + +import { summarizeAgentBrain } from './agentResourceUtils' + +describe('summarizeAgentBrain', () => { + it('returns only set fields, in brain-key order, formatted', () => { + const rows = summarizeAgentBrain({ + provider: { kind: 'openai', model: 'gpt-4o', resource: '$res:f/x/openai' } as any, + system_prompt: 'You are helpful', + temperature: 0.7, + streaming: true, + max_iterations: 10 + }) + expect(rows).toEqual([ + { label: 'Model', value: 'openai · gpt-4o' }, + { label: 'System prompt', value: 'You are helpful' }, + { label: 'Streaming', value: 'on' }, + { label: 'Temperature', value: '0.7' }, + { label: 'Max iterations', value: '10' } + ]) + }) + + it('skips empty/undefined fields', () => { + expect(summarizeAgentBrain({ system_prompt: '', provider: undefined as any })).toEqual([]) + expect(summarizeAgentBrain(undefined)).toEqual([]) + }) + + it('summarizes structured fields compactly', () => { + const rows = summarizeAgentBrain({ + memory: { type: 'auto', context_length: 20 } as any, + output_schema: { type: 'object' } as any + }) + expect(rows).toEqual([ + { label: 'Memory', value: 'auto' }, + { label: 'Output schema', value: 'configured' } + ]) + }) +}) diff --git a/frontend/src/lib/components/flows/agentResourceUtils.ts b/frontend/src/lib/components/flows/agentResourceUtils.ts new file mode 100644 index 0000000000000..d6cfc7ec8718c --- /dev/null +++ b/frontend/src/lib/components/flows/agentResourceUtils.ts @@ -0,0 +1,155 @@ +import type { InputTransform } from '$lib/gen' + +// The brain fields stored flat in an `ai_agent` resource value. The flow-local inputs +// (user_message/user_attachments) are intentionally excluded — they are supplied per-flow. +export const AGENT_BRAIN_KEYS = [ + 'provider', + 'output_type', + 'system_prompt', + 'streaming', + 'memory', + 'output_schema', + 'max_completion_tokens', + 'temperature', + 'max_iterations' +] as const + +export const AGENT_FLOW_LOCAL_KEYS = ['user_message', 'user_attachments'] as const + +export type AgentTool = Record + +export type AgentAssertion = + | { kind: 'contains'; value: string; case_sensitive?: boolean } + | { kind: 'not_contains'; value: string; case_sensitive?: boolean } + | { kind: 'regex'; pattern: string } + | { kind: 'json_path_equals'; path: string; value: unknown } + | { kind: 'output_schema_valid' } + +export interface AgentEvalCase { + id: string + name?: string + input: { user_message?: string; user_attachments?: unknown[] } + judge_checklist?: string[] + assertions?: AgentAssertion[] +} + +export interface AgentEvalSuite { + cases: AgentEvalCase[] + judge?: unknown +} + +/** Brain keys whose step transform is non-static and would be dropped by a save-as-agent snapshot. */ +export function nonStaticBrainKeys( + inputTransforms: Record | undefined +): string[] { + return AGENT_BRAIN_KEYS.filter((key) => { + const t = inputTransforms?.[key] as any + return t && t.type !== 'static' + }) +} + +export interface AIAgentConfig { + provider?: unknown + output_type?: string + system_prompt?: string + streaming?: boolean + memory?: unknown + output_schema?: unknown + max_completion_tokens?: number + temperature?: number + max_iterations?: number + tools?: AgentTool[] + evals?: AgentEvalSuite +} + +/** Extract the static brain values from a step's input_transforms into a flat agent config. */ +export function inputTransformsToAgentConfig( + inputTransforms: Record | undefined, + tools: AgentTool[] | undefined, + evals?: AgentEvalSuite +): AIAgentConfig { + const config: AIAgentConfig = { tools: tools ?? [], evals: evals ?? { cases: [] } } + for (const key of AGENT_BRAIN_KEYS) { + const t = inputTransforms?.[key] as any + if (t && t.type === 'static' && t.value !== undefined) { + ;(config as any)[key] = t.value + } + } + return config +} + +/** + * Reduce the AI agent schema to only the flow-local inputs. Used when a step is linked to a saved + * agent: the brain fields come from the resource, so only user_message/user_attachments stay editable. + */ +export function flowLocalAgentSchema(schema: any): any { + if (!schema?.properties) { + return schema + } + const properties: Record = {} + for (const key of AGENT_FLOW_LOCAL_KEYS) { + if (schema.properties[key]) { + properties[key] = schema.properties[key] + } + } + return { + ...schema, + properties, + order: (schema.order ?? Object.keys(properties)).filter((k: string) => k in properties), + required: (schema.required ?? []).filter((k: string) => k in properties) + } +} + +const AGENT_BRAIN_LABELS: Record = { + provider: 'Model', + output_type: 'Output type', + system_prompt: 'System prompt', + streaming: 'Streaming', + memory: 'Memory', + output_schema: 'Output schema', + max_completion_tokens: 'Max tokens', + temperature: 'Temperature', + max_iterations: 'Max iterations' +} + +/** Flatten a saved agent's brain config into human-readable label/value rows for a read-only + * display on a linked step. Only set fields are returned, in the canonical brain-key order. */ +export function summarizeAgentBrain( + config: AIAgentConfig | undefined +): { label: string; value: string }[] { + const rows: { label: string; value: string }[] = [] + for (const key of AGENT_BRAIN_KEYS) { + const v = (config as any)?.[key] + if (v === undefined || v === null || v === '') continue + let value: string + if (key === 'provider') { + value = [v.kind, v.model].filter(Boolean).join(' · ') || 'configured' + } else if (key === 'memory') { + value = typeof v === 'object' ? (v.type ?? 'configured') : String(v) + } else if (key === 'output_schema') { + value = 'configured' + } else if (typeof v === 'boolean') { + value = v ? 'on' : 'off' + } else if (typeof v === 'object') { + value = JSON.stringify(v) + } else { + value = String(v) + } + rows.push({ label: AGENT_BRAIN_LABELS[key] ?? key, value }) + } + return rows +} + +/** Inverse: wrap brain config values as static input_transforms (used when unlinking a step). */ +export function agentConfigToInputTransforms( + config: AIAgentConfig +): Record { + const it: Record = {} + for (const key of AGENT_BRAIN_KEYS) { + const v = (config as any)[key] + if (v !== undefined) { + it[key] = { type: 'static', value: v } as InputTransform + } + } + return it +} diff --git a/frontend/src/lib/components/flows/content/AgentEvalsPanel.svelte b/frontend/src/lib/components/flows/content/AgentEvalsPanel.svelte new file mode 100644 index 0000000000000..c426994823470 --- /dev/null +++ b/frontend/src/lib/components/flows/content/AgentEvalsPanel.svelte @@ -0,0 +1,369 @@ + + +
+
+ Evals + {#if ranCount > 0} + {passedCount}/{ranCount} passed + {/if} +
+ + + +
+
+ + {#if loading} +
+ Loading… +
+ {:else if cases.length === 0} +

+ No eval cases yet. Add a case with an input message, then grade it with a judge checklist + and/or deterministic assertions. +

+ {:else} + {#each cases as c (c.id)} + {@const r = results[c.id]} +
+
+ + {#if r === 'running'} + + {:else if r} + {#if r.passed} + + Pass + + {:else} + + Fail + + {/if} + {#if r.judge} + judge {r.judge.score} + {/if} + {r.latency_ms}ms + {/if} + +
+ + + + + +
+ Assertions + {#each c.assertions ?? [] as a, i (i)} +
+ {a.kind} + {#if a.kind === 'contains' || a.kind === 'not_contains'} + + {:else if a.kind === 'regex'} + + {:else if a.kind === 'json_path_equals'} + + { + try { + a.value = JSON.parse(e.currentTarget.value) + } catch { + a.value = e.currentTarget.value + } + }} + placeholder="expected value" + /> + {:else} + output is valid non-null JSON + {/if} +
+ {/each} +
+
+ + +
+ {#snippet actions()} + + {/snippet} + + diff --git a/frontend/src/lib/components/flows/content/FlowModuleComponent.svelte b/frontend/src/lib/components/flows/content/FlowModuleComponent.svelte index 46a5c78d329d9..2fcf6fbce2b77 100644 --- a/frontend/src/lib/components/flows/content/FlowModuleComponent.svelte +++ b/frontend/src/lib/components/flows/content/FlowModuleComponent.svelte @@ -36,6 +36,9 @@ import FlowModuleSleep from './FlowModuleSleep.svelte' import FlowPathViewer from './FlowPathViewer.svelte' import InputTransformSchemaForm from '$lib/components/InputTransformSchemaForm.svelte' + import AgentResourceBar from './AgentResourceBar.svelte' + import AgentEvalsPanel from './AgentEvalsPanel.svelte' + import { flowLocalAgentSchema } from '../agentResourceUtils' import FlowModuleMockTransitionMessage from './FlowModuleMockTransitionMessage.svelte' import Tooltip from '$lib/components/Tooltip.svelte' import { SecondsInput } from '$lib/components/common' @@ -164,6 +167,9 @@ flowModule.value.type === 'aiagent' ) let visibleSelected = $derived(selected === 'chat' && !canShowChatTab ? 'inputs' : selected) + let agentLinked = $derived( + flowModule.value.type === 'aiagent' && Boolean((flowModule.value as any).agent) + ) let advancedSelected = $state('retries') let advancedRuntimeSelected = $state('concurrency') let s3Kind = $state('s3_client') @@ -1028,12 +1034,43 @@ label="Chat" /> {/if} + {#if flowModule.value.type === 'aiagent' && !isAgentTool} + + {/if} {#if !preprocessorModule && !isAgentTool} {/if} {#if visibleSelected === 'inputs' && (flowModule.value.type == 'rawscript' || flowModule.value.type == 'script' || flowModule.value.type == 'flow' || flowModule.value.type == 'aiagent')}
+ {#if flowModule.value.type === 'aiagent'} + (flowModule.value as any).agent, + (v) => { + if (flowModule.value.type === 'aiagent') { + ;(flowModule.value as any).agent = v + } + } + } + bind:inputTransforms={ + () => (flowModule.value as any).input_transforms, + (v) => { + if (flowModule.value.type === 'aiagent') { + ;(flowModule.value as any).input_transforms = v + } + } + } + bind:tools={ + () => (flowModule.value as any).tools ?? [], + (v) => { + if (flowModule.value.type === 'aiagent') { + ;(flowModule.value as any).tools = v + } + } + } + /> + {/if} { @@ -1127,6 +1166,17 @@ />
+ {:else if visibleSelected === 'evals' && flowModule.value.type === 'aiagent'} +
+ {#if agentLinked && (flowModule.value as any).agent} + + {:else} +
+ Save this step as a reusable agent (in the Step Input tab) to author and + run evals against it. +
+ {/if} +
{:else if visibleSelected === 'advanced'}