From 8e3400c439144797d61dcc2750ebece44a768685 Mon Sep 17 00:00:00 2001 From: Ufuk Kayserilioglu Date: Fri, 19 Jun 2026 01:00:07 +0300 Subject: [PATCH 1/8] Add read-only Cypher query engine over the in-memory graph Introduce a hand-written Cypher subset engine (lexer, recursive-descent parser, and tree-walking executor) that runs read-only queries directly against the in-memory Graph, with no external parser or database dependency and no graph duplication. The graph is exposed as a property graph: node labels (Document, Definition, Declaration plus kind sub-labels and the Namespace grouping) and relationship types (DEFINES, DECLARES, CONTAINS, INHERITS, INCLUDES, PREPENDS, EXTENDS, OWNS, ANCESTOR, DESCENDANT, REFERENCES) mirror the DOT exporter's schema. Supported syntax: MATCH with node patterns (label disjunction, inline properties), relationship patterns (directions, type lists, variable length), WHERE (comparisons, CONTAINS/STARTS WITH/ENDS WITH, AND/OR/NOT), RETURN with DISTINCT/aliases/aggregates, and ORDER BY/SKIP/LIMIT. Results render as a text table or JSON. A static description of the queryable schema (labels, relationship types, and properties) is also available via `cypher::schema`. --- rust/rubydex/src/query.rs | 2 + rust/rubydex/src/query/cypher/ast.rs | 204 +++++++ rust/rubydex/src/query/cypher/error.rs | 40 ++ rust/rubydex/src/query/cypher/executor.rs | 574 +++++++++++++++++++ rust/rubydex/src/query/cypher/format.rs | 104 ++++ rust/rubydex/src/query/cypher/lexer.rs | 205 +++++++ rust/rubydex/src/query/cypher/mod.rs | 51 ++ rust/rubydex/src/query/cypher/parser.rs | 548 ++++++++++++++++++ rust/rubydex/src/query/cypher/schema.rs | 495 ++++++++++++++++ rust/rubydex/src/query/cypher/schema_info.rs | 369 ++++++++++++ rust/rubydex/src/query/cypher/tests.rs | 294 ++++++++++ rust/rubydex/src/query/cypher/value.rs | 148 +++++ 12 files changed, 3034 insertions(+) create mode 100644 rust/rubydex/src/query/cypher/ast.rs create mode 100644 rust/rubydex/src/query/cypher/error.rs create mode 100644 rust/rubydex/src/query/cypher/executor.rs create mode 100644 rust/rubydex/src/query/cypher/format.rs create mode 100644 rust/rubydex/src/query/cypher/lexer.rs create mode 100644 rust/rubydex/src/query/cypher/mod.rs create mode 100644 rust/rubydex/src/query/cypher/parser.rs create mode 100644 rust/rubydex/src/query/cypher/schema.rs create mode 100644 rust/rubydex/src/query/cypher/schema_info.rs create mode 100644 rust/rubydex/src/query/cypher/tests.rs create mode 100644 rust/rubydex/src/query/cypher/value.rs diff --git a/rust/rubydex/src/query.rs b/rust/rubydex/src/query.rs index 85f6f4ec9..dd005ec25 100644 --- a/rust/rubydex/src/query.rs +++ b/rust/rubydex/src/query.rs @@ -15,6 +15,8 @@ use crate::model::keywords::{self, Keyword}; use crate::model::name::NameRef; use crate::model::visibility::Visibility; +pub mod cypher; + /// Controls how declaration names are matched against the search query. #[derive(Default)] pub enum MatchMode { diff --git a/rust/rubydex/src/query/cypher/ast.rs b/rust/rubydex/src/query/cypher/ast.rs new file mode 100644 index 000000000..867c46f86 --- /dev/null +++ b/rust/rubydex/src/query/cypher/ast.rs @@ -0,0 +1,204 @@ +//! Abstract syntax tree for the supported subset of Cypher. + +/// A complete parsed query. +#[derive(Debug, Clone, PartialEq)] +pub struct Query { + /// One or more comma-separated path patterns from the `MATCH` clause. + pub patterns: Vec, + pub where_clause: Option, + pub return_clause: Return, + pub order_by: Vec, + pub skip: Option, + pub limit: Option, +} + +/// A path pattern: a starting node followed by zero or more relationship/node hops. +#[derive(Debug, Clone, PartialEq)] +pub struct PathPattern { + pub start: NodePattern, + pub rest: Vec<(RelPattern, NodePattern)>, +} + +/// A node pattern such as `(c:Class {name: 'Foo'})` or `(c:Class|Module)`. +#[derive(Debug, Clone, PartialEq)] +pub struct NodePattern { + pub var: Option, + /// Labels in a disjunction: a node matches if it has **any** of these labels. Empty means + /// "any node". + pub labels: Vec, + pub props: Vec<(String, Literal)>, +} + +/// A relationship pattern such as `-[:INHERITS*1..3]->`. +#[derive(Debug, Clone, PartialEq)] +pub struct RelPattern { + pub var: Option, + /// Relationship types; empty means "any type". + pub types: Vec, + pub direction: Direction, + /// Variable-length specification, if the pattern used `*`. + pub length: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Direction { + /// `-[]->` + Outgoing, + /// `<-[]-` + Incoming, + /// `-[]-` + Both, +} + +/// A variable-length relationship bound, from `*`, `*n`, `*min..`, `*..max`, or `*min..max`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct VarLength { + pub min: u32, + pub max: Option, +} + +#[derive(Debug, Clone, PartialEq)] +pub enum Literal { + Str(String), + Int(i64), + Bool(bool), + Null, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct Return { + pub distinct: bool, + pub items: Vec, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct ReturnItem { + pub expr: Expr, + pub alias: Option, +} + +impl ReturnItem { + /// The output column name: the explicit alias, or a name derived from the expression. + #[must_use] + pub fn column_name(&self) -> String { + self.alias.clone().unwrap_or_else(|| self.expr.display_name()) + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum Expr { + Var(String), + Property(String, String), + Literal(Literal), + Not(Box), + And(Box, Box), + Or(Box, Box), + Compare(Box, CmpOp, Box), + Aggregate { + func: AggFn, + arg: Option>, + distinct: bool, + }, +} + +impl Expr { + /// Whether this expression tree contains an aggregate function call. + #[must_use] + #[allow(clippy::match_same_arms)] + pub fn contains_aggregate(&self) -> bool { + match self { + Expr::Aggregate { .. } => true, + Expr::Not(inner) => inner.contains_aggregate(), + Expr::And(a, b) | Expr::Or(a, b) => a.contains_aggregate() || b.contains_aggregate(), + Expr::Compare(a, _, b) => a.contains_aggregate() || b.contains_aggregate(), + Expr::Var(_) | Expr::Property(..) | Expr::Literal(_) => false, + } + } + + /// A human-readable name for the expression, used as a default column header. + #[must_use] + pub fn display_name(&self) -> String { + match self { + Expr::Var(v) => v.clone(), + Expr::Property(v, p) => format!("{v}.{p}"), + Expr::Literal(lit) => match lit { + Literal::Str(s) => format!("'{s}'"), + Literal::Int(i) => i.to_string(), + Literal::Bool(b) => b.to_string(), + Literal::Null => "null".to_string(), + }, + Expr::Not(inner) => format!("NOT {}", inner.display_name()), + Expr::And(a, b) => format!("{} AND {}", a.display_name(), b.display_name()), + Expr::Or(a, b) => format!("{} OR {}", a.display_name(), b.display_name()), + Expr::Compare(a, op, b) => format!("{} {} {}", a.display_name(), op.as_str(), b.display_name()), + Expr::Aggregate { func, arg, distinct } => { + let inner = match arg { + Some(expr) => expr.display_name(), + None => "*".to_string(), + }; + let distinct = if *distinct { "DISTINCT " } else { "" }; + format!("{}({distinct}{inner})", func.as_str()) + } + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CmpOp { + Eq, + Neq, + Lt, + Lte, + Gt, + Gte, + Contains, + StartsWith, + EndsWith, +} + +impl CmpOp { + #[must_use] + pub fn as_str(self) -> &'static str { + match self { + CmpOp::Eq => "=", + CmpOp::Neq => "<>", + CmpOp::Lt => "<", + CmpOp::Lte => "<=", + CmpOp::Gt => ">", + CmpOp::Gte => ">=", + CmpOp::Contains => "CONTAINS", + CmpOp::StartsWith => "STARTS WITH", + CmpOp::EndsWith => "ENDS WITH", + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AggFn { + Count, + Collect, + Min, + Max, + Sum, + Avg, +} + +impl AggFn { + #[must_use] + pub fn as_str(self) -> &'static str { + match self { + AggFn::Count => "count", + AggFn::Collect => "collect", + AggFn::Min => "min", + AggFn::Max => "max", + AggFn::Sum => "sum", + AggFn::Avg => "avg", + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub struct OrderItem { + pub expr: Expr, + pub descending: bool, +} diff --git a/rust/rubydex/src/query/cypher/error.rs b/rust/rubydex/src/query/cypher/error.rs new file mode 100644 index 000000000..100fc37c0 --- /dev/null +++ b/rust/rubydex/src/query/cypher/error.rs @@ -0,0 +1,40 @@ +use std::fmt; + +/// An error produced while lexing, parsing, or executing a Cypher query. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum CypherError { + /// A lexing or parsing error, with a byte position into the source query. + Syntax { message: String, position: usize }, + /// A semantic or execution error (e.g. unknown property, type mismatch). + Execution { message: String }, +} + +impl CypherError { + pub(crate) fn syntax(message: impl Into, position: usize) -> Self { + Self::Syntax { + message: message.into(), + position, + } + } + + pub(crate) fn execution(message: impl Into) -> Self { + Self::Execution { + message: message.into(), + } + } +} + +impl fmt::Display for CypherError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + CypherError::Syntax { message, position } => { + write!(f, "Cypher syntax error at position {position}: {message}") + } + CypherError::Execution { message } => { + write!(f, "Cypher execution error: {message}") + } + } + } +} + +impl std::error::Error for CypherError {} diff --git a/rust/rubydex/src/query/cypher/executor.rs b/rust/rubydex/src/query/cypher/executor.rs new file mode 100644 index 000000000..88c746514 --- /dev/null +++ b/rust/rubydex/src/query/cypher/executor.rs @@ -0,0 +1,574 @@ +use std::collections::HashMap; +use std::collections::HashSet; + +use crate::model::graph::Graph; + +use super::ast::{AggFn, CmpOp, Direction, Expr, Literal, NodePattern, OrderItem, PathPattern, Query, ReturnItem}; +use super::error::CypherError; +use super::schema::{self, NodeRef, RelType}; +use super::value::CypherValue; + +/// The tabular result of executing a query. +#[derive(Debug, Clone, PartialEq)] +pub struct ResultSet { + pub columns: Vec, + pub rows: Vec>, +} + +/// A single binding row: maps pattern variable names to graph nodes. +type Row = HashMap; + +/// Executes a parsed query against the graph. +/// +/// # Errors +/// +/// Returns a [`CypherError::Execution`] for unknown relationship types, aggregates used in `WHERE`, +/// or `ORDER BY` expressions that cannot be resolved under aggregation. +pub fn execute(graph: &Graph, query: &Query) -> Result { + let mut executor = Executor { + graph, + reverse_cache: HashMap::new(), + }; + executor.run(query) +} + +struct Executor<'a> { + graph: &'a Graph, + reverse_cache: HashMap>>, +} + +impl Executor<'_> { + fn run(&mut self, query: &Query) -> Result { + let mut rows: Vec = vec![Row::new()]; + for pattern in &query.patterns { + rows = self.eval_pattern(rows, pattern)?; + } + + if let Some(predicate) = &query.where_clause { + let mut filtered = Vec::with_capacity(rows.len()); + for row in rows { + if self.eval_expr(&row, predicate)?.is_truthy() { + filtered.push(row); + } + } + rows = filtered; + } + + self.project(query, &rows) + } + + // ---- Pattern matching ------------------------------------------------ + + fn eval_pattern(&mut self, base: Vec, pattern: &PathPattern) -> Result, CypherError> { + let mut working: Vec<(Row, NodeRef)> = Vec::new(); + + for row in base { + for node in self.candidates_for_node(&row, &pattern.start) { + let mut new_row = row.clone(); + if let Some(var) = &pattern.start.var { + new_row.insert(var.clone(), node); + } + working.push((new_row, node)); + } + } + + for (rel, node) in &pattern.rest { + working = self.expand_step(working, rel, node)?; + } + + Ok(working.into_iter().map(|(row, _)| row).collect()) + } + + fn candidates_for_node(&self, row: &Row, pattern: &NodePattern) -> Vec { + if let Some(var) = &pattern.var + && let Some(existing) = row.get(var) + { + return if self.node_matches(*existing, pattern) { + vec![*existing] + } else { + Vec::new() + }; + } + + schema::scan(self.graph, &pattern.labels) + .into_iter() + .filter(|node| self.props_match(*node, pattern)) + .collect() + } + + fn expand_step( + &mut self, + working: Vec<(Row, NodeRef)>, + rel: &super::ast::RelPattern, + node: &NodePattern, + ) -> Result, CypherError> { + let rel_types = resolve_rel_types(rel)?; + let mut next = Vec::new(); + + for (row, current) in working { + let targets = self.step_targets(current, &rel_types, rel.direction, rel.length.as_ref()); + for target in targets { + if !self.node_matches(target, node) { + continue; + } + if let Some(var) = &node.var + && let Some(existing) = row.get(var) + && *existing != target + { + continue; + } + let mut new_row = row.clone(); + if let Some(var) = &node.var { + new_row.insert(var.clone(), target); + } + next.push((new_row, target)); + } + } + + Ok(next) + } + + fn step_targets( + &mut self, + node: NodeRef, + rel_types: &[RelType], + direction: Direction, + length: Option<&super::ast::VarLength>, + ) -> Vec { + if let Some(var_length) = length { + return self.var_length_targets(node, rel_types, direction, var_length); + } + + let mut seen = HashSet::new(); + let mut targets = Vec::new(); + for rel in rel_types { + for target in self.step_once(node, *rel, direction) { + if seen.insert(target) { + targets.push(target); + } + } + } + targets + } + + fn step_once(&mut self, node: NodeRef, rel: RelType, direction: Direction) -> Vec { + match direction { + Direction::Outgoing => schema::expand_out(self.graph, node, rel), + Direction::Incoming => self.incoming(node, rel), + Direction::Both => { + let mut seen = HashSet::new(); + let mut targets = Vec::new(); + for target in schema::expand_out(self.graph, node, rel) + .into_iter() + .chain(self.incoming(node, rel)) + { + if seen.insert(target) { + targets.push(target); + } + } + targets + } + } + } + + fn incoming(&mut self, node: NodeRef, rel: RelType) -> Vec { + if !self.reverse_cache.contains_key(&rel) { + let mut reverse: HashMap> = HashMap::new(); + for source in schema::rel_source_nodes(self.graph, rel) { + for target in schema::expand_out(self.graph, source, rel) { + reverse.entry(target).or_default().push(source); + } + } + self.reverse_cache.insert(rel, reverse); + } + + self.reverse_cache + .get(&rel) + .and_then(|reverse| reverse.get(&node)) + .cloned() + .unwrap_or_default() + } + + fn var_length_targets( + &mut self, + start: NodeRef, + rel_types: &[RelType], + direction: Direction, + var_length: &super::ast::VarLength, + ) -> Vec { + let max = var_length.max.unwrap_or(u32::MAX); + let mut results = Vec::new(); + let mut result_seen = HashSet::new(); + + if var_length.min == 0 { + results.push(start); + result_seen.insert(start); + } + + let mut visited = HashSet::new(); + visited.insert(start); + let mut frontier = vec![start]; + let mut depth = 0u32; + + while depth < max && !frontier.is_empty() { + depth += 1; + let mut next = Vec::new(); + for node in &frontier { + for rel in rel_types { + for target in self.step_once(*node, *rel, direction) { + if visited.insert(target) { + next.push(target); + if depth >= var_length.min && result_seen.insert(target) { + results.push(target); + } + } + } + } + } + frontier = next; + } + + results + } + + fn node_matches(&self, node: NodeRef, pattern: &NodePattern) -> bool { + if !schema::matches_labels(self.graph, node, &pattern.labels) { + return false; + } + self.props_match(node, pattern) + } + + fn props_match(&self, node: NodeRef, pattern: &NodePattern) -> bool { + pattern + .props + .iter() + .all(|(key, literal)| schema::property(self.graph, node, key) == literal_to_value(literal)) + } + + // ---- Expression evaluation ------------------------------------------- + + fn eval_expr(&self, row: &Row, expr: &Expr) -> Result { + match expr { + Expr::Literal(literal) => Ok(literal_to_value(literal)), + Expr::Var(name) => Ok(row.get(name).map_or(CypherValue::Null, |node| self.node_value(*node))), + Expr::Property(var, prop) => Ok(row + .get(var) + .map_or(CypherValue::Null, |node| schema::property(self.graph, *node, prop))), + Expr::Not(inner) => Ok(CypherValue::Bool(!self.eval_expr(row, inner)?.is_truthy())), + Expr::And(a, b) => Ok(CypherValue::Bool( + self.eval_expr(row, a)?.is_truthy() && self.eval_expr(row, b)?.is_truthy(), + )), + Expr::Or(a, b) => Ok(CypherValue::Bool( + self.eval_expr(row, a)?.is_truthy() || self.eval_expr(row, b)?.is_truthy(), + )), + Expr::Compare(a, op, b) => { + let left = self.eval_expr(row, a)?; + let right = self.eval_expr(row, b)?; + Ok(CypherValue::Bool(compare_values(&left, *op, &right))) + } + Expr::Aggregate { .. } => Err(CypherError::execution("aggregate functions are only allowed in RETURN")), + } + } + + fn node_value(&self, node: NodeRef) -> CypherValue { + CypherValue::Node { + label: schema::node_label(self.graph, node), + name: schema::node_name(self.graph, node), + } + } + + // ---- Projection ------------------------------------------------------ + + fn project(&self, query: &Query, rows: &[Row]) -> Result { + let items = &query.return_clause.items; + let columns: Vec = items.iter().map(ReturnItem::column_name).collect(); + + let has_aggregate = items.iter().any(|item| item.expr.contains_aggregate()); + + let mut values = if has_aggregate { + self.project_aggregated(query, rows)? + } else { + self.project_simple(query, rows)? + }; + + if query.return_clause.distinct { + dedupe(&mut values); + } + + apply_order_skip_limit(query, &mut values, &columns)?; + + Ok(ResultSet { columns, rows: values }) + } + + fn project_simple(&self, query: &Query, rows: &[Row]) -> Result>, CypherError> { + let items = &query.return_clause.items; + let mut output = Vec::with_capacity(rows.len()); + for row in rows { + let mut values = Vec::with_capacity(items.len()); + for item in items { + values.push(self.eval_expr(row, &item.expr)?); + } + output.push(values); + } + Ok(output) + } + + fn project_aggregated(&self, query: &Query, rows: &[Row]) -> Result>, CypherError> { + let items = &query.return_clause.items; + + // Group rows by the values of the non-aggregate (grouping) return items. + let mut group_order: Vec> = Vec::new(); + let mut groups: HashMap, Vec> = HashMap::new(); + + for (index, row) in rows.iter().enumerate() { + let mut key = Vec::new(); + for item in items { + if !item.expr.contains_aggregate() { + key.push(self.eval_expr(row, &item.expr)?); + } + } + if !groups.contains_key(&key) { + group_order.push(key.clone()); + } + groups.entry(key).or_default().push(index); + } + + // With no grouping keys and no input rows, aggregates still produce a single row. + let grouping_keys = items.iter().filter(|item| !item.expr.contains_aggregate()).count(); + if group_order.is_empty() && grouping_keys == 0 { + group_order.push(Vec::new()); + groups.insert(Vec::new(), Vec::new()); + } + + let mut output = Vec::with_capacity(group_order.len()); + for key in group_order { + let row_indices = &groups[&key]; + let group_rows: Vec<&Row> = row_indices.iter().map(|index| &rows[*index]).collect(); + + let mut values = Vec::with_capacity(items.len()); + let mut key_iter = key.iter(); + for item in items { + if item.expr.contains_aggregate() { + values.push(self.eval_aggregate(&item.expr, &group_rows)?); + } else { + values.push(key_iter.next().cloned().unwrap_or(CypherValue::Null)); + } + } + output.push(values); + } + + Ok(output) + } + + fn eval_aggregate(&self, expr: &Expr, group: &[&Row]) -> Result { + let Expr::Aggregate { func, arg, distinct } = expr else { + return Err(CypherError::execution("expected an aggregate function")); + }; + + // count(*) does not evaluate an argument. + if *func == AggFn::Count && arg.is_none() { + return Ok(CypherValue::Int(i64::try_from(group.len()).unwrap_or(i64::MAX))); + } + + let arg_expr = arg + .as_ref() + .ok_or_else(|| CypherError::execution("aggregate function requires an argument"))?; + + let mut values = Vec::new(); + for row in group { + let value = self.eval_expr(row, arg_expr)?; + if value != CypherValue::Null { + values.push(value); + } + } + + if *distinct { + values = dedupe_values(values); + } + + Ok(match func { + AggFn::Count => CypherValue::Int(i64::try_from(values.len()).unwrap_or(i64::MAX)), + AggFn::Collect => CypherValue::List(values), + AggFn::Min => values + .into_iter() + .min_by(CypherValue::total_cmp) + .unwrap_or(CypherValue::Null), + AggFn::Max => values + .into_iter() + .max_by(CypherValue::total_cmp) + .unwrap_or(CypherValue::Null), + AggFn::Sum => CypherValue::Int(values.iter().filter_map(CypherValue::as_int).sum()), + AggFn::Avg => { + let numbers: Vec = values.iter().filter_map(CypherValue::as_int).collect(); + if numbers.is_empty() { + CypherValue::Null + } else { + CypherValue::Int(numbers.iter().sum::() / i64::try_from(numbers.len()).unwrap_or(1)) + } + } + }) + } +} + +fn apply_order_skip_limit( + query: &Query, + values: &mut Vec>, + columns: &[String], +) -> Result<(), CypherError> { + // ORDER BY operates on the projected value rows: each ORDER BY expression must resolve to + // a RETURN column (by identical expression or by naming a column/alias). + if !query.order_by.is_empty() { + let mut keys: Vec = Vec::with_capacity(query.order_by.len()); + for item in &query.order_by { + keys.push(resolve_order_column(item, &query.return_clause.items, columns)?); + } + + values.sort_by(|a, b| { + for (key_index, order_item) in keys.iter().zip(&query.order_by) { + let ordering = a[*key_index].total_cmp(&b[*key_index]); + let ordering = if order_item.descending { + ordering.reverse() + } else { + ordering + }; + if ordering != std::cmp::Ordering::Equal { + return ordering; + } + } + std::cmp::Ordering::Equal + }); + } + + if let Some(skip) = query.skip { + if skip >= values.len() { + values.clear(); + } else { + values.drain(0..skip); + } + } + + if let Some(limit) = query.limit + && values.len() > limit + { + values.truncate(limit); + } + + Ok(()) +} + +fn resolve_order_column( + order_item: &OrderItem, + items: &[ReturnItem], + columns: &[String], +) -> Result { + // Match by identical return expression first. + if let Some(index) = items.iter().position(|item| item.expr == order_item.expr) { + return Ok(index); + } + + // Otherwise, a bare variable in ORDER BY may name a return column or alias. + if let Expr::Var(name) = &order_item.expr + && let Some(index) = columns.iter().position(|column| column == name) + { + return Ok(index); + } + + Err(CypherError::execution(format!( + "ORDER BY expression `{}` must also appear in RETURN", + order_item.expr.display_name() + ))) +} + +fn resolve_rel_types(rel: &super::ast::RelPattern) -> Result, CypherError> { + if rel.types.is_empty() { + return Ok(RelType::all().to_vec()); + } + + let mut types = Vec::with_capacity(rel.types.len()); + for name in &rel.types { + let rel_type = RelType::parse(name) + .ok_or_else(|| CypherError::execution(format!("unknown relationship type `{name}`")))?; + types.push(rel_type); + } + Ok(types) +} + +fn literal_to_value(literal: &Literal) -> CypherValue { + match literal { + Literal::Str(value) => CypherValue::Str(value.clone()), + Literal::Int(value) => CypherValue::Int(*value), + Literal::Bool(value) => CypherValue::Bool(*value), + Literal::Null => CypherValue::Null, + } +} + +fn compare_values(left: &CypherValue, op: CmpOp, right: &CypherValue) -> bool { + if matches!(left, CypherValue::Null) || matches!(right, CypherValue::Null) { + return false; + } + + match op { + CmpOp::Eq => values_equal(left, right), + CmpOp::Neq => !values_equal(left, right), + CmpOp::Lt | CmpOp::Lte | CmpOp::Gt | CmpOp::Gte => { + if !same_type(left, right) { + return false; + } + let ordering = left.total_cmp(right); + match op { + CmpOp::Lt => ordering.is_lt(), + CmpOp::Lte => ordering.is_le(), + CmpOp::Gt => ordering.is_gt(), + CmpOp::Gte => ordering.is_ge(), + _ => unreachable!(), + } + } + CmpOp::Contains => string_op(left, right, |haystack, needle| haystack.contains(needle)), + CmpOp::StartsWith => string_op(left, right, |haystack, needle| haystack.starts_with(needle)), + CmpOp::EndsWith => string_op(left, right, |haystack, needle| haystack.ends_with(needle)), + } +} + +fn values_equal(left: &CypherValue, right: &CypherValue) -> bool { + same_type(left, right) && left == right +} + +fn same_type(left: &CypherValue, right: &CypherValue) -> bool { + matches!( + (left, right), + (CypherValue::Bool(_), CypherValue::Bool(_)) + | (CypherValue::Int(_), CypherValue::Int(_)) + | (CypherValue::Str(_), CypherValue::Str(_)) + | (CypherValue::Node { .. }, CypherValue::Node { .. }) + | (CypherValue::List(_), CypherValue::List(_)) + ) +} + +fn string_op(left: &CypherValue, right: &CypherValue, op: impl Fn(&str, &str) -> bool) -> bool { + match (left.as_str(), right.as_str()) { + (Some(haystack), Some(needle)) => op(haystack, needle), + _ => false, + } +} + +fn dedupe(output: &mut Vec>) { + let mut seen: Vec> = Vec::new(); + output.retain(|values| { + if seen.iter().any(|existing| existing == values) { + false + } else { + seen.push(values.clone()); + true + } + }); +} + +fn dedupe_values(values: Vec) -> Vec { + let mut result: Vec = Vec::new(); + for value in values { + if !result.contains(&value) { + result.push(value); + } + } + result +} diff --git a/rust/rubydex/src/query/cypher/format.rs b/rust/rubydex/src/query/cypher/format.rs new file mode 100644 index 000000000..6104f0952 --- /dev/null +++ b/rust/rubydex/src/query/cypher/format.rs @@ -0,0 +1,104 @@ +use std::fmt::Write; + +use super::executor::ResultSet; +use super::value::{CypherValue, write_json_string}; + +/// The output format for query results. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum OutputFormat { + Table, + Json, +} + +/// Renders a result set in the requested format. +#[must_use] +pub fn format(result: &ResultSet, format: OutputFormat) -> String { + match format { + OutputFormat::Table => format_table(result), + OutputFormat::Json => format_json(result), + } +} + +fn format_table(result: &ResultSet) -> String { + if result.columns.is_empty() { + return String::new(); + } + + let rendered: Vec> = result + .rows + .iter() + .map(|row| row.iter().map(CypherValue::to_display_string).collect()) + .collect(); + + let mut widths: Vec = result.columns.iter().map(String::len).collect(); + for row in &rendered { + for (index, cell) in row.iter().enumerate() { + if let Some(width) = widths.get_mut(index) { + *width = (*width).max(cell.chars().count()); + } + } + } + + let mut output = String::new(); + push_row(&mut output, &result.columns, &widths); + push_separator(&mut output, &widths); + for row in &rendered { + push_row(&mut output, row, &widths); + } + + let count = result.rows.len(); + let suffix = if count == 1 { "row" } else { "rows" }; + let _ = write!(output, "\n{count} {suffix}\n"); + output +} + +fn push_row(output: &mut String, cells: &[String], widths: &[usize]) { + for (index, width) in widths.iter().enumerate() { + if index > 0 { + output.push_str(" | "); + } + let empty = String::new(); + let cell = cells.get(index).unwrap_or(&empty); + let pad = width.saturating_sub(cell.chars().count()); + output.push_str(cell); + for _ in 0..pad { + output.push(' '); + } + } + output.push('\n'); +} + +fn push_separator(output: &mut String, widths: &[usize]) { + for (index, width) in widths.iter().enumerate() { + if index > 0 { + output.push_str("-+-"); + } + for _ in 0..*width { + output.push('-'); + } + } + output.push('\n'); +} + +fn format_json(result: &ResultSet) -> String { + let mut output = String::from("["); + for (row_index, row) in result.rows.iter().enumerate() { + if row_index > 0 { + output.push(','); + } + output.push('{'); + for (column_index, column) in result.columns.iter().enumerate() { + if column_index > 0 { + output.push(','); + } + write_json_string(&mut output, column); + output.push(':'); + row.get(column_index) + .unwrap_or(&CypherValue::Null) + .write_json(&mut output); + } + output.push('}'); + } + output.push(']'); + output +} diff --git a/rust/rubydex/src/query/cypher/lexer.rs b/rust/rubydex/src/query/cypher/lexer.rs new file mode 100644 index 000000000..de2fb24c9 --- /dev/null +++ b/rust/rubydex/src/query/cypher/lexer.rs @@ -0,0 +1,205 @@ +use super::error::CypherError; + +/// A lexical token together with the byte position where it starts in the source query. +#[derive(Debug, Clone, PartialEq)] +pub struct Token { + pub kind: TokenKind, + pub position: usize, +} + +#[derive(Debug, Clone, PartialEq)] +pub enum TokenKind { + Ident(String), + Int(i64), + Str(String), + LParen, + RParen, + LBracket, + RBracket, + LBrace, + RBrace, + Comma, + Colon, + Dot, + DotDot, + Star, + Pipe, + Eq, + Neq, + Lt, + Lte, + Gt, + Gte, + Minus, +} + +/// Tokenizes a Cypher query string into a flat token stream. +/// +/// # Errors +/// +/// Returns a [`CypherError::Syntax`] if an unterminated string or unexpected character is found. +pub fn tokenize(input: &str) -> Result, CypherError> { + let chars: Vec = input.chars().collect(); + let mut tokens = Vec::new(); + let mut index = 0; + + while index < chars.len() { + let ch = chars[index]; + + if ch.is_whitespace() { + index += 1; + continue; + } + + let start = index; + + match ch { + '(' => push(&mut tokens, TokenKind::LParen, start, &mut index), + ')' => push(&mut tokens, TokenKind::RParen, start, &mut index), + '[' => push(&mut tokens, TokenKind::LBracket, start, &mut index), + ']' => push(&mut tokens, TokenKind::RBracket, start, &mut index), + '{' => push(&mut tokens, TokenKind::LBrace, start, &mut index), + '}' => push(&mut tokens, TokenKind::RBrace, start, &mut index), + ',' => push(&mut tokens, TokenKind::Comma, start, &mut index), + ':' => push(&mut tokens, TokenKind::Colon, start, &mut index), + '*' => push(&mut tokens, TokenKind::Star, start, &mut index), + '|' => push(&mut tokens, TokenKind::Pipe, start, &mut index), + '-' => push(&mut tokens, TokenKind::Minus, start, &mut index), + '=' => push(&mut tokens, TokenKind::Eq, start, &mut index), + '.' => { + if chars.get(index + 1) == Some(&'.') { + tokens.push(Token { + kind: TokenKind::DotDot, + position: start, + }); + index += 2; + } else { + push(&mut tokens, TokenKind::Dot, start, &mut index); + } + } + '<' => { + if chars.get(index + 1) == Some(&'=') { + tokens.push(Token { + kind: TokenKind::Lte, + position: start, + }); + index += 2; + } else if chars.get(index + 1) == Some(&'>') { + tokens.push(Token { + kind: TokenKind::Neq, + position: start, + }); + index += 2; + } else { + push(&mut tokens, TokenKind::Lt, start, &mut index); + } + } + '>' => { + if chars.get(index + 1) == Some(&'=') { + tokens.push(Token { + kind: TokenKind::Gte, + position: start, + }); + index += 2; + } else { + push(&mut tokens, TokenKind::Gt, start, &mut index); + } + } + '\'' | '"' => { + let (value, next) = lex_string(&chars, index, ch)?; + tokens.push(Token { + kind: TokenKind::Str(value), + position: start, + }); + index = next; + } + c if c.is_ascii_digit() => { + let (value, next) = lex_number(&chars, index)?; + tokens.push(Token { + kind: TokenKind::Int(value), + position: start, + }); + index = next; + } + c if is_ident_start(c) => { + let (value, next) = lex_ident(&chars, index); + tokens.push(Token { + kind: TokenKind::Ident(value), + position: start, + }); + index = next; + } + other => { + return Err(CypherError::syntax(format!("unexpected character `{other}`"), start)); + } + } + } + + Ok(tokens) +} + +fn push(tokens: &mut Vec, kind: TokenKind, position: usize, index: &mut usize) { + tokens.push(Token { kind, position }); + *index += 1; +} + +fn lex_string(chars: &[char], start: usize, quote: char) -> Result<(String, usize), CypherError> { + let mut value = String::new(); + let mut index = start + 1; + + while index < chars.len() { + let ch = chars[index]; + if ch == '\\' { + if let Some(&next) = chars.get(index + 1) { + match next { + 'n' => value.push('\n'), + 't' => value.push('\t'), + 'r' => value.push('\r'), + '\\' => value.push('\\'), + '\'' => value.push('\''), + '"' => value.push('"'), + other => value.push(other), + } + index += 2; + continue; + } + return Err(CypherError::syntax("unterminated escape in string literal", index)); + } + if ch == quote { + return Ok((value, index + 1)); + } + value.push(ch); + index += 1; + } + + Err(CypherError::syntax("unterminated string literal", start)) +} + +fn lex_number(chars: &[char], start: usize) -> Result<(i64, usize), CypherError> { + let mut index = start; + while index < chars.len() && chars[index].is_ascii_digit() { + index += 1; + } + let text: String = chars[start..index].iter().collect(); + let value = text + .parse::() + .map_err(|_| CypherError::syntax(format!("invalid integer `{text}`"), start))?; + Ok((value, index)) +} + +fn lex_ident(chars: &[char], start: usize) -> (String, usize) { + let mut index = start; + while index < chars.len() && is_ident_continue(chars[index]) { + index += 1; + } + let text: String = chars[start..index].iter().collect(); + (text, index) +} + +fn is_ident_start(ch: char) -> bool { + ch.is_ascii_alphabetic() || ch == '_' +} + +fn is_ident_continue(ch: char) -> bool { + ch.is_ascii_alphanumeric() || ch == '_' +} diff --git a/rust/rubydex/src/query/cypher/mod.rs b/rust/rubydex/src/query/cypher/mod.rs new file mode 100644 index 000000000..978098c93 --- /dev/null +++ b/rust/rubydex/src/query/cypher/mod.rs @@ -0,0 +1,51 @@ +//! A small Cypher query engine that runs read-only queries directly against the in-memory +//! [`Graph`](crate::model::graph::Graph). +//! +//! Supported subset: +//! - `MATCH` with node patterns `(v:Label {prop: value})` — labels may be a disjunction +//! (`(v:Class|Module)` matches a node with **any** of the listed labels) — and relationship +//! patterns `-[:TYPE]->`, `<-[:TYPE]-`, `-[:TYPE]-`, including variable-length `-[:TYPE*min..max]->`. +//! - `WHERE` with `=`, `<>`, `<`, `<=`, `>`, `>=`, `CONTAINS`, `STARTS WITH`, `ENDS WITH`, +//! combined with `AND`, `OR`, `NOT`. +//! - `RETURN` with `DISTINCT`, `AS` aliases, and the aggregates `count`, `collect`, `min`, `max`, +//! `sum`, `avg`. +//! - `ORDER BY`, `SKIP`, `LIMIT`. +//! +//! See [`schema`] for the node labels and relationship types exposed to queries. + +pub mod ast; +pub mod error; +pub mod executor; +pub mod format; +pub mod lexer; +pub mod parser; +pub mod schema; +pub mod schema_info; +pub mod value; + +pub use error::CypherError; +pub use executor::ResultSet; +pub use format::OutputFormat; + +use crate::model::graph::Graph; + +/// Parses and executes a Cypher query against the graph, returning the formatted output. +/// +/// # Errors +/// +/// Returns a [`CypherError`] if the query cannot be parsed or executed. +pub fn run_query(graph: &Graph, query: &str, output_format: OutputFormat) -> Result { + let parsed = parser::parse(query)?; + let result = executor::execute(graph, &parsed)?; + Ok(format::format(&result, output_format)) +} + +/// Returns a description of the queryable schema (node labels, relationship types, and properties) +/// in the requested format. The schema is static and does not require a graph. +#[must_use] +pub fn schema(output_format: OutputFormat) -> String { + schema_info::describe(output_format) +} + +#[cfg(test)] +mod tests; diff --git a/rust/rubydex/src/query/cypher/parser.rs b/rust/rubydex/src/query/cypher/parser.rs new file mode 100644 index 000000000..b08182649 --- /dev/null +++ b/rust/rubydex/src/query/cypher/parser.rs @@ -0,0 +1,548 @@ +use super::ast::{ + AggFn, CmpOp, Direction, Expr, Literal, NodePattern, OrderItem, PathPattern, Query, RelPattern, Return, ReturnItem, + VarLength, +}; +use super::error::CypherError; +use super::lexer::{Token, TokenKind, tokenize}; + +/// Parses a Cypher query string into a [`Query`] AST. +/// +/// # Errors +/// +/// Returns a [`CypherError::Syntax`] on any lexical or grammatical error. +pub fn parse(input: &str) -> Result { + let tokens = tokenize(input)?; + let mut parser = Parser { + tokens, + position: 0, + source_len: input.len(), + }; + let query = parser.parse_query()?; + if let Some(token) = parser.peek() { + return Err(CypherError::syntax("unexpected trailing input", token.position)); + } + Ok(query) +} + +struct Parser { + tokens: Vec, + position: usize, + source_len: usize, +} + +impl Parser { + fn peek(&self) -> Option<&Token> { + self.tokens.get(self.position) + } + + fn peek_kind(&self) -> Option<&TokenKind> { + self.tokens.get(self.position).map(|t| &t.kind) + } + + fn current_position(&self) -> usize { + self.tokens + .get(self.position) + .map_or(self.source_len, |token| token.position) + } + + fn advance(&mut self) -> Option { + let token = self.tokens.get(self.position).cloned(); + if token.is_some() { + self.position += 1; + } + token + } + + fn expect(&mut self, kind: &TokenKind, description: &str) -> Result<(), CypherError> { + match self.peek_kind() { + Some(actual) if actual == kind => { + self.position += 1; + Ok(()) + } + _ => Err(CypherError::syntax( + format!("expected {description}"), + self.current_position(), + )), + } + } + + fn at_keyword(&self, keyword: &str) -> bool { + matches!(self.peek_kind(), Some(TokenKind::Ident(name)) if name.eq_ignore_ascii_case(keyword)) + } + + fn eat_keyword(&mut self, keyword: &str) -> bool { + if self.at_keyword(keyword) { + self.position += 1; + true + } else { + false + } + } + + fn expect_keyword(&mut self, keyword: &str) -> Result<(), CypherError> { + if self.eat_keyword(keyword) { + Ok(()) + } else { + Err(CypherError::syntax( + format!("expected keyword `{keyword}`"), + self.current_position(), + )) + } + } + + fn expect_ident(&mut self, description: &str) -> Result { + match self.advance() { + Some(Token { + kind: TokenKind::Ident(name), + .. + }) => Ok(name), + _ => Err(CypherError::syntax( + format!("expected {description}"), + self.current_position(), + )), + } + } + + fn parse_query(&mut self) -> Result { + self.expect_keyword("MATCH")?; + let patterns = self.parse_patterns()?; + + let where_clause = if self.eat_keyword("WHERE") { + Some(self.parse_expr()?) + } else { + None + }; + + self.expect_keyword("RETURN")?; + let return_clause = self.parse_return()?; + + let mut order_by = Vec::new(); + if self.eat_keyword("ORDER") { + self.expect_keyword("BY")?; + order_by = self.parse_order_by()?; + } + + let skip = if self.eat_keyword("SKIP") { + Some(self.parse_usize()?) + } else { + None + }; + + let limit = if self.eat_keyword("LIMIT") { + Some(self.parse_usize()?) + } else { + None + }; + + Ok(Query { + patterns, + where_clause, + return_clause, + order_by, + skip, + limit, + }) + } + + fn parse_usize(&mut self) -> Result { + match self.advance() { + Some(Token { + kind: TokenKind::Int(value), + position, + }) => usize::try_from(value).map_err(|_| CypherError::syntax("expected a non-negative integer", position)), + _ => Err(CypherError::syntax("expected an integer", self.current_position())), + } + } + + fn parse_patterns(&mut self) -> Result, CypherError> { + let mut patterns = vec![self.parse_path_pattern()?]; + while matches!(self.peek_kind(), Some(TokenKind::Comma)) { + self.position += 1; + patterns.push(self.parse_path_pattern()?); + } + Ok(patterns) + } + + fn parse_path_pattern(&mut self) -> Result { + let start = self.parse_node_pattern()?; + let mut rest = Vec::new(); + while self.at_relationship_start() { + let rel = self.parse_rel_pattern()?; + let node = self.parse_node_pattern()?; + rest.push((rel, node)); + } + Ok(PathPattern { start, rest }) + } + + fn at_relationship_start(&self) -> bool { + matches!(self.peek_kind(), Some(TokenKind::Minus | TokenKind::Lt)) + } + + fn parse_node_pattern(&mut self) -> Result { + self.expect(&TokenKind::LParen, "`(` to start a node pattern")?; + + let var = match self.peek_kind() { + Some(TokenKind::Ident(name)) => { + let name = name.clone(); + self.position += 1; + Some(name) + } + _ => None, + }; + + let mut labels = Vec::new(); + if matches!(self.peek_kind(), Some(TokenKind::Colon)) { + self.position += 1; + labels.push(self.expect_ident("a node label after `:`")?); + while matches!(self.peek_kind(), Some(TokenKind::Pipe)) { + self.position += 1; + labels.push(self.expect_ident("a node label after `|`")?); + } + } + + let props = if matches!(self.peek_kind(), Some(TokenKind::LBrace)) { + self.parse_prop_map()? + } else { + Vec::new() + }; + + self.expect(&TokenKind::RParen, "`)` to close a node pattern")?; + + Ok(NodePattern { var, labels, props }) + } + + fn parse_prop_map(&mut self) -> Result, CypherError> { + self.expect(&TokenKind::LBrace, "`{`")?; + let mut props = Vec::new(); + + if !matches!(self.peek_kind(), Some(TokenKind::RBrace)) { + loop { + let key = self.expect_ident("a property name")?; + self.expect(&TokenKind::Colon, "`:` after property name")?; + let value = self.parse_literal()?; + props.push((key, value)); + + if matches!(self.peek_kind(), Some(TokenKind::Comma)) { + self.position += 1; + } else { + break; + } + } + } + + self.expect(&TokenKind::RBrace, "`}` to close a property map")?; + Ok(props) + } + + fn parse_rel_pattern(&mut self) -> Result { + let leading_in = matches!(self.peek_kind(), Some(TokenKind::Lt)); + if leading_in { + self.position += 1; + } + self.expect(&TokenKind::Minus, "`-` in relationship pattern")?; + + let mut var = None; + let mut types = Vec::new(); + let mut length = None; + + if matches!(self.peek_kind(), Some(TokenKind::LBracket)) { + self.position += 1; + + if let Some(TokenKind::Ident(name)) = self.peek_kind() { + var = Some(name.clone()); + self.position += 1; + } + + if matches!(self.peek_kind(), Some(TokenKind::Colon)) { + self.position += 1; + types.push(self.expect_ident("a relationship type after `:`")?); + while matches!(self.peek_kind(), Some(TokenKind::Pipe)) { + self.position += 1; + types.push(self.expect_ident("a relationship type after `|`")?); + } + } + + if matches!(self.peek_kind(), Some(TokenKind::Star)) { + self.position += 1; + length = Some(self.parse_var_length()?); + } + + self.expect(&TokenKind::RBracket, "`]` to close a relationship pattern")?; + } + + self.expect(&TokenKind::Minus, "`-` in relationship pattern")?; + let trailing_out = matches!(self.peek_kind(), Some(TokenKind::Gt)); + if trailing_out { + self.position += 1; + } + + let direction = match (leading_in, trailing_out) { + (true, false) => Direction::Incoming, + (false, true) => Direction::Outgoing, + (false, false) => Direction::Both, + (true, true) => { + return Err(CypherError::syntax( + "a relationship cannot point in both directions", + self.current_position(), + )); + } + }; + + Ok(RelPattern { + var, + types, + direction, + length, + }) + } + + fn parse_var_length(&mut self) -> Result { + let mut min = 1; + let mut max = None; + + if let Some(TokenKind::Int(value)) = self.peek_kind() { + let lower = u32::try_from(*value) + .map_err(|_| CypherError::syntax("variable-length bound is too large", self.current_position()))?; + self.position += 1; + + if matches!(self.peek_kind(), Some(TokenKind::DotDot)) { + self.position += 1; + min = lower; + max = self.parse_optional_length_bound()?; + } else { + // `*n` means exactly n hops. + min = lower; + max = Some(lower); + } + } else if matches!(self.peek_kind(), Some(TokenKind::DotDot)) { + self.position += 1; + max = self.parse_optional_length_bound()?; + } + + Ok(VarLength { min, max }) + } + + fn parse_optional_length_bound(&mut self) -> Result, CypherError> { + if let Some(TokenKind::Int(value)) = self.peek_kind() { + let upper = u32::try_from(*value) + .map_err(|_| CypherError::syntax("variable-length bound is too large", self.current_position()))?; + self.position += 1; + Ok(Some(upper)) + } else { + Ok(None) + } + } + + fn parse_return(&mut self) -> Result { + let distinct = self.eat_keyword("DISTINCT"); + let mut items = vec![self.parse_return_item()?]; + while matches!(self.peek_kind(), Some(TokenKind::Comma)) { + self.position += 1; + items.push(self.parse_return_item()?); + } + Ok(Return { distinct, items }) + } + + fn parse_return_item(&mut self) -> Result { + let expr = self.parse_expr()?; + let alias = if self.eat_keyword("AS") { + Some(self.expect_ident("an alias after `AS`")?) + } else { + None + }; + Ok(ReturnItem { expr, alias }) + } + + fn parse_order_by(&mut self) -> Result, CypherError> { + let mut items = vec![self.parse_order_item()?]; + while matches!(self.peek_kind(), Some(TokenKind::Comma)) { + self.position += 1; + items.push(self.parse_order_item()?); + } + Ok(items) + } + + fn parse_order_item(&mut self) -> Result { + let expr = self.parse_expr()?; + let descending = if self.eat_keyword("DESC") { + true + } else { + // ASC is the default and optional. + let _ = self.eat_keyword("ASC"); + false + }; + Ok(OrderItem { expr, descending }) + } + + fn parse_expr(&mut self) -> Result { + self.parse_or() + } + + fn parse_or(&mut self) -> Result { + let mut left = self.parse_and()?; + while self.eat_keyword("OR") { + let right = self.parse_and()?; + left = Expr::Or(Box::new(left), Box::new(right)); + } + Ok(left) + } + + fn parse_and(&mut self) -> Result { + let mut left = self.parse_not()?; + while self.eat_keyword("AND") { + let right = self.parse_not()?; + left = Expr::And(Box::new(left), Box::new(right)); + } + Ok(left) + } + + fn parse_not(&mut self) -> Result { + if self.eat_keyword("NOT") { + let inner = self.parse_not()?; + Ok(Expr::Not(Box::new(inner))) + } else { + self.parse_comparison() + } + } + + fn parse_comparison(&mut self) -> Result { + let left = self.parse_primary()?; + if let Some(op) = self.parse_comparison_op()? { + let right = self.parse_primary()?; + Ok(Expr::Compare(Box::new(left), op, Box::new(right))) + } else { + Ok(left) + } + } + + fn parse_comparison_op(&mut self) -> Result, CypherError> { + let op = match self.peek_kind() { + Some(TokenKind::Eq) => CmpOp::Eq, + Some(TokenKind::Neq) => CmpOp::Neq, + Some(TokenKind::Lt) => CmpOp::Lt, + Some(TokenKind::Lte) => CmpOp::Lte, + Some(TokenKind::Gt) => CmpOp::Gt, + Some(TokenKind::Gte) => CmpOp::Gte, + Some(TokenKind::Ident(name)) if name.eq_ignore_ascii_case("CONTAINS") => { + self.position += 1; + return Ok(Some(CmpOp::Contains)); + } + Some(TokenKind::Ident(name)) if name.eq_ignore_ascii_case("STARTS") => { + self.position += 1; + self.expect_keyword("WITH")?; + return Ok(Some(CmpOp::StartsWith)); + } + Some(TokenKind::Ident(name)) if name.eq_ignore_ascii_case("ENDS") => { + self.position += 1; + self.expect_keyword("WITH")?; + return Ok(Some(CmpOp::EndsWith)); + } + _ => return Ok(None), + }; + self.position += 1; + Ok(Some(op)) + } + + fn parse_primary(&mut self) -> Result { + match self.peek_kind() { + Some(TokenKind::LParen) => { + self.position += 1; + let expr = self.parse_or()?; + self.expect(&TokenKind::RParen, "`)` to close a grouped expression")?; + Ok(expr) + } + Some(TokenKind::Str(_) | TokenKind::Int(_)) => Ok(Expr::Literal(self.parse_literal()?)), + Some(TokenKind::Ident(name)) => { + let name = name.clone(); + if name.eq_ignore_ascii_case("TRUE") + || name.eq_ignore_ascii_case("FALSE") + || name.eq_ignore_ascii_case("NULL") + { + return Ok(Expr::Literal(self.parse_literal()?)); + } + if let Some(func) = aggregate_function(&name) + && matches!( + self.tokens.get(self.position + 1).map(|t| &t.kind), + Some(TokenKind::LParen) + ) + { + return self.parse_aggregate(func); + } + self.position += 1; + if matches!(self.peek_kind(), Some(TokenKind::Dot)) { + self.position += 1; + let prop = self.expect_ident("a property name after `.`")?; + Ok(Expr::Property(name, prop)) + } else { + Ok(Expr::Var(name)) + } + } + _ => Err(CypherError::syntax("expected an expression", self.current_position())), + } + } + + fn parse_aggregate(&mut self, func: AggFn) -> Result { + self.position += 1; // function name + self.expect(&TokenKind::LParen, "`(` after aggregate function")?; + let distinct = self.eat_keyword("DISTINCT"); + + let arg = if matches!(self.peek_kind(), Some(TokenKind::Star)) { + if func != AggFn::Count { + return Err(CypherError::syntax( + "only count(*) may use `*`", + self.current_position(), + )); + } + self.position += 1; + None + } else { + Some(Box::new(self.parse_or()?)) + }; + + self.expect(&TokenKind::RParen, "`)` to close aggregate function")?; + Ok(Expr::Aggregate { func, arg, distinct }) + } + + fn parse_literal(&mut self) -> Result { + match self.advance() { + Some(Token { + kind: TokenKind::Str(value), + .. + }) => Ok(Literal::Str(value)), + Some(Token { + kind: TokenKind::Int(value), + .. + }) => Ok(Literal::Int(value)), + Some(Token { + kind: TokenKind::Ident(name), + position, + }) => { + if name.eq_ignore_ascii_case("true") { + Ok(Literal::Bool(true)) + } else if name.eq_ignore_ascii_case("false") { + Ok(Literal::Bool(false)) + } else if name.eq_ignore_ascii_case("null") { + Ok(Literal::Null) + } else { + Err(CypherError::syntax( + format!("expected a literal, found `{name}`"), + position, + )) + } + } + _ => Err(CypherError::syntax("expected a literal value", self.current_position())), + } + } +} + +fn aggregate_function(name: &str) -> Option { + match name.to_ascii_lowercase().as_str() { + "count" => Some(AggFn::Count), + "collect" => Some(AggFn::Collect), + "min" => Some(AggFn::Min), + "max" => Some(AggFn::Max), + "sum" => Some(AggFn::Sum), + "avg" => Some(AggFn::Avg), + _ => None, + } +} diff --git a/rust/rubydex/src/query/cypher/schema.rs b/rust/rubydex/src/query/cypher/schema.rs new file mode 100644 index 000000000..02b024264 --- /dev/null +++ b/rust/rubydex/src/query/cypher/schema.rs @@ -0,0 +1,495 @@ +//! Maps the rubydex [`Graph`] onto a property-graph schema for Cypher execution. +//! +//! Node labels: +//! - `Document` — a source file. +//! - `Definition` — a per-file occurrence of a Ruby construct. +//! - `Declaration` — the global, merged concept of a named entity. Declarations also carry +//! kind sub-labels (`Class`, `Module`, `SingletonClass`, `Method`, `Constant`, `ConstantAlias`, +//! `GlobalVariable`, `InstanceVariable`, `ClassVariable`) plus the grouping label `Namespace` +//! (any of `Class`/`Module`/`SingletonClass`). +//! +//! Relationship types mirror `dot.rs`: +//! - `DEFINES`: `Document` → `Definition` +//! - `DECLARES`: `Definition` → `Declaration` +//! - `CONTAINS`: `Definition` → `Definition` (lexical nesting) +//! - `INHERITS`: `Declaration` → `Declaration` (superclass) +//! - `INCLUDES` / `PREPENDS` / `EXTENDS`: `Declaration` → `Declaration` (mixins) +//! - `OWNS`: `Declaration` → `Declaration` (members) +//! - `ANCESTOR`: `Declaration` → `Declaration` (linearized ancestor chain) +//! - `DESCENDANT`: `Declaration` → `Declaration` +//! - `REFERENCES`: `Document` → `Declaration` (constant references) + +use std::collections::{HashSet, VecDeque}; + +use crate::model::declaration::Declaration; +use crate::model::definitions::{Definition, Mixin}; +use crate::model::graph::Graph; +use crate::model::ids::{ConstantReferenceId, DeclarationId, DefinitionId, UriId}; + +use super::value::CypherValue; + +/// A handle to a node in the graph. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum NodeRef { + Declaration(DeclarationId), + Definition(DefinitionId), + Document(UriId), +} + +/// A relationship type. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum RelType { + Defines, + Declares, + Contains, + Inherits, + Includes, + Prepends, + Extends, + Owns, + Ancestor, + Descendant, + References, +} + +impl RelType { + /// Parses a relationship type name (case-insensitive). Returns `None` if unknown. + #[must_use] + pub fn parse(name: &str) -> Option { + match name.to_ascii_uppercase().as_str() { + "DEFINES" => Some(RelType::Defines), + "DECLARES" => Some(RelType::Declares), + "CONTAINS" => Some(RelType::Contains), + "INHERITS" => Some(RelType::Inherits), + "INCLUDES" => Some(RelType::Includes), + "PREPENDS" => Some(RelType::Prepends), + "EXTENDS" => Some(RelType::Extends), + "OWNS" => Some(RelType::Owns), + "ANCESTOR" => Some(RelType::Ancestor), + "DESCENDANT" => Some(RelType::Descendant), + "REFERENCES" => Some(RelType::References), + _ => None, + } + } + + /// All relationship types, used when a pattern leaves the type unspecified. + #[must_use] + pub fn all() -> &'static [RelType] { + &[ + RelType::Defines, + RelType::Declares, + RelType::Contains, + RelType::Inherits, + RelType::Includes, + RelType::Prepends, + RelType::Extends, + RelType::Owns, + RelType::Ancestor, + RelType::Descendant, + RelType::References, + ] + } +} + +/// Returns all nodes matching the given labels. An empty slice matches every node; otherwise a node +/// is returned if it matches **any** of the labels (label disjunction, e.g. `(:Class|Module)`). +#[must_use] +pub fn scan(graph: &Graph, labels: &[String]) -> Vec { + if labels.is_empty() { + let mut nodes = Vec::new(); + nodes.extend(graph.documents().keys().map(|id| NodeRef::Document(*id))); + nodes.extend(graph.definitions().keys().map(|id| NodeRef::Definition(*id))); + nodes.extend(graph.declarations().keys().map(|id| NodeRef::Declaration(*id))); + return nodes; + } + + let mut seen = HashSet::new(); + let mut nodes = Vec::new(); + for label in labels { + for node in scan_label(graph, label) { + if seen.insert(node) { + nodes.push(node); + } + } + } + nodes +} + +/// Returns all nodes matching a single label. +fn scan_label(graph: &Graph, label: &str) -> Vec { + match label { + "Document" => graph.documents().keys().map(|id| NodeRef::Document(*id)).collect(), + "Definition" => graph.definitions().keys().map(|id| NodeRef::Definition(*id)).collect(), + other => graph + .declarations() + .iter() + .filter(|(_, declaration)| declaration_matches_label(declaration, other)) + .map(|(id, _)| NodeRef::Declaration(*id)) + .collect(), + } +} + +/// Returns whether a node matches the given labels. An empty slice matches any node; otherwise the +/// node must match **at least one** of the labels. +#[must_use] +pub fn matches_labels(graph: &Graph, node: NodeRef, labels: &[String]) -> bool { + if labels.is_empty() { + return true; + } + labels.iter().any(|label| matches_label(graph, node, label)) +} + +/// Returns whether a node matches a single label. +#[must_use] +pub fn matches_label(graph: &Graph, node: NodeRef, label: &str) -> bool { + match node { + NodeRef::Document(_) => label == "Document", + NodeRef::Definition(_) => label == "Definition", + NodeRef::Declaration(id) => graph + .declarations() + .get(&id) + .is_some_and(|declaration| declaration_matches_label(declaration, label)), + } +} + +fn declaration_matches_label(declaration: &Declaration, label: &str) -> bool { + match label { + "Declaration" => true, + "Namespace" => declaration.as_namespace().is_some(), + other => declaration.kind() == other, + } +} + +/// Returns the top-level label name of a node, used for display and JSON output. +#[must_use] +pub fn node_label(graph: &Graph, node: NodeRef) -> String { + match node { + NodeRef::Document(_) => "Document".to_string(), + NodeRef::Definition(id) => graph + .definitions() + .get(&id) + .map_or_else(|| "Definition".to_string(), |definition| definition.kind().to_string()), + NodeRef::Declaration(id) => graph.declarations().get(&id).map_or_else( + || "Declaration".to_string(), + |declaration| declaration.kind().to_string(), + ), + } +} + +/// The primary display name of a node (FQN for declarations, URI basename for documents). +#[must_use] +pub fn node_name(graph: &Graph, node: NodeRef) -> String { + match node { + NodeRef::Declaration(id) => graph + .declarations() + .get(&id) + .map_or_else(String::new, |declaration| declaration.name().to_string()), + NodeRef::Definition(id) => graph + .definitions() + .get(&id) + .and_then(|definition| graph.definition_to_declaration_id(definition)) + .and_then(|decl_id| graph.declarations().get(decl_id)) + .map_or_else(String::new, |declaration| declaration.name().to_string()), + NodeRef::Document(id) => graph.documents().get(&id).map_or_else(String::new, |document| { + let uri = document.uri(); + uri.rsplit('/').next().unwrap_or(uri).to_string() + }), + } +} + +/// Resolves a node property to a value. Unknown properties yield `NULL`. +#[must_use] +pub fn property(graph: &Graph, node: NodeRef, prop: &str) -> CypherValue { + match prop { + "label" | "kind" => CypherValue::Str(node_label(graph, node)), + _ => match node { + NodeRef::Declaration(id) => declaration_property(graph, id, prop), + NodeRef::Definition(id) => definition_property(graph, id, prop), + NodeRef::Document(id) => document_property(graph, id, prop), + }, + } +} + +fn declaration_property(graph: &Graph, id: DeclarationId, prop: &str) -> CypherValue { + let Some(declaration) = graph.declarations().get(&id) else { + return CypherValue::Null; + }; + + match prop { + "name" => CypherValue::Str(declaration.name().to_string()), + "unqualified_name" => CypherValue::Str(declaration.unqualified_name()), + "visibility" => graph + .visibility(&id) + .map_or(CypherValue::Null, |visibility| CypherValue::Str(visibility.to_string())), + "definition_count" => CypherValue::Int(i64::try_from(declaration.definitions().len()).unwrap_or(i64::MAX)), + _ => CypherValue::Null, + } +} + +fn definition_property(graph: &Graph, id: DefinitionId, prop: &str) -> CypherValue { + let Some(definition) = graph.definitions().get(&id) else { + return CypherValue::Null; + }; + + match prop { + "name" => CypherValue::Str(node_name(graph, NodeRef::Definition(id))), + "file" => graph + .documents() + .get(definition.uri_id()) + .map_or(CypherValue::Null, |document| { + CypherValue::Str(document.uri().to_string()) + }), + "line" => graph + .documents() + .get(definition.uri_id()) + .map_or(CypherValue::Null, |document| { + let location = definition.offset().to_location(document).to_presentation(); + CypherValue::Int(i64::from(location.start_line())) + }), + _ => CypherValue::Null, + } +} + +fn document_property(graph: &Graph, id: UriId, prop: &str) -> CypherValue { + let Some(document) = graph.documents().get(&id) else { + return CypherValue::Null; + }; + + match prop { + "uri" => CypherValue::Str(document.uri().to_string()), + "path" | "name" => { + let uri = document.uri(); + CypherValue::Str(uri.rsplit('/').next().unwrap_or(uri).to_string()) + } + _ => CypherValue::Null, + } +} + +/// Returns the candidate source nodes for a relationship type, used to build reverse adjacency. +#[must_use] +pub fn rel_source_nodes(graph: &Graph, rel: RelType) -> Vec { + match rel { + RelType::Defines | RelType::References => graph.documents().keys().map(|id| NodeRef::Document(*id)).collect(), + RelType::Declares | RelType::Contains => { + graph.definitions().keys().map(|id| NodeRef::Definition(*id)).collect() + } + RelType::Inherits + | RelType::Includes + | RelType::Prepends + | RelType::Extends + | RelType::Owns + | RelType::Ancestor + | RelType::Descendant => graph + .declarations() + .keys() + .map(|id| NodeRef::Declaration(*id)) + .collect(), + } +} + +/// Expands the outgoing edges of `node` for the given relationship type. +#[must_use] +pub fn expand_out(graph: &Graph, node: NodeRef, rel: RelType) -> Vec { + match (node, rel) { + (NodeRef::Document(uri_id), RelType::Defines) => graph + .documents() + .get(&uri_id) + .map(|document| { + document + .definitions() + .iter() + .map(|id| NodeRef::Definition(*id)) + .collect() + }) + .unwrap_or_default(), + (NodeRef::Document(uri_id), RelType::References) => document_references(graph, uri_id), + (NodeRef::Definition(def_id), RelType::Declares) => graph + .definitions() + .get(&def_id) + .and_then(|definition| graph.definition_to_declaration_id(definition)) + .map(|decl_id| vec![NodeRef::Declaration(*decl_id)]) + .unwrap_or_default(), + (NodeRef::Definition(def_id), RelType::Contains) => definition_children(graph, def_id), + (NodeRef::Declaration(decl_id), RelType::Inherits) => superclasses(graph, decl_id), + (NodeRef::Declaration(decl_id), RelType::Includes) => mixin_targets(graph, decl_id, MixinKind::Include), + (NodeRef::Declaration(decl_id), RelType::Prepends) => mixin_targets(graph, decl_id, MixinKind::Prepend), + (NodeRef::Declaration(decl_id), RelType::Extends) => mixin_targets(graph, decl_id, MixinKind::Extend), + (NodeRef::Declaration(decl_id), RelType::Owns) => members(graph, decl_id), + (NodeRef::Declaration(decl_id), RelType::Ancestor) => ancestors(graph, decl_id), + (NodeRef::Declaration(decl_id), RelType::Descendant) => descendants(graph, decl_id), + _ => Vec::new(), + } +} + +fn document_references(graph: &Graph, uri_id: UriId) -> Vec { + let Some(document) = graph.documents().get(&uri_id) else { + return Vec::new(); + }; + + let mut seen = HashSet::new(); + let mut targets = Vec::new(); + for ref_id in document.constant_references() { + if let Some(decl_id) = resolve_ref(graph, *ref_id) + && seen.insert(decl_id) + { + targets.push(NodeRef::Declaration(decl_id)); + } + } + targets +} + +fn definition_children(graph: &Graph, def_id: DefinitionId) -> Vec { + let Some(definition) = graph.definitions().get(&def_id) else { + return Vec::new(); + }; + + let children: &[DefinitionId] = match definition { + Definition::Class(d) => d.members(), + Definition::Module(d) => d.members(), + Definition::SingletonClass(d) => d.members(), + _ => &[], + }; + children.iter().map(|id| NodeRef::Definition(*id)).collect() +} + +fn superclasses(graph: &Graph, decl_id: DeclarationId) -> Vec { + let Some(declaration) = graph.declarations().get(&decl_id) else { + return Vec::new(); + }; + + let mut seen = HashSet::new(); + let mut targets = Vec::new(); + for definition_id in declaration.definitions() { + if let Some(Definition::Class(class_def)) = graph.definitions().get(definition_id) + && let Some(superclass_ref) = class_def.superclass_ref() + && let Some(target) = resolve_ref_to_namespace(graph, *superclass_ref) + && seen.insert(target) + { + targets.push(NodeRef::Declaration(target)); + } + } + targets +} + +#[derive(Clone, Copy)] +enum MixinKind { + Include, + Prepend, + Extend, +} + +fn mixin_targets(graph: &Graph, decl_id: DeclarationId, kind: MixinKind) -> Vec { + let Some(declaration) = graph.declarations().get(&decl_id) else { + return Vec::new(); + }; + + let mut seen = HashSet::new(); + let mut targets = Vec::new(); + for definition_id in declaration.definitions() { + let mixins: &[Mixin] = match graph.definitions().get(definition_id) { + Some(Definition::Class(d)) => d.mixins(), + Some(Definition::Module(d)) => d.mixins(), + Some(Definition::SingletonClass(d)) => d.mixins(), + _ => &[], + }; + + for mixin in mixins { + let matches = matches!( + (kind, mixin), + (MixinKind::Include, Mixin::Include(_)) + | (MixinKind::Prepend, Mixin::Prepend(_)) + | (MixinKind::Extend, Mixin::Extend(_)) + ); + if matches + && let Some(target) = resolve_ref_to_namespace(graph, *mixin.constant_reference_id()) + && seen.insert(target) + { + targets.push(NodeRef::Declaration(target)); + } + } + } + targets +} + +fn members(graph: &Graph, decl_id: DeclarationId) -> Vec { + graph + .declarations() + .get(&decl_id) + .and_then(Declaration::as_namespace) + .map(|namespace| { + namespace + .members() + .values() + .map(|id| NodeRef::Declaration(*id)) + .collect() + }) + .unwrap_or_default() +} + +fn ancestors(graph: &Graph, decl_id: DeclarationId) -> Vec { + use crate::model::declaration::Ancestor; + + graph + .declarations() + .get(&decl_id) + .and_then(Declaration::as_namespace) + .map(|namespace| { + namespace + .ancestors() + .iter() + .filter_map(|ancestor| match ancestor { + Ancestor::Complete(id) if *id != decl_id => Some(NodeRef::Declaration(*id)), + _ => None, + }) + .collect() + }) + .unwrap_or_default() +} + +fn descendants(graph: &Graph, decl_id: DeclarationId) -> Vec { + graph + .declarations() + .get(&decl_id) + .and_then(Declaration::as_namespace) + .map(|namespace| { + namespace + .descendants() + .iter() + .map(|id| NodeRef::Declaration(*id)) + .collect() + }) + .unwrap_or_default() +} + +/// Resolves a constant reference to the declaration of the name it points to. +fn resolve_ref(graph: &Graph, ref_id: ConstantReferenceId) -> Option { + let constant_ref = graph.constant_references().get(&ref_id)?; + graph.name_id_to_declaration_id(*constant_ref.name_id()).copied() +} + +/// Resolves a constant reference to a namespace declaration, following constant aliases. +fn resolve_ref_to_namespace(graph: &Graph, ref_id: ConstantReferenceId) -> Option { + resolve_to_namespace(graph, resolve_ref(graph, ref_id)?) +} + +/// Walks constant-alias chains until reaching a namespace declaration. +fn resolve_to_namespace(graph: &Graph, declaration_id: DeclarationId) -> Option { + let mut queue = VecDeque::from([declaration_id]); + let mut seen = HashSet::new(); + + while let Some(current_id) = queue.pop_front() { + if !seen.insert(current_id) { + continue; + } + + match graph.declarations().get(¤t_id)? { + Declaration::Namespace(_) => return Some(current_id), + Declaration::ConstantAlias(_) => { + queue.extend(graph.alias_targets(¤t_id)?); + } + _ => {} + } + } + + None +} diff --git a/rust/rubydex/src/query/cypher/schema_info.rs b/rust/rubydex/src/query/cypher/schema_info.rs new file mode 100644 index 000000000..2d3f06d73 --- /dev/null +++ b/rust/rubydex/src/query/cypher/schema_info.rs @@ -0,0 +1,369 @@ +//! Static, self-describing catalog of the Cypher property-graph model: the node labels, +//! relationship types, and node properties that queries can use. This mirrors the mapping +//! implemented in [`super::schema`] and is exposed via `--schema` for discoverability. + +use super::format::OutputFormat; +use super::value::write_json_string; + +/// A node label and what graph entity it matches. +struct LabelInfo { + label: &'static str, + matches: &'static str, + description: &'static str, +} + +/// A relationship type and its endpoints. +struct RelInfo { + name: &'static str, + from: &'static str, + to: &'static str, + description: &'static str, +} + +/// A property exposed on a node type. +struct PropInfo { + node_type: &'static str, + property: &'static str, + description: &'static str, +} + +const LABELS: &[LabelInfo] = &[ + LabelInfo { + label: "Document", + matches: "source files", + description: "A source file in the workspace", + }, + LabelInfo { + label: "Definition", + matches: "per-file occurrences", + description: "A single occurrence of a Ruby construct in one file", + }, + LabelInfo { + label: "Declaration", + matches: "merged entities", + description: "The global, merged concept of a named entity", + }, + LabelInfo { + label: "Namespace", + matches: "Class | Module | SingletonClass declarations", + description: "Grouping label for namespace-like declarations", + }, + LabelInfo { + label: "Class", + matches: "declarations of kind Class", + description: "A class declaration", + }, + LabelInfo { + label: "Module", + matches: "declarations of kind Module", + description: "A module declaration", + }, + LabelInfo { + label: "SingletonClass", + matches: "declarations of kind SingletonClass", + description: "A singleton class declaration", + }, + LabelInfo { + label: "Method", + matches: "declarations of kind Method", + description: "A method declaration", + }, + LabelInfo { + label: "Constant", + matches: "declarations of kind Constant", + description: "A constant declaration", + }, + LabelInfo { + label: "ConstantAlias", + matches: "declarations of kind ConstantAlias", + description: "A constant alias declaration", + }, + LabelInfo { + label: "GlobalVariable", + matches: "declarations of kind GlobalVariable", + description: "A global variable declaration", + }, + LabelInfo { + label: "InstanceVariable", + matches: "declarations of kind InstanceVariable", + description: "An instance variable declaration", + }, + LabelInfo { + label: "ClassVariable", + matches: "declarations of kind ClassVariable", + description: "A class variable declaration", + }, +]; + +const RELATIONSHIPS: &[RelInfo] = &[ + RelInfo { + name: "DEFINES", + from: "Document", + to: "Definition", + description: "A file defines a construct occurrence", + }, + RelInfo { + name: "DECLARES", + from: "Definition", + to: "Declaration", + description: "An occurrence contributes to a declaration", + }, + RelInfo { + name: "CONTAINS", + from: "Definition", + to: "Definition", + description: "Lexical nesting of definitions", + }, + RelInfo { + name: "INHERITS", + from: "Class", + to: "Class", + description: "Superclass relationship", + }, + RelInfo { + name: "INCLUDES", + from: "Declaration", + to: "Declaration", + description: "`include` mixin", + }, + RelInfo { + name: "PREPENDS", + from: "Declaration", + to: "Declaration", + description: "`prepend` mixin", + }, + RelInfo { + name: "EXTENDS", + from: "Declaration", + to: "Declaration", + description: "`extend` mixin", + }, + RelInfo { + name: "OWNS", + from: "Declaration", + to: "Declaration", + description: "A namespace owns a member declaration", + }, + RelInfo { + name: "ANCESTOR", + from: "Declaration", + to: "Declaration", + description: "An entry in the linearized ancestor chain", + }, + RelInfo { + name: "DESCENDANT", + from: "Declaration", + to: "Declaration", + description: "A declaration that descends from this one", + }, + RelInfo { + name: "REFERENCES", + from: "Document", + to: "Declaration", + description: "A file references a constant declaration", + }, +]; + +const PROPERTIES: &[PropInfo] = &[ + PropInfo { + node_type: "(any)", + property: "label", + description: "The node's top-level label / kind", + }, + PropInfo { + node_type: "(any)", + property: "kind", + description: "Alias of `label`", + }, + PropInfo { + node_type: "Declaration", + property: "name", + description: "Fully qualified name", + }, + PropInfo { + node_type: "Declaration", + property: "unqualified_name", + description: "Name without its namespace prefix", + }, + PropInfo { + node_type: "Declaration", + property: "visibility", + description: "public / protected / private (when applicable)", + }, + PropInfo { + node_type: "Declaration", + property: "definition_count", + description: "Number of definitions that compose the declaration", + }, + PropInfo { + node_type: "Definition", + property: "name", + description: "Name of the declaration this definition contributes to", + }, + PropInfo { + node_type: "Definition", + property: "file", + description: "URI of the file containing the definition", + }, + PropInfo { + node_type: "Definition", + property: "line", + description: "1-indexed start line of the definition", + }, + PropInfo { + node_type: "Document", + property: "uri", + description: "Full document URI", + }, + PropInfo { + node_type: "Document", + property: "path", + description: "Basename of the document URI", + }, +]; + +/// Renders the schema catalog in the requested format. +#[must_use] +pub fn describe(format: OutputFormat) -> String { + match format { + OutputFormat::Table => render_table(), + OutputFormat::Json => render_json(), + } +} + +fn render_table() -> String { + let mut out = String::new(); + + out.push_str("Node labels\n"); + let label_rows: Vec<[&str; 3]> = LABELS.iter().map(|l| [l.label, l.matches, l.description]).collect(); + push_table(&mut out, &["Label", "Matches", "Description"], &label_rows); + + out.push_str("\nRelationship types\n"); + let rel_rows: Vec<[&str; 4]> = RELATIONSHIPS + .iter() + .map(|r| [r.name, r.from, r.to, r.description]) + .collect(); + push_table(&mut out, &["Type", "From", "To", "Description"], &rel_rows); + + out.push_str("\nProperties\n"); + let prop_rows: Vec<[&str; 3]> = PROPERTIES + .iter() + .map(|p| [p.node_type, p.property, p.description]) + .collect(); + push_table(&mut out, &["Node type", "Property", "Description"], &prop_rows); + + out +} + +/// Renders a single aligned table section. `N` is the column count. +fn push_table(out: &mut String, headers: &[&str; N], rows: &[[&str; N]]) { + let mut widths: [usize; N] = std::array::from_fn(|i| headers[i].chars().count()); + for row in rows { + for (index, cell) in row.iter().enumerate() { + widths[index] = widths[index].max(cell.chars().count()); + } + } + + push_table_row(out, headers, &widths); + for (index, width) in widths.iter().enumerate() { + if index > 0 { + out.push_str("-+-"); + } + for _ in 0..*width { + out.push('-'); + } + } + out.push('\n'); + for row in rows { + push_table_row(out, row, &widths); + } +} + +fn push_table_row(out: &mut String, cells: &[&str; N], widths: &[usize; N]) { + for (index, width) in widths.iter().enumerate() { + if index > 0 { + out.push_str(" | "); + } + let cell = cells[index]; + out.push_str(cell); + for _ in 0..width.saturating_sub(cell.chars().count()) { + out.push(' '); + } + } + out.push('\n'); +} + +fn render_json() -> String { + let mut out = String::from("{\"node_labels\":["); + for (index, label) in LABELS.iter().enumerate() { + if index > 0 { + out.push(','); + } + out.push_str("{\"label\":"); + write_json_string(&mut out, label.label); + out.push_str(",\"matches\":"); + write_json_string(&mut out, label.matches); + out.push_str(",\"description\":"); + write_json_string(&mut out, label.description); + out.push('}'); + } + + out.push_str("],\"relationships\":["); + for (index, rel) in RELATIONSHIPS.iter().enumerate() { + if index > 0 { + out.push(','); + } + out.push_str("{\"type\":"); + write_json_string(&mut out, rel.name); + out.push_str(",\"from\":"); + write_json_string(&mut out, rel.from); + out.push_str(",\"to\":"); + write_json_string(&mut out, rel.to); + out.push_str(",\"description\":"); + write_json_string(&mut out, rel.description); + out.push('}'); + } + + out.push_str("],\"properties\":["); + for (index, prop) in PROPERTIES.iter().enumerate() { + if index > 0 { + out.push(','); + } + out.push_str("{\"node_type\":"); + write_json_string(&mut out, prop.node_type); + out.push_str(",\"property\":"); + write_json_string(&mut out, prop.property); + out.push_str(",\"description\":"); + write_json_string(&mut out, prop.description); + out.push('}'); + } + + out.push_str("]}"); + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn table_lists_labels_relationships_and_properties() { + let output = describe(OutputFormat::Table); + assert!(output.contains("Node labels")); + assert!(output.contains("Relationship types")); + assert!(output.contains("Properties")); + assert!(output.contains("Namespace")); + assert!(output.contains("INHERITS")); + assert!(output.contains("unqualified_name")); + } + + #[test] + fn json_is_well_formed_object() { + let output = describe(OutputFormat::Json); + assert!(output.starts_with("{\"node_labels\":[")); + assert!(output.contains("\"relationships\":[")); + assert!(output.contains("\"properties\":[")); + assert!(output.contains("\"type\":\"DEFINES\"")); + assert!(output.ends_with("]}")); + } +} diff --git a/rust/rubydex/src/query/cypher/tests.rs b/rust/rubydex/src/query/cypher/tests.rs new file mode 100644 index 000000000..fa88734b5 --- /dev/null +++ b/rust/rubydex/src/query/cypher/tests.rs @@ -0,0 +1,294 @@ +use super::ast::{AggFn, CmpOp, Direction, Expr, Literal}; +use super::executor::{self, ResultSet}; +use super::parser; +use super::value::CypherValue; +use super::{OutputFormat, run_query}; +use crate::model::graph::Graph; +use crate::test_utils::GraphTest; + +// ---- Parser tests -------------------------------------------------------- + +#[test] +fn parses_basic_match_return() { + let query = parser::parse("MATCH (c:Class) RETURN c.name").unwrap(); + assert_eq!(query.patterns.len(), 1); + let start = &query.patterns[0].start; + assert_eq!(start.var.as_deref(), Some("c")); + assert_eq!(start.labels, vec!["Class".to_string()]); + assert!(query.patterns[0].rest.is_empty()); + assert_eq!(query.return_clause.items.len(), 1); + assert_eq!( + query.return_clause.items[0].expr, + Expr::Property("c".into(), "name".into()) + ); +} + +#[test] +fn parses_label_disjunction() { + let query = parser::parse("MATCH (n:Class|Module) RETURN n").unwrap(); + assert_eq!( + query.patterns[0].start.labels, + vec!["Class".to_string(), "Module".to_string()] + ); +} + +#[test] +fn parses_inline_properties() { + let query = parser::parse("MATCH (c:Class {name: 'Foo'}) RETURN c").unwrap(); + let props = &query.patterns[0].start.props; + assert_eq!(props.len(), 1); + assert_eq!(props[0].0, "name"); + assert_eq!(props[0].1, Literal::Str("Foo".into())); +} + +#[test] +fn parses_relationship_directions() { + let outgoing = parser::parse("MATCH (a)-[:INHERITS]->(b) RETURN a").unwrap(); + assert_eq!(outgoing.patterns[0].rest[0].0.direction, Direction::Outgoing); + assert_eq!(outgoing.patterns[0].rest[0].0.types, vec!["INHERITS".to_string()]); + + let incoming = parser::parse("MATCH (a)<-[:INHERITS]-(b) RETURN a").unwrap(); + assert_eq!(incoming.patterns[0].rest[0].0.direction, Direction::Incoming); + + let both = parser::parse("MATCH (a)-[:INHERITS]-(b) RETURN a").unwrap(); + assert_eq!(both.patterns[0].rest[0].0.direction, Direction::Both); +} + +#[test] +fn parses_variable_length() { + let query = parser::parse("MATCH (a)-[:INHERITS*2..5]->(b) RETURN a").unwrap(); + let length = query.patterns[0].rest[0].0.length.unwrap(); + assert_eq!(length.min, 2); + assert_eq!(length.max, Some(5)); + + let unbounded = parser::parse("MATCH (a)-[:OWNS*]->(b) RETURN a").unwrap(); + let length = unbounded.patterns[0].rest[0].0.length.unwrap(); + assert_eq!(length.min, 1); + assert_eq!(length.max, None); + + let exact = parser::parse("MATCH (a)-[:OWNS*3]->(b) RETURN a").unwrap(); + let length = exact.patterns[0].rest[0].0.length.unwrap(); + assert_eq!(length.min, 3); + assert_eq!(length.max, Some(3)); +} + +#[test] +fn parses_aggregation_and_alias() { + let query = parser::parse("MATCH (c:Class) RETURN c.name, count(*) AS total").unwrap(); + assert_eq!(query.return_clause.items[1].alias.as_deref(), Some("total")); + assert_eq!( + query.return_clause.items[1].expr, + Expr::Aggregate { + func: AggFn::Count, + arg: None, + distinct: false, + } + ); +} + +#[test] +fn parses_where_and_order_limit() { + let query = + parser::parse("MATCH (c:Class) WHERE c.name CONTAINS 'Service' RETURN c.name ORDER BY c.name DESC LIMIT 5") + .unwrap(); + let Some(Expr::Compare(_, op, _)) = query.where_clause else { + panic!("expected comparison"); + }; + assert_eq!(op, CmpOp::Contains); + assert_eq!(query.order_by.len(), 1); + assert!(query.order_by[0].descending); + assert_eq!(query.limit, Some(5)); +} + +#[test] +fn rejects_invalid_syntax() { + assert!(parser::parse("MATCH (c:Class RETURN c").is_err()); + assert!(parser::parse("RETURN c").is_err()); + assert!(parser::parse("MATCH (c) RETURN").is_err()); + assert!(parser::parse("MATCH (a)<-[:INHERITS]->(b) RETURN a").is_err()); +} + +// ---- Executor tests ------------------------------------------------------ + +fn fixture_graph() -> Graph { + let mut context = GraphTest::new(); + context.index_uri( + "file:///zoo.rb", + " + module Walkable + end + + class Animal + def speak; end + end + + class Dog < Animal + include Walkable + end + + class Cat < Animal + end + ", + ); + context.resolve(); + context.into_graph() +} + +fn run(graph: &Graph, query: &str) -> ResultSet { + let parsed = parser::parse(query).unwrap(); + executor::execute(graph, &parsed).unwrap() +} + +fn column_strings(result: &ResultSet, column: usize) -> Vec { + let mut values: Vec = result.rows.iter().map(|row| row[column].to_display_string()).collect(); + values.sort(); + values +} + +#[test] +fn scans_declarations_by_label_and_property() { + let graph = fixture_graph(); + let result = run(&graph, "MATCH (c:Class {name: 'Dog'}) RETURN c.name"); + assert_eq!(result.columns, vec!["c.name".to_string()]); + assert_eq!(column_strings(&result, 0), vec!["Dog".to_string()]); +} + +#[test] +fn scans_label_disjunction() { + let graph = fixture_graph(); + let result = run( + &graph, + "MATCH (n:Class|Module) WHERE n.name = 'Animal' OR n.name = 'Walkable' RETURN n.name, n.kind", + ); + let names = column_strings(&result, 0); + assert_eq!(names, vec!["Animal".to_string(), "Walkable".to_string()]); +} + +#[test] +fn follows_inherits_relationship() { + let graph = fixture_graph(); + let result = run( + &graph, + "MATCH (c:Class)-[:INHERITS]->(p:Class) WHERE c.name = 'Dog' RETURN p.name", + ); + assert_eq!(column_strings(&result, 0), vec!["Animal".to_string()]); +} + +#[test] +fn follows_incoming_relationship() { + let graph = fixture_graph(); + let result = run( + &graph, + "MATCH (p:Class)<-[:INHERITS]-(c:Class) WHERE p.name = 'Animal' RETURN c.name", + ); + assert_eq!(column_strings(&result, 0), vec!["Cat".to_string(), "Dog".to_string()]); +} + +#[test] +fn follows_includes_relationship() { + let graph = fixture_graph(); + let result = run( + &graph, + "MATCH (c:Class)-[:INCLUDES]->(m) WHERE c.name = 'Dog' RETURN m.name", + ); + assert_eq!(column_strings(&result, 0), vec!["Walkable".to_string()]); +} + +#[test] +fn follows_owns_to_method() { + let graph = fixture_graph(); + let result = run( + &graph, + "MATCH (c:Class)-[:OWNS]->(m:Method) WHERE c.name = 'Animal' RETURN m.unqualified_name", + ); + assert!(column_strings(&result, 0).iter().any(|name| name.contains("speak"))); +} + +#[test] +fn variable_length_ancestor_chain() { + let graph = fixture_graph(); + let result = run( + &graph, + "MATCH (c:Class)-[:ANCESTOR]->(a) WHERE c.name = 'Dog' RETURN a.name", + ); + let ancestors = column_strings(&result, 0); + assert!(ancestors.contains(&"Animal".to_string())); + assert!(ancestors.contains(&"Walkable".to_string())); + assert!(ancestors.contains(&"Object".to_string())); +} + +#[test] +fn traverses_document_to_declaration() { + let graph = fixture_graph(); + let result = run( + &graph, + "MATCH (d:Document)-[:DEFINES]->(def:Definition)-[:DECLARES]->(decl) WHERE decl.name = 'Dog' RETURN decl.name", + ); + assert_eq!(column_strings(&result, 0), vec!["Dog".to_string()]); +} + +#[test] +fn aggregation_counts_subclasses() { + let graph = fixture_graph(); + let result = run( + &graph, + "MATCH (c:Class)-[:INHERITS]->(p:Class) WHERE p.name = 'Animal' RETURN p.name, count(c) AS subclasses", + ); + assert_eq!(result.rows.len(), 1); + assert_eq!(result.rows[0][0], CypherValue::Str("Animal".into())); + assert_eq!(result.rows[0][1], CypherValue::Int(2)); +} + +#[test] +fn distinct_and_order_and_limit() { + let graph = fixture_graph(); + let result = run( + &graph, + "MATCH (c:Class)-[:INHERITS]->(p:Class) RETURN DISTINCT p.name ORDER BY p.name LIMIT 1", + ); + assert_eq!(result.rows.len(), 1); + assert_eq!(result.rows[0][0], CypherValue::Str("Animal".into())); +} + +#[test] +fn where_with_boolean_operators() { + let graph = fixture_graph(); + let result = run( + &graph, + "MATCH (c:Class) WHERE c.name = 'Dog' OR c.name = 'Cat' RETURN c.name", + ); + assert_eq!(column_strings(&result, 0), vec!["Cat".to_string(), "Dog".to_string()]); +} + +#[test] +fn run_query_table_output() { + let graph = fixture_graph(); + let output = run_query( + &graph, + "MATCH (c:Class {name: 'Dog'}) RETURN c.name", + OutputFormat::Table, + ) + .unwrap(); + assert!(output.contains("c.name")); + assert!(output.contains("Dog")); + assert!(output.contains("1 row")); +} + +#[test] +fn run_query_json_output() { + let graph = fixture_graph(); + let output = run_query( + &graph, + "MATCH (c:Class {name: 'Dog'}) RETURN c.name", + OutputFormat::Json, + ) + .unwrap(); + assert_eq!(output, "[{\"c.name\":\"Dog\"}]"); +} + +#[test] +fn unknown_relationship_type_errors() { + let graph = fixture_graph(); + let parsed = parser::parse("MATCH (a)-[:BOGUS]->(b) RETURN a").unwrap(); + assert!(executor::execute(&graph, &parsed).is_err()); +} diff --git a/rust/rubydex/src/query/cypher/value.rs b/rust/rubydex/src/query/cypher/value.rs new file mode 100644 index 000000000..b14ddc60f --- /dev/null +++ b/rust/rubydex/src/query/cypher/value.rs @@ -0,0 +1,148 @@ +use std::cmp::Ordering; +use std::fmt::Write; + +/// A scalar or composite value produced by evaluating a Cypher expression or projecting a result. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum CypherValue { + Null, + Bool(bool), + Int(i64), + Str(String), + /// A graph node, rendered with its label and primary name. + Node { + label: String, + name: String, + }, + List(Vec), +} + +impl CypherValue { + /// Returns the truthiness of a value for use in `WHERE` filtering. + /// `NULL` and `false` are falsy; everything else (including bound nodes) is truthy. + #[must_use] + pub fn is_truthy(&self) -> bool { + match self { + CypherValue::Null => false, + CypherValue::Bool(b) => *b, + _ => true, + } + } + + /// Returns a numeric view of the value if it is an integer. + #[must_use] + pub fn as_int(&self) -> Option { + match self { + CypherValue::Int(i) => Some(*i), + _ => None, + } + } + + /// Returns a string view of the value if it is a string. + #[must_use] + pub fn as_str(&self) -> Option<&str> { + match self { + CypherValue::Str(s) => Some(s), + _ => None, + } + } + + /// Rank of each variant, used to order values of differing types deterministically. + fn type_rank(&self) -> u8 { + match self { + CypherValue::Null => 0, + CypherValue::Bool(_) => 1, + CypherValue::Int(_) => 2, + CypherValue::Str(_) => 3, + CypherValue::Node { .. } => 4, + CypherValue::List(_) => 5, + } + } + + /// Total ordering across all value types. `NULL` sorts first. Differing types are ordered by + /// their variant rank so that sorting is always deterministic. + #[must_use] + pub fn total_cmp(&self, other: &CypherValue) -> Ordering { + match (self, other) { + (CypherValue::Bool(a), CypherValue::Bool(b)) => a.cmp(b), + (CypherValue::Int(a), CypherValue::Int(b)) => a.cmp(b), + (CypherValue::Str(a), CypherValue::Str(b)) + | (CypherValue::Node { name: a, .. }, CypherValue::Node { name: b, .. }) => a.cmp(b), + (CypherValue::List(a), CypherValue::List(b)) => { + for (x, y) in a.iter().zip(b.iter()) { + let ordering = x.total_cmp(y); + if ordering != Ordering::Equal { + return ordering; + } + } + a.len().cmp(&b.len()) + } + _ => self.type_rank().cmp(&other.type_rank()), + } + } + + /// Renders the value for display in a plain-text table cell. + #[must_use] + pub fn to_display_string(&self) -> String { + match self { + CypherValue::Null => String::new(), + CypherValue::Bool(b) => b.to_string(), + CypherValue::Int(i) => i.to_string(), + CypherValue::Str(s) => s.clone(), + CypherValue::Node { name, .. } => name.clone(), + CypherValue::List(items) => { + let rendered: Vec = items.iter().map(CypherValue::to_display_string).collect(); + format!("[{}]", rendered.join(", ")) + } + } + } + + /// Renders the value as a JSON fragment, appending to `out`. + pub fn write_json(&self, out: &mut String) { + match self { + CypherValue::Null => out.push_str("null"), + CypherValue::Bool(b) => { + let _ = write!(out, "{b}"); + } + CypherValue::Int(i) => { + let _ = write!(out, "{i}"); + } + CypherValue::Str(s) => write_json_string(out, s), + CypherValue::Node { label, name } => { + out.push_str("{\"label\":"); + write_json_string(out, label); + out.push_str(",\"name\":"); + write_json_string(out, name); + out.push('}'); + } + CypherValue::List(items) => { + out.push('['); + for (index, item) in items.iter().enumerate() { + if index > 0 { + out.push(','); + } + item.write_json(out); + } + out.push(']'); + } + } + } +} + +/// Escapes and quotes a string as a JSON string literal. +pub fn write_json_string(out: &mut String, value: &str) { + out.push('"'); + for ch in value.chars() { + match ch { + '"' => out.push_str("\\\""), + '\\' => out.push_str("\\\\"), + '\n' => out.push_str("\\n"), + '\r' => out.push_str("\\r"), + '\t' => out.push_str("\\t"), + c if (c as u32) < 0x20 => { + let _ = write!(out, "\\u{:04x}", c as u32); + } + c => out.push(c), + } + } + out.push('"'); +} From e3c1975e52dfe77dee99e1ab8cc469c8fb50dbbc Mon Sep 17 00:00:00 2001 From: Ufuk Kayserilioglu Date: Fri, 19 Jun 2026 01:00:13 +0300 Subject: [PATCH 2/8] Add --query and --schema flags to rubydex_cli Wire the Cypher engine into the CLI with --query to run a query and --schema to print the queryable schema (labels, relationships, properties). The output format is selected with --format (default table). Queries run after resolution; --schema is static and exits before indexing. Parse and execution errors go to stderr with a non-zero exit. Add CLI integration tests for query output, schema output, and error handling. --- rust/rubydex/src/main.rs | 58 +++++++++++++++++++++++++++++++++ rust/rubydex/tests/cli.rs | 68 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+) diff --git a/rust/rubydex/src/main.rs b/rust/rubydex/src/main.rs index d6a4c7f87..bebd17c20 100644 --- a/rust/rubydex/src/main.rs +++ b/rust/rubydex/src/main.rs @@ -6,6 +6,7 @@ use rubydex::{ indexing::{self, IndexerBackend}, integrity, listing, model::graph::Graph, + query::cypher::{self, OutputFormat}, resolution::Resolver, stats::{ memory::MemoryStats, @@ -52,6 +53,38 @@ struct Args { help = "Write orphan definitions report to specified file" )] report_orphans: Option, + + #[arg(long = "query", value_name = "CYPHER", help = "Run a Cypher query against the graph")] + query: Option, + + #[arg( + long = "schema", + help = "Describe the queryable Cypher schema (labels, relationships, properties) and exit" + )] + schema: bool, + + #[arg( + long = "format", + value_enum, + default_value = "table", + help = "Output format for --query and --schema results" + )] + format: Format, +} + +#[derive(Debug, Clone, Copy, ValueEnum)] +enum Format { + Table, + Json, +} + +impl From for OutputFormat { + fn from(format: Format) -> Self { + match format { + Format::Table => OutputFormat::Table, + Format::Json => OutputFormat::Json, + } + } } #[derive(Debug, Clone, ValueEnum)] @@ -88,6 +121,12 @@ fn exit(print_stats: bool) { fn main() { let args = Args::parse(); + // The Cypher schema is static, so describe it without indexing the workspace. + if args.schema { + print!("{}", cypher::schema(args.format.into())); + std::process::exit(0); + } + if args.stats { Timer::set_global_timer(Timer::new()); } @@ -173,6 +212,25 @@ fn main() { } } + // Cypher query + if let Some(query) = &args.query { + match time_it!(querying, { cypher::run_query(&graph, query, args.format.into()) }) { + Ok(output) => print!("{output}"), + Err(error) => { + eprintln!("{error}"); + std::process::exit(1); + } + } + + if args.stats { + Timer::print_breakdown(); + MemoryStats::print_memory_usage(); + } + + mem::forget(graph); + return; + } + // Generate visualization or print statistics if args.dot { println!("{}", dot::DotBuilder::generate(&graph, args.show_builtins)); diff --git a/rust/rubydex/tests/cli.rs b/rust/rubydex/tests/cli.rs index 5c5a4cb59..7227dea35 100644 --- a/rust/rubydex/tests/cli.rs +++ b/rust/rubydex/tests/cli.rs @@ -89,6 +89,74 @@ fn dot_flag() { }); } +#[test] +fn query_flag_table_output() { + with_context(|context| { + context.write("zoo.rb", "class Animal\nend\n\nclass Dog < Animal\nend\n"); + + rdx(&[ + context.absolute_path().to_str().unwrap(), + "--query", + "MATCH (c:Class)-[:INHERITS]->(p:Class) WHERE c.name = 'Dog' RETURN p.name", + ]) + .success() + .stdout(predicate::str::contains("p.name")) + .stdout(predicate::str::contains("Animal")) + .stdout(predicate::str::contains("1 row")); + }); +} + +#[test] +fn query_flag_json_output() { + with_context(|context| { + context.write("zoo.rb", "class Animal\nend\n\nclass Dog < Animal\nend\n"); + + rdx(&[ + context.absolute_path().to_str().unwrap(), + "--query", + "MATCH (c:Class {name: 'Dog'}) RETURN c.name", + "--format", + "json", + ]) + .success() + .stdout(predicate::str::contains("[{\"c.name\":\"Dog\"}]")); + }); +} + +#[test] +fn schema_flag_describes_model() { + rdx(&["--schema"]) + .success() + .stdout(predicate::str::contains("Node labels")) + .stdout(predicate::str::contains("Relationship types")) + .stdout(predicate::str::contains("Properties")) + .stdout(predicate::str::contains("INHERITS")) + .stdout(predicate::str::contains("unqualified_name")); +} + +#[test] +fn schema_flag_json_format() { + rdx(&["--schema", "--format", "json"]) + .success() + .stdout(predicate::str::contains("\"node_labels\":[")) + .stdout(predicate::str::contains("\"type\":\"DEFINES\"")); +} + +#[test] +fn query_flag_reports_syntax_error() { + with_context(|context| { + context.write("zoo.rb", "class Animal\nend\n"); + + rdx(&[ + context.absolute_path().to_str().unwrap(), + "--query", + "MATCH (c RETURN c", + ]) + .failure() + .stderr(predicate::str::contains("Cypher syntax error")); + }); +} + #[test] fn stop_after() { with_context(|context| { From 13f37ecd7a7435f5ebd43e78144cd32a9ea16749 Mon Sep 17 00:00:00 2001 From: Ufuk Kayserilioglu Date: Fri, 19 Jun 2026 01:00:22 +0300 Subject: [PATCH 3/8] Expose Cypher via Rubydex::Graph and the rdx command CLI Add FFI exports (rdx_graph_query and rdx_cypher_schema) in rubydex-sys, bind them as the Graph#query instance method and the Graph.cypher_schema class method, and add their Sorbet signatures. query accepts an optional format (String or Symbol, default :table) and raises ArgumentError on parse, execution, or format errors. Restructure the exe/rdx executable around subcommands: `rdx query `, `rdx schema`, and `rdx console` (the interactive session), each with a --format option where applicable. Cover the Ruby API with tests for query output, schema output, format coercion, label disjunction, and error handling. --- exe/rdx | 110 +++++++++++++++++++++++------- ext/rubydex/graph.c | 65 ++++++++++++++++++ rbi/rubydex.rbi | 8 +++ rust/rubydex-sys/src/graph_api.rs | 86 +++++++++++++++++++++++ test/graph_test.rb | 107 +++++++++++++++++++++++++++++ 5 files changed, 352 insertions(+), 24 deletions(-) diff --git a/exe/rdx b/exe/rdx index 38143b2ef..bb5356e3d 100755 --- a/exe/rdx +++ b/exe/rdx @@ -5,40 +5,100 @@ $LOAD_PATH.unshift(File.expand_path("../lib", __dir__)) require "optparse" -options = {} +USAGE = <<~TEXT + Usage: rdx [options] -OptionParser.new do |parser| - parser.on("--version", "Print the gem's version") do - require "rubydex/version" - puts "v#{Rubydex::VERSION}" - exit - end + Commands: + query Run a Cypher query against the workspace graph and print the result + schema Describe the queryable Cypher schema (labels, relationships, properties) + console Open an interactive session with a populated graph for the current workspace + help Show this help message - parser.on("-h", "--help", "Prints this help") do - puts parser - exit - end + Run `rdx --help` for command-specific options. +TEXT - parser.on("-i", "--interactive", "Open an interactive session with a populated graph for the current workspace") do - options[:interactive] = true - end -end.parse! +def abort_with_usage(message) + warn(message) + warn("") + warn(USAGE) + exit(1) +end -require "rubydex" +# Top-level --version / --help / bare invocation, handled before command dispatch. +case ARGV.first +when "--version", "version" + require "rubydex/version" + puts "v#{Rubydex::VERSION}" + exit +when nil, "-h", "--help", "help" + puts USAGE + exit +end + +command = ARGV.shift -def __with_timer(message, &block) - print(message) +def with_timer(io, message) + io.print(message) start = Process.clock_gettime(Process::CLOCK_MONOTONIC, :float_millisecond) - block.call + yield duration = Process.clock_gettime(Process::CLOCK_MONOTONIC, :float_millisecond) - start - puts " finished in #{duration.round(2)}ms" + io.puts(" finished in #{duration.round(2)}ms") +end + +# Builds the workspace graph, sending progress messages to `progress_io`. +def build_graph(progress_io) + graph = Rubydex::Graph.new + with_timer(progress_io, "Indexing workspace...") { graph.index_workspace } + with_timer(progress_io, "Resolving graph...") { graph.resolve } + graph end -graph = Rubydex::Graph.new -__with_timer("Indexing workspace...") { graph.index_workspace } -__with_timer("Resolving graph...") { graph.resolve } +# Parses `--format`/`--help` for a command and returns the chosen format. +def parse_format(usage) + format = "table" + OptionParser.new do |parser| + parser.banner = usage + parser.on("--format FORMAT", ["table", "json"], "Output format (table or json)") { |value| format = value } + parser.on("-h", "--help", "Show this help") do + puts parser + exit + end + end.parse! + format +end + +case command +when "query" + format = parse_format("Usage: rdx query [options]") + query = ARGV.shift + abort_with_usage("`query` requires a Cypher query argument") if query.nil? || query.empty? + + require "rubydex" + # Progress goes to stderr so stdout carries only the query result (e.g. for piping JSON). + graph = build_graph($stderr) + begin + print(graph.query(query, format)) + rescue ArgumentError => e + abort(e.message) + end +when "schema" + format = parse_format("Usage: rdx schema [options]") + + require "rubydex" + # The schema is static, so describe it without indexing the workspace. + print(Rubydex::Graph.cypher_schema(format)) +when "console" + OptionParser.new do |parser| + parser.banner = "Usage: rdx console" + parser.on("-h", "--help", "Show this help") do + puts parser + exit + end + end.parse! + + require "rubydex" + graph = build_graph($stdout) -if options[:interactive] begin require "irb" IRB.setup(nil) @@ -48,4 +108,6 @@ if options[:interactive] rescue LoadError abort("Interactive mode requires `irb` to be in the bundle") end +else + abort_with_usage("unknown command: #{command}") end diff --git a/ext/rubydex/graph.c b/ext/rubydex/graph.c index 15912be63..91a481f58 100644 --- a/ext/rubydex/graph.c +++ b/ext/rubydex/graph.c @@ -750,6 +750,68 @@ static VALUE rdxr_graph_keyword(VALUE self, VALUE name) { return rb_class_new_instance(2, argv, cKeyword); } +// Graph#query: (String query, ?(String | Symbol) format) -> String +// Runs a Cypher query against the graph and returns the formatted output. +// `format` may be "table" (default) or "json". Raises ArgumentError on a parse, execution, or +// format error. +static VALUE rdxr_graph_query(int argc, VALUE *argv, VALUE self) { + VALUE query, format; + rb_scan_args(argc, argv, "11", &query, &format); + Check_Type(query, T_STRING); + + const char *format_str = "table"; + if (!NIL_P(format)) { + if (RB_TYPE_P(format, T_SYMBOL)) { + format = rb_sym2str(format); + } + Check_Type(format, T_STRING); + format_str = StringValueCStr(format); + } + + void *graph; + TypedData_Get_Struct(self, void *, &graph_type, graph); + + struct CQueryResult result = rdx_graph_query(graph, StringValueCStr(query), format_str); + + if (result.error != NULL) { + VALUE message = rb_utf8_str_new_cstr(result.error); + free_c_string(result.error); + rb_raise(rb_eArgError, "%s", StringValueCStr(message)); + } + + VALUE output = result.output == NULL ? rb_utf8_str_new_cstr("") : rb_utf8_str_new_cstr(result.output); + if (result.output != NULL) { + free_c_string(result.output); + } + + return output; +} + +// Rubydex::Graph.cypher_schema(format = :table) -> String +// Returns a description of the queryable Cypher schema. `format` may be "table" (default) or "json". +// The schema is static, so this is a class method and does not require a graph instance. +static VALUE rdxr_cypher_schema(int argc, VALUE *argv, VALUE self) { + VALUE format; + rb_scan_args(argc, argv, "01", &format); + + const char *format_str = "table"; + if (!NIL_P(format)) { + if (RB_TYPE_P(format, T_SYMBOL)) { + format = rb_sym2str(format); + } + Check_Type(format, T_STRING); + format_str = StringValueCStr(format); + } + + const char *output = rdx_cypher_schema(format_str); + VALUE result = output == NULL ? rb_utf8_str_new_cstr("") : rb_utf8_str_new_cstr(output); + if (output != NULL) { + free_c_string(output); + } + + return result; +} + void rdxi_initialize_graph(VALUE moduleRubydex) { mRubydex = moduleRubydex; cGraph = rb_define_class_under(mRubydex, "Graph", rb_cObject); @@ -784,4 +846,7 @@ void rdxi_initialize_graph(VALUE moduleRubydex) { rb_define_method(cGraph, "exclude_paths", rdxr_graph_exclude_paths, 1); rb_define_method(cGraph, "excluded_paths", rdxr_graph_excluded_paths, 0); rb_define_method(cGraph, "keyword", rdxr_graph_keyword, 1); + rb_define_method(cGraph, "query", rdxr_graph_query, -1); + + rb_define_singleton_method(cGraph, "cypher_schema", rdxr_cypher_schema, -1); } diff --git a/rbi/rubydex.rbi b/rbi/rubydex.rbi index dd413eed8..231c7e80a 100644 --- a/rbi/rubydex.rbi +++ b/rbi/rubydex.rbi @@ -275,6 +275,11 @@ class Rubydex::IntegrityFailure < Rubydex::Failure; end class Rubydex::Graph IGNORED_DIRECTORIES = T.let(T.unsafe(nil), T::Array[String]) + class << self + sig { params(format: T.any(String, Symbol)).returns(String) } + def cypher_schema(format = :table); end + end + sig { params(workspace_path: T.nilable(String)).void } def initialize(workspace_path: nil); end @@ -324,6 +329,9 @@ class Rubydex::Graph sig { params(require_path: String, load_paths: T::Array[String]).returns(T.nilable(Rubydex::Document)) } def resolve_require_path(require_path, load_paths); end + sig { params(query: String, format: T.any(String, Symbol)).returns(String) } + def query(query, format = :table); end + sig { params(query: String).returns(T::Enumerable[Rubydex::Declaration]) } def search(query); end diff --git a/rust/rubydex-sys/src/graph_api.rs b/rust/rubydex-sys/src/graph_api.rs index 562963f7b..1caab7a50 100644 --- a/rust/rubydex-sys/src/graph_api.rs +++ b/rust/rubydex-sys/src/graph_api.rs @@ -14,6 +14,7 @@ use rubydex::model::ids::{DeclarationId, NameId, UriId}; use rubydex::model::keywords; use rubydex::model::name::NameRef; use rubydex::model::visibility::Visibility; +use rubydex::query::cypher::{self, OutputFormat}; use rubydex::query::{CompletionCandidate, CompletionContext, CompletionReceiver}; use rubydex::resolution::Resolver; use rubydex::{indexing, integrity, listing, query}; @@ -977,6 +978,91 @@ pub unsafe extern "C" fn rdx_keyword_get(name: *const c_char) -> *const CKeyword } } +/// The result of running a Cypher query, carrying either the formatted output or an error message. +#[repr(C)] +pub struct CQueryResult { + /// Non-null on success; null on error. Caller must free with `free_c_string`. + pub output: *const c_char, + /// Non-null on error; null on success. Caller must free with `free_c_string`. + pub error: *const c_char, +} + +impl CQueryResult { + fn success(output: &str) -> Self { + match CString::new(output) { + Ok(c_string) => Self { + output: c_string.into_raw().cast_const(), + error: ptr::null(), + }, + Err(_) => Self::error("query output contained an interior NUL byte"), + } + } + + fn error(message: &str) -> Self { + Self { + output: ptr::null(), + error: CString::new(message).map_or(ptr::null(), |s| s.into_raw().cast_const()), + } + } +} + +/// Returns a description of the queryable Cypher schema (node labels, relationship types, and +/// properties) in the given format (`"table"` or `"json"`). The schema is static and requires no +/// graph. Caller must free the returned pointer with `free_c_string`. +/// +/// # Safety +/// +/// - `format` must be a valid, null-terminated UTF-8 string. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn rdx_cypher_schema(format: *const c_char) -> *const c_char { + let format_str = unsafe { utils::convert_char_ptr_to_string(format) }.unwrap_or_else(|_| "table".to_string()); + let output_format = if format_str == "json" { + OutputFormat::Json + } else { + OutputFormat::Table + }; + + CString::new(cypher::schema(output_format)).map_or(ptr::null(), |s| s.into_raw().cast_const()) +} + +/// Runs a Cypher query against the graph and returns the formatted output or an error message. +/// +/// `format` must be `"table"` or `"json"`. +/// +/// # Safety +/// +/// - `pointer` must be a valid `GraphPointer` previously returned by this crate. +/// - `query` and `format` must be valid, null-terminated UTF-8 strings. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn rdx_graph_query( + pointer: GraphPointer, + query: *const c_char, + format: *const c_char, +) -> CQueryResult { + let Ok(query_str) = (unsafe { utils::convert_char_ptr_to_string(query) }) else { + return CQueryResult::error("query is not valid UTF-8"); + }; + + let Ok(format_str) = (unsafe { utils::convert_char_ptr_to_string(format) }) else { + return CQueryResult::error("format is not valid UTF-8"); + }; + + let output_format = match format_str.as_str() { + "table" => OutputFormat::Table, + "json" => OutputFormat::Json, + other => { + return CQueryResult::error(&format!("unknown query format `{other}` (expected `table` or `json`)")); + } + }; + + with_graph(pointer, |graph| { + match cypher::run_query(graph, &query_str, output_format) { + Ok(output) => CQueryResult::success(&output), + Err(error) => CQueryResult::error(&error.to_string()), + } + }) +} + #[repr(u8)] #[derive(Debug, Clone, Copy)] pub enum CVisibility { diff --git a/test/graph_test.rb b/test/graph_test.rb index 5a73a5958..51ddae1f0 100644 --- a/test/graph_test.rb +++ b/test/graph_test.rb @@ -2,6 +2,7 @@ require "test_helper" require "helpers/context" +require "json" class GraphTest < Minitest::Test include Test::Helpers::WithContext @@ -1440,6 +1441,112 @@ def test_document_returns_correct_document_with_multiple_documents assert_equal("file:///bar.rb", document.uri) end + def test_cypher_schema_table + output = Rubydex::Graph.cypher_schema + + assert_match(/Node labels/, output) + assert_match(/Relationship types/, output) + assert_match(/Properties/, output) + assert_match(/INHERITS/, output) + assert_match(/unqualified_name/, output) + end + + def test_cypher_schema_json + output = Rubydex::Graph.cypher_schema(:json) + + parsed = JSON.parse(output) + assert_equal(["node_labels", "relationships", "properties"], parsed.keys) + assert(parsed["relationships"].any? { |r| r["type"] == "DEFINES" }) + end + + def test_query_returns_table_output + with_context do |context| + context.write!("zoo.rb", <<~RUBY) + class Animal; end + class Dog < Animal; end + class Cat < Animal; end + RUBY + + graph = Rubydex::Graph.new + graph.index_all(context.glob("**/*.rb")) + graph.resolve + + output = graph.query("MATCH (c:Class)-[:INHERITS]->(p:Class) WHERE p.name = 'Animal' RETURN c.name ORDER BY c.name") + + assert_match(/c\.name/, output) + assert_match(/Cat/, output) + assert_match(/Dog/, output) + assert_match(/2 rows/, output) + end + end + + def test_query_label_disjunction + with_context do |context| + context.write!("zoo.rb", <<~RUBY) + class Animal; end + module Walkable; end + class Dog < Animal; end + RUBY + + graph = Rubydex::Graph.new + graph.index_all(context.glob("**/*.rb")) + graph.resolve + + output = graph.query( + "MATCH (n:Class|Module) WHERE n.name = 'Animal' OR n.name = 'Walkable' RETURN n.name ORDER BY n.name", + :json, + ) + + assert_equal("[{\"n.name\":\"Animal\"},{\"n.name\":\"Walkable\"}]", output) + end + end + + def test_query_returns_json_output + with_context do |context| + context.write!("zoo.rb", "class Animal; end\nclass Dog < Animal; end\n") + + graph = Rubydex::Graph.new + graph.index_all(context.glob("**/*.rb")) + graph.resolve + + assert_equal( + "[{\"c.name\":\"Dog\"}]", + graph.query("MATCH (c:Class {name: 'Dog'}) RETURN c.name", :json), + ) + end + end + + def test_query_accepts_string_format + with_context do |context| + context.write!("zoo.rb", "class Dog; end\n") + + graph = Rubydex::Graph.new + graph.index_all(context.glob("**/*.rb")) + graph.resolve + + assert_equal( + "[{\"c.name\":\"Dog\"}]", + graph.query("MATCH (c:Class {name: 'Dog'}) RETURN c.name", "json"), + ) + end + end + + def test_query_raises_on_syntax_error + graph = Rubydex::Graph.new + graph.resolve + + error = assert_raises(ArgumentError) { graph.query("MATCH (c RETURN c") } + assert_match(/Cypher syntax error/, error.message) + end + + def test_query_raises_on_invalid_format + graph = Rubydex::Graph.new + graph.resolve + + error = assert_raises(ArgumentError) { graph.query("MATCH (c:Class) RETURN c", :yaml) } + assert_match(/unknown query format/, error.message) + end + private def assert_diagnostics(expected, actual) From af7ea68cb27d4f033d6f0f2da18105a598077476 Mon Sep 17 00:00:00 2001 From: Ufuk Kayserilioglu Date: Tue, 23 Jun 2026 19:53:52 +0300 Subject: [PATCH 4/8] Extract the Cypher engine into the standalone cypher-parser crate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the entire Cypher engine — lexer, parser, AST, the tree-walking executor, values, and result formatting — out of rubydex and into the standalone, published `cypher-parser` crate (depended on from crates.io). The executor is generic over `cypher_parser::GraphProvider`, so rubydex only provides the rubydex-specific mapping by implementing that trait for `Graph` (in `query::cypher::schema`), plus the static `--schema` description (in `query::cypher::schema_info`). This separates the query language and its execution from the rubydex graph, letting the engine be versioned, tested, and reused independently. The executor's own tests live in the cypher-parser crate (against an in-memory provider); rubydex keeps end-to-end tests against a real Graph. --- rust/Cargo.lock | 7 + rust/rubydex/Cargo.toml | 1 + rust/rubydex/src/query/cypher/ast.rs | 204 ------- rust/rubydex/src/query/cypher/error.rs | 40 -- rust/rubydex/src/query/cypher/executor.rs | 574 ------------------- rust/rubydex/src/query/cypher/format.rs | 104 ---- rust/rubydex/src/query/cypher/lexer.rs | 205 ------- rust/rubydex/src/query/cypher/mod.rs | 20 +- rust/rubydex/src/query/cypher/parser.rs | 548 ------------------ rust/rubydex/src/query/cypher/schema.rs | 68 ++- rust/rubydex/src/query/cypher/schema_info.rs | 4 +- rust/rubydex/src/query/cypher/tests.rs | 120 +--- rust/rubydex/src/query/cypher/value.rs | 148 ----- 13 files changed, 81 insertions(+), 1962 deletions(-) delete mode 100644 rust/rubydex/src/query/cypher/ast.rs delete mode 100644 rust/rubydex/src/query/cypher/error.rs delete mode 100644 rust/rubydex/src/query/cypher/executor.rs delete mode 100644 rust/rubydex/src/query/cypher/format.rs delete mode 100644 rust/rubydex/src/query/cypher/lexer.rs delete mode 100644 rust/rubydex/src/query/cypher/parser.rs delete mode 100644 rust/rubydex/src/query/cypher/value.rs diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 98fab4410..70a1189d7 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -318,6 +318,12 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "cypher-parser" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed0d1e561d51e651bdf70f8439da293fd0f1fe34d8431059061eadaefc7abb1" + [[package]] name = "darling" version = "0.23.0" @@ -1057,6 +1063,7 @@ dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", + "cypher-parser", "glob", "libc", "line-index", diff --git a/rust/rubydex/Cargo.toml b/rust/rubydex/Cargo.toml index 2d6021eca..f1bb044bd 100644 --- a/rust/rubydex/Cargo.toml +++ b/rust/rubydex/Cargo.toml @@ -22,6 +22,7 @@ crate-type = ["rlib"] test_utils = ["dep:tempfile"] [dependencies] +cypher-parser = "0.2" ruby-prism = "1.9.0" ruby-rbs = "0.3" url = "2.5.4" diff --git a/rust/rubydex/src/query/cypher/ast.rs b/rust/rubydex/src/query/cypher/ast.rs deleted file mode 100644 index 867c46f86..000000000 --- a/rust/rubydex/src/query/cypher/ast.rs +++ /dev/null @@ -1,204 +0,0 @@ -//! Abstract syntax tree for the supported subset of Cypher. - -/// A complete parsed query. -#[derive(Debug, Clone, PartialEq)] -pub struct Query { - /// One or more comma-separated path patterns from the `MATCH` clause. - pub patterns: Vec, - pub where_clause: Option, - pub return_clause: Return, - pub order_by: Vec, - pub skip: Option, - pub limit: Option, -} - -/// A path pattern: a starting node followed by zero or more relationship/node hops. -#[derive(Debug, Clone, PartialEq)] -pub struct PathPattern { - pub start: NodePattern, - pub rest: Vec<(RelPattern, NodePattern)>, -} - -/// A node pattern such as `(c:Class {name: 'Foo'})` or `(c:Class|Module)`. -#[derive(Debug, Clone, PartialEq)] -pub struct NodePattern { - pub var: Option, - /// Labels in a disjunction: a node matches if it has **any** of these labels. Empty means - /// "any node". - pub labels: Vec, - pub props: Vec<(String, Literal)>, -} - -/// A relationship pattern such as `-[:INHERITS*1..3]->`. -#[derive(Debug, Clone, PartialEq)] -pub struct RelPattern { - pub var: Option, - /// Relationship types; empty means "any type". - pub types: Vec, - pub direction: Direction, - /// Variable-length specification, if the pattern used `*`. - pub length: Option, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Direction { - /// `-[]->` - Outgoing, - /// `<-[]-` - Incoming, - /// `-[]-` - Both, -} - -/// A variable-length relationship bound, from `*`, `*n`, `*min..`, `*..max`, or `*min..max`. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct VarLength { - pub min: u32, - pub max: Option, -} - -#[derive(Debug, Clone, PartialEq)] -pub enum Literal { - Str(String), - Int(i64), - Bool(bool), - Null, -} - -#[derive(Debug, Clone, PartialEq)] -pub struct Return { - pub distinct: bool, - pub items: Vec, -} - -#[derive(Debug, Clone, PartialEq)] -pub struct ReturnItem { - pub expr: Expr, - pub alias: Option, -} - -impl ReturnItem { - /// The output column name: the explicit alias, or a name derived from the expression. - #[must_use] - pub fn column_name(&self) -> String { - self.alias.clone().unwrap_or_else(|| self.expr.display_name()) - } -} - -#[derive(Debug, Clone, PartialEq)] -pub enum Expr { - Var(String), - Property(String, String), - Literal(Literal), - Not(Box), - And(Box, Box), - Or(Box, Box), - Compare(Box, CmpOp, Box), - Aggregate { - func: AggFn, - arg: Option>, - distinct: bool, - }, -} - -impl Expr { - /// Whether this expression tree contains an aggregate function call. - #[must_use] - #[allow(clippy::match_same_arms)] - pub fn contains_aggregate(&self) -> bool { - match self { - Expr::Aggregate { .. } => true, - Expr::Not(inner) => inner.contains_aggregate(), - Expr::And(a, b) | Expr::Or(a, b) => a.contains_aggregate() || b.contains_aggregate(), - Expr::Compare(a, _, b) => a.contains_aggregate() || b.contains_aggregate(), - Expr::Var(_) | Expr::Property(..) | Expr::Literal(_) => false, - } - } - - /// A human-readable name for the expression, used as a default column header. - #[must_use] - pub fn display_name(&self) -> String { - match self { - Expr::Var(v) => v.clone(), - Expr::Property(v, p) => format!("{v}.{p}"), - Expr::Literal(lit) => match lit { - Literal::Str(s) => format!("'{s}'"), - Literal::Int(i) => i.to_string(), - Literal::Bool(b) => b.to_string(), - Literal::Null => "null".to_string(), - }, - Expr::Not(inner) => format!("NOT {}", inner.display_name()), - Expr::And(a, b) => format!("{} AND {}", a.display_name(), b.display_name()), - Expr::Or(a, b) => format!("{} OR {}", a.display_name(), b.display_name()), - Expr::Compare(a, op, b) => format!("{} {} {}", a.display_name(), op.as_str(), b.display_name()), - Expr::Aggregate { func, arg, distinct } => { - let inner = match arg { - Some(expr) => expr.display_name(), - None => "*".to_string(), - }; - let distinct = if *distinct { "DISTINCT " } else { "" }; - format!("{}({distinct}{inner})", func.as_str()) - } - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum CmpOp { - Eq, - Neq, - Lt, - Lte, - Gt, - Gte, - Contains, - StartsWith, - EndsWith, -} - -impl CmpOp { - #[must_use] - pub fn as_str(self) -> &'static str { - match self { - CmpOp::Eq => "=", - CmpOp::Neq => "<>", - CmpOp::Lt => "<", - CmpOp::Lte => "<=", - CmpOp::Gt => ">", - CmpOp::Gte => ">=", - CmpOp::Contains => "CONTAINS", - CmpOp::StartsWith => "STARTS WITH", - CmpOp::EndsWith => "ENDS WITH", - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum AggFn { - Count, - Collect, - Min, - Max, - Sum, - Avg, -} - -impl AggFn { - #[must_use] - pub fn as_str(self) -> &'static str { - match self { - AggFn::Count => "count", - AggFn::Collect => "collect", - AggFn::Min => "min", - AggFn::Max => "max", - AggFn::Sum => "sum", - AggFn::Avg => "avg", - } - } -} - -#[derive(Debug, Clone, PartialEq)] -pub struct OrderItem { - pub expr: Expr, - pub descending: bool, -} diff --git a/rust/rubydex/src/query/cypher/error.rs b/rust/rubydex/src/query/cypher/error.rs deleted file mode 100644 index 100fc37c0..000000000 --- a/rust/rubydex/src/query/cypher/error.rs +++ /dev/null @@ -1,40 +0,0 @@ -use std::fmt; - -/// An error produced while lexing, parsing, or executing a Cypher query. -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum CypherError { - /// A lexing or parsing error, with a byte position into the source query. - Syntax { message: String, position: usize }, - /// A semantic or execution error (e.g. unknown property, type mismatch). - Execution { message: String }, -} - -impl CypherError { - pub(crate) fn syntax(message: impl Into, position: usize) -> Self { - Self::Syntax { - message: message.into(), - position, - } - } - - pub(crate) fn execution(message: impl Into) -> Self { - Self::Execution { - message: message.into(), - } - } -} - -impl fmt::Display for CypherError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - CypherError::Syntax { message, position } => { - write!(f, "Cypher syntax error at position {position}: {message}") - } - CypherError::Execution { message } => { - write!(f, "Cypher execution error: {message}") - } - } - } -} - -impl std::error::Error for CypherError {} diff --git a/rust/rubydex/src/query/cypher/executor.rs b/rust/rubydex/src/query/cypher/executor.rs deleted file mode 100644 index 88c746514..000000000 --- a/rust/rubydex/src/query/cypher/executor.rs +++ /dev/null @@ -1,574 +0,0 @@ -use std::collections::HashMap; -use std::collections::HashSet; - -use crate::model::graph::Graph; - -use super::ast::{AggFn, CmpOp, Direction, Expr, Literal, NodePattern, OrderItem, PathPattern, Query, ReturnItem}; -use super::error::CypherError; -use super::schema::{self, NodeRef, RelType}; -use super::value::CypherValue; - -/// The tabular result of executing a query. -#[derive(Debug, Clone, PartialEq)] -pub struct ResultSet { - pub columns: Vec, - pub rows: Vec>, -} - -/// A single binding row: maps pattern variable names to graph nodes. -type Row = HashMap; - -/// Executes a parsed query against the graph. -/// -/// # Errors -/// -/// Returns a [`CypherError::Execution`] for unknown relationship types, aggregates used in `WHERE`, -/// or `ORDER BY` expressions that cannot be resolved under aggregation. -pub fn execute(graph: &Graph, query: &Query) -> Result { - let mut executor = Executor { - graph, - reverse_cache: HashMap::new(), - }; - executor.run(query) -} - -struct Executor<'a> { - graph: &'a Graph, - reverse_cache: HashMap>>, -} - -impl Executor<'_> { - fn run(&mut self, query: &Query) -> Result { - let mut rows: Vec = vec![Row::new()]; - for pattern in &query.patterns { - rows = self.eval_pattern(rows, pattern)?; - } - - if let Some(predicate) = &query.where_clause { - let mut filtered = Vec::with_capacity(rows.len()); - for row in rows { - if self.eval_expr(&row, predicate)?.is_truthy() { - filtered.push(row); - } - } - rows = filtered; - } - - self.project(query, &rows) - } - - // ---- Pattern matching ------------------------------------------------ - - fn eval_pattern(&mut self, base: Vec, pattern: &PathPattern) -> Result, CypherError> { - let mut working: Vec<(Row, NodeRef)> = Vec::new(); - - for row in base { - for node in self.candidates_for_node(&row, &pattern.start) { - let mut new_row = row.clone(); - if let Some(var) = &pattern.start.var { - new_row.insert(var.clone(), node); - } - working.push((new_row, node)); - } - } - - for (rel, node) in &pattern.rest { - working = self.expand_step(working, rel, node)?; - } - - Ok(working.into_iter().map(|(row, _)| row).collect()) - } - - fn candidates_for_node(&self, row: &Row, pattern: &NodePattern) -> Vec { - if let Some(var) = &pattern.var - && let Some(existing) = row.get(var) - { - return if self.node_matches(*existing, pattern) { - vec![*existing] - } else { - Vec::new() - }; - } - - schema::scan(self.graph, &pattern.labels) - .into_iter() - .filter(|node| self.props_match(*node, pattern)) - .collect() - } - - fn expand_step( - &mut self, - working: Vec<(Row, NodeRef)>, - rel: &super::ast::RelPattern, - node: &NodePattern, - ) -> Result, CypherError> { - let rel_types = resolve_rel_types(rel)?; - let mut next = Vec::new(); - - for (row, current) in working { - let targets = self.step_targets(current, &rel_types, rel.direction, rel.length.as_ref()); - for target in targets { - if !self.node_matches(target, node) { - continue; - } - if let Some(var) = &node.var - && let Some(existing) = row.get(var) - && *existing != target - { - continue; - } - let mut new_row = row.clone(); - if let Some(var) = &node.var { - new_row.insert(var.clone(), target); - } - next.push((new_row, target)); - } - } - - Ok(next) - } - - fn step_targets( - &mut self, - node: NodeRef, - rel_types: &[RelType], - direction: Direction, - length: Option<&super::ast::VarLength>, - ) -> Vec { - if let Some(var_length) = length { - return self.var_length_targets(node, rel_types, direction, var_length); - } - - let mut seen = HashSet::new(); - let mut targets = Vec::new(); - for rel in rel_types { - for target in self.step_once(node, *rel, direction) { - if seen.insert(target) { - targets.push(target); - } - } - } - targets - } - - fn step_once(&mut self, node: NodeRef, rel: RelType, direction: Direction) -> Vec { - match direction { - Direction::Outgoing => schema::expand_out(self.graph, node, rel), - Direction::Incoming => self.incoming(node, rel), - Direction::Both => { - let mut seen = HashSet::new(); - let mut targets = Vec::new(); - for target in schema::expand_out(self.graph, node, rel) - .into_iter() - .chain(self.incoming(node, rel)) - { - if seen.insert(target) { - targets.push(target); - } - } - targets - } - } - } - - fn incoming(&mut self, node: NodeRef, rel: RelType) -> Vec { - if !self.reverse_cache.contains_key(&rel) { - let mut reverse: HashMap> = HashMap::new(); - for source in schema::rel_source_nodes(self.graph, rel) { - for target in schema::expand_out(self.graph, source, rel) { - reverse.entry(target).or_default().push(source); - } - } - self.reverse_cache.insert(rel, reverse); - } - - self.reverse_cache - .get(&rel) - .and_then(|reverse| reverse.get(&node)) - .cloned() - .unwrap_or_default() - } - - fn var_length_targets( - &mut self, - start: NodeRef, - rel_types: &[RelType], - direction: Direction, - var_length: &super::ast::VarLength, - ) -> Vec { - let max = var_length.max.unwrap_or(u32::MAX); - let mut results = Vec::new(); - let mut result_seen = HashSet::new(); - - if var_length.min == 0 { - results.push(start); - result_seen.insert(start); - } - - let mut visited = HashSet::new(); - visited.insert(start); - let mut frontier = vec![start]; - let mut depth = 0u32; - - while depth < max && !frontier.is_empty() { - depth += 1; - let mut next = Vec::new(); - for node in &frontier { - for rel in rel_types { - for target in self.step_once(*node, *rel, direction) { - if visited.insert(target) { - next.push(target); - if depth >= var_length.min && result_seen.insert(target) { - results.push(target); - } - } - } - } - } - frontier = next; - } - - results - } - - fn node_matches(&self, node: NodeRef, pattern: &NodePattern) -> bool { - if !schema::matches_labels(self.graph, node, &pattern.labels) { - return false; - } - self.props_match(node, pattern) - } - - fn props_match(&self, node: NodeRef, pattern: &NodePattern) -> bool { - pattern - .props - .iter() - .all(|(key, literal)| schema::property(self.graph, node, key) == literal_to_value(literal)) - } - - // ---- Expression evaluation ------------------------------------------- - - fn eval_expr(&self, row: &Row, expr: &Expr) -> Result { - match expr { - Expr::Literal(literal) => Ok(literal_to_value(literal)), - Expr::Var(name) => Ok(row.get(name).map_or(CypherValue::Null, |node| self.node_value(*node))), - Expr::Property(var, prop) => Ok(row - .get(var) - .map_or(CypherValue::Null, |node| schema::property(self.graph, *node, prop))), - Expr::Not(inner) => Ok(CypherValue::Bool(!self.eval_expr(row, inner)?.is_truthy())), - Expr::And(a, b) => Ok(CypherValue::Bool( - self.eval_expr(row, a)?.is_truthy() && self.eval_expr(row, b)?.is_truthy(), - )), - Expr::Or(a, b) => Ok(CypherValue::Bool( - self.eval_expr(row, a)?.is_truthy() || self.eval_expr(row, b)?.is_truthy(), - )), - Expr::Compare(a, op, b) => { - let left = self.eval_expr(row, a)?; - let right = self.eval_expr(row, b)?; - Ok(CypherValue::Bool(compare_values(&left, *op, &right))) - } - Expr::Aggregate { .. } => Err(CypherError::execution("aggregate functions are only allowed in RETURN")), - } - } - - fn node_value(&self, node: NodeRef) -> CypherValue { - CypherValue::Node { - label: schema::node_label(self.graph, node), - name: schema::node_name(self.graph, node), - } - } - - // ---- Projection ------------------------------------------------------ - - fn project(&self, query: &Query, rows: &[Row]) -> Result { - let items = &query.return_clause.items; - let columns: Vec = items.iter().map(ReturnItem::column_name).collect(); - - let has_aggregate = items.iter().any(|item| item.expr.contains_aggregate()); - - let mut values = if has_aggregate { - self.project_aggregated(query, rows)? - } else { - self.project_simple(query, rows)? - }; - - if query.return_clause.distinct { - dedupe(&mut values); - } - - apply_order_skip_limit(query, &mut values, &columns)?; - - Ok(ResultSet { columns, rows: values }) - } - - fn project_simple(&self, query: &Query, rows: &[Row]) -> Result>, CypherError> { - let items = &query.return_clause.items; - let mut output = Vec::with_capacity(rows.len()); - for row in rows { - let mut values = Vec::with_capacity(items.len()); - for item in items { - values.push(self.eval_expr(row, &item.expr)?); - } - output.push(values); - } - Ok(output) - } - - fn project_aggregated(&self, query: &Query, rows: &[Row]) -> Result>, CypherError> { - let items = &query.return_clause.items; - - // Group rows by the values of the non-aggregate (grouping) return items. - let mut group_order: Vec> = Vec::new(); - let mut groups: HashMap, Vec> = HashMap::new(); - - for (index, row) in rows.iter().enumerate() { - let mut key = Vec::new(); - for item in items { - if !item.expr.contains_aggregate() { - key.push(self.eval_expr(row, &item.expr)?); - } - } - if !groups.contains_key(&key) { - group_order.push(key.clone()); - } - groups.entry(key).or_default().push(index); - } - - // With no grouping keys and no input rows, aggregates still produce a single row. - let grouping_keys = items.iter().filter(|item| !item.expr.contains_aggregate()).count(); - if group_order.is_empty() && grouping_keys == 0 { - group_order.push(Vec::new()); - groups.insert(Vec::new(), Vec::new()); - } - - let mut output = Vec::with_capacity(group_order.len()); - for key in group_order { - let row_indices = &groups[&key]; - let group_rows: Vec<&Row> = row_indices.iter().map(|index| &rows[*index]).collect(); - - let mut values = Vec::with_capacity(items.len()); - let mut key_iter = key.iter(); - for item in items { - if item.expr.contains_aggregate() { - values.push(self.eval_aggregate(&item.expr, &group_rows)?); - } else { - values.push(key_iter.next().cloned().unwrap_or(CypherValue::Null)); - } - } - output.push(values); - } - - Ok(output) - } - - fn eval_aggregate(&self, expr: &Expr, group: &[&Row]) -> Result { - let Expr::Aggregate { func, arg, distinct } = expr else { - return Err(CypherError::execution("expected an aggregate function")); - }; - - // count(*) does not evaluate an argument. - if *func == AggFn::Count && arg.is_none() { - return Ok(CypherValue::Int(i64::try_from(group.len()).unwrap_or(i64::MAX))); - } - - let arg_expr = arg - .as_ref() - .ok_or_else(|| CypherError::execution("aggregate function requires an argument"))?; - - let mut values = Vec::new(); - for row in group { - let value = self.eval_expr(row, arg_expr)?; - if value != CypherValue::Null { - values.push(value); - } - } - - if *distinct { - values = dedupe_values(values); - } - - Ok(match func { - AggFn::Count => CypherValue::Int(i64::try_from(values.len()).unwrap_or(i64::MAX)), - AggFn::Collect => CypherValue::List(values), - AggFn::Min => values - .into_iter() - .min_by(CypherValue::total_cmp) - .unwrap_or(CypherValue::Null), - AggFn::Max => values - .into_iter() - .max_by(CypherValue::total_cmp) - .unwrap_or(CypherValue::Null), - AggFn::Sum => CypherValue::Int(values.iter().filter_map(CypherValue::as_int).sum()), - AggFn::Avg => { - let numbers: Vec = values.iter().filter_map(CypherValue::as_int).collect(); - if numbers.is_empty() { - CypherValue::Null - } else { - CypherValue::Int(numbers.iter().sum::() / i64::try_from(numbers.len()).unwrap_or(1)) - } - } - }) - } -} - -fn apply_order_skip_limit( - query: &Query, - values: &mut Vec>, - columns: &[String], -) -> Result<(), CypherError> { - // ORDER BY operates on the projected value rows: each ORDER BY expression must resolve to - // a RETURN column (by identical expression or by naming a column/alias). - if !query.order_by.is_empty() { - let mut keys: Vec = Vec::with_capacity(query.order_by.len()); - for item in &query.order_by { - keys.push(resolve_order_column(item, &query.return_clause.items, columns)?); - } - - values.sort_by(|a, b| { - for (key_index, order_item) in keys.iter().zip(&query.order_by) { - let ordering = a[*key_index].total_cmp(&b[*key_index]); - let ordering = if order_item.descending { - ordering.reverse() - } else { - ordering - }; - if ordering != std::cmp::Ordering::Equal { - return ordering; - } - } - std::cmp::Ordering::Equal - }); - } - - if let Some(skip) = query.skip { - if skip >= values.len() { - values.clear(); - } else { - values.drain(0..skip); - } - } - - if let Some(limit) = query.limit - && values.len() > limit - { - values.truncate(limit); - } - - Ok(()) -} - -fn resolve_order_column( - order_item: &OrderItem, - items: &[ReturnItem], - columns: &[String], -) -> Result { - // Match by identical return expression first. - if let Some(index) = items.iter().position(|item| item.expr == order_item.expr) { - return Ok(index); - } - - // Otherwise, a bare variable in ORDER BY may name a return column or alias. - if let Expr::Var(name) = &order_item.expr - && let Some(index) = columns.iter().position(|column| column == name) - { - return Ok(index); - } - - Err(CypherError::execution(format!( - "ORDER BY expression `{}` must also appear in RETURN", - order_item.expr.display_name() - ))) -} - -fn resolve_rel_types(rel: &super::ast::RelPattern) -> Result, CypherError> { - if rel.types.is_empty() { - return Ok(RelType::all().to_vec()); - } - - let mut types = Vec::with_capacity(rel.types.len()); - for name in &rel.types { - let rel_type = RelType::parse(name) - .ok_or_else(|| CypherError::execution(format!("unknown relationship type `{name}`")))?; - types.push(rel_type); - } - Ok(types) -} - -fn literal_to_value(literal: &Literal) -> CypherValue { - match literal { - Literal::Str(value) => CypherValue::Str(value.clone()), - Literal::Int(value) => CypherValue::Int(*value), - Literal::Bool(value) => CypherValue::Bool(*value), - Literal::Null => CypherValue::Null, - } -} - -fn compare_values(left: &CypherValue, op: CmpOp, right: &CypherValue) -> bool { - if matches!(left, CypherValue::Null) || matches!(right, CypherValue::Null) { - return false; - } - - match op { - CmpOp::Eq => values_equal(left, right), - CmpOp::Neq => !values_equal(left, right), - CmpOp::Lt | CmpOp::Lte | CmpOp::Gt | CmpOp::Gte => { - if !same_type(left, right) { - return false; - } - let ordering = left.total_cmp(right); - match op { - CmpOp::Lt => ordering.is_lt(), - CmpOp::Lte => ordering.is_le(), - CmpOp::Gt => ordering.is_gt(), - CmpOp::Gte => ordering.is_ge(), - _ => unreachable!(), - } - } - CmpOp::Contains => string_op(left, right, |haystack, needle| haystack.contains(needle)), - CmpOp::StartsWith => string_op(left, right, |haystack, needle| haystack.starts_with(needle)), - CmpOp::EndsWith => string_op(left, right, |haystack, needle| haystack.ends_with(needle)), - } -} - -fn values_equal(left: &CypherValue, right: &CypherValue) -> bool { - same_type(left, right) && left == right -} - -fn same_type(left: &CypherValue, right: &CypherValue) -> bool { - matches!( - (left, right), - (CypherValue::Bool(_), CypherValue::Bool(_)) - | (CypherValue::Int(_), CypherValue::Int(_)) - | (CypherValue::Str(_), CypherValue::Str(_)) - | (CypherValue::Node { .. }, CypherValue::Node { .. }) - | (CypherValue::List(_), CypherValue::List(_)) - ) -} - -fn string_op(left: &CypherValue, right: &CypherValue, op: impl Fn(&str, &str) -> bool) -> bool { - match (left.as_str(), right.as_str()) { - (Some(haystack), Some(needle)) => op(haystack, needle), - _ => false, - } -} - -fn dedupe(output: &mut Vec>) { - let mut seen: Vec> = Vec::new(); - output.retain(|values| { - if seen.iter().any(|existing| existing == values) { - false - } else { - seen.push(values.clone()); - true - } - }); -} - -fn dedupe_values(values: Vec) -> Vec { - let mut result: Vec = Vec::new(); - for value in values { - if !result.contains(&value) { - result.push(value); - } - } - result -} diff --git a/rust/rubydex/src/query/cypher/format.rs b/rust/rubydex/src/query/cypher/format.rs deleted file mode 100644 index 6104f0952..000000000 --- a/rust/rubydex/src/query/cypher/format.rs +++ /dev/null @@ -1,104 +0,0 @@ -use std::fmt::Write; - -use super::executor::ResultSet; -use super::value::{CypherValue, write_json_string}; - -/// The output format for query results. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum OutputFormat { - Table, - Json, -} - -/// Renders a result set in the requested format. -#[must_use] -pub fn format(result: &ResultSet, format: OutputFormat) -> String { - match format { - OutputFormat::Table => format_table(result), - OutputFormat::Json => format_json(result), - } -} - -fn format_table(result: &ResultSet) -> String { - if result.columns.is_empty() { - return String::new(); - } - - let rendered: Vec> = result - .rows - .iter() - .map(|row| row.iter().map(CypherValue::to_display_string).collect()) - .collect(); - - let mut widths: Vec = result.columns.iter().map(String::len).collect(); - for row in &rendered { - for (index, cell) in row.iter().enumerate() { - if let Some(width) = widths.get_mut(index) { - *width = (*width).max(cell.chars().count()); - } - } - } - - let mut output = String::new(); - push_row(&mut output, &result.columns, &widths); - push_separator(&mut output, &widths); - for row in &rendered { - push_row(&mut output, row, &widths); - } - - let count = result.rows.len(); - let suffix = if count == 1 { "row" } else { "rows" }; - let _ = write!(output, "\n{count} {suffix}\n"); - output -} - -fn push_row(output: &mut String, cells: &[String], widths: &[usize]) { - for (index, width) in widths.iter().enumerate() { - if index > 0 { - output.push_str(" | "); - } - let empty = String::new(); - let cell = cells.get(index).unwrap_or(&empty); - let pad = width.saturating_sub(cell.chars().count()); - output.push_str(cell); - for _ in 0..pad { - output.push(' '); - } - } - output.push('\n'); -} - -fn push_separator(output: &mut String, widths: &[usize]) { - for (index, width) in widths.iter().enumerate() { - if index > 0 { - output.push_str("-+-"); - } - for _ in 0..*width { - output.push('-'); - } - } - output.push('\n'); -} - -fn format_json(result: &ResultSet) -> String { - let mut output = String::from("["); - for (row_index, row) in result.rows.iter().enumerate() { - if row_index > 0 { - output.push(','); - } - output.push('{'); - for (column_index, column) in result.columns.iter().enumerate() { - if column_index > 0 { - output.push(','); - } - write_json_string(&mut output, column); - output.push(':'); - row.get(column_index) - .unwrap_or(&CypherValue::Null) - .write_json(&mut output); - } - output.push('}'); - } - output.push(']'); - output -} diff --git a/rust/rubydex/src/query/cypher/lexer.rs b/rust/rubydex/src/query/cypher/lexer.rs deleted file mode 100644 index de2fb24c9..000000000 --- a/rust/rubydex/src/query/cypher/lexer.rs +++ /dev/null @@ -1,205 +0,0 @@ -use super::error::CypherError; - -/// A lexical token together with the byte position where it starts in the source query. -#[derive(Debug, Clone, PartialEq)] -pub struct Token { - pub kind: TokenKind, - pub position: usize, -} - -#[derive(Debug, Clone, PartialEq)] -pub enum TokenKind { - Ident(String), - Int(i64), - Str(String), - LParen, - RParen, - LBracket, - RBracket, - LBrace, - RBrace, - Comma, - Colon, - Dot, - DotDot, - Star, - Pipe, - Eq, - Neq, - Lt, - Lte, - Gt, - Gte, - Minus, -} - -/// Tokenizes a Cypher query string into a flat token stream. -/// -/// # Errors -/// -/// Returns a [`CypherError::Syntax`] if an unterminated string or unexpected character is found. -pub fn tokenize(input: &str) -> Result, CypherError> { - let chars: Vec = input.chars().collect(); - let mut tokens = Vec::new(); - let mut index = 0; - - while index < chars.len() { - let ch = chars[index]; - - if ch.is_whitespace() { - index += 1; - continue; - } - - let start = index; - - match ch { - '(' => push(&mut tokens, TokenKind::LParen, start, &mut index), - ')' => push(&mut tokens, TokenKind::RParen, start, &mut index), - '[' => push(&mut tokens, TokenKind::LBracket, start, &mut index), - ']' => push(&mut tokens, TokenKind::RBracket, start, &mut index), - '{' => push(&mut tokens, TokenKind::LBrace, start, &mut index), - '}' => push(&mut tokens, TokenKind::RBrace, start, &mut index), - ',' => push(&mut tokens, TokenKind::Comma, start, &mut index), - ':' => push(&mut tokens, TokenKind::Colon, start, &mut index), - '*' => push(&mut tokens, TokenKind::Star, start, &mut index), - '|' => push(&mut tokens, TokenKind::Pipe, start, &mut index), - '-' => push(&mut tokens, TokenKind::Minus, start, &mut index), - '=' => push(&mut tokens, TokenKind::Eq, start, &mut index), - '.' => { - if chars.get(index + 1) == Some(&'.') { - tokens.push(Token { - kind: TokenKind::DotDot, - position: start, - }); - index += 2; - } else { - push(&mut tokens, TokenKind::Dot, start, &mut index); - } - } - '<' => { - if chars.get(index + 1) == Some(&'=') { - tokens.push(Token { - kind: TokenKind::Lte, - position: start, - }); - index += 2; - } else if chars.get(index + 1) == Some(&'>') { - tokens.push(Token { - kind: TokenKind::Neq, - position: start, - }); - index += 2; - } else { - push(&mut tokens, TokenKind::Lt, start, &mut index); - } - } - '>' => { - if chars.get(index + 1) == Some(&'=') { - tokens.push(Token { - kind: TokenKind::Gte, - position: start, - }); - index += 2; - } else { - push(&mut tokens, TokenKind::Gt, start, &mut index); - } - } - '\'' | '"' => { - let (value, next) = lex_string(&chars, index, ch)?; - tokens.push(Token { - kind: TokenKind::Str(value), - position: start, - }); - index = next; - } - c if c.is_ascii_digit() => { - let (value, next) = lex_number(&chars, index)?; - tokens.push(Token { - kind: TokenKind::Int(value), - position: start, - }); - index = next; - } - c if is_ident_start(c) => { - let (value, next) = lex_ident(&chars, index); - tokens.push(Token { - kind: TokenKind::Ident(value), - position: start, - }); - index = next; - } - other => { - return Err(CypherError::syntax(format!("unexpected character `{other}`"), start)); - } - } - } - - Ok(tokens) -} - -fn push(tokens: &mut Vec, kind: TokenKind, position: usize, index: &mut usize) { - tokens.push(Token { kind, position }); - *index += 1; -} - -fn lex_string(chars: &[char], start: usize, quote: char) -> Result<(String, usize), CypherError> { - let mut value = String::new(); - let mut index = start + 1; - - while index < chars.len() { - let ch = chars[index]; - if ch == '\\' { - if let Some(&next) = chars.get(index + 1) { - match next { - 'n' => value.push('\n'), - 't' => value.push('\t'), - 'r' => value.push('\r'), - '\\' => value.push('\\'), - '\'' => value.push('\''), - '"' => value.push('"'), - other => value.push(other), - } - index += 2; - continue; - } - return Err(CypherError::syntax("unterminated escape in string literal", index)); - } - if ch == quote { - return Ok((value, index + 1)); - } - value.push(ch); - index += 1; - } - - Err(CypherError::syntax("unterminated string literal", start)) -} - -fn lex_number(chars: &[char], start: usize) -> Result<(i64, usize), CypherError> { - let mut index = start; - while index < chars.len() && chars[index].is_ascii_digit() { - index += 1; - } - let text: String = chars[start..index].iter().collect(); - let value = text - .parse::() - .map_err(|_| CypherError::syntax(format!("invalid integer `{text}`"), start))?; - Ok((value, index)) -} - -fn lex_ident(chars: &[char], start: usize) -> (String, usize) { - let mut index = start; - while index < chars.len() && is_ident_continue(chars[index]) { - index += 1; - } - let text: String = chars[start..index].iter().collect(); - (text, index) -} - -fn is_ident_start(ch: char) -> bool { - ch.is_ascii_alphabetic() || ch == '_' -} - -fn is_ident_continue(ch: char) -> bool { - ch.is_ascii_alphanumeric() || ch == '_' -} diff --git a/rust/rubydex/src/query/cypher/mod.rs b/rust/rubydex/src/query/cypher/mod.rs index 978098c93..1e28bc0af 100644 --- a/rust/rubydex/src/query/cypher/mod.rs +++ b/rust/rubydex/src/query/cypher/mod.rs @@ -13,19 +13,13 @@ //! //! See [`schema`] for the node labels and relationship types exposed to queries. -pub mod ast; -pub mod error; -pub mod executor; -pub mod format; -pub mod lexer; -pub mod parser; +// The whole Cypher engine — lexer, parser, AST, executor, values, and formatting — lives in the +// graph-independent `cypher-parser` crate. rubydex only provides the `GraphProvider` mapping for its +// `Graph` (in `schema`) and the static schema description (in `schema_info`). +pub use cypher_parser::{CypherError, OutputFormat}; + pub mod schema; pub mod schema_info; -pub mod value; - -pub use error::CypherError; -pub use executor::ResultSet; -pub use format::OutputFormat; use crate::model::graph::Graph; @@ -35,9 +29,7 @@ use crate::model::graph::Graph; /// /// Returns a [`CypherError`] if the query cannot be parsed or executed. pub fn run_query(graph: &Graph, query: &str, output_format: OutputFormat) -> Result { - let parsed = parser::parse(query)?; - let result = executor::execute(graph, &parsed)?; - Ok(format::format(&result, output_format)) + cypher_parser::run_query(graph, query, output_format) } /// Returns a description of the queryable schema (node labels, relationship types, and properties) diff --git a/rust/rubydex/src/query/cypher/parser.rs b/rust/rubydex/src/query/cypher/parser.rs deleted file mode 100644 index b08182649..000000000 --- a/rust/rubydex/src/query/cypher/parser.rs +++ /dev/null @@ -1,548 +0,0 @@ -use super::ast::{ - AggFn, CmpOp, Direction, Expr, Literal, NodePattern, OrderItem, PathPattern, Query, RelPattern, Return, ReturnItem, - VarLength, -}; -use super::error::CypherError; -use super::lexer::{Token, TokenKind, tokenize}; - -/// Parses a Cypher query string into a [`Query`] AST. -/// -/// # Errors -/// -/// Returns a [`CypherError::Syntax`] on any lexical or grammatical error. -pub fn parse(input: &str) -> Result { - let tokens = tokenize(input)?; - let mut parser = Parser { - tokens, - position: 0, - source_len: input.len(), - }; - let query = parser.parse_query()?; - if let Some(token) = parser.peek() { - return Err(CypherError::syntax("unexpected trailing input", token.position)); - } - Ok(query) -} - -struct Parser { - tokens: Vec, - position: usize, - source_len: usize, -} - -impl Parser { - fn peek(&self) -> Option<&Token> { - self.tokens.get(self.position) - } - - fn peek_kind(&self) -> Option<&TokenKind> { - self.tokens.get(self.position).map(|t| &t.kind) - } - - fn current_position(&self) -> usize { - self.tokens - .get(self.position) - .map_or(self.source_len, |token| token.position) - } - - fn advance(&mut self) -> Option { - let token = self.tokens.get(self.position).cloned(); - if token.is_some() { - self.position += 1; - } - token - } - - fn expect(&mut self, kind: &TokenKind, description: &str) -> Result<(), CypherError> { - match self.peek_kind() { - Some(actual) if actual == kind => { - self.position += 1; - Ok(()) - } - _ => Err(CypherError::syntax( - format!("expected {description}"), - self.current_position(), - )), - } - } - - fn at_keyword(&self, keyword: &str) -> bool { - matches!(self.peek_kind(), Some(TokenKind::Ident(name)) if name.eq_ignore_ascii_case(keyword)) - } - - fn eat_keyword(&mut self, keyword: &str) -> bool { - if self.at_keyword(keyword) { - self.position += 1; - true - } else { - false - } - } - - fn expect_keyword(&mut self, keyword: &str) -> Result<(), CypherError> { - if self.eat_keyword(keyword) { - Ok(()) - } else { - Err(CypherError::syntax( - format!("expected keyword `{keyword}`"), - self.current_position(), - )) - } - } - - fn expect_ident(&mut self, description: &str) -> Result { - match self.advance() { - Some(Token { - kind: TokenKind::Ident(name), - .. - }) => Ok(name), - _ => Err(CypherError::syntax( - format!("expected {description}"), - self.current_position(), - )), - } - } - - fn parse_query(&mut self) -> Result { - self.expect_keyword("MATCH")?; - let patterns = self.parse_patterns()?; - - let where_clause = if self.eat_keyword("WHERE") { - Some(self.parse_expr()?) - } else { - None - }; - - self.expect_keyword("RETURN")?; - let return_clause = self.parse_return()?; - - let mut order_by = Vec::new(); - if self.eat_keyword("ORDER") { - self.expect_keyword("BY")?; - order_by = self.parse_order_by()?; - } - - let skip = if self.eat_keyword("SKIP") { - Some(self.parse_usize()?) - } else { - None - }; - - let limit = if self.eat_keyword("LIMIT") { - Some(self.parse_usize()?) - } else { - None - }; - - Ok(Query { - patterns, - where_clause, - return_clause, - order_by, - skip, - limit, - }) - } - - fn parse_usize(&mut self) -> Result { - match self.advance() { - Some(Token { - kind: TokenKind::Int(value), - position, - }) => usize::try_from(value).map_err(|_| CypherError::syntax("expected a non-negative integer", position)), - _ => Err(CypherError::syntax("expected an integer", self.current_position())), - } - } - - fn parse_patterns(&mut self) -> Result, CypherError> { - let mut patterns = vec![self.parse_path_pattern()?]; - while matches!(self.peek_kind(), Some(TokenKind::Comma)) { - self.position += 1; - patterns.push(self.parse_path_pattern()?); - } - Ok(patterns) - } - - fn parse_path_pattern(&mut self) -> Result { - let start = self.parse_node_pattern()?; - let mut rest = Vec::new(); - while self.at_relationship_start() { - let rel = self.parse_rel_pattern()?; - let node = self.parse_node_pattern()?; - rest.push((rel, node)); - } - Ok(PathPattern { start, rest }) - } - - fn at_relationship_start(&self) -> bool { - matches!(self.peek_kind(), Some(TokenKind::Minus | TokenKind::Lt)) - } - - fn parse_node_pattern(&mut self) -> Result { - self.expect(&TokenKind::LParen, "`(` to start a node pattern")?; - - let var = match self.peek_kind() { - Some(TokenKind::Ident(name)) => { - let name = name.clone(); - self.position += 1; - Some(name) - } - _ => None, - }; - - let mut labels = Vec::new(); - if matches!(self.peek_kind(), Some(TokenKind::Colon)) { - self.position += 1; - labels.push(self.expect_ident("a node label after `:`")?); - while matches!(self.peek_kind(), Some(TokenKind::Pipe)) { - self.position += 1; - labels.push(self.expect_ident("a node label after `|`")?); - } - } - - let props = if matches!(self.peek_kind(), Some(TokenKind::LBrace)) { - self.parse_prop_map()? - } else { - Vec::new() - }; - - self.expect(&TokenKind::RParen, "`)` to close a node pattern")?; - - Ok(NodePattern { var, labels, props }) - } - - fn parse_prop_map(&mut self) -> Result, CypherError> { - self.expect(&TokenKind::LBrace, "`{`")?; - let mut props = Vec::new(); - - if !matches!(self.peek_kind(), Some(TokenKind::RBrace)) { - loop { - let key = self.expect_ident("a property name")?; - self.expect(&TokenKind::Colon, "`:` after property name")?; - let value = self.parse_literal()?; - props.push((key, value)); - - if matches!(self.peek_kind(), Some(TokenKind::Comma)) { - self.position += 1; - } else { - break; - } - } - } - - self.expect(&TokenKind::RBrace, "`}` to close a property map")?; - Ok(props) - } - - fn parse_rel_pattern(&mut self) -> Result { - let leading_in = matches!(self.peek_kind(), Some(TokenKind::Lt)); - if leading_in { - self.position += 1; - } - self.expect(&TokenKind::Minus, "`-` in relationship pattern")?; - - let mut var = None; - let mut types = Vec::new(); - let mut length = None; - - if matches!(self.peek_kind(), Some(TokenKind::LBracket)) { - self.position += 1; - - if let Some(TokenKind::Ident(name)) = self.peek_kind() { - var = Some(name.clone()); - self.position += 1; - } - - if matches!(self.peek_kind(), Some(TokenKind::Colon)) { - self.position += 1; - types.push(self.expect_ident("a relationship type after `:`")?); - while matches!(self.peek_kind(), Some(TokenKind::Pipe)) { - self.position += 1; - types.push(self.expect_ident("a relationship type after `|`")?); - } - } - - if matches!(self.peek_kind(), Some(TokenKind::Star)) { - self.position += 1; - length = Some(self.parse_var_length()?); - } - - self.expect(&TokenKind::RBracket, "`]` to close a relationship pattern")?; - } - - self.expect(&TokenKind::Minus, "`-` in relationship pattern")?; - let trailing_out = matches!(self.peek_kind(), Some(TokenKind::Gt)); - if trailing_out { - self.position += 1; - } - - let direction = match (leading_in, trailing_out) { - (true, false) => Direction::Incoming, - (false, true) => Direction::Outgoing, - (false, false) => Direction::Both, - (true, true) => { - return Err(CypherError::syntax( - "a relationship cannot point in both directions", - self.current_position(), - )); - } - }; - - Ok(RelPattern { - var, - types, - direction, - length, - }) - } - - fn parse_var_length(&mut self) -> Result { - let mut min = 1; - let mut max = None; - - if let Some(TokenKind::Int(value)) = self.peek_kind() { - let lower = u32::try_from(*value) - .map_err(|_| CypherError::syntax("variable-length bound is too large", self.current_position()))?; - self.position += 1; - - if matches!(self.peek_kind(), Some(TokenKind::DotDot)) { - self.position += 1; - min = lower; - max = self.parse_optional_length_bound()?; - } else { - // `*n` means exactly n hops. - min = lower; - max = Some(lower); - } - } else if matches!(self.peek_kind(), Some(TokenKind::DotDot)) { - self.position += 1; - max = self.parse_optional_length_bound()?; - } - - Ok(VarLength { min, max }) - } - - fn parse_optional_length_bound(&mut self) -> Result, CypherError> { - if let Some(TokenKind::Int(value)) = self.peek_kind() { - let upper = u32::try_from(*value) - .map_err(|_| CypherError::syntax("variable-length bound is too large", self.current_position()))?; - self.position += 1; - Ok(Some(upper)) - } else { - Ok(None) - } - } - - fn parse_return(&mut self) -> Result { - let distinct = self.eat_keyword("DISTINCT"); - let mut items = vec![self.parse_return_item()?]; - while matches!(self.peek_kind(), Some(TokenKind::Comma)) { - self.position += 1; - items.push(self.parse_return_item()?); - } - Ok(Return { distinct, items }) - } - - fn parse_return_item(&mut self) -> Result { - let expr = self.parse_expr()?; - let alias = if self.eat_keyword("AS") { - Some(self.expect_ident("an alias after `AS`")?) - } else { - None - }; - Ok(ReturnItem { expr, alias }) - } - - fn parse_order_by(&mut self) -> Result, CypherError> { - let mut items = vec![self.parse_order_item()?]; - while matches!(self.peek_kind(), Some(TokenKind::Comma)) { - self.position += 1; - items.push(self.parse_order_item()?); - } - Ok(items) - } - - fn parse_order_item(&mut self) -> Result { - let expr = self.parse_expr()?; - let descending = if self.eat_keyword("DESC") { - true - } else { - // ASC is the default and optional. - let _ = self.eat_keyword("ASC"); - false - }; - Ok(OrderItem { expr, descending }) - } - - fn parse_expr(&mut self) -> Result { - self.parse_or() - } - - fn parse_or(&mut self) -> Result { - let mut left = self.parse_and()?; - while self.eat_keyword("OR") { - let right = self.parse_and()?; - left = Expr::Or(Box::new(left), Box::new(right)); - } - Ok(left) - } - - fn parse_and(&mut self) -> Result { - let mut left = self.parse_not()?; - while self.eat_keyword("AND") { - let right = self.parse_not()?; - left = Expr::And(Box::new(left), Box::new(right)); - } - Ok(left) - } - - fn parse_not(&mut self) -> Result { - if self.eat_keyword("NOT") { - let inner = self.parse_not()?; - Ok(Expr::Not(Box::new(inner))) - } else { - self.parse_comparison() - } - } - - fn parse_comparison(&mut self) -> Result { - let left = self.parse_primary()?; - if let Some(op) = self.parse_comparison_op()? { - let right = self.parse_primary()?; - Ok(Expr::Compare(Box::new(left), op, Box::new(right))) - } else { - Ok(left) - } - } - - fn parse_comparison_op(&mut self) -> Result, CypherError> { - let op = match self.peek_kind() { - Some(TokenKind::Eq) => CmpOp::Eq, - Some(TokenKind::Neq) => CmpOp::Neq, - Some(TokenKind::Lt) => CmpOp::Lt, - Some(TokenKind::Lte) => CmpOp::Lte, - Some(TokenKind::Gt) => CmpOp::Gt, - Some(TokenKind::Gte) => CmpOp::Gte, - Some(TokenKind::Ident(name)) if name.eq_ignore_ascii_case("CONTAINS") => { - self.position += 1; - return Ok(Some(CmpOp::Contains)); - } - Some(TokenKind::Ident(name)) if name.eq_ignore_ascii_case("STARTS") => { - self.position += 1; - self.expect_keyword("WITH")?; - return Ok(Some(CmpOp::StartsWith)); - } - Some(TokenKind::Ident(name)) if name.eq_ignore_ascii_case("ENDS") => { - self.position += 1; - self.expect_keyword("WITH")?; - return Ok(Some(CmpOp::EndsWith)); - } - _ => return Ok(None), - }; - self.position += 1; - Ok(Some(op)) - } - - fn parse_primary(&mut self) -> Result { - match self.peek_kind() { - Some(TokenKind::LParen) => { - self.position += 1; - let expr = self.parse_or()?; - self.expect(&TokenKind::RParen, "`)` to close a grouped expression")?; - Ok(expr) - } - Some(TokenKind::Str(_) | TokenKind::Int(_)) => Ok(Expr::Literal(self.parse_literal()?)), - Some(TokenKind::Ident(name)) => { - let name = name.clone(); - if name.eq_ignore_ascii_case("TRUE") - || name.eq_ignore_ascii_case("FALSE") - || name.eq_ignore_ascii_case("NULL") - { - return Ok(Expr::Literal(self.parse_literal()?)); - } - if let Some(func) = aggregate_function(&name) - && matches!( - self.tokens.get(self.position + 1).map(|t| &t.kind), - Some(TokenKind::LParen) - ) - { - return self.parse_aggregate(func); - } - self.position += 1; - if matches!(self.peek_kind(), Some(TokenKind::Dot)) { - self.position += 1; - let prop = self.expect_ident("a property name after `.`")?; - Ok(Expr::Property(name, prop)) - } else { - Ok(Expr::Var(name)) - } - } - _ => Err(CypherError::syntax("expected an expression", self.current_position())), - } - } - - fn parse_aggregate(&mut self, func: AggFn) -> Result { - self.position += 1; // function name - self.expect(&TokenKind::LParen, "`(` after aggregate function")?; - let distinct = self.eat_keyword("DISTINCT"); - - let arg = if matches!(self.peek_kind(), Some(TokenKind::Star)) { - if func != AggFn::Count { - return Err(CypherError::syntax( - "only count(*) may use `*`", - self.current_position(), - )); - } - self.position += 1; - None - } else { - Some(Box::new(self.parse_or()?)) - }; - - self.expect(&TokenKind::RParen, "`)` to close aggregate function")?; - Ok(Expr::Aggregate { func, arg, distinct }) - } - - fn parse_literal(&mut self) -> Result { - match self.advance() { - Some(Token { - kind: TokenKind::Str(value), - .. - }) => Ok(Literal::Str(value)), - Some(Token { - kind: TokenKind::Int(value), - .. - }) => Ok(Literal::Int(value)), - Some(Token { - kind: TokenKind::Ident(name), - position, - }) => { - if name.eq_ignore_ascii_case("true") { - Ok(Literal::Bool(true)) - } else if name.eq_ignore_ascii_case("false") { - Ok(Literal::Bool(false)) - } else if name.eq_ignore_ascii_case("null") { - Ok(Literal::Null) - } else { - Err(CypherError::syntax( - format!("expected a literal, found `{name}`"), - position, - )) - } - } - _ => Err(CypherError::syntax("expected a literal value", self.current_position())), - } - } -} - -fn aggregate_function(name: &str) -> Option { - match name.to_ascii_lowercase().as_str() { - "count" => Some(AggFn::Count), - "collect" => Some(AggFn::Collect), - "min" => Some(AggFn::Min), - "max" => Some(AggFn::Max), - "sum" => Some(AggFn::Sum), - "avg" => Some(AggFn::Avg), - _ => None, - } -} diff --git a/rust/rubydex/src/query/cypher/schema.rs b/rust/rubydex/src/query/cypher/schema.rs index 02b024264..ee3237367 100644 --- a/rust/rubydex/src/query/cypher/schema.rs +++ b/rust/rubydex/src/query/cypher/schema.rs @@ -26,7 +26,7 @@ use crate::model::definitions::{Definition, Mixin}; use crate::model::graph::Graph; use crate::model::ids::{ConstantReferenceId, DeclarationId, DefinitionId, UriId}; -use super::value::CypherValue; +use cypher_parser::{CypherValue, GraphProvider}; /// A handle to a node in the graph. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -89,6 +89,62 @@ impl RelType { RelType::References, ] } + + /// The canonical uppercase name of this relationship type. + #[must_use] + pub fn name(self) -> &'static str { + match self { + RelType::Defines => "DEFINES", + RelType::Declares => "DECLARES", + RelType::Contains => "CONTAINS", + RelType::Inherits => "INHERITS", + RelType::Includes => "INCLUDES", + RelType::Prepends => "PREPENDS", + RelType::Extends => "EXTENDS", + RelType::Owns => "OWNS", + RelType::Ancestor => "ANCESTOR", + RelType::Descendant => "DESCENDANT", + RelType::References => "REFERENCES", + } + } +} + +/// Exposes the rubydex [`Graph`] to the `cypher-parser` executor as a property graph. This is the +/// rubydex-specific mapping; the executor itself is generic over this trait. +impl GraphProvider for Graph { + type NodeId = NodeRef; + + fn scan(&self, labels: &[String]) -> Vec { + scan(self, labels) + } + + fn matches_label(&self, node: NodeRef, label: &str) -> bool { + matches_label(self, node, label) + } + + fn relationship_types(&self) -> Vec { + RelType::all().iter().map(|rel| rel.name().to_string()).collect() + } + + fn expand(&self, node: NodeRef, rel_type: &str) -> Vec { + RelType::parse(rel_type).map_or_else(Vec::new, |rel| expand_out(self, node, rel)) + } + + fn rel_sources(&self, rel_type: &str) -> Vec { + RelType::parse(rel_type).map_or_else(Vec::new, |rel| rel_source_nodes(self, rel)) + } + + fn property(&self, node: NodeRef, prop: &str) -> CypherValue { + property(self, node, prop) + } + + fn label(&self, node: NodeRef) -> String { + node_label(self, node) + } + + fn name(&self, node: NodeRef) -> String { + node_name(self, node) + } } /// Returns all nodes matching the given labels. An empty slice matches every node; otherwise a node @@ -129,16 +185,6 @@ fn scan_label(graph: &Graph, label: &str) -> Vec { } } -/// Returns whether a node matches the given labels. An empty slice matches any node; otherwise the -/// node must match **at least one** of the labels. -#[must_use] -pub fn matches_labels(graph: &Graph, node: NodeRef, labels: &[String]) -> bool { - if labels.is_empty() { - return true; - } - labels.iter().any(|label| matches_label(graph, node, label)) -} - /// Returns whether a node matches a single label. #[must_use] pub fn matches_label(graph: &Graph, node: NodeRef, label: &str) -> bool { diff --git a/rust/rubydex/src/query/cypher/schema_info.rs b/rust/rubydex/src/query/cypher/schema_info.rs index 2d3f06d73..8a5df4ef7 100644 --- a/rust/rubydex/src/query/cypher/schema_info.rs +++ b/rust/rubydex/src/query/cypher/schema_info.rs @@ -2,8 +2,8 @@ //! relationship types, and node properties that queries can use. This mirrors the mapping //! implemented in [`super::schema`] and is exposed via `--schema` for discoverability. -use super::format::OutputFormat; -use super::value::write_json_string; +use cypher_parser::OutputFormat; +use cypher_parser::value::write_json_string; /// A node label and what graph entity it matches. struct LabelInfo { diff --git a/rust/rubydex/src/query/cypher/tests.rs b/rust/rubydex/src/query/cypher/tests.rs index fa88734b5..3f02a2e45 100644 --- a/rust/rubydex/src/query/cypher/tests.rs +++ b/rust/rubydex/src/query/cypher/tests.rs @@ -1,114 +1,10 @@ -use super::ast::{AggFn, CmpOp, Direction, Expr, Literal}; -use super::executor::{self, ResultSet}; -use super::parser; -use super::value::CypherValue; -use super::{OutputFormat, run_query}; +use super::run_query; use crate::model::graph::Graph; use crate::test_utils::GraphTest; +use cypher_parser::{CypherValue, OutputFormat, ResultSet, execute, parse}; -// ---- Parser tests -------------------------------------------------------- - -#[test] -fn parses_basic_match_return() { - let query = parser::parse("MATCH (c:Class) RETURN c.name").unwrap(); - assert_eq!(query.patterns.len(), 1); - let start = &query.patterns[0].start; - assert_eq!(start.var.as_deref(), Some("c")); - assert_eq!(start.labels, vec!["Class".to_string()]); - assert!(query.patterns[0].rest.is_empty()); - assert_eq!(query.return_clause.items.len(), 1); - assert_eq!( - query.return_clause.items[0].expr, - Expr::Property("c".into(), "name".into()) - ); -} - -#[test] -fn parses_label_disjunction() { - let query = parser::parse("MATCH (n:Class|Module) RETURN n").unwrap(); - assert_eq!( - query.patterns[0].start.labels, - vec!["Class".to_string(), "Module".to_string()] - ); -} - -#[test] -fn parses_inline_properties() { - let query = parser::parse("MATCH (c:Class {name: 'Foo'}) RETURN c").unwrap(); - let props = &query.patterns[0].start.props; - assert_eq!(props.len(), 1); - assert_eq!(props[0].0, "name"); - assert_eq!(props[0].1, Literal::Str("Foo".into())); -} - -#[test] -fn parses_relationship_directions() { - let outgoing = parser::parse("MATCH (a)-[:INHERITS]->(b) RETURN a").unwrap(); - assert_eq!(outgoing.patterns[0].rest[0].0.direction, Direction::Outgoing); - assert_eq!(outgoing.patterns[0].rest[0].0.types, vec!["INHERITS".to_string()]); - - let incoming = parser::parse("MATCH (a)<-[:INHERITS]-(b) RETURN a").unwrap(); - assert_eq!(incoming.patterns[0].rest[0].0.direction, Direction::Incoming); - - let both = parser::parse("MATCH (a)-[:INHERITS]-(b) RETURN a").unwrap(); - assert_eq!(both.patterns[0].rest[0].0.direction, Direction::Both); -} - -#[test] -fn parses_variable_length() { - let query = parser::parse("MATCH (a)-[:INHERITS*2..5]->(b) RETURN a").unwrap(); - let length = query.patterns[0].rest[0].0.length.unwrap(); - assert_eq!(length.min, 2); - assert_eq!(length.max, Some(5)); - - let unbounded = parser::parse("MATCH (a)-[:OWNS*]->(b) RETURN a").unwrap(); - let length = unbounded.patterns[0].rest[0].0.length.unwrap(); - assert_eq!(length.min, 1); - assert_eq!(length.max, None); - - let exact = parser::parse("MATCH (a)-[:OWNS*3]->(b) RETURN a").unwrap(); - let length = exact.patterns[0].rest[0].0.length.unwrap(); - assert_eq!(length.min, 3); - assert_eq!(length.max, Some(3)); -} - -#[test] -fn parses_aggregation_and_alias() { - let query = parser::parse("MATCH (c:Class) RETURN c.name, count(*) AS total").unwrap(); - assert_eq!(query.return_clause.items[1].alias.as_deref(), Some("total")); - assert_eq!( - query.return_clause.items[1].expr, - Expr::Aggregate { - func: AggFn::Count, - arg: None, - distinct: false, - } - ); -} - -#[test] -fn parses_where_and_order_limit() { - let query = - parser::parse("MATCH (c:Class) WHERE c.name CONTAINS 'Service' RETURN c.name ORDER BY c.name DESC LIMIT 5") - .unwrap(); - let Some(Expr::Compare(_, op, _)) = query.where_clause else { - panic!("expected comparison"); - }; - assert_eq!(op, CmpOp::Contains); - assert_eq!(query.order_by.len(), 1); - assert!(query.order_by[0].descending); - assert_eq!(query.limit, Some(5)); -} - -#[test] -fn rejects_invalid_syntax() { - assert!(parser::parse("MATCH (c:Class RETURN c").is_err()); - assert!(parser::parse("RETURN c").is_err()); - assert!(parser::parse("MATCH (c) RETURN").is_err()); - assert!(parser::parse("MATCH (a)<-[:INHERITS]->(b) RETURN a").is_err()); -} - -// ---- Executor tests ------------------------------------------------------ +// Parser-only tests live in the `cypher-parser` crate. These exercise the executor and the +// end-to-end query/format path against a real graph. fn fixture_graph() -> Graph { let mut context = GraphTest::new(); @@ -135,8 +31,8 @@ fn fixture_graph() -> Graph { } fn run(graph: &Graph, query: &str) -> ResultSet { - let parsed = parser::parse(query).unwrap(); - executor::execute(graph, &parsed).unwrap() + let parsed = parse(query).unwrap(); + execute(graph, &parsed).unwrap() } fn column_strings(result: &ResultSet, column: usize) -> Vec { @@ -289,6 +185,6 @@ fn run_query_json_output() { #[test] fn unknown_relationship_type_errors() { let graph = fixture_graph(); - let parsed = parser::parse("MATCH (a)-[:BOGUS]->(b) RETURN a").unwrap(); - assert!(executor::execute(&graph, &parsed).is_err()); + let parsed = parse("MATCH (a)-[:BOGUS]->(b) RETURN a").unwrap(); + assert!(execute(&graph, &parsed).is_err()); } diff --git a/rust/rubydex/src/query/cypher/value.rs b/rust/rubydex/src/query/cypher/value.rs deleted file mode 100644 index b14ddc60f..000000000 --- a/rust/rubydex/src/query/cypher/value.rs +++ /dev/null @@ -1,148 +0,0 @@ -use std::cmp::Ordering; -use std::fmt::Write; - -/// A scalar or composite value produced by evaluating a Cypher expression or projecting a result. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum CypherValue { - Null, - Bool(bool), - Int(i64), - Str(String), - /// A graph node, rendered with its label and primary name. - Node { - label: String, - name: String, - }, - List(Vec), -} - -impl CypherValue { - /// Returns the truthiness of a value for use in `WHERE` filtering. - /// `NULL` and `false` are falsy; everything else (including bound nodes) is truthy. - #[must_use] - pub fn is_truthy(&self) -> bool { - match self { - CypherValue::Null => false, - CypherValue::Bool(b) => *b, - _ => true, - } - } - - /// Returns a numeric view of the value if it is an integer. - #[must_use] - pub fn as_int(&self) -> Option { - match self { - CypherValue::Int(i) => Some(*i), - _ => None, - } - } - - /// Returns a string view of the value if it is a string. - #[must_use] - pub fn as_str(&self) -> Option<&str> { - match self { - CypherValue::Str(s) => Some(s), - _ => None, - } - } - - /// Rank of each variant, used to order values of differing types deterministically. - fn type_rank(&self) -> u8 { - match self { - CypherValue::Null => 0, - CypherValue::Bool(_) => 1, - CypherValue::Int(_) => 2, - CypherValue::Str(_) => 3, - CypherValue::Node { .. } => 4, - CypherValue::List(_) => 5, - } - } - - /// Total ordering across all value types. `NULL` sorts first. Differing types are ordered by - /// their variant rank so that sorting is always deterministic. - #[must_use] - pub fn total_cmp(&self, other: &CypherValue) -> Ordering { - match (self, other) { - (CypherValue::Bool(a), CypherValue::Bool(b)) => a.cmp(b), - (CypherValue::Int(a), CypherValue::Int(b)) => a.cmp(b), - (CypherValue::Str(a), CypherValue::Str(b)) - | (CypherValue::Node { name: a, .. }, CypherValue::Node { name: b, .. }) => a.cmp(b), - (CypherValue::List(a), CypherValue::List(b)) => { - for (x, y) in a.iter().zip(b.iter()) { - let ordering = x.total_cmp(y); - if ordering != Ordering::Equal { - return ordering; - } - } - a.len().cmp(&b.len()) - } - _ => self.type_rank().cmp(&other.type_rank()), - } - } - - /// Renders the value for display in a plain-text table cell. - #[must_use] - pub fn to_display_string(&self) -> String { - match self { - CypherValue::Null => String::new(), - CypherValue::Bool(b) => b.to_string(), - CypherValue::Int(i) => i.to_string(), - CypherValue::Str(s) => s.clone(), - CypherValue::Node { name, .. } => name.clone(), - CypherValue::List(items) => { - let rendered: Vec = items.iter().map(CypherValue::to_display_string).collect(); - format!("[{}]", rendered.join(", ")) - } - } - } - - /// Renders the value as a JSON fragment, appending to `out`. - pub fn write_json(&self, out: &mut String) { - match self { - CypherValue::Null => out.push_str("null"), - CypherValue::Bool(b) => { - let _ = write!(out, "{b}"); - } - CypherValue::Int(i) => { - let _ = write!(out, "{i}"); - } - CypherValue::Str(s) => write_json_string(out, s), - CypherValue::Node { label, name } => { - out.push_str("{\"label\":"); - write_json_string(out, label); - out.push_str(",\"name\":"); - write_json_string(out, name); - out.push('}'); - } - CypherValue::List(items) => { - out.push('['); - for (index, item) in items.iter().enumerate() { - if index > 0 { - out.push(','); - } - item.write_json(out); - } - out.push(']'); - } - } - } -} - -/// Escapes and quotes a string as a JSON string literal. -pub fn write_json_string(out: &mut String, value: &str) { - out.push('"'); - for ch in value.chars() { - match ch { - '"' => out.push_str("\\\""), - '\\' => out.push_str("\\\\"), - '\n' => out.push_str("\\n"), - '\r' => out.push_str("\\r"), - '\t' => out.push_str("\\t"), - c if (c as u32) < 0x20 => { - let _ = write!(out, "\\u{:04x}", c as u32); - } - c => out.push(c), - } - } - out.push('"'); -} From bc2a231149d62fb0efc87b92ddf9c3441c4f8d50 Mon Sep 17 00:00:00 2001 From: Ufuk Kayserilioglu Date: Tue, 23 Jun 2026 22:29:38 +0300 Subject: [PATCH 5/8] Parse Cypher queries into a reusable Query object before building the graph Split query handling into an explicit parse step and a render step so a malformed query fails fast, before the expensive workspace indexing and resolution. - rubydex_cli: parse `--query` up front (exiting on a syntax error before any listing/indexing), then run the pre-parsed query against the graph via cypher::run_parsed. - Gem: add an opaque `Rubydex::Query` object: * `Rubydex::Query.parse(str)` parses without a graph, raising ArgumentError on a syntax error; * `Query#render(graph, format)` runs the parsed query against a graph and returns the formatted output; * `Rubydex::Query.schema(format)` describes the queryable schema. Backed by new FFI exports (rdx_cypher_parse, rdx_cypher_query_free, rdx_query_run). The query API now lives entirely on `Rubydex::Query`: the previous `Graph#query` and `Graph.cypher_schema` methods are removed. - exe/rdx: `query` parses first, then builds the graph, then renders the parsed query against it; `schema` uses `Rubydex::Query.schema`. --- exe/rdx | 12 ++- ext/rubydex/graph.c | 124 +++++++++++++++++---------- rbi/rubydex.rbi | 19 ++-- rust/rubydex-sys/src/graph_api.rs | 94 +++++++++++++++----- rust/rubydex/src/main.rs | 15 +++- rust/rubydex/src/query/cypher/mod.rs | 16 +++- test/graph_test.rb | 80 ++++++++++------- 7 files changed, 251 insertions(+), 109 deletions(-) diff --git a/exe/rdx b/exe/rdx index bb5356e3d..8954d86b3 100755 --- a/exe/rdx +++ b/exe/rdx @@ -74,10 +74,18 @@ when "query" abort_with_usage("`query` requires a Cypher query argument") if query.nil? || query.empty? require "rubydex" + + # Parse the query first so a malformed query fails fast, before the expensive workspace indexing. + parsed = begin + Rubydex::Query.parse(query) + rescue ArgumentError => e + abort(e.message) + end + # Progress goes to stderr so stdout carries only the query result (e.g. for piping JSON). graph = build_graph($stderr) begin - print(graph.query(query, format)) + print(parsed.render(graph, format)) rescue ArgumentError => e abort(e.message) end @@ -86,7 +94,7 @@ when "schema" require "rubydex" # The schema is static, so describe it without indexing the workspace. - print(Rubydex::Graph.cypher_schema(format)) + print(Rubydex::Query.schema(format)) when "console" OptionParser.new do |parser| parser.banner = "Usage: rdx console" diff --git a/ext/rubydex/graph.c b/ext/rubydex/graph.c index 91a481f58..5e4414e6a 100644 --- a/ext/rubydex/graph.c +++ b/ext/rubydex/graph.c @@ -9,6 +9,7 @@ #include "utils.h" static VALUE cGraph; +static VALUE cQuery; static VALUE mRubydex; static VALUE cKeyword; static VALUE cKeywordParameter; @@ -16,6 +17,9 @@ static VALUE cKeywordParameter; // Interned once in `rdxi_initialize_graph` to avoid repeated symbol-table lookups on hot completion paths. static ID id_self_receiver; +// Coerces an optional format argument (String, Symbol, or nil) to a C string; defined below. +static const char *cypher_format_cstr(VALUE format); + // Extracts the required `self_receiver:` kwarg from `opts`. Returns NULL when the value is `nil`, // which means "no self-type to walk" (e.g., empty class body where the singleton class hasn't // been created). Raises ArgumentError if the kwarg is absent, of the wrong type, or an empty @@ -750,28 +754,84 @@ static VALUE rdxr_graph_keyword(VALUE self, VALUE name) { return rb_class_new_instance(2, argv, cKeyword); } -// Graph#query: (String query, ?(String | Symbol) format) -> String -// Runs a Cypher query against the graph and returns the formatted output. -// `format` may be "table" (default) or "json". Raises ArgumentError on a parse, execution, or -// format error. -static VALUE rdxr_graph_query(int argc, VALUE *argv, VALUE self) { - VALUE query, format; - rb_scan_args(argc, argv, "11", &query, &format); +// Rubydex::Query.schema(format = :table) -> String +// Returns a description of the queryable Cypher schema. `format` may be "table" (default) or "json". +// The schema is static, so this is a class method and does not require a graph. +static VALUE rdxr_cypher_schema(int argc, VALUE *argv, VALUE self) { + VALUE format; + rb_scan_args(argc, argv, "01", &format); + + const char *output = rdx_cypher_schema(cypher_format_cstr(format)); + VALUE result = output == NULL ? rb_utf8_str_new_cstr("") : rb_utf8_str_new_cstr(output); + if (output != NULL) { + free_c_string(output); + } + + return result; +} + +// Coerces an optional format argument (String, Symbol, or nil) to a C string, defaulting to "table". +static const char *cypher_format_cstr(VALUE format) { + if (NIL_P(format)) { + return "table"; + } + if (RB_TYPE_P(format, T_SYMBOL)) { + format = rb_sym2str(format); + } + Check_Type(format, T_STRING); + return StringValueCStr(format); +} + +// Free function for Rubydex::Query: releases the parsed query allocated by Rust. +static void query_free(void *ptr) { + if (ptr) { + rdx_cypher_query_free(ptr); + } +} + +static const rb_data_type_t query_type = { + .wrap_struct_name = "Rubydex::Query", + .function = { + .dmark = NULL, + .dfree = query_free, + .dsize = NULL, + .dcompact = NULL, + }, + .parent = NULL, + .data = NULL, + .flags = RUBY_TYPED_FREE_IMMEDIATELY, +}; + +// Rubydex::Query.parse(query) -> Rubydex::Query +// Parses a Cypher query into an opaque, reusable object, without needing a graph. Raises +// ArgumentError on a syntax error, so callers can validate a query before building a graph. +static VALUE rdxr_query_parse(VALUE klass, VALUE query) { Check_Type(query, T_STRING); - const char *format_str = "table"; - if (!NIL_P(format)) { - if (RB_TYPE_P(format, T_SYMBOL)) { - format = rb_sym2str(format); - } - Check_Type(format, T_STRING); - format_str = StringValueCStr(format); + struct CParseResult result = rdx_cypher_parse(StringValueCStr(query)); + if (result.error != NULL) { + VALUE message = rb_utf8_str_new_cstr(result.error); + free_c_string(result.error); + rb_raise(rb_eArgError, "%s", StringValueCStr(message)); } + return TypedData_Wrap_Struct(klass, &query_type, result.query); +} + +// Rubydex::Query#render(graph, format = :table) -> String +// Runs this parsed query against the given graph and returns the formatted output. `format` may be +// "table" (default) or "json". Raises ArgumentError on an execution or format error. +static VALUE rdxr_query_render(int argc, VALUE *argv, VALUE self) { + VALUE graph_obj, format; + rb_scan_args(argc, argv, "11", &graph_obj, &format); + + void *query; + TypedData_Get_Struct(self, void *, &query_type, query); + void *graph; - TypedData_Get_Struct(self, void *, &graph_type, graph); + TypedData_Get_Struct(graph_obj, void *, &graph_type, graph); - struct CQueryResult result = rdx_graph_query(graph, StringValueCStr(query), format_str); + struct CQueryResult result = rdx_query_run(query, graph, cypher_format_cstr(format)); if (result.error != NULL) { VALUE message = rb_utf8_str_new_cstr(result.error); @@ -787,31 +847,6 @@ static VALUE rdxr_graph_query(int argc, VALUE *argv, VALUE self) { return output; } -// Rubydex::Graph.cypher_schema(format = :table) -> String -// Returns a description of the queryable Cypher schema. `format` may be "table" (default) or "json". -// The schema is static, so this is a class method and does not require a graph instance. -static VALUE rdxr_cypher_schema(int argc, VALUE *argv, VALUE self) { - VALUE format; - rb_scan_args(argc, argv, "01", &format); - - const char *format_str = "table"; - if (!NIL_P(format)) { - if (RB_TYPE_P(format, T_SYMBOL)) { - format = rb_sym2str(format); - } - Check_Type(format, T_STRING); - format_str = StringValueCStr(format); - } - - const char *output = rdx_cypher_schema(format_str); - VALUE result = output == NULL ? rb_utf8_str_new_cstr("") : rb_utf8_str_new_cstr(output); - if (output != NULL) { - free_c_string(output); - } - - return result; -} - void rdxi_initialize_graph(VALUE moduleRubydex) { mRubydex = moduleRubydex; cGraph = rb_define_class_under(mRubydex, "Graph", rb_cObject); @@ -846,7 +881,10 @@ void rdxi_initialize_graph(VALUE moduleRubydex) { rb_define_method(cGraph, "exclude_paths", rdxr_graph_exclude_paths, 1); rb_define_method(cGraph, "excluded_paths", rdxr_graph_excluded_paths, 0); rb_define_method(cGraph, "keyword", rdxr_graph_keyword, 1); - rb_define_method(cGraph, "query", rdxr_graph_query, -1); - rb_define_singleton_method(cGraph, "cypher_schema", rdxr_cypher_schema, -1); + cQuery = rb_define_class_under(mRubydex, "Query", rb_cObject); + rb_undef_alloc_func(cQuery); + rb_define_singleton_method(cQuery, "parse", rdxr_query_parse, 1); + rb_define_singleton_method(cQuery, "schema", rdxr_cypher_schema, -1); + rb_define_method(cQuery, "render", rdxr_query_render, -1); } diff --git a/rbi/rubydex.rbi b/rbi/rubydex.rbi index 231c7e80a..618d34e62 100644 --- a/rbi/rubydex.rbi +++ b/rbi/rubydex.rbi @@ -272,14 +272,22 @@ end class Rubydex::IntegrityFailure < Rubydex::Failure; end -class Rubydex::Graph - IGNORED_DIRECTORIES = T.let(T.unsafe(nil), T::Array[String]) - +class Rubydex::Query class << self + sig { params(query: String).returns(Rubydex::Query) } + def parse(query); end + sig { params(format: T.any(String, Symbol)).returns(String) } - def cypher_schema(format = :table); end + def schema(format = :table); end end + sig { params(graph: Rubydex::Graph, format: T.any(String, Symbol)).returns(String) } + def render(graph, format = :table); end +end + +class Rubydex::Graph + IGNORED_DIRECTORIES = T.let(T.unsafe(nil), T::Array[String]) + sig { params(workspace_path: T.nilable(String)).void } def initialize(workspace_path: nil); end @@ -329,9 +337,6 @@ class Rubydex::Graph sig { params(require_path: String, load_paths: T::Array[String]).returns(T.nilable(Rubydex::Document)) } def resolve_require_path(require_path, load_paths); end - sig { params(query: String, format: T.any(String, Symbol)).returns(String) } - def query(query, format = :table); end - sig { params(query: String).returns(T::Enumerable[Rubydex::Declaration]) } def search(query); end diff --git a/rust/rubydex-sys/src/graph_api.rs b/rust/rubydex-sys/src/graph_api.rs index 1caab7a50..99c6e8ec1 100644 --- a/rust/rubydex-sys/src/graph_api.rs +++ b/rust/rubydex-sys/src/graph_api.rs @@ -1006,42 +1006,75 @@ impl CQueryResult { } } -/// Returns a description of the queryable Cypher schema (node labels, relationship types, and -/// properties) in the given format (`"table"` or `"json"`). The schema is static and requires no -/// graph. Caller must free the returned pointer with `free_c_string`. +/// The result of parsing a Cypher query into an opaque, reusable parsed-query object. +#[repr(C)] +pub struct CParseResult { + /// Non-null on success: a heap-allocated parsed query. Free with `rdx_cypher_query_free`. + pub query: *mut c_void, + /// Non-null on error; null on success. Caller must free with `free_c_string`. + pub error: *const c_char, +} + +/// Parses a Cypher query string into an opaque parsed-query object, without needing a graph. +/// +/// On success, `query` is a heap-allocated parsed query that can be executed against a graph with +/// `rdx_query_run` and must eventually be freed with `rdx_cypher_query_free`. On failure, `error` +/// holds the message. /// /// # Safety /// -/// - `format` must be a valid, null-terminated UTF-8 string. +/// - `query` must be a valid, null-terminated UTF-8 string. #[unsafe(no_mangle)] -pub unsafe extern "C" fn rdx_cypher_schema(format: *const c_char) -> *const c_char { - let format_str = unsafe { utils::convert_char_ptr_to_string(format) }.unwrap_or_else(|_| "table".to_string()); - let output_format = if format_str == "json" { - OutputFormat::Json - } else { - OutputFormat::Table +pub unsafe extern "C" fn rdx_cypher_parse(query: *const c_char) -> CParseResult { + let Ok(query_str) = (unsafe { utils::convert_char_ptr_to_string(query) }) else { + return CParseResult { + query: ptr::null_mut(), + error: CString::new("query is not valid UTF-8").map_or(ptr::null(), |s| s.into_raw().cast_const()), + }; }; - CString::new(cypher::schema(output_format)).map_or(ptr::null(), |s| s.into_raw().cast_const()) + match cypher::parse(&query_str) { + Ok(parsed) => CParseResult { + query: Box::into_raw(Box::new(parsed)).cast::(), + error: ptr::null(), + }, + Err(error) => CParseResult { + query: ptr::null_mut(), + error: CString::new(error.to_string()).map_or(ptr::null(), |s| s.into_raw().cast_const()), + }, + } } -/// Runs a Cypher query against the graph and returns the formatted output or an error message. +/// Frees a parsed query previously returned by `rdx_cypher_parse`. +/// +/// # Safety /// -/// `format` must be `"table"` or `"json"`. +/// - `query` must be a pointer returned by `rdx_cypher_parse`, or null. It must not be used after. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn rdx_cypher_query_free(query: *mut c_void) { + if query.is_null() { + return; + } + let _ = unsafe { Box::from_raw(query.cast::()) }; +} + +/// Executes a previously parsed query (from `rdx_cypher_parse`) against the graph and returns the +/// formatted output or an error message. `format` must be `"table"` or `"json"`. /// /// # Safety /// +/// - `query` must be a valid pointer returned by `rdx_cypher_parse`. /// - `pointer` must be a valid `GraphPointer` previously returned by this crate. -/// - `query` and `format` must be valid, null-terminated UTF-8 strings. +/// - `format` must be a valid, null-terminated UTF-8 string. #[unsafe(no_mangle)] -pub unsafe extern "C" fn rdx_graph_query( +pub unsafe extern "C" fn rdx_query_run( + query: *const c_void, pointer: GraphPointer, - query: *const c_char, format: *const c_char, ) -> CQueryResult { - let Ok(query_str) = (unsafe { utils::convert_char_ptr_to_string(query) }) else { - return CQueryResult::error("query is not valid UTF-8"); - }; + if query.is_null() { + return CQueryResult::error("query is null"); + } let Ok(format_str) = (unsafe { utils::convert_char_ptr_to_string(format) }) else { return CQueryResult::error("format is not valid UTF-8"); @@ -1055,14 +1088,35 @@ pub unsafe extern "C" fn rdx_graph_query( } }; + let parsed = unsafe { &*query.cast::() }; + with_graph(pointer, |graph| { - match cypher::run_query(graph, &query_str, output_format) { + match cypher::run_parsed(graph, parsed, output_format) { Ok(output) => CQueryResult::success(&output), Err(error) => CQueryResult::error(&error.to_string()), } }) } +/// Returns a description of the queryable Cypher schema (node labels, relationship types, and +/// properties) in the given format (`"table"` or `"json"`). The schema is static and requires no +/// graph. Caller must free the returned pointer with `free_c_string`. +/// +/// # Safety +/// +/// - `format` must be a valid, null-terminated UTF-8 string. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn rdx_cypher_schema(format: *const c_char) -> *const c_char { + let format_str = unsafe { utils::convert_char_ptr_to_string(format) }.unwrap_or_else(|_| "table".to_string()); + let output_format = if format_str == "json" { + OutputFormat::Json + } else { + OutputFormat::Table + }; + + CString::new(cypher::schema(output_format)).map_or(ptr::null(), |s| s.into_raw().cast_const()) +} + #[repr(u8)] #[derive(Debug, Clone, Copy)] pub enum CVisibility { diff --git a/rust/rubydex/src/main.rs b/rust/rubydex/src/main.rs index bebd17c20..b95b65dd2 100644 --- a/rust/rubydex/src/main.rs +++ b/rust/rubydex/src/main.rs @@ -127,6 +127,15 @@ fn main() { std::process::exit(0); } + // Parse the query up front, before any indexing, so a malformed query fails fast. + let parsed_query = args.query.as_ref().map(|query| match cypher::parse(query) { + Ok(parsed) => parsed, + Err(error) => { + eprintln!("{error}"); + std::process::exit(1); + } + }); + if args.stats { Timer::set_global_timer(Timer::new()); } @@ -212,9 +221,9 @@ fn main() { } } - // Cypher query - if let Some(query) = &args.query { - match time_it!(querying, { cypher::run_query(&graph, query, args.format.into()) }) { + // Cypher query: execute the query parsed earlier against the now-built graph. + if let Some(query) = &parsed_query { + match time_it!(querying, { cypher::run_parsed(&graph, query, args.format.into()) }) { Ok(output) => print!("{output}"), Err(error) => { eprintln!("{error}"); diff --git a/rust/rubydex/src/query/cypher/mod.rs b/rust/rubydex/src/query/cypher/mod.rs index 1e28bc0af..63a7c0403 100644 --- a/rust/rubydex/src/query/cypher/mod.rs +++ b/rust/rubydex/src/query/cypher/mod.rs @@ -16,7 +16,10 @@ // The whole Cypher engine — lexer, parser, AST, executor, values, and formatting — lives in the // graph-independent `cypher-parser` crate. rubydex only provides the `GraphProvider` mapping for its // `Graph` (in `schema`) and the static schema description (in `schema_info`). -pub use cypher_parser::{CypherError, OutputFormat}; +// +// `Query` is the opaque parsed-query object: callers can `parse` a query string once (failing fast +// on syntax errors), then `run_parsed` it against a graph that was built afterwards. +pub use cypher_parser::{CypherError, OutputFormat, Query, parse}; pub mod schema; pub mod schema_info; @@ -32,6 +35,17 @@ pub fn run_query(graph: &Graph, query: &str, output_format: OutputFormat) -> Res cypher_parser::run_query(graph, query, output_format) } +/// Executes an already-parsed [`Query`] against the graph and formats the result. Pair with +/// [`parse`] to validate a query before building the graph. +/// +/// # Errors +/// +/// Returns a [`CypherError`] if the query cannot be executed. +pub fn run_parsed(graph: &Graph, query: &Query, output_format: OutputFormat) -> Result { + let result = cypher_parser::execute(graph, query)?; + Ok(cypher_parser::format::format(&result, output_format)) +} + /// Returns a description of the queryable schema (node labels, relationship types, and properties) /// in the requested format. The schema is static and does not require a graph. #[must_use] diff --git a/test/graph_test.rb b/test/graph_test.rb index 51ddae1f0..b684c4a39 100644 --- a/test/graph_test.rb +++ b/test/graph_test.rb @@ -1442,7 +1442,7 @@ def test_document_returns_correct_document_with_multiple_documents end def test_cypher_schema_table - output = Rubydex::Graph.cypher_schema + output = Rubydex::Query.schema assert_match(/Node labels/, output) assert_match(/Relationship types/, output) @@ -1452,13 +1452,46 @@ def test_cypher_schema_table end def test_cypher_schema_json - output = Rubydex::Graph.cypher_schema(:json) + output = Rubydex::Query.schema(:json) parsed = JSON.parse(output) assert_equal(["node_labels", "relationships", "properties"], parsed.keys) assert(parsed["relationships"].any? { |r| r["type"] == "DEFINES" }) end + def test_parsed_query_runs_against_graph + with_context do |context| + context.write!("zoo.rb", "class Animal; end\nclass Dog < Animal; end\n") + + query = Rubydex::Query.parse("MATCH (c:Class)-[:INHERITS]->(p:Class) WHERE c.name = 'Dog' RETURN p.name") + assert_instance_of(Rubydex::Query, query) + + graph = Rubydex::Graph.new + graph.index_all(context.glob("**/*.rb")) + graph.resolve + + assert_equal("[{\"p.name\":\"Animal\"}]", query.render(graph, :json)) + end + end + + def test_parse_raises_on_syntax_error + error = assert_raises(ArgumentError) { Rubydex::Query.parse("MATCH (c RETURN c") } + assert_match(/Cypher syntax error/, error.message) + end + + def test_parsed_query_reusable_across_graphs + query = Rubydex::Query.parse("MATCH (c:Class {name: 'Dog'}) RETURN c.name") + + with_context do |context| + context.write!("zoo.rb", "class Dog; end\n") + graph = Rubydex::Graph.new + graph.index_all(context.glob("**/*.rb")) + graph.resolve + + assert_equal("[{\"c.name\":\"Dog\"}]", query.render(graph, :json)) + end + end + def test_query_returns_table_output with_context do |context| context.write!("zoo.rb", <<~RUBY) @@ -1471,7 +1504,8 @@ class Cat < Animal; end graph.index_all(context.glob("**/*.rb")) graph.resolve - output = graph.query("MATCH (c:Class)-[:INHERITS]->(p:Class) WHERE p.name = 'Animal' RETURN c.name ORDER BY c.name") + query = Rubydex::Query.parse("MATCH (c:Class)-[:INHERITS]->(p:Class) WHERE p.name = 'Animal' RETURN c.name ORDER BY c.name") + output = query.render(graph) assert_match(/c\.name/, output) assert_match(/Cat/, output) @@ -1492,31 +1526,28 @@ class Dog < Animal; end graph.index_all(context.glob("**/*.rb")) graph.resolve - output = graph.query( + query = Rubydex::Query.parse( "MATCH (n:Class|Module) WHERE n.name = 'Animal' OR n.name = 'Walkable' RETURN n.name ORDER BY n.name", - :json, ) - assert_equal("[{\"n.name\":\"Animal\"},{\"n.name\":\"Walkable\"}]", output) + assert_equal("[{\"n.name\":\"Animal\"},{\"n.name\":\"Walkable\"}]", query.render(graph, :json)) end end - def test_query_returns_json_output + def test_query_accepts_string_format with_context do |context| - context.write!("zoo.rb", "class Animal; end\nclass Dog < Animal; end\n") + context.write!("zoo.rb", "class Dog; end\n") graph = Rubydex::Graph.new graph.index_all(context.glob("**/*.rb")) graph.resolve - assert_equal( - "[{\"c.name\":\"Dog\"}]", - graph.query("MATCH (c:Class {name: 'Dog'}) RETURN c.name", :json), - ) + query = Rubydex::Query.parse("MATCH (c:Class {name: 'Dog'}) RETURN c.name") + assert_equal("[{\"c.name\":\"Dog\"}]", query.render(graph, "json")) end end - def test_query_accepts_string_format + def test_render_raises_on_invalid_format with_context do |context| context.write!("zoo.rb", "class Dog; end\n") @@ -1524,29 +1555,12 @@ def test_query_accepts_string_format graph.index_all(context.glob("**/*.rb")) graph.resolve - assert_equal( - "[{\"c.name\":\"Dog\"}]", - graph.query("MATCH (c:Class {name: 'Dog'}) RETURN c.name", "json"), - ) + query = Rubydex::Query.parse("MATCH (c:Class) RETURN c.name") + error = assert_raises(ArgumentError) { query.render(graph, :yaml) } + assert_match(/unknown query format/, error.message) end end - def test_query_raises_on_syntax_error - graph = Rubydex::Graph.new - graph.resolve - - error = assert_raises(ArgumentError) { graph.query("MATCH (c RETURN c") } - assert_match(/Cypher syntax error/, error.message) - end - - def test_query_raises_on_invalid_format - graph = Rubydex::Graph.new - graph.resolve - - error = assert_raises(ArgumentError) { graph.query("MATCH (c:Class) RETURN c", :yaml) } - assert_match(/unknown query format/, error.message) - end - private def assert_diagnostics(expected, actual) From 1fc859867339ac4cbda6acf1d13c3114348e24c0 Mon Sep 17 00:00:00 2001 From: Ufuk Kayserilioglu Date: Thu, 25 Jun 2026 22:20:51 +0300 Subject: [PATCH 6/8] Use cypher.rs instead of cypher/mod.rs module file Follow the modern Rust module convention (path.rs alongside a path/ directory) instead of the legacy path/mod.rs style. Pure file move; the cypher/ directory keeps the schema, schema_info, and tests submodules. --- rust/rubydex/src/query/{cypher/mod.rs => cypher.rs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename rust/rubydex/src/query/{cypher/mod.rs => cypher.rs} (100%) diff --git a/rust/rubydex/src/query/cypher/mod.rs b/rust/rubydex/src/query/cypher.rs similarity index 100% rename from rust/rubydex/src/query/cypher/mod.rs rename to rust/rubydex/src/query/cypher.rs From 15a5462b2c503dcf7952e22fbdf9e140b5722d1d Mon Sep 17 00:00:00 2001 From: Ufuk Kayserilioglu Date: Thu, 25 Jun 2026 22:22:10 +0300 Subject: [PATCH 7/8] Document the CONTAINS vs OWNS distinction on RelType CONTAINS is per-file lexical nesting (Definition -> Definition), e.g. a class written inside a module; OWNS is the declaration-level membership counterpart, merged across all files. Add per-variant doc comments to RelType and clarify both in the module-level schema docs. --- rust/rubydex/src/query/cypher/schema.rs | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/rust/rubydex/src/query/cypher/schema.rs b/rust/rubydex/src/query/cypher/schema.rs index ee3237367..5fa2708b0 100644 --- a/rust/rubydex/src/query/cypher/schema.rs +++ b/rust/rubydex/src/query/cypher/schema.rs @@ -11,10 +11,12 @@ //! Relationship types mirror `dot.rs`: //! - `DEFINES`: `Document` → `Definition` //! - `DECLARES`: `Definition` → `Declaration` -//! - `CONTAINS`: `Definition` → `Definition` (lexical nesting) +//! - `CONTAINS`: `Definition` → `Definition` (lexical nesting in one file, e.g. a class written +//! inside a module; the source-level counterpart of declaration-level `OWNS`) //! - `INHERITS`: `Declaration` → `Declaration` (superclass) //! - `INCLUDES` / `PREPENDS` / `EXTENDS`: `Declaration` → `Declaration` (mixins) -//! - `OWNS`: `Declaration` → `Declaration` (members) +//! - `OWNS`: `Declaration` → `Declaration` (declaration-level membership, e.g. a namespace's methods +//! and nested constants, merged across all files) //! - `ANCESTOR`: `Declaration` → `Declaration` (linearized ancestor chain) //! - `DESCENDANT`: `Declaration` → `Declaration` //! - `REFERENCES`: `Document` → `Declaration` (constant references) @@ -39,16 +41,32 @@ pub enum NodeRef { /// A relationship type. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RelType { + /// `Document` → `Definition`: a file defines a construct occurrence. Defines, + /// `Definition` → `Declaration`: a per-file occurrence declares the global merged entity. Declares, + /// `Definition` → `Definition`: lexical nesting within a single file, e.g. a class written + /// inside a module. This is the source-level structure; the declaration-level (merged across + /// files) counterpart is [`RelType::Owns`]. Contains, + /// `Declaration` → `Declaration`: direct superclass (a single hop, not the full chain). Inherits, + /// `Declaration` → `Declaration`: an included module mixin. Includes, + /// `Declaration` → `Declaration`: a prepended module mixin. Prepends, + /// `Declaration` → `Declaration`: an extended module mixin. Extends, + /// `Declaration` → `Declaration`: declaration-level membership (a namespace's methods and + /// nested constants), merged across all files. The per-file source counterpart is + /// [`RelType::Contains`]. Owns, + /// `Declaration` → `Declaration`: an entry in the linearized ancestor chain (transitive + /// superclasses plus included/prepended modules). Ancestor, + /// `Declaration` → `Declaration`: the reverse of [`RelType::Ancestor`]. Descendant, + /// `Document` → `Declaration`: a constant reference in the file resolves to a declaration. References, } From 7e08a7b32eb4236b62815f8578e39f7e6a97222d Mon Sep 17 00:00:00 2001 From: Ufuk Kayserilioglu Date: Thu, 25 Jun 2026 22:25:52 +0300 Subject: [PATCH 8/8] Distinguish Document path from name and parse URIs robustly Previously the Document `path` property returned the URI basename, making it identical to a name and mislabeled. Split them: - `uri` -> full document URI (e.g. file:///app/models/user.rb) - `path` -> file-system path (e.g. /app/models/user.rb) - `name` -> base file name (e.g. user.rb) Add `Document::file_path` / `Document::file_name`, which decode the URI via the `url` crate (already a dependency) so percent-encoding and platform paths (including Windows drive paths) are handled correctly instead of naively splitting on '/'. `require_path` now reuses `file_path` instead of re-parsing the URI. Non-file:// URIs (the synthetic built-in document) fall back to the raw URI. Clarify that `prop` is the property name read off a node, and advertise the new `name` property in the schema. --- rust/rubydex/src/model/document.rs | 40 +++++++++++++++++--- rust/rubydex/src/query/cypher/schema.rs | 20 ++++++---- rust/rubydex/src/query/cypher/schema_info.rs | 7 +++- rust/rubydex/src/query/cypher/tests.rs | 23 +++++++++++ 4 files changed, 76 insertions(+), 14 deletions(-) diff --git a/rust/rubydex/src/model/document.rs b/rust/rubydex/src/model/document.rs index c99daa8e9..bd3419225 100644 --- a/rust/rubydex/src/model/document.rs +++ b/rust/rubydex/src/model/document.rs @@ -84,6 +84,39 @@ impl Document { self.diagnostics.push(diagnostic); } + /// The file-system path of this document, decoded from its URI. + /// + /// Returns `None` when the URI is not a `file://` URL (e.g. the synthetic built-in document) or + /// cannot be converted to a path. Uses `Url` so percent-encoding and platform-specific paths + /// (including Windows drive paths) are handled correctly. + #[must_use] + pub fn file_path(&self) -> Option { + let url = Url::parse(&self.uri).ok()?; + if url.scheme() != "file" { + return None; + } + url.to_file_path().ok() + } + + /// The base file name of this document (the last path segment), decoded from its URI. + /// + /// Prefers the platform file path, but falls back to the last URL path segment so it still works + /// for `file://` URIs that don't convert to a local path on the current platform (e.g. a + /// drive-less path like `file:///foo.rb` on Windows). Returns `None` only when the URI has no + /// usable path segment (e.g. the synthetic built-in document). + #[must_use] + pub fn file_name(&self) -> Option { + if let Some(path) = self.file_path() + && let Some(name) = path.file_name() + { + return Some(name.to_string_lossy().into_owned()); + } + + let url = Url::parse(&self.uri).ok()?; + let segment = url.path_segments()?.rfind(|segment| !segment.is_empty())?; + Some(segment.to_string()) + } + /// Computes the require path for this document given load paths. /// /// Returns `None` if: @@ -97,12 +130,7 @@ impl Document { /// Panics if load path entries exceed u16. #[must_use] pub fn require_path(&self, load_paths: &[PathBuf]) -> Option<(String, u16)> { - let url = Url::parse(&self.uri).ok()?; - if url.scheme() != "file" { - return None; - } - - let file_path = url.to_file_path().ok()?; + let file_path = self.file_path()?; if file_path.extension().is_none_or(|ext| ext != "rb") { return None; } diff --git a/rust/rubydex/src/query/cypher/schema.rs b/rust/rubydex/src/query/cypher/schema.rs index 5fa2708b0..6bd485de9 100644 --- a/rust/rubydex/src/query/cypher/schema.rs +++ b/rust/rubydex/src/query/cypher/schema.rs @@ -255,13 +255,13 @@ pub fn node_name(graph: &Graph, node: NodeRef) -> String { .and_then(|decl_id| graph.declarations().get(decl_id)) .map_or_else(String::new, |declaration| declaration.name().to_string()), NodeRef::Document(id) => graph.documents().get(&id).map_or_else(String::new, |document| { - let uri = document.uri(); - uri.rsplit('/').next().unwrap_or(uri).to_string() + document.file_name().unwrap_or_else(|| document.uri().to_string()) }), } } -/// Resolves a node property to a value. Unknown properties yield `NULL`. +/// Resolves a node property to a value, where `prop` is the property name read off the node (the +/// `x` in `RETURN n.x` / `WHERE n.x = ...`). Unknown properties yield `NULL`. #[must_use] pub fn property(graph: &Graph, node: NodeRef, prop: &str) -> CypherValue { match prop { @@ -319,12 +319,18 @@ fn document_property(graph: &Graph, id: UriId, prop: &str) -> CypherValue { return CypherValue::Null; }; + // Non-`file://` URIs (the synthetic built-in document) have no file path, so `path`/`name` fall + // back to the raw URI. match prop { + // Full document URI, e.g. `file:///app/models/user.rb`. "uri" => CypherValue::Str(document.uri().to_string()), - "path" | "name" => { - let uri = document.uri(); - CypherValue::Str(uri.rsplit('/').next().unwrap_or(uri).to_string()) - } + // File-system path, e.g. `/app/models/user.rb`. + "path" => CypherValue::Str(document.file_path().map_or_else( + || document.uri().to_string(), + |path| path.to_string_lossy().into_owned(), + )), + // Base file name, e.g. `user.rb`. + "name" => CypherValue::Str(document.file_name().unwrap_or_else(|| document.uri().to_string())), _ => CypherValue::Null, } } diff --git a/rust/rubydex/src/query/cypher/schema_info.rs b/rust/rubydex/src/query/cypher/schema_info.rs index 8a5df4ef7..cb0a356bd 100644 --- a/rust/rubydex/src/query/cypher/schema_info.rs +++ b/rust/rubydex/src/query/cypher/schema_info.rs @@ -218,7 +218,12 @@ const PROPERTIES: &[PropInfo] = &[ PropInfo { node_type: "Document", property: "path", - description: "Basename of the document URI", + description: "File system path of the document", + }, + PropInfo { + node_type: "Document", + property: "name", + description: "Base file name of the document", }, ]; diff --git a/rust/rubydex/src/query/cypher/tests.rs b/rust/rubydex/src/query/cypher/tests.rs index 3f02a2e45..5b0dfce9f 100644 --- a/rust/rubydex/src/query/cypher/tests.rs +++ b/rust/rubydex/src/query/cypher/tests.rs @@ -188,3 +188,26 @@ fn unknown_relationship_type_errors() { let parsed = parse("MATCH (a)-[:BOGUS]->(b) RETURN a").unwrap(); assert!(execute(&graph, &parsed).is_err()); } + +#[test] +fn document_uri_path_and_name_are_distinct() { + let graph = fixture_graph(); + let result = run( + &graph, + "MATCH (d:Document) WHERE d.uri = 'file:///zoo.rb' RETURN d.uri, d.path, d.name", + ); + assert_eq!( + result.columns, + vec!["d.uri".to_string(), "d.path".to_string(), "d.name".to_string()] + ); + // `uri` is the full URI and `name` is the basename on every platform. + assert_eq!(column_strings(&result, 0), vec!["file:///zoo.rb".to_string()]); + assert_eq!(column_strings(&result, 2), vec!["zoo.rb".to_string()]); + + // `path` is the decoded file-system path. A drive-less `file://` URI has no valid Windows path, + // so there it falls back to the raw URI; on Unix it decodes to `/zoo.rb`. + #[cfg(not(windows))] + assert_eq!(column_strings(&result, 1), vec!["/zoo.rb".to_string()]); + #[cfg(windows)] + assert_eq!(column_strings(&result, 1), vec!["file:///zoo.rb".to_string()]); +}