XLSForm · lognaturel · Dec 15, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,6 +11,7 @@ dependencies = [
   "xlrd==2.0.1",        # Read XLS files
   "openpyxl==3.1.5",    # Read XLSX files
   "defusedxml==0.7.1",  # Parse XML
+  "lark==1.3.1",        # Parse custom grammars
 ]
 
 [project.optional-dependencies]

diff --git a/pyxform/entities/entities_parsing.py b/pyxform/entities/entities_parsing.py
@@ -260,11 +260,12 @@ def get_validated_repeat_name(entity) -> str | None:
         match = parse_pyxform_references(value=value, match_limit=1, match_full=True)
     except PyXFormError as e:
         e.context.update(sheet="entities", column="repeat", row=2)
+        raise
     else:
-        if not match or not is_xml_tag(match[0]):
+        if not match or match[0].last_saved:
             raise PyXFormError(ENTITY001.format(value=value))
         else:
-            return match[0]
+            return match[0].name
 
 
 def validate_entity_saveto(

diff --git a/pyxform/errors.py b/pyxform/errors.py
@@ -56,10 +56,19 @@ class ErrorCode(Enum):
         ),
     )
     PYREF_003: Detail = Detail(
-        name="PyXForm Reference Question Not Found",
+        name="PyXForm Reference Name Not Found",
         msg=(
             "[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
-            "Reference variables must refer to a question name. Could not find '{q}'."
+            "Reference variables must contain a name from the 'survey' sheet. Could not "
+            "find the name '{q}'."
+        ),
+    )
+    PYREF_004: Detail = Detail(
+        name="PyXForm Reference Duplicate Name",
+        msg=(
+            "[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
+            "Reference variables names must be unique anywhere in the 'survey'. The name "
+            "'{q}' appears more than once."
         ),
     )
     INTERNAL_001: Detail = Detail(

diff --git a/pyxform/parsing/expression.py b/pyxform/parsing/expression.py
@@ -2,6 +2,8 @@
 from functools import lru_cache
 from typing import Any
 
+from lark import Lark, Token
+
 # ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
 # (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
 # They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
@@ -20,10 +22,6 @@
 ncname_regex_ns = rf"{ncname_regex}(?:\:{ncname_regex})?"
 ncname_regex_ns_named = rf"(?P<ncname_ns>{ncname_regex_ns})"
 
-date_regex = r"-?\d{4}-\d{2}-\d{2}"
-time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
-date_time_regex = date_regex + "T" + time_regex
-
 # pyxform_ref_outer picks up possible refs, and matches unterminated refs to exclude them.
 pyxform_ref_outer = r"\$\{(?P<pyxform_ref>[^}]+)\}|\$\{[^}]*$"
 pyxform_ref_inner = rf"(?P<last_saved>last-saved#)?{ncname_regex_named}"
@@ -32,75 +30,82 @@
 )
 pyxform_ref = rf"(?P<pyxform_ref>\$\{{{pyxform_ref_inner}\}})"
 
-# Rule order is significant - match priority runs top to bottom.
-LEXER_RULES = {
-    # https://www.w3.org/TR/xmlschema-2/#dateTime
-    "DATETIME": date_time_regex,
-    "DATE": date_regex,
-    "TIME": time_regex,
-    "NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
-    # https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
-    "OPS_MATH": r"[\*\+\-]| mod | div ",
-    "OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
-    "OPS_BOOL": r" and | or ",
-    "OPS_UNION": r"\|",
-    "OPEN_PAREN": r"\(",
-    "CLOSE_PAREN": r"\)",
-    "BRACKET": r"\[\]\{\}",
-    "PARENT_REF": r"\.\.",
-    "SELF_REF": r"\.",
-    "PATH_SEP": r"\/",  # javarosa.xpath says "//" is an "unsupported construct".
-    "SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
-    "COMMA": r",",
-    "WHITESPACE": r"\s+",
-    "PYXFORM_REF": pyxform_ref,
-    "FUNC_CALL": ncname_regex_ns_named + r"\(",
-    "XPATH_PRED_START": ncname_regex_ns_named + r"\[",
-    "XPATH_PRED_END": r"\]",
-    "URI_SCHEME": ncname_regex_named + r"://",
-    "NAME": ncname_regex_named,  # Must be after rules containing ncname_regex.
-    "PYXFORM_REF_START": r"\$\{",
-    "PYXFORM_REF_END": r"\}",
-    "OTHER": r".+?",  # Catch any other character so that parsing doesn't stop.
-}
-
+lark_grammar = rf"""
+    // Parser
+    start: (token | WHITESPACE)*
+    ?token: DATETIME
+          | DATE
+          | TIME
+          | NUMBER
+          | OPS_MATH
+          | OPS_COMP
+          | OPS_BOOL
+          | OPS_UNION
+          | OPEN_PAREN
+          | CLOSE_PAREN
+          | BRACKET
+          | PARENT_REF
+          | SELF_REF
+          | PATH_SEP
+          | SYSTEM_LITERAL
+          | COMMA
+          | PYXFORM_REF
+          | FUNC_CALL
+          | XPATH_PRED_START
+          | XPATH_PRED_END
+          | URI_SCHEME
+          | NAME
+          | PYXFORM_REF_START
+          | PYXFORM_REF_END
+          | OTHER
+
+    // Lexer
+    // https://www.w3.org/TR/xmlschema-2/#dateTime
+    DATETIME.25: DATE "T" TIME
+    DATE.24: /-?\d{{4}}-\d{{2}}-\d{{2}}/
+    TIME.23: /\d{{2}}:\d{{2}}:\d{{2}}(\.\s+)?(((\+|\-)\d{{2}}:\d{{2}})|Z)?/
+    NUMBER.22: /-?\d+\.\d*|-?\.\d+|-?\d+/
+    // https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
+    OPS_MATH.21: /[\*\+\-]| mod | div /
+    OPS_COMP.20: /\=|\!\=|\<|\>|\<=|>=/
+    OPS_BOOL.19: / and | or /
+    OPS_UNION.18: /\|/
+    OPEN_PAREN.17: /\(/
+    CLOSE_PAREN.16: /\)/
+    BRACKET.15: /[\[\{{\}}]/
+    PARENT_REF.14: /\.\./
+    SELF_REF.13: /\./\
+    // # javarosa.xpath says "//" is an "unsupported construct".
+    PATH_SEP.12: /\//
+    SYSTEM_LITERAL.11: /"[^"]*"|'[^']*'/
+    COMMA.10: /,/
+    WHITESPACE.9: /\s+/
+    PYXFORM_REF.8: /\$\{{(?:last-saved#)?{ncname_regex}\}}/
+    FUNC_CALL.7: /{ncname_regex_ns}\(/
+    XPATH_PRED_START.6: /{ncname_regex_ns}\[/
+    XPATH_PRED_END.5: /\]/
+    URI_SCHEME.4: /{ncname_regex}:\/\//
+    // Must be lower priority than rules containing ncname_regex.
+    NAME.3: /{ncname_regex_ns}/
+    PYXFORM_REF_START.2: /\$\{{/
+    PYXFORM_REF_END.1: /\}}/\
+    // Catch any other character so that parsing doesn't stop.
+    OTHER.0: /.+?/\
+"""
 
 RE_NCNAME_NAMESPACED = re.compile(ncname_regex_ns_named)
 RE_PYXFORM_REF = re.compile(pyxform_ref)
 RE_PYXFORM_REF_OUTER = re.compile(pyxform_ref_outer)
 RE_PYXFORM_REF_INNER = re.compile(pyxform_ref_inner)
 
 
-def get_expression_lexer() -> re.Scanner:
-    def get_tokenizer(name):
-        def tokenizer(scan, value) -> ExpLexerToken | str:
-            match = scan.match
-            return ExpLexerToken(name, value, match.start(), match.end())
-
-        return tokenizer
-
-    lexicon = [(v, get_tokenizer(k)) for k, v in LEXER_RULES.items()]
-    # re.Scanner is undocumented but has been around since at least 2003
-    # https://mail.python.org/pipermail/python-dev/2003-April/035075.html
-    return re.Scanner(lexicon)
-
-
-class ExpLexerToken:
-    __slots__ = ("end", "name", "start", "value")
-
-    def __init__(self, name: str, value: str, start: int, end: int) -> None:
-        self.name: str = name
-        self.value: str = value
-        self.start: int = start
-        self.end: int = end
-
-
-# Scanner takes a few 100ms to compile so use the shared instance.
-_EXPRESSION_LEXER = get_expression_lexer()
+_EXPRESSION_LEXER = Lark(
+    lark_grammar, parser="lalr", start="start", propagate_positions=True
+)
 
 
 @lru_cache(maxsize=128)
-def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
+def parse_expression(text: str) -> tuple[Token, ...]:
     """
     Parse an expression.
 
@@ -109,8 +114,7 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
     :param text: The expression.
     :return: The parsed tokens, and any remaining unparsed text.
     """
-    tokens, remainder = _EXPRESSION_LEXER.scan(text)
-    return tokens, remainder
+    return tuple(_EXPRESSION_LEXER.lex(text))
 
 
 def is_xml_tag(value: str) -> bool:

diff --git a/pyxform/parsing/instance_expression.py b/pyxform/parsing/instance_expression.py
@@ -21,7 +21,7 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
     :param xml_text: XML text that may contain an instance expression.
     :return: Tokens in instance expression, and the string position boundaries.
     """
-    tokens, _ = parse_expression(xml_text)
+    tokens = parse_expression(xml_text)
     if not tokens:
         return []
     instance_enter = False
@@ -33,43 +33,43 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
     for t in tokens:
         emit = False
         # If an instance expression had started, note the string position boundary.
-        if not instance_enter and t.name == "FUNC_CALL" and t.value == "instance(":
+        if not instance_enter and t.type == "FUNC_CALL" and t.value == "instance(":
             instance_enter = True
             emit = True
-            boundaries.append(t.start)
+            boundaries.append(t.start_pos)
         # Tokens that are part of an instance expression.
         elif instance_enter:
             # Tokens that are part of the instance call.
             if (
-                t.name == "SYSTEM_LITERAL"
-                and last_token.name == "FUNC_CALL"
+                t.type == "SYSTEM_LITERAL"
+                and last_token.type == "FUNC_CALL"
                 and last_token.value == "instance("
             ):
                 emit = True
-            elif last_token.name == "SYSTEM_LITERAL" and t.name == "CLOSE_PAREN":
+            elif last_token.type == "SYSTEM_LITERAL" and t.type == "CLOSE_PAREN":
                 emit = True
-            elif t.name == "PATH_SEP" and last_token.name == "CLOSE_PAREN":
+            elif t.type == "PATH_SEP" and last_token.type == "CLOSE_PAREN":
                 emit = True
                 path_enter = True
             # A XPath path may continue after a predicate.
-            elif t.name == "PATH_SEP" and last_token.name == "XPATH_PRED_END":
+            elif t.type == "PATH_SEP" and last_token.type == "XPATH_PRED_END":
                 emit = True
                 path_enter = True
             # Tokens that are part of a XPath path.
             elif path_enter:
-                if t.name == "WHITESPACE":
+                if t.type == "WHITESPACE":
                     path_enter = False
-                elif t.name != "XPATH_PRED_START":
+                elif t.type != "XPATH_PRED_START":
                     emit = True
-                elif t.name == "XPATH_PRED_START":
+                elif t.type == "XPATH_PRED_START":
                     emit = True
                     path_enter = False
                     pred_enter = True
             # Tokens that are part of a XPath predicate.
             elif pred_enter:
-                if t.name != "XPATH_PRED_END":
+                if t.type != "XPATH_PRED_END":
                     emit = True
-                elif t.name == "XPATH_PRED_END":
+                elif t.type == "XPATH_PRED_END":
                     emit = True
                     pred_enter = False
         # Track instance expression tokens, ignore others.
@@ -78,10 +78,10 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
         # If an instance expression had ended, note the string position boundary.
         elif instance_enter:
             instance_enter = False
-            boundaries.append(last_token.end)
+            boundaries.append(last_token.end_pos)
 
     if last_token is not None:
-        boundaries.append(last_token.end)
+        boundaries.append(last_token.end_pos)
 
     # Pair up the boundaries [1, 2, 3, 4] -> [(1, 2), (3, 4)].
     bounds = iter(boundaries)