Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies = [
"xlrd==2.0.1", # Read XLS files
"openpyxl==3.1.5", # Read XLSX files
"defusedxml==0.7.1", # Parse XML
"lark==1.3.1", # Parse custom grammars
]

[project.optional-dependencies]
Expand Down
5 changes: 3 additions & 2 deletions pyxform/entities/entities_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,11 +260,12 @@ def get_validated_repeat_name(entity) -> str | None:
match = parse_pyxform_references(value=value, match_limit=1, match_full=True)
except PyXFormError as e:
e.context.update(sheet="entities", column="repeat", row=2)
raise
else:
if not match or not is_xml_tag(match[0]):
if not match or match[0].last_saved:
raise PyXFormError(ENTITY001.format(value=value))
else:
return match[0]
return match[0].name


def validate_entity_saveto(
Expand Down
13 changes: 11 additions & 2 deletions pyxform/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,19 @@ class ErrorCode(Enum):
),
)
PYREF_003: Detail = Detail(
name="PyXForm Reference Question Not Found",
name="PyXForm Reference Name Not Found",
msg=(
"[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
"Reference variables must refer to a question name. Could not find '{q}'."
"Reference variables must contain a name from the 'survey' sheet. Could not "
"find the name '{q}'."
),
)
PYREF_004: Detail = Detail(
name="PyXForm Reference Duplicate Name",
msg=(
"[row : {row}] On the '{sheet}' sheet, the '{column}' value is invalid. "
"Reference variables names must be unique anywhere in the 'survey'. The name "
"'{q}' appears more than once."
),
)
INTERNAL_001: Detail = Detail(
Expand Down
134 changes: 69 additions & 65 deletions pyxform/parsing/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from functools import lru_cache
from typing import Any

from lark import Lark, Token

# ncname regex adapted from eulxml https://github.com/emory-libraries/eulxml/blob/2e1a9f71ffd1fd455bd8326ec82125e333b352e0/eulxml/xpath/lexrules.py
# (C) 2010,2011 Emory University Libraries [Apache v2.0 License]
# They in turn adapted it from https://www.w3.org/TR/REC-xml/#NT-NameStartChar
Expand All @@ -20,10 +22,6 @@
ncname_regex_ns = rf"{ncname_regex}(?:\:{ncname_regex})?"
ncname_regex_ns_named = rf"(?P<ncname_ns>{ncname_regex_ns})"

date_regex = r"-?\d{4}-\d{2}-\d{2}"
time_regex = r"\d{2}:\d{2}:\d{2}(\.\s+)?(((\+|\-)\d{2}:\d{2})|Z)?"
date_time_regex = date_regex + "T" + time_regex

# pyxform_ref_outer picks up possible refs, and matches unterminated refs to exclude them.
pyxform_ref_outer = r"\$\{(?P<pyxform_ref>[^}]+)\}|\$\{[^}]*$"
pyxform_ref_inner = rf"(?P<last_saved>last-saved#)?{ncname_regex_named}"
Expand All @@ -32,75 +30,82 @@
)
pyxform_ref = rf"(?P<pyxform_ref>\$\{{{pyxform_ref_inner}\}})"

# Rule order is significant - match priority runs top to bottom.
LEXER_RULES = {
# https://www.w3.org/TR/xmlschema-2/#dateTime
"DATETIME": date_time_regex,
"DATE": date_regex,
"TIME": time_regex,
"NUMBER": r"-?\d+\.\d*|-?\.\d+|-?\d+",
# https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
"OPS_MATH": r"[\*\+\-]| mod | div ",
"OPS_COMP": r"\=|\!\=|\<|\>|\<=|>=",
"OPS_BOOL": r" and | or ",
"OPS_UNION": r"\|",
"OPEN_PAREN": r"\(",
"CLOSE_PAREN": r"\)",
"BRACKET": r"\[\]\{\}",
"PARENT_REF": r"\.\.",
"SELF_REF": r"\.",
"PATH_SEP": r"\/", # javarosa.xpath says "//" is an "unsupported construct".
"SYSTEM_LITERAL": r""""[^"]*"|'[^']*'""",
"COMMA": r",",
"WHITESPACE": r"\s+",
"PYXFORM_REF": pyxform_ref,
"FUNC_CALL": ncname_regex_ns_named + r"\(",
"XPATH_PRED_START": ncname_regex_ns_named + r"\[",
"XPATH_PRED_END": r"\]",
"URI_SCHEME": ncname_regex_named + r"://",
"NAME": ncname_regex_named, # Must be after rules containing ncname_regex.
"PYXFORM_REF_START": r"\$\{",
"PYXFORM_REF_END": r"\}",
"OTHER": r".+?", # Catch any other character so that parsing doesn't stop.
}

lark_grammar = rf"""
// Parser
start: (token | WHITESPACE)*
?token: DATETIME
| DATE
| TIME
| NUMBER
| OPS_MATH
| OPS_COMP
| OPS_BOOL
| OPS_UNION
| OPEN_PAREN
| CLOSE_PAREN
| BRACKET
| PARENT_REF
| SELF_REF
| PATH_SEP
| SYSTEM_LITERAL
| COMMA
| PYXFORM_REF
| FUNC_CALL
| XPATH_PRED_START
| XPATH_PRED_END
| URI_SCHEME
| NAME
| PYXFORM_REF_START
| PYXFORM_REF_END
| OTHER

// Lexer
// https://www.w3.org/TR/xmlschema-2/#dateTime
DATETIME.25: DATE "T" TIME
DATE.24: /-?\d{{4}}-\d{{2}}-\d{{2}}/
TIME.23: /\d{{2}}:\d{{2}}:\d{{2}}(\.\s+)?(((\+|\-)\d{{2}}:\d{{2}})|Z)?/
NUMBER.22: /-?\d+\.\d*|-?\.\d+|-?\d+/
// https://www.w3.org/TR/1999/REC-xpath-19991116/#exprlex
OPS_MATH.21: /[\*\+\-]| mod | div /
OPS_COMP.20: /\=|\!\=|\<|\>|\<=|>=/
OPS_BOOL.19: / and | or /
OPS_UNION.18: /\|/
OPEN_PAREN.17: /\(/
CLOSE_PAREN.16: /\)/
BRACKET.15: /[\[\{{\}}]/
PARENT_REF.14: /\.\./
SELF_REF.13: /\./\
// # javarosa.xpath says "//" is an "unsupported construct".
PATH_SEP.12: /\//
SYSTEM_LITERAL.11: /"[^"]*"|'[^']*'/
COMMA.10: /,/
WHITESPACE.9: /\s+/
PYXFORM_REF.8: /\$\{{(?:last-saved#)?{ncname_regex}\}}/
FUNC_CALL.7: /{ncname_regex_ns}\(/
XPATH_PRED_START.6: /{ncname_regex_ns}\[/
XPATH_PRED_END.5: /\]/
URI_SCHEME.4: /{ncname_regex}:\/\//
// Must be lower priority than rules containing ncname_regex.
NAME.3: /{ncname_regex_ns}/
PYXFORM_REF_START.2: /\$\{{/
PYXFORM_REF_END.1: /\}}/\
// Catch any other character so that parsing doesn't stop.
OTHER.0: /.+?/\
"""

RE_NCNAME_NAMESPACED = re.compile(ncname_regex_ns_named)
RE_PYXFORM_REF = re.compile(pyxform_ref)
RE_PYXFORM_REF_OUTER = re.compile(pyxform_ref_outer)
RE_PYXFORM_REF_INNER = re.compile(pyxform_ref_inner)


def get_expression_lexer() -> re.Scanner:
def get_tokenizer(name):
def tokenizer(scan, value) -> ExpLexerToken | str:
match = scan.match
return ExpLexerToken(name, value, match.start(), match.end())

return tokenizer

lexicon = [(v, get_tokenizer(k)) for k, v in LEXER_RULES.items()]
# re.Scanner is undocumented but has been around since at least 2003
# https://mail.python.org/pipermail/python-dev/2003-April/035075.html
return re.Scanner(lexicon)


class ExpLexerToken:
__slots__ = ("end", "name", "start", "value")

def __init__(self, name: str, value: str, start: int, end: int) -> None:
self.name: str = name
self.value: str = value
self.start: int = start
self.end: int = end


# Scanner takes a few 100ms to compile so use the shared instance.
_EXPRESSION_LEXER = get_expression_lexer()
_EXPRESSION_LEXER = Lark(
lark_grammar, parser="lalr", start="start", propagate_positions=True
)


@lru_cache(maxsize=128)
def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
def parse_expression(text: str) -> tuple[Token, ...]:
"""
Parse an expression.

Expand All @@ -109,8 +114,7 @@ def parse_expression(text: str) -> tuple[list[ExpLexerToken], str]:
:param text: The expression.
:return: The parsed tokens, and any remaining unparsed text.
"""
tokens, remainder = _EXPRESSION_LEXER.scan(text)
return tokens, remainder
return tuple(_EXPRESSION_LEXER.lex(text))


def is_xml_tag(value: str) -> bool:
Expand Down
30 changes: 15 additions & 15 deletions pyxform/parsing/instance_expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
:param xml_text: XML text that may contain an instance expression.
:return: Tokens in instance expression, and the string position boundaries.
"""
tokens, _ = parse_expression(xml_text)
tokens = parse_expression(xml_text)
if not tokens:
return []
instance_enter = False
Expand All @@ -33,43 +33,43 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
for t in tokens:
emit = False
# If an instance expression had started, note the string position boundary.
if not instance_enter and t.name == "FUNC_CALL" and t.value == "instance(":
if not instance_enter and t.type == "FUNC_CALL" and t.value == "instance(":
instance_enter = True
emit = True
boundaries.append(t.start)
boundaries.append(t.start_pos)
# Tokens that are part of an instance expression.
elif instance_enter:
# Tokens that are part of the instance call.
if (
t.name == "SYSTEM_LITERAL"
and last_token.name == "FUNC_CALL"
t.type == "SYSTEM_LITERAL"
and last_token.type == "FUNC_CALL"
and last_token.value == "instance("
):
emit = True
elif last_token.name == "SYSTEM_LITERAL" and t.name == "CLOSE_PAREN":
elif last_token.type == "SYSTEM_LITERAL" and t.type == "CLOSE_PAREN":
emit = True
elif t.name == "PATH_SEP" and last_token.name == "CLOSE_PAREN":
elif t.type == "PATH_SEP" and last_token.type == "CLOSE_PAREN":
emit = True
path_enter = True
# A XPath path may continue after a predicate.
elif t.name == "PATH_SEP" and last_token.name == "XPATH_PRED_END":
elif t.type == "PATH_SEP" and last_token.type == "XPATH_PRED_END":
emit = True
path_enter = True
# Tokens that are part of a XPath path.
elif path_enter:
if t.name == "WHITESPACE":
if t.type == "WHITESPACE":
path_enter = False
elif t.name != "XPATH_PRED_START":
elif t.type != "XPATH_PRED_START":
emit = True
elif t.name == "XPATH_PRED_START":
elif t.type == "XPATH_PRED_START":
emit = True
path_enter = False
pred_enter = True
# Tokens that are part of a XPath predicate.
elif pred_enter:
if t.name != "XPATH_PRED_END":
if t.type != "XPATH_PRED_END":
emit = True
elif t.name == "XPATH_PRED_END":
elif t.type == "XPATH_PRED_END":
emit = True
pred_enter = False
# Track instance expression tokens, ignore others.
Expand All @@ -78,10 +78,10 @@ def find_boundaries(xml_text: str) -> list[tuple[int, int]]:
# If an instance expression had ended, note the string position boundary.
elif instance_enter:
instance_enter = False
boundaries.append(last_token.end)
boundaries.append(last_token.end_pos)

if last_token is not None:
boundaries.append(last_token.end)
boundaries.append(last_token.end_pos)

# Pair up the boundaries [1, 2, 3, 4] -> [(1, 2), (3, 4)].
bounds = iter(boundaries)
Expand Down
Loading