fix(workflows): make expression operator/literal parsing quote-aware (#3197)

_evaluate_simple_expression split on operator keywords using naive str.find/split, so a keyword INSIDE a quoted operand was treated as an operator: `inputs.mode == 'read and write'` split on the inner ' and ' and evaluated as `(inputs.mode == 'read) and (write')`. The literal short-circuit was also too greedy -- `'a' == 'b'` matched startswith("'")/endswith("'") and was stripped to the garbage truthy string `a' == 'b`, so `'done' == 'failed'` evaluated truthy and gated the wrong branch. Add a quote/bracket-aware _find_top_level helper (mirroring the existing _split_top_level_commas) and use it for the and/or/comparison/in/not-in splits; tighten the literal short-circuit to fire only when the opening quote's match is the final char. The docstring already lists comparisons + and/or/not + in/not-in + string literals as supported, so this restores the documented contract. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-03 12:28:06 +08:00 · 2026-06-29 19:53:35 +05:00
parent fd185c1fd8
commit 2a9db1d350
2 changed files with 89 additions and 19 deletions
--- a/src/specify_cli/workflows/expressions.py
+++ b/src/specify_cli/workflows/expressions.py
@@ -180,6 +180,35 @@ def _split_top_level_commas(text: str) -> list[str]:
    return parts


+def _find_top_level(text: str, token: str) -> int:
+    """Return the index of the first occurrence of *token* in *text* that lies
+    outside any quoted string or nested bracket, or ``-1`` if there is none.
+
+    Used so operator/keyword splitting (``and``/``or``/``in``/comparisons) does
+    not match a separator that appears *inside* a quoted operand -- e.g. the
+    ``and`` in ``mode == 'read and write'`` or the ``or`` in ``'approve or reject'``.
+    """
+    quote: str | None = None
+    depth = 0
+    i = 0
+    n = len(text)
+    while i < n:
+        ch = text[i]
+        if quote is not None:
+            if ch == quote:
+                quote = None
+        elif ch in ("'", '"'):
+            quote = ch
+        elif ch in "([{":
+            depth += 1
+        elif ch in ")]}":
+            depth = max(0, depth - 1)
+        elif depth == 0 and text.startswith(token, i):
+            return i
+        i += 1
+    return -1
+
+
 def _evaluate_simple_expression(expr: str, namespace: dict[str, Any]) -> Any:
    """Evaluate a simple expression against the namespace.

@@ -193,11 +222,12 @@ def _evaluate_simple_expression(expr: str, namespace: dict[str, Any]) -> Any:
    """
    expr = expr.strip()

-    # String literal — check before pipes and operators so quoted strings
-    # containing | or operator keywords are not mis-parsed.
-    if (expr.startswith("'") and expr.endswith("'")) or (
-        expr.startswith('"') and expr.endswith('"')
-    ):
+    # String literal — only when the WHOLE expression is one quoted string,
+    # i.e. the opening quote's matching close is the final character. Checking
+    # startswith/endswith alone would also grab `'a' == 'b'` and strip it to the
+    # garbage `a' == 'b`; a genuine single literal short-circuits here so quoted
+    # strings containing `|` or operator keywords are not mis-parsed downstream.
+    if expr[:1] in ("'", '"') and expr.find(expr[0], 1) == len(expr) - 1:
        return expr[1:-1]

    # Handle pipe filters
@@ -262,29 +292,33 @@ def _evaluate_simple_expression(expr: str, namespace: dict[str, Any]) -> Any:
        )

    # Boolean operators — parse 'or' first (lower precedence) so that
-    # 'a or b and c' is evaluated as 'a or (b and c)'.
-    if " or " in expr:
-        parts = expr.split(" or ", 1)
-        left = _evaluate_simple_expression(parts[0].strip(), namespace)
-        right = _evaluate_simple_expression(parts[1].strip(), namespace)
+    # 'a or b and c' is evaluated as 'a or (b and c)'. Splits are quote/bracket
+    # aware so a keyword inside a quoted operand (e.g. the 'and' in
+    # 'read and write') is not mistaken for an operator.
+    or_idx = _find_top_level(expr, " or ")
+    if or_idx != -1:
+        left = _evaluate_simple_expression(expr[:or_idx].strip(), namespace)
+        right = _evaluate_simple_expression(expr[or_idx + 4:].strip(), namespace)
        return bool(left) or bool(right)

-    if " and " in expr:
-        parts = expr.split(" and ", 1)
-        left = _evaluate_simple_expression(parts[0].strip(), namespace)
-        right = _evaluate_simple_expression(parts[1].strip(), namespace)
+    and_idx = _find_top_level(expr, " and ")
+    if and_idx != -1:
+        left = _evaluate_simple_expression(expr[:and_idx].strip(), namespace)
+        right = _evaluate_simple_expression(expr[and_idx + 5:].strip(), namespace)
        return bool(left) and bool(right)

    if expr.startswith("not "):
        inner = _evaluate_simple_expression(expr[4:].strip(), namespace)
        return not bool(inner)

-    # Comparison operators (order matters — check multi-char ops first)
+    # Comparison operators (order matters — check multi-char ops first). Split at
+    # the first top-level occurrence so an operator inside a quoted operand is
+    # ignored.
    for op in ("!=", "==", ">=", "<=", ">", "<", " not in ", " in "):
-        if op in expr:
-            parts = expr.split(op, 1)
-            left = _evaluate_simple_expression(parts[0].strip(), namespace)
-            right = _evaluate_simple_expression(parts[1].strip(), namespace)
+        op_idx = _find_top_level(expr, op)
+        if op_idx != -1:
+            left = _evaluate_simple_expression(expr[:op_idx].strip(), namespace)
+            right = _evaluate_simple_expression(expr[op_idx + len(op):].strip(), namespace)
            if op == "==":
                return left == right
            if op == "!=":
--- a/tests/test_workflows.py
+++ b/tests/test_workflows.py
@@ -286,6 +286,42 @@ class TestExpressions:
        assert evaluate_expression('{{ [["a", "b"], "c"] }}', ctx) == [["a", "b"], "c"]
        assert evaluate_expression("{{ [[1, 2], [3, 4]] }}", ctx) == [[1, 2], [3, 4]]

+    def test_operator_splitting_is_quote_aware(self):
+        from specify_cli.workflows.expressions import (
+            evaluate_condition,
+            evaluate_expression,
+        )
+        from specify_cli.workflows.base import StepContext
+
+        # An 'and'/'or'/'in' keyword INSIDE a quoted operand must not be treated
+        # as a boolean/membership operator: the comparison applies to the whole
+        # string literal.
+        ctx = StepContext(inputs={"mode": "read and write"})
+        assert evaluate_expression("{{ inputs.mode == 'read and write' }}", ctx) is True
+        assert evaluate_expression("{{ inputs.mode == 'read or write' }}", ctx) is False
+        # ...also when the quoted literal is on the left of the operator.
+        left_ctx = StepContext(inputs={"x": "approve or reject"})
+        assert evaluate_expression("{{ 'approve or reject' == inputs.x }}", left_ctx) is True
+        # membership against a literal that contains a keyword
+        assert evaluate_expression("{{ 'cat' in 'cat and dog' }}", StepContext()) is True
+
+        # Literal-vs-literal equality no longer mis-strips to a garbage string
+        # (previously `'done' == 'failed'` short-circuited to the truthy string
+        # "done' == 'failed").
+        assert evaluate_condition("{{ 'done' == 'failed' }}", StepContext()) is False
+        assert evaluate_condition("{{ 'done' == 'done' }}", StepContext()) is True
+
+        # A single quoted literal that itself contains operator text is preserved.
+        assert evaluate_expression("{{ 'a == b' }}", StepContext()) == "a == b"
+        assert evaluate_expression("{{ 'x and y' }}", StepContext()) == "x and y"
+
+        # Regression: ordinary (unquoted-keyword) parsing still works.
+        plain = StepContext(inputs={"a": 1, "b": 2, "mode": "read"})
+        assert evaluate_expression("{{ inputs.mode == 'read' }}", plain) is True
+        assert evaluate_expression("{{ inputs.a == 1 and inputs.b == 2 }}", plain) is True
+        assert evaluate_expression("{{ inputs.a == 9 or inputs.b == 2 }}", plain) is True
+        assert evaluate_expression("{{ inputs.missing | default('a and b') }}", plain) == "a and b"
+
    def test_filter_default(self):
        from specify_cli.workflows.expressions import evaluate_expression
        from specify_cli.workflows.base import StepContext