From be67fcce0fa145a2a25fd7c90ac2b07f79f66a8e Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Thu, 18 Jun 2026 00:58:50 -0700
Subject: [PATCH] test(pty): detect stripAnsi-collapsed prose AUQs + judge
 spinner-precedence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The plan-eng/plan-design plan-mode + finding-floor smokes timed out even when
the skill HAD rendered a complete prose AskUserQuestion and was waiting: the PTY
strips cursor-positioning escapes, collapsing the option newlines/spaces so
"A) ..." arrives as "A(recommended)" / "-B:" and "Reply with A, B, or C" as
"ReplywithA,B,orC". Every line-anchored detector (Patterns 1-3) returns false on
those bytes, so proseAUQEverObserved never latched and the run timed out on a
question that was already on screen.

Add Pattern 4/5: a two-signal collapsed-form detector — a reply/recommendation
marker (space-insensitive "reply with [A-D]", "Recommendation:", or
"(recommended)") AND 2+ distinct A-D letters each punctuated by ) : or (. The
conjunction is what separates a real AUQ from incidental report prose; verified
true on the verbatim failing-run buffers where Patterns 1-3 return false.

Also fix the Haiku judge spinner bias: of 614 verdicts, 569 were 'working' and
95 of those noted a question was visible — Claude Code keeps the spinner
animating at an idle prose decision, so the judge coin-flipped. Add a precedence
override: when an option list AND a Recommendation/Reply instruction are both
visible, classify WAITING even with spinner glyphs. Kept the strict dual-signal
gate (never option-list-alone) so auto-decide-preserved doesn't flip.

5 unit tests pin the two-signal contract (2 true on real collapsed bytes, 3
false guards). 90 -> 95 pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 test/helpers/claude-pty-runner.ts           | 42 ++++++++++++++++++-
 test/helpers/claude-pty-runner.unit.test.ts | 45 +++++++++++++++++++++
 2 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts
index 36b2efb1a..280139bbd 100644
--- a/test/helpers/claude-pty-runner.ts
+++ b/test/helpers/claude-pty-runner.ts
@@ -407,8 +407,10 @@ export function judgePtyState(
   const prompt = `You are reading a snapshot of a terminal where Claude Code is running in plan mode for an automated test. Your job: classify the agent's current state.
 
 Pick exactly ONE:
-- WAITING — agent surfaced a question or option list and is sitting at the input prompt waiting for user reply. Signs: numbered/lettered options visible (1./2./3. or A)/B)/C)), "Recommendation:" line, cursor at empty input prompt with no recent generation activity.
+- WAITING — agent surfaced a question or option list and is sitting at the input prompt waiting for user reply. Signs: numbered/lettered options visible (1./2./3. or A)/B)/C)), "Recommendation:" line, cursor at empty input prompt with no recent generation activity, OR a fully-rendered question + reply-instruction (e.g. "Reply with A, B, or C" / "Recommendation:") is visible.
 - WORKING — agent is actively generating or running tools. Signs: spinner glyphs (✻ ✶ ✳ ✢ ✽), "Musing..." or "Churned for ..." text, recent tool-call blocks (Read/Edit/Bash/Grep), in-flight token output.
+
+PRECEDENCE OVERRIDE: if a lettered/numbered option list (A)/B)/1./2.) AND a "Recommendation:" or "Reply with"/"Reply A" instruction are BOTH visible in this snapshot, classify WAITING even when spinner glyphs (✻ ✶ ✳ ✢ ✽) are still animating — Claude Code keeps the spinner up at an idle prose decision, so a spinner alongside a fully-rendered question + reply-instruction is a residual render artifact, not active generation.
 - HUNG — agent has stopped without surfacing a question and without any spinner/work activity. Rare; usually means a crash.
 
 Respond with strict JSON ONLY (no markdown fences, no prose):
@@ -507,6 +509,13 @@ ${tail}
  *   - 3+ markdown bold-bullet options (`- **label**`) following an
  *     interrogative line — office-hours renders its mode question this way
  *     (`> - **Building a startup**`), which has no letter/number marker
+ *   - Pattern 4/5 (collapsed-form): a reply-instruction OR recommendation
+ *     marker PLUS 2+ distinct A-D letter markers each punctuated by ) : or (
+ *     anywhere in the tail. stripAnsi destroys the newlines + inter-word
+ *     spaces that the line-anchored patterns above need, so a real prose AUQ
+ *     arrives collapsed ("ReplywithA,B,orC", "A(recommended)", "-B:") and is
+ *     invisible to Patterns 1-3. This is the dominant Shape-B render mode in
+ *     the plan-design smoke + floor timeouts (verified against real run bytes).
  *
  * Used by classifyVisible and runPlanSkillFloorCheck to return outcome='asked'
  * (or auq_observed) instead of letting the harness time out when the model
@@ -567,6 +576,37 @@ export function isProseAUQVisible(visible: string): boolean {
     const boldBulletHits = (tail.match(/[-*•]\s+\*\*/g) || []).length;
     if (boldBulletHits >= 3) return true;
   }
+
+  // Pattern 4/5: collapsed-form prose AUQ. stripAnsi removes the
+  // cursor-positioning escapes that render option newlines + inter-word
+  // spaces, so "Reply with A, B, or C" arrives as "ReplywithA,B,orC" and
+  // "A) ..." as "A(recommended)" / "-B:" — defeating every line-anchored or
+  // ')'-anchored pattern above (Patterns 1-3 all return false on the real
+  // plan-design smoke + floor timeout bytes). Detect via two INDEPENDENT
+  // signals that must BOTH hold — the corroboration is what separates a real
+  // AUQ from incidental report prose that happens to mention a recommendation:
+  //   (1) a reply-instruction matched space-insensitively OR a recommendation
+  //       marker, AND
+  //   (2) 2+ distinct A-D letter markers each punctuated by ) : or ( anywhere
+  //       in the tail.
+  // A single 'B)' + the word "recommendation", or a comma-only collapsed
+  // "ReplywithA,B,orC" with no )/:/( punctuation on the letters, both stay
+  // false — the two-signal contract is pinned by unit tests.
+  const replyOrRec =
+    /reply\s*(?:with)?\s*[A-D]/i.test(tail) ||
+    /reply(?:with)?[A-D]/i.test(tail.replace(/\s+/g, '')) ||
+    /\bRecommendation\s*:/i.test(tail) ||
+    /\(recommended\)/i.test(tail);
+  if (replyOrRec) {
+    const collapsedLetterRe = /\b([A-D])[):(]/g;
+    const collapsedHits = new Set<string>();
+    let cm: RegExpExecArray | null;
+    while ((cm = collapsedLetterRe.exec(tail)) !== null) {
+      if (cm[1]) collapsedHits.add(cm[1]);
+    }
+    if (collapsedHits.size >= 2) return true;
+  }
+
   return false;
 }
 
diff --git a/test/helpers/claude-pty-runner.unit.test.ts b/test/helpers/claude-pty-runner.unit.test.ts
index a42e12583..df4e5fad1 100644
--- a/test/helpers/claude-pty-runner.unit.test.ts
+++ b/test/helpers/claude-pty-runner.unit.test.ts
@@ -349,6 +349,51 @@ What should we do about this?
     // The ❯1. cursor gate fires first — native list handling owns this.
     expect(isProseAUQVisible(sample)).toBe(false);
   });
+
+  // Pattern 4/5: collapsed-form prose AUQ. stripAnsi destroys the newlines +
+  // inter-word spaces, so a real prose AUQ arrives collapsed and defeats the
+  // line-anchored Patterns 1-3. These are the dominant Shape-B render mode in
+  // the plan-design smoke + floor timeouts — verbatim de-spinnered bytes from
+  // the real failing runs (bdm3sucql.output).
+  test('matches the real collapsed floor render (colon-delimited, Pattern 4/5)', () => {
+    const sample =
+      'The review is blocked on D1—reply withA, B, r Cabovetocontinue:' +
+      '- A(recommended): Spec thefull P1AskUserQuestioncopy in this review' +
+      '-B:LeaveP1copytotheimplementerwithstructuralrequirements' +
+      'C: Add a placeholder template to the plan';
+    expect(isProseAUQVisible(sample)).toBe(true);
+  });
+
+  test('matches the real collapsed plan-mode render (Recommendation + collapsed A)/B), Pattern 4/5)', () => {
+    const sample =
+      'Recommendation:A—writethecopynow.(recommended)A) Writ the fullcopy in thisdesign review— now.' +
+      '(recommended) Completeness:10/10 B) Leveit to theimplemente — task spec is enough.' +
+      'Reply withA (write the copy now)orB(leavetoimplementer)';
+    expect(isProseAUQVisible(sample)).toBe(true);
+  });
+
+  test('collapsed-form requires BOTH signals — single B) + word "recommendation" stays false', () => {
+    // Only one punctuated letter marker: the two-signal contract is not met.
+    const sample =
+      'We should consider option B) here. My recommendation is to do it now.';
+    expect(isProseAUQVisible(sample)).toBe(false);
+  });
+
+  test('collapsed-form requires letter punctuation — comma-only "ReplywithA,B,orC" stays false', () => {
+    // Reply-instruction present, but the letters carry no ) : or ( punctuation,
+    // so they could be incidental enumerations in running prose. Stays false.
+    const sample = 'ReplywithA,B,orC';
+    expect(isProseAUQVisible(sample)).toBe(false);
+  });
+
+  test('collapsed-form does not regress the existing FP guard (see option B) ... point A))', () => {
+    // The classic citation FP: a model referencing prior options in prose.
+    // No reply-instruction / recommendation marker on its own line, so the
+    // collapsed-form signal does not fire either.
+    const sample =
+      'As noted (see option B) above, and the earlier point A) we discussed, this is fine.';
+    expect(isProseAUQVisible(sample)).toBe(false);
+  });
 });
 
 describe('classifyVisible (runtime path through the runner classifier)', () => {