From be67fcce0fa145a2a25fd7c90ac2b07f79f66a8e Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 18 Jun 2026 00:58:50 -0700 Subject: [PATCH] test(pty): detect stripAnsi-collapsed prose AUQs + judge spinner-precedence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The plan-eng/plan-design plan-mode + finding-floor smokes timed out even when the skill HAD rendered a complete prose AskUserQuestion and was waiting: the PTY strips cursor-positioning escapes, collapsing the option newlines/spaces so "A) ..." arrives as "A(recommended)" / "-B:" and "Reply with A, B, or C" as "ReplywithA,B,orC". Every line-anchored detector (Patterns 1-3) returns false on those bytes, so proseAUQEverObserved never latched and the run timed out on a question that was already on screen. Add Pattern 4/5: a two-signal collapsed-form detector — a reply/recommendation marker (space-insensitive "reply with [A-D]", "Recommendation:", or "(recommended)") AND 2+ distinct A-D letters each punctuated by ) : or (. The conjunction is what separates a real AUQ from incidental report prose; verified true on the verbatim failing-run buffers where Patterns 1-3 return false. Also fix the Haiku judge spinner bias: of 614 verdicts, 569 were 'working' and 95 of those noted a question was visible — Claude Code keeps the spinner animating at an idle prose decision, so the judge coin-flipped. Add a precedence override: when an option list AND a Recommendation/Reply instruction are both visible, classify WAITING even with spinner glyphs. Kept the strict dual-signal gate (never option-list-alone) so auto-decide-preserved doesn't flip. 5 unit tests pin the two-signal contract (2 true on real collapsed bytes, 3 false guards). 90 -> 95 pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- test/helpers/claude-pty-runner.ts | 42 ++++++++++++++++++- test/helpers/claude-pty-runner.unit.test.ts | 45 +++++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts index 36b2efb1a..280139bbd 100644 --- a/test/helpers/claude-pty-runner.ts +++ b/test/helpers/claude-pty-runner.ts @@ -407,8 +407,10 @@ export function judgePtyState( const prompt = `You are reading a snapshot of a terminal where Claude Code is running in plan mode for an automated test. Your job: classify the agent's current state. Pick exactly ONE: -- WAITING — agent surfaced a question or option list and is sitting at the input prompt waiting for user reply. Signs: numbered/lettered options visible (1./2./3. or A)/B)/C)), "Recommendation:" line, cursor at empty input prompt with no recent generation activity. +- WAITING — agent surfaced a question or option list and is sitting at the input prompt waiting for user reply. Signs: numbered/lettered options visible (1./2./3. or A)/B)/C)), "Recommendation:" line, cursor at empty input prompt with no recent generation activity, OR a fully-rendered question + reply-instruction (e.g. "Reply with A, B, or C" / "Recommendation:") is visible. - WORKING — agent is actively generating or running tools. Signs: spinner glyphs (✻ ✶ ✳ ✢ ✽), "Musing..." or "Churned for ..." text, recent tool-call blocks (Read/Edit/Bash/Grep), in-flight token output. + +PRECEDENCE OVERRIDE: if a lettered/numbered option list (A)/B)/1./2.) AND a "Recommendation:" or "Reply with"/"Reply A" instruction are BOTH visible in this snapshot, classify WAITING even when spinner glyphs (✻ ✶ ✳ ✢ ✽) are still animating — Claude Code keeps the spinner up at an idle prose decision, so a spinner alongside a fully-rendered question + reply-instruction is a residual render artifact, not active generation. - HUNG — agent has stopped without surfacing a question and without any spinner/work activity. Rare; usually means a crash. Respond with strict JSON ONLY (no markdown fences, no prose): @@ -507,6 +509,13 @@ ${tail} * - 3+ markdown bold-bullet options (`- **label**`) following an * interrogative line — office-hours renders its mode question this way * (`> - **Building a startup**`), which has no letter/number marker + * - Pattern 4/5 (collapsed-form): a reply-instruction OR recommendation + * marker PLUS 2+ distinct A-D letter markers each punctuated by ) : or ( + * anywhere in the tail. stripAnsi destroys the newlines + inter-word + * spaces that the line-anchored patterns above need, so a real prose AUQ + * arrives collapsed ("ReplywithA,B,orC", "A(recommended)", "-B:") and is + * invisible to Patterns 1-3. This is the dominant Shape-B render mode in + * the plan-design smoke + floor timeouts (verified against real run bytes). * * Used by classifyVisible and runPlanSkillFloorCheck to return outcome='asked' * (or auq_observed) instead of letting the harness time out when the model @@ -567,6 +576,37 @@ export function isProseAUQVisible(visible: string): boolean { const boldBulletHits = (tail.match(/[-*•]\s+\*\*/g) || []).length; if (boldBulletHits >= 3) return true; } + + // Pattern 4/5: collapsed-form prose AUQ. stripAnsi removes the + // cursor-positioning escapes that render option newlines + inter-word + // spaces, so "Reply with A, B, or C" arrives as "ReplywithA,B,orC" and + // "A) ..." as "A(recommended)" / "-B:" — defeating every line-anchored or + // ')'-anchored pattern above (Patterns 1-3 all return false on the real + // plan-design smoke + floor timeout bytes). Detect via two INDEPENDENT + // signals that must BOTH hold — the corroboration is what separates a real + // AUQ from incidental report prose that happens to mention a recommendation: + // (1) a reply-instruction matched space-insensitively OR a recommendation + // marker, AND + // (2) 2+ distinct A-D letter markers each punctuated by ) : or ( anywhere + // in the tail. + // A single 'B)' + the word "recommendation", or a comma-only collapsed + // "ReplywithA,B,orC" with no )/:/( punctuation on the letters, both stay + // false — the two-signal contract is pinned by unit tests. + const replyOrRec = + /reply\s*(?:with)?\s*[A-D]/i.test(tail) || + /reply(?:with)?[A-D]/i.test(tail.replace(/\s+/g, '')) || + /\bRecommendation\s*:/i.test(tail) || + /\(recommended\)/i.test(tail); + if (replyOrRec) { + const collapsedLetterRe = /\b([A-D])[):(]/g; + const collapsedHits = new Set(); + let cm: RegExpExecArray | null; + while ((cm = collapsedLetterRe.exec(tail)) !== null) { + if (cm[1]) collapsedHits.add(cm[1]); + } + if (collapsedHits.size >= 2) return true; + } + return false; } diff --git a/test/helpers/claude-pty-runner.unit.test.ts b/test/helpers/claude-pty-runner.unit.test.ts index a42e12583..df4e5fad1 100644 --- a/test/helpers/claude-pty-runner.unit.test.ts +++ b/test/helpers/claude-pty-runner.unit.test.ts @@ -349,6 +349,51 @@ What should we do about this? // The ❯1. cursor gate fires first — native list handling owns this. expect(isProseAUQVisible(sample)).toBe(false); }); + + // Pattern 4/5: collapsed-form prose AUQ. stripAnsi destroys the newlines + + // inter-word spaces, so a real prose AUQ arrives collapsed and defeats the + // line-anchored Patterns 1-3. These are the dominant Shape-B render mode in + // the plan-design smoke + floor timeouts — verbatim de-spinnered bytes from + // the real failing runs (bdm3sucql.output). + test('matches the real collapsed floor render (colon-delimited, Pattern 4/5)', () => { + const sample = + 'The review is blocked on D1—reply withA, B, r Cabovetocontinue:' + + '- A(recommended): Spec thefull P1AskUserQuestioncopy in this review' + + '-B:LeaveP1copytotheimplementerwithstructuralrequirements' + + 'C: Add a placeholder template to the plan'; + expect(isProseAUQVisible(sample)).toBe(true); + }); + + test('matches the real collapsed plan-mode render (Recommendation + collapsed A)/B), Pattern 4/5)', () => { + const sample = + 'Recommendation:A—writethecopynow.(recommended)A) Writ the fullcopy in thisdesign review— now.' + + '(recommended) Completeness:10/10 B) Leveit to theimplemente — task spec is enough.' + + 'Reply withA (write the copy now)orB(leavetoimplementer)'; + expect(isProseAUQVisible(sample)).toBe(true); + }); + + test('collapsed-form requires BOTH signals — single B) + word "recommendation" stays false', () => { + // Only one punctuated letter marker: the two-signal contract is not met. + const sample = + 'We should consider option B) here. My recommendation is to do it now.'; + expect(isProseAUQVisible(sample)).toBe(false); + }); + + test('collapsed-form requires letter punctuation — comma-only "ReplywithA,B,orC" stays false', () => { + // Reply-instruction present, but the letters carry no ) : or ( punctuation, + // so they could be incidental enumerations in running prose. Stays false. + const sample = 'ReplywithA,B,orC'; + expect(isProseAUQVisible(sample)).toBe(false); + }); + + test('collapsed-form does not regress the existing FP guard (see option B) ... point A))', () => { + // The classic citation FP: a model referencing prior options in prose. + // No reply-instruction / recommendation marker on its own line, so the + // collapsed-form signal does not fire either. + const sample = + 'As noted (see option B) above, and the earlier point A) we discussed, this is fine.'; + expect(isProseAUQVisible(sample)).toBe(false); + }); }); describe('classifyVisible (runtime path through the runner classifier)', () => {