fix(search): support phrase search and fix CJK matching in history search (#14225)

### What this PR does Before this PR: - Searching CJK (Chinese/Japanese/Korean) text in whole-word mode silently failed because Unicode word-boundary assertions (`\p{L}`) treat CJK characters as letters, causing the lookbehind/lookahead to reject valid matches within continuous CJK text. - Multi-word English queries were split into individual words with OR logic, producing noisy results. - Quoted phrases (e.g., `"machine learning"`) were not supported — quotes were stripped and each word was searched independently. After this PR: - CJK terms automatically degrade to substring matching in whole-word mode, so searches like "组合优于" correctly find results within longer CJK text. - Search filter uses AND logic (`.every()`) instead of OR (`.some()`), so all terms must be present for a result to match. - Quoted phrases (`"machine learning"` or `'neural network'`) are kept as a single search term. Fixes #14212 ### Why we need it and why it was done in this way The root cause of issue #14212 is twofold: 1. **CJK whole-word matching failure**: `buildWholeWordPattern` wraps terms with `(?<![\p{L}\p{N}])…(?![\p{L}\p{N}])`. Since CJK characters match `\p{L}`, searching for a CJK substring inside continuous CJK text always fails the negative lookbehind. The fix detects CJK characters in the search term and skips word-boundary assertions, falling back to plain substring matching. 2. **No phrase support**: `splitKeywordsToTerms` simply split on whitespace. The fix uses a regex that extracts quoted substrings first, then splits remaining text on whitespace. The following tradeoffs were made: - CJK terms always use substring matching even in "whole-word" mode, since CJK languages don't have word boundaries in the same sense as Latin scripts. This is the expected behavior for CJK users. The following alternatives were considered: - Using a CJK segmentation library for proper word-boundary detection — rejected as too heavy for this use case. ### Breaking changes None. Search behavior becomes more accurate; no API or data model changes. ### Special notes for your reviewer - The `SearchResults.tsx` change is a single-line `.some()` → `.every()` switch (OR→AND logic). - All new behavior is covered by tests in `keywordSearch.test.ts`. - This is a minimal bug fix targeting the `main` branch via `hotfix/*` — no refactoring included. ### Checklist - [x] PR: The PR description is expressive enough and will help future contributors - [x] Code: [Write code that humans can understand](https://en.wikiquote.org/wiki/Martin_Fowler#code-for-humans) and [Keep it simple](https://en.wikipedia.org/wiki/KISS_principle) - [x] Refactor: You have [left the code cleaner than you found it (Boy Scout Rule)](https://learning.oreilly.com/library/view/97-things-every/9780596809515/ch08.html) - [x] Upgrade: Impact of this change on upgrade flows was considered and addressed if required - [x] Documentation: A [user-guide update](https://docs.cherry-ai.com) was considered and is present (link) or not required. Check this only when the PR introduces or changes a user-facing feature or behavior. - [x] Self-review: I have reviewed my own code (e.g., via [`/gh-pr-review`](/.claude/skills/gh-pr-review/SKILL.md), `gh pr diff`, or GitHub UI) before requesting review from others ### Release note ```release-note Fix history search failing to find CJK text and not supporting quoted phrase search. ``` Signed-off-by: raymond <raymond@qlg.me> Co-authored-by: SuYao <sy20010504@gmail.com>
2026-07-04 05:00:00 +08:00 · 2026-04-18 19:47:16 +08:00
parent fbda0d213d
commit d89ce08716
3 changed files with 83 additions and 5 deletions
--- a/src/renderer/src/pages/history/components/SearchResults.tsx
+++ b/src/renderer/src/pages/history/components/SearchResults.tsx
@@ -221,7 +221,7 @@ const SearchResults: FC<Props> = ({ keywords, onMessageClick, onTopicClick, ...p
      .filter((block) => block.type === MessageBlockType.MAIN_TEXT)
      .filter((block) => {
        const searchableContent = stripMarkdownFormatting(block.content)
-        return searchRegexes.some((regex) => regex.test(searchableContent))
+        return searchRegexes.every((regex) => regex.test(searchableContent))
      })

    const messages = topics?.flatMap((topic) => topic.messages)
--- a/src/renderer/src/utils/tests/keywordSearch.test.ts
+++ b/src/renderer/src/utils/tests/keywordSearch.test.ts
@@ -2,6 +2,7 @@ import { describe, expect, it } from 'vitest'

 import {
  buildKeywordRegex,
+  buildKeywordRegexes,
  buildKeywordUnionRegex,
  type KeywordMatchMode,
  splitKeywordsToTerms
@@ -16,6 +17,55 @@ describe('keywordSearch', () => {
    it('returns empty array for empty input', () => {
      expect(splitKeywordsToTerms('')).toEqual([])
    })
+
+    describe('phrase search (quoted substrings)', () => {
+      it('extracts double-quoted phrases as single terms', () => {
+        expect(splitKeywordsToTerms('"machine learning" deep')).toEqual(['machine learning', 'deep'])
+      })
+
+      it('extracts single-quoted phrases as single terms', () => {
+        expect(splitKeywordsToTerms("'neural network' model")).toEqual(['neural network', 'model'])
+      })
+
+      it('handles multiple quoted phrases', () => {
+        expect(splitKeywordsToTerms('"hello world" "foo bar"')).toEqual(['hello world', 'foo bar'])
+      })
+
+      it('handles mixed quoted and unquoted terms', () => {
+        expect(splitKeywordsToTerms('test "some phrase" end')).toEqual(['test', 'some phrase', 'end'])
+      })
+
+      it('handles unclosed quotes gracefully', () => {
+        expect(splitKeywordsToTerms('"unclosed phrase')).toEqual(['unclosed phrase'])
+      })
+
+      it('skips empty quotes', () => {
+        expect(splitKeywordsToTerms('"" hello')).toEqual(['hello'])
+      })
+    })
+  })
+
+  describe('AND logic with buildKeywordRegexes', () => {
+    it('every() returns true when all terms are present', () => {
+      const terms = splitKeywordsToTerms('hello world')
+      const regexes = buildKeywordRegexes(terms, { matchMode: 'substring', flags: 'i' })
+      expect(regexes.every((r) => r.test('hello world'))).toBe(true)
+      expect(regexes.every((r) => r.test('world of hello'))).toBe(true)
+    })
+
+    it('every() returns false when only some terms are present', () => {
+      const terms = splitKeywordsToTerms('hello world')
+      const regexes = buildKeywordRegexes(terms, { matchMode: 'substring', flags: 'i' })
+      expect(regexes.every((r) => r.test('hello only'))).toBe(false)
+      expect(regexes.every((r) => r.test('world only'))).toBe(false)
+    })
+
+    it('every() works with phrase search', () => {
+      const terms = splitKeywordsToTerms('"machine learning" deep')
+      const regexes = buildKeywordRegexes(terms, { matchMode: 'substring', flags: 'i' })
+      expect(regexes.every((r) => r.test('deep machine learning is great'))).toBe(true)
+      expect(regexes.every((r) => r.test('deep learning but not machine'))).toBe(false)
+    })
  })

  describe('buildKeywordRegex (whole-word)', () => {
@@ -46,6 +96,17 @@ describe('keywordSearch', () => {
      expect(regex.test('mañana')).toBe(false)
      expect(regex.test('ana')).toBe(true)
    })
+
+    it('CJK terms degrade to substring in whole-word mode', () => {
+      const regex = buildKeywordRegex('组合优于', { matchMode })
+      expect(regex.test('投资组合优于其他策略')).toBe(true)
+      expect(regex.test('组合优于')).toBe(true)
+    })
+
+    it('CJK whole-word still does not match partial substring across non-CJK boundary', () => {
+      const regex = buildKeywordRegex('组合优于', { matchMode })
+      expect(regex.test('abc组合优于def')).toBe(true)
+    })
  })

  describe('buildKeywordRegex (substring)', () => {
--- a/src/renderer/src/utils/keywordSearch.ts
+++ b/src/renderer/src/utils/keywordSearch.ts
@@ -5,13 +5,30 @@ export function escapeRegex(text: string): string {
 }

 export function splitKeywordsToTerms(keywords: string): string[] {
-  return (keywords || '')
-    .toLowerCase()
-    .split(/\s+/)
-    .filter((term) => term.length > 0)
+  const input = (keywords || '').trim()
+  if (input.length === 0) return []
+
+  const terms: string[] = []
+  const pattern = /"([^"]*)"?|'([^']*)'?|(\S+)/g
+  let match: RegExpExecArray | null
+  while ((match = pattern.exec(input)) !== null) {
+    const term = (match[1] ?? match[2] ?? match[3]).trim()
+    if (term.length > 0) {
+      terms.push(term.toLowerCase())
+    }
+  }
+  return terms
+}
+
+function containsCJK(text: string): boolean {
+  return /[\u4e00-\u9fff\u3400-\u4dbf\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(text)
 }

 function buildWholeWordPattern(escapedTerm: string): string {
+  // CJK text has no word boundaries — degrade to substring matching
+  if (containsCJK(escapedTerm)) {
+    return escapedTerm
+  }
  // "Whole word" here means: do not match inside a larger alphanumeric token.
  // This avoids false positives like:
  // - API keys: "IMr4WSMS5dwa52"