fix(search): support phrase search and fix CJK matching in history search (#14225)

### What this PR does

Before this PR:
- Searching CJK (Chinese/Japanese/Korean) text in whole-word mode
silently failed because Unicode word-boundary assertions (`\p{L}`) treat
CJK characters as letters, causing the lookbehind/lookahead to reject
valid matches within continuous CJK text.
- Multi-word English queries were split into individual words with OR
logic, producing noisy results.
- Quoted phrases (e.g., `"machine learning"`) were not supported —
quotes were stripped and each word was searched independently.

After this PR:
- CJK terms automatically degrade to substring matching in whole-word
mode, so searches like "组合优于" correctly find results within longer CJK
text.
- Search filter uses AND logic (`.every()`) instead of OR (`.some()`),
so all terms must be present for a result to match.
- Quoted phrases (`"machine learning"` or `'neural network'`) are kept
as a single search term.

Fixes #14212

### Why we need it and why it was done in this way

The root cause of issue #14212 is twofold:

1. **CJK whole-word matching failure**: `buildWholeWordPattern` wraps
terms with `(?<![\p{L}\p{N}])…(?![\p{L}\p{N}])`. Since CJK characters
match `\p{L}`, searching for a CJK substring inside continuous CJK text
always fails the negative lookbehind. The fix detects CJK characters in
the search term and skips word-boundary assertions, falling back to
plain substring matching.

2. **No phrase support**: `splitKeywordsToTerms` simply split on
whitespace. The fix uses a regex that extracts quoted substrings first,
then splits remaining text on whitespace.

The following tradeoffs were made:
- CJK terms always use substring matching even in "whole-word" mode,
since CJK languages don't have word boundaries in the same sense as
Latin scripts. This is the expected behavior for CJK users.

The following alternatives were considered:
- Using a CJK segmentation library for proper word-boundary detection —
rejected as too heavy for this use case.

### Breaking changes

None. Search behavior becomes more accurate; no API or data model
changes.

### Special notes for your reviewer

- The `SearchResults.tsx` change is a single-line `.some()` → `.every()`
switch (OR→AND logic).
- All new behavior is covered by tests in `keywordSearch.test.ts`.
- This is a minimal bug fix targeting the `main` branch via `hotfix/*` —
no refactoring included.

### Checklist

- [x] PR: The PR description is expressive enough and will help future
contributors
- [x] Code: [Write code that humans can
understand](https://en.wikiquote.org/wiki/Martin_Fowler#code-for-humans)
and [Keep it simple](https://en.wikipedia.org/wiki/KISS_principle)
- [x] Refactor: You have [left the code cleaner than you found it (Boy
Scout
Rule)](https://learning.oreilly.com/library/view/97-things-every/9780596809515/ch08.html)
- [x] Upgrade: Impact of this change on upgrade flows was considered and
addressed if required
- [x] Documentation: A [user-guide update](https://docs.cherry-ai.com)
was considered and is present (link) or not required. Check this only
when the PR introduces or changes a user-facing feature or behavior.
- [x] Self-review: I have reviewed my own code (e.g., via
[`/gh-pr-review`](/.claude/skills/gh-pr-review/SKILL.md), `gh pr diff`,
or GitHub UI) before requesting review from others

### Release note

```release-note
Fix history search failing to find CJK text and not supporting quoted phrase search.
```

Signed-off-by: raymond <raymond@qlg.me>
Co-authored-by: SuYao <sy20010504@gmail.com>
This commit is contained in:
Qin Lingguang
2026-04-18 19:47:16 +08:00
committed by GitHub
parent fbda0d213d
commit d89ce08716
3 changed files with 83 additions and 5 deletions

View File

@@ -221,7 +221,7 @@ const SearchResults: FC<Props> = ({ keywords, onMessageClick, onTopicClick, ...p
.filter((block) => block.type === MessageBlockType.MAIN_TEXT)
.filter((block) => {
const searchableContent = stripMarkdownFormatting(block.content)
return searchRegexes.some((regex) => regex.test(searchableContent))
return searchRegexes.every((regex) => regex.test(searchableContent))
})
const messages = topics?.flatMap((topic) => topic.messages)

View File

@@ -2,6 +2,7 @@ import { describe, expect, it } from 'vitest'
import {
buildKeywordRegex,
buildKeywordRegexes,
buildKeywordUnionRegex,
type KeywordMatchMode,
splitKeywordsToTerms
@@ -16,6 +17,55 @@ describe('keywordSearch', () => {
it('returns empty array for empty input', () => {
expect(splitKeywordsToTerms('')).toEqual([])
})
describe('phrase search (quoted substrings)', () => {
it('extracts double-quoted phrases as single terms', () => {
expect(splitKeywordsToTerms('"machine learning" deep')).toEqual(['machine learning', 'deep'])
})
it('extracts single-quoted phrases as single terms', () => {
expect(splitKeywordsToTerms("'neural network' model")).toEqual(['neural network', 'model'])
})
it('handles multiple quoted phrases', () => {
expect(splitKeywordsToTerms('"hello world" "foo bar"')).toEqual(['hello world', 'foo bar'])
})
it('handles mixed quoted and unquoted terms', () => {
expect(splitKeywordsToTerms('test "some phrase" end')).toEqual(['test', 'some phrase', 'end'])
})
it('handles unclosed quotes gracefully', () => {
expect(splitKeywordsToTerms('"unclosed phrase')).toEqual(['unclosed phrase'])
})
it('skips empty quotes', () => {
expect(splitKeywordsToTerms('"" hello')).toEqual(['hello'])
})
})
})
describe('AND logic with buildKeywordRegexes', () => {
it('every() returns true when all terms are present', () => {
const terms = splitKeywordsToTerms('hello world')
const regexes = buildKeywordRegexes(terms, { matchMode: 'substring', flags: 'i' })
expect(regexes.every((r) => r.test('hello world'))).toBe(true)
expect(regexes.every((r) => r.test('world of hello'))).toBe(true)
})
it('every() returns false when only some terms are present', () => {
const terms = splitKeywordsToTerms('hello world')
const regexes = buildKeywordRegexes(terms, { matchMode: 'substring', flags: 'i' })
expect(regexes.every((r) => r.test('hello only'))).toBe(false)
expect(regexes.every((r) => r.test('world only'))).toBe(false)
})
it('every() works with phrase search', () => {
const terms = splitKeywordsToTerms('"machine learning" deep')
const regexes = buildKeywordRegexes(terms, { matchMode: 'substring', flags: 'i' })
expect(regexes.every((r) => r.test('deep machine learning is great'))).toBe(true)
expect(regexes.every((r) => r.test('deep learning but not machine'))).toBe(false)
})
})
describe('buildKeywordRegex (whole-word)', () => {
@@ -46,6 +96,17 @@ describe('keywordSearch', () => {
expect(regex.test('mañana')).toBe(false)
expect(regex.test('ana')).toBe(true)
})
it('CJK terms degrade to substring in whole-word mode', () => {
const regex = buildKeywordRegex('组合优于', { matchMode })
expect(regex.test('投资组合优于其他策略')).toBe(true)
expect(regex.test('组合优于')).toBe(true)
})
it('CJK whole-word still does not match partial substring across non-CJK boundary', () => {
const regex = buildKeywordRegex('组合优于', { matchMode })
expect(regex.test('abc组合优于def')).toBe(true)
})
})
describe('buildKeywordRegex (substring)', () => {

View File

@@ -5,13 +5,30 @@ export function escapeRegex(text: string): string {
}
export function splitKeywordsToTerms(keywords: string): string[] {
return (keywords || '')
.toLowerCase()
.split(/\s+/)
.filter((term) => term.length > 0)
const input = (keywords || '').trim()
if (input.length === 0) return []
const terms: string[] = []
const pattern = /"([^"]*)"?|'([^']*)'?|(\S+)/g
let match: RegExpExecArray | null
while ((match = pattern.exec(input)) !== null) {
const term = (match[1] ?? match[2] ?? match[3]).trim()
if (term.length > 0) {
terms.push(term.toLowerCase())
}
}
return terms
}
function containsCJK(text: string): boolean {
return /[\u4e00-\u9fff\u3400-\u4dbf\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(text)
}
function buildWholeWordPattern(escapedTerm: string): string {
// CJK text has no word boundaries — degrade to substring matching
if (containsCJK(escapedTerm)) {
return escapedTerm
}
// "Whole word" here means: do not match inside a larger alphanumeric token.
// This avoids false positives like:
// - API keys: "IMr4WSMS5dwa52"