mirror of
https://github.com/CherryHQ/cherry-studio.git
synced 2026-07-04 05:00:00 +08:00
fix(search): support phrase search and fix CJK matching in history search (#14225)
### What this PR does
Before this PR:
- Searching CJK (Chinese/Japanese/Korean) text in whole-word mode
silently failed because Unicode word-boundary assertions (`\p{L}`) treat
CJK characters as letters, causing the lookbehind/lookahead to reject
valid matches within continuous CJK text.
- Multi-word English queries were split into individual words with OR
logic, producing noisy results.
- Quoted phrases (e.g., `"machine learning"`) were not supported —
quotes were stripped and each word was searched independently.
After this PR:
- CJK terms automatically degrade to substring matching in whole-word
mode, so searches like "组合优于" correctly find results within longer CJK
text.
- Search filter uses AND logic (`.every()`) instead of OR (`.some()`),
so all terms must be present for a result to match.
- Quoted phrases (`"machine learning"` or `'neural network'`) are kept
as a single search term.
Fixes #14212
### Why we need it and why it was done in this way
The root cause of issue #14212 is twofold:
1. **CJK whole-word matching failure**: `buildWholeWordPattern` wraps
terms with `(?<![\p{L}\p{N}])…(?![\p{L}\p{N}])`. Since CJK characters
match `\p{L}`, searching for a CJK substring inside continuous CJK text
always fails the negative lookbehind. The fix detects CJK characters in
the search term and skips word-boundary assertions, falling back to
plain substring matching.
2. **No phrase support**: `splitKeywordsToTerms` simply split on
whitespace. The fix uses a regex that extracts quoted substrings first,
then splits remaining text on whitespace.
The following tradeoffs were made:
- CJK terms always use substring matching even in "whole-word" mode,
since CJK languages don't have word boundaries in the same sense as
Latin scripts. This is the expected behavior for CJK users.
The following alternatives were considered:
- Using a CJK segmentation library for proper word-boundary detection —
rejected as too heavy for this use case.
### Breaking changes
None. Search behavior becomes more accurate; no API or data model
changes.
### Special notes for your reviewer
- The `SearchResults.tsx` change is a single-line `.some()` → `.every()`
switch (OR→AND logic).
- All new behavior is covered by tests in `keywordSearch.test.ts`.
- This is a minimal bug fix targeting the `main` branch via `hotfix/*` —
no refactoring included.
### Checklist
- [x] PR: The PR description is expressive enough and will help future
contributors
- [x] Code: [Write code that humans can
understand](https://en.wikiquote.org/wiki/Martin_Fowler#code-for-humans)
and [Keep it simple](https://en.wikipedia.org/wiki/KISS_principle)
- [x] Refactor: You have [left the code cleaner than you found it (Boy
Scout
Rule)](https://learning.oreilly.com/library/view/97-things-every/9780596809515/ch08.html)
- [x] Upgrade: Impact of this change on upgrade flows was considered and
addressed if required
- [x] Documentation: A [user-guide update](https://docs.cherry-ai.com)
was considered and is present (link) or not required. Check this only
when the PR introduces or changes a user-facing feature or behavior.
- [x] Self-review: I have reviewed my own code (e.g., via
[`/gh-pr-review`](/.claude/skills/gh-pr-review/SKILL.md), `gh pr diff`,
or GitHub UI) before requesting review from others
### Release note
```release-note
Fix history search failing to find CJK text and not supporting quoted phrase search.
```
Signed-off-by: raymond <raymond@qlg.me>
Co-authored-by: SuYao <sy20010504@gmail.com>
This commit is contained in:
@@ -221,7 +221,7 @@ const SearchResults: FC<Props> = ({ keywords, onMessageClick, onTopicClick, ...p
|
||||
.filter((block) => block.type === MessageBlockType.MAIN_TEXT)
|
||||
.filter((block) => {
|
||||
const searchableContent = stripMarkdownFormatting(block.content)
|
||||
return searchRegexes.some((regex) => regex.test(searchableContent))
|
||||
return searchRegexes.every((regex) => regex.test(searchableContent))
|
||||
})
|
||||
|
||||
const messages = topics?.flatMap((topic) => topic.messages)
|
||||
|
||||
@@ -2,6 +2,7 @@ import { describe, expect, it } from 'vitest'
|
||||
|
||||
import {
|
||||
buildKeywordRegex,
|
||||
buildKeywordRegexes,
|
||||
buildKeywordUnionRegex,
|
||||
type KeywordMatchMode,
|
||||
splitKeywordsToTerms
|
||||
@@ -16,6 +17,55 @@ describe('keywordSearch', () => {
|
||||
it('returns empty array for empty input', () => {
|
||||
expect(splitKeywordsToTerms('')).toEqual([])
|
||||
})
|
||||
|
||||
describe('phrase search (quoted substrings)', () => {
|
||||
it('extracts double-quoted phrases as single terms', () => {
|
||||
expect(splitKeywordsToTerms('"machine learning" deep')).toEqual(['machine learning', 'deep'])
|
||||
})
|
||||
|
||||
it('extracts single-quoted phrases as single terms', () => {
|
||||
expect(splitKeywordsToTerms("'neural network' model")).toEqual(['neural network', 'model'])
|
||||
})
|
||||
|
||||
it('handles multiple quoted phrases', () => {
|
||||
expect(splitKeywordsToTerms('"hello world" "foo bar"')).toEqual(['hello world', 'foo bar'])
|
||||
})
|
||||
|
||||
it('handles mixed quoted and unquoted terms', () => {
|
||||
expect(splitKeywordsToTerms('test "some phrase" end')).toEqual(['test', 'some phrase', 'end'])
|
||||
})
|
||||
|
||||
it('handles unclosed quotes gracefully', () => {
|
||||
expect(splitKeywordsToTerms('"unclosed phrase')).toEqual(['unclosed phrase'])
|
||||
})
|
||||
|
||||
it('skips empty quotes', () => {
|
||||
expect(splitKeywordsToTerms('"" hello')).toEqual(['hello'])
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
describe('AND logic with buildKeywordRegexes', () => {
|
||||
it('every() returns true when all terms are present', () => {
|
||||
const terms = splitKeywordsToTerms('hello world')
|
||||
const regexes = buildKeywordRegexes(terms, { matchMode: 'substring', flags: 'i' })
|
||||
expect(regexes.every((r) => r.test('hello world'))).toBe(true)
|
||||
expect(regexes.every((r) => r.test('world of hello'))).toBe(true)
|
||||
})
|
||||
|
||||
it('every() returns false when only some terms are present', () => {
|
||||
const terms = splitKeywordsToTerms('hello world')
|
||||
const regexes = buildKeywordRegexes(terms, { matchMode: 'substring', flags: 'i' })
|
||||
expect(regexes.every((r) => r.test('hello only'))).toBe(false)
|
||||
expect(regexes.every((r) => r.test('world only'))).toBe(false)
|
||||
})
|
||||
|
||||
it('every() works with phrase search', () => {
|
||||
const terms = splitKeywordsToTerms('"machine learning" deep')
|
||||
const regexes = buildKeywordRegexes(terms, { matchMode: 'substring', flags: 'i' })
|
||||
expect(regexes.every((r) => r.test('deep machine learning is great'))).toBe(true)
|
||||
expect(regexes.every((r) => r.test('deep learning but not machine'))).toBe(false)
|
||||
})
|
||||
})
|
||||
|
||||
describe('buildKeywordRegex (whole-word)', () => {
|
||||
@@ -46,6 +96,17 @@ describe('keywordSearch', () => {
|
||||
expect(regex.test('mañana')).toBe(false)
|
||||
expect(regex.test('ana')).toBe(true)
|
||||
})
|
||||
|
||||
it('CJK terms degrade to substring in whole-word mode', () => {
|
||||
const regex = buildKeywordRegex('组合优于', { matchMode })
|
||||
expect(regex.test('投资组合优于其他策略')).toBe(true)
|
||||
expect(regex.test('组合优于')).toBe(true)
|
||||
})
|
||||
|
||||
it('CJK whole-word still does not match partial substring across non-CJK boundary', () => {
|
||||
const regex = buildKeywordRegex('组合优于', { matchMode })
|
||||
expect(regex.test('abc组合优于def')).toBe(true)
|
||||
})
|
||||
})
|
||||
|
||||
describe('buildKeywordRegex (substring)', () => {
|
||||
|
||||
@@ -5,13 +5,30 @@ export function escapeRegex(text: string): string {
|
||||
}
|
||||
|
||||
export function splitKeywordsToTerms(keywords: string): string[] {
|
||||
return (keywords || '')
|
||||
.toLowerCase()
|
||||
.split(/\s+/)
|
||||
.filter((term) => term.length > 0)
|
||||
const input = (keywords || '').trim()
|
||||
if (input.length === 0) return []
|
||||
|
||||
const terms: string[] = []
|
||||
const pattern = /"([^"]*)"?|'([^']*)'?|(\S+)/g
|
||||
let match: RegExpExecArray | null
|
||||
while ((match = pattern.exec(input)) !== null) {
|
||||
const term = (match[1] ?? match[2] ?? match[3]).trim()
|
||||
if (term.length > 0) {
|
||||
terms.push(term.toLowerCase())
|
||||
}
|
||||
}
|
||||
return terms
|
||||
}
|
||||
|
||||
function containsCJK(text: string): boolean {
|
||||
return /[\u4e00-\u9fff\u3400-\u4dbf\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(text)
|
||||
}
|
||||
|
||||
function buildWholeWordPattern(escapedTerm: string): string {
|
||||
// CJK text has no word boundaries — degrade to substring matching
|
||||
if (containsCJK(escapedTerm)) {
|
||||
return escapedTerm
|
||||
}
|
||||
// "Whole word" here means: do not match inside a larger alphanumeric token.
|
||||
// This avoids false positives like:
|
||||
// - API keys: "IMr4WSMS5dwa52"
|
||||
|
||||
Reference in New Issue
Block a user