mirror of
https://github.com/CherryHQ/cherry-studio.git
synced 2026-07-03 12:27:41 +08:00
fix(v2): remove knowledge libsql vector indexes (#14280)
This commit is contained in:
5
.changeset/rare-schools-end.md
Normal file
5
.changeset/rare-schools-end.md
Normal file
@@ -0,0 +1,5 @@
|
||||
---
|
||||
'@vectorstores/libsql': patch
|
||||
---
|
||||
|
||||
Remove libSQL vector index and `vector_top_k` usage from the knowledge vector store query path.
|
||||
@@ -104,6 +104,25 @@ Current `KnowledgeSearchResult` includes:
|
||||
|
||||
`chunkId` is the vector row identity used for result-level attribution. `itemId` is populated from stored metadata when available.
|
||||
|
||||
### Current Retrieval Cost Assumption
|
||||
|
||||
The current v2 implementation intentionally does **not** create a libSQL vector index and does **not** use `vector_top_k`.
|
||||
Similarity search currently queries the base table directly and sorts by `vector_distance_cos(...)`.
|
||||
|
||||
This means retrieval cost scales roughly linearly with the number of vector rows in a single knowledge base.
|
||||
That tradeoff is currently accepted because it keeps the runtime path simpler and performs well enough for the expected near-term corpus sizes.
|
||||
|
||||
A local benchmark run on April 15, 2026 with 1536-dimension embeddings and `topK=10` measured approximately:
|
||||
|
||||
- `20k` rows: `~78ms` warm vector search
|
||||
- `50k` rows: `~195ms` warm vector search
|
||||
|
||||
Current guidance:
|
||||
|
||||
1. Treat the no-index design as the default for now, not as an unlimited scaling guarantee.
|
||||
2. Re-evaluate indexed search if real single-base corpora grow toward `100k+` rows or retrieval latency budgets can no longer tolerate a few hundred milliseconds per query.
|
||||
3. If future product requirements change, adding a vector index remains a valid follow-up optimization rather than a blocked prerequisite for the current design.
|
||||
|
||||
## Deletion
|
||||
|
||||
Deletion still requires two concerns to be handled:
|
||||
|
||||
@@ -64,7 +64,7 @@ function toInArgs(params: unknown[]): InArgs {
|
||||
|
||||
/**
|
||||
* Provides support for writing and querying vector data in libSQL/Turso.
|
||||
* Uses native libSQL vector operations for similarity search.
|
||||
* Uses native libSQL vector operations for similarity search without ANN indexes.
|
||||
*/
|
||||
export class LibSQLVectorStore extends BaseVectorStore {
|
||||
storesText: boolean = true
|
||||
@@ -184,15 +184,6 @@ export class LibSQLVectorStore extends BaseVectorStore {
|
||||
}
|
||||
await client.execute(collectionIndexStatement)
|
||||
|
||||
const vectorIndexStatement: InStatement = {
|
||||
sql: `
|
||||
CREATE INDEX IF NOT EXISTS idx_${this.tableName}_vector
|
||||
ON ${this.tableName} (libsql_vector_idx(embeddings, 'metric=cosine'))
|
||||
`,
|
||||
args: []
|
||||
}
|
||||
await client.execute(vectorIndexStatement)
|
||||
|
||||
// Create FTS5 virtual table for full-text search (bm25/hybrid modes)
|
||||
const ftsTableName = `${this.tableName}_fts`
|
||||
const ftsTableExistsResult = await client.execute({
|
||||
@@ -569,22 +560,16 @@ export class LibSQLVectorStore extends BaseVectorStore {
|
||||
|
||||
const { where, params } = this.buildWhereClause(query, 't')
|
||||
const vectorJson = `[${queryEmbedding.join(',')}]`
|
||||
const indexName = `idx_${this.tableName}_vector`
|
||||
|
||||
// Use vector_top_k for efficient ANN search with vector index
|
||||
// Fetch more candidates to account for filtering
|
||||
const prefetch = where ? max * 5 : max
|
||||
|
||||
const vectorStatement: InStatement = {
|
||||
sql: `
|
||||
SELECT t.*, vector_distance_cos(t.embeddings, vector32(?)) as distance
|
||||
FROM vector_top_k('${indexName}', vector32(?), ${prefetch}) AS v
|
||||
JOIN ${this.tableName} t ON t.rowid = v.id
|
||||
FROM ${this.tableName} t
|
||||
${where}
|
||||
ORDER BY distance
|
||||
LIMIT ${max}
|
||||
`,
|
||||
args: toInArgs([vectorJson, vectorJson, ...params])
|
||||
args: toInArgs([vectorJson, ...params])
|
||||
}
|
||||
|
||||
const vectorResults = await this.clientInstance.execute(vectorStatement)
|
||||
|
||||
@@ -366,28 +366,6 @@ describe('LibSQLVectorStore', () => {
|
||||
await expect(store.add([invalidNode])).rejects.toThrow('Invalid libSQL argument at index 0: null')
|
||||
})
|
||||
|
||||
it('should fail initialization when vector index creation fails', async () => {
|
||||
const originalExecute = client.execute.bind(client)
|
||||
const executeSpy = vi.spyOn(client, 'execute').mockImplementation(async (statement: any) => {
|
||||
const sql = typeof statement === 'string' ? statement : statement.sql
|
||||
if (typeof sql === 'string' && sql.includes('libsql_vector_idx')) {
|
||||
throw new Error('vector index failed')
|
||||
}
|
||||
|
||||
return await originalExecute(statement)
|
||||
})
|
||||
|
||||
const node = new TextNode({
|
||||
id_: 'chunk-hard-fail',
|
||||
text: 'Document chunk',
|
||||
embedding: [0.1, 0.2],
|
||||
metadata: { category: 'test' }
|
||||
})
|
||||
|
||||
await expect(store.add([node])).rejects.toThrow('vector index failed')
|
||||
executeSpy.mockRestore()
|
||||
})
|
||||
|
||||
it('should fail initialization when FTS schema creation fails', async () => {
|
||||
const originalExecute = client.execute.bind(client)
|
||||
const executeSpy = vi.spyOn(client, 'execute').mockImplementation(async (statement: any) => {
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
3. 向量迁移过程中的关键字段转换
|
||||
4. 文件安全、校验与跳过规则
|
||||
|
||||
这份文档只描述当前已经落地的迁移器行为,不展开到未来在线索引重建或最终 retrieval API 设计。
|
||||
这份文档只描述当前已经落地的迁移器行为,不展开到未来在线向量数据重建或最终 retrieval API 设计。
|
||||
|
||||
对应实现:
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
换句话说:
|
||||
|
||||
1. `KnowledgeMigrator` 负责业务主数据
|
||||
2. `KnowledgeVectorMigrator` 负责向量索引数据
|
||||
2. `KnowledgeVectorMigrator` 负责向量数据迁移
|
||||
|
||||
两者共同完成知识库的完整迁移,但 source of truth 仍然是 V2 业务表,不是向量库。
|
||||
|
||||
@@ -103,9 +103,7 @@
|
||||
2. 普通索引
|
||||
- `external_id`
|
||||
- `collection`
|
||||
3. 向量索引
|
||||
- `libsql_vector_idx(embeddings, 'metric=cosine')`
|
||||
4. FTS 表和触发器
|
||||
3. FTS 表和触发器
|
||||
|
||||
## 5. 核心转换规则
|
||||
|
||||
@@ -128,7 +126,7 @@ V2 迁移时,不保留这个旧字段作为最终业务标识,而是把它
|
||||
这里有一个重要约束:
|
||||
|
||||
1. 只有能够映射到 V2 `knowledge_item.id` 的 legacy 向量记录,才属于有效可迁移数据
|
||||
2. 无法映射到 `knowledge_item.id` 的 legacy 向量,即使仍存在于旧 `embedjs` DB 中,也视为无效索引残留
|
||||
2. 无法映射到 `knowledge_item.id` 的 legacy 向量,即使仍存在于旧 `embedjs` DB 中,也视为无效残留数据
|
||||
3. 因此迁移器的目标不是“尽量保留旧向量文件中的所有内容”,而是“只保留能被当前 V2 业务表证明合法归属的向量数据”
|
||||
|
||||
### 5.2 Chunk 内容映射
|
||||
@@ -235,11 +233,11 @@ V2 迁移时,不保留这个旧字段作为最终业务标识,而是把它
|
||||
|
||||
1. 如果某个 base 的 legacy 向量记录最终全部被跳过,则该 base 在 V2 中会被重建为空的 vector store
|
||||
2. 这不是“回滚保留旧 DB”的场景,而是预期的数据清洗结果
|
||||
3. 原因是这些被跳过的记录无法稳定关联到当前 V2 `knowledge_item`,因此不再被视为有效业务索引数据
|
||||
3. 原因是这些被跳过的记录无法稳定关联到当前 V2 `knowledge_item`,因此不再被视为有效业务向量数据
|
||||
|
||||
## 9. 当前边界与限制
|
||||
|
||||
当前迁移器只负责“向量索引重建”,不负责:
|
||||
当前迁移器只负责“向量数据重建”,不负责:
|
||||
|
||||
1. 重新切块
|
||||
2. 重新 embedding
|
||||
@@ -268,13 +266,13 @@ V2 迁移时,不保留这个旧字段作为最终业务标识,而是把它
|
||||
- `knowledge-schema.md`
|
||||
- 定义 V2 业务 schema
|
||||
- 本文档
|
||||
- 专门说明向量迁移器如何把旧向量索引接到新的 V2 业务模型上
|
||||
- 专门说明向量迁移器如何把旧向量数据接到新的 V2 业务模型上
|
||||
|
||||
三者的关系可以简化为:
|
||||
|
||||
1. schema 定义业务结构
|
||||
2. backend decisions 文档定义当前运行时边界
|
||||
3. vector migrator 文档定义旧向量索引如何迁移进新体系
|
||||
3. vector migrator 文档定义旧向量数据如何迁移进新体系
|
||||
|
||||
## 12. 与当前 Runtime 的衔接
|
||||
|
||||
|
||||
Reference in New Issue
Block a user