fix(v2): remove knowledge libsql vector indexes (#14280)

This commit is contained in:
槑囿脑袋
2026-04-15 21:42:56 +08:00
committed by GitHub
parent 4ac451e292
commit 7f5486ca51
5 changed files with 35 additions and 50 deletions

View File

@@ -0,0 +1,5 @@
---
'@vectorstores/libsql': patch
---
Remove libSQL vector index and `vector_top_k` usage from the knowledge vector store query path.

View File

@@ -104,6 +104,25 @@ Current `KnowledgeSearchResult` includes:
`chunkId` is the vector row identity used for result-level attribution. `itemId` is populated from stored metadata when available.
### Current Retrieval Cost Assumption
The current v2 implementation intentionally does **not** create a libSQL vector index and does **not** use `vector_top_k`.
Similarity search currently queries the base table directly and sorts by `vector_distance_cos(...)`.
This means retrieval cost scales roughly linearly with the number of vector rows in a single knowledge base.
That tradeoff is currently accepted because it keeps the runtime path simpler and performs well enough for the expected near-term corpus sizes.
A local benchmark run on April 15, 2026 with 1536-dimension embeddings and `topK=10` measured approximately:
- `20k` rows: `~78ms` warm vector search
- `50k` rows: `~195ms` warm vector search
Current guidance:
1. Treat the no-index design as the default for now, not as an unlimited scaling guarantee.
2. Re-evaluate indexed search if real single-base corpora grow toward `100k+` rows or retrieval latency budgets can no longer tolerate a few hundred milliseconds per query.
3. If future product requirements change, adding a vector index remains a valid follow-up optimization rather than a blocked prerequisite for the current design.
## Deletion
Deletion still requires two concerns to be handled:

View File

@@ -64,7 +64,7 @@ function toInArgs(params: unknown[]): InArgs {
/**
* Provides support for writing and querying vector data in libSQL/Turso.
* Uses native libSQL vector operations for similarity search.
* Uses native libSQL vector operations for similarity search without ANN indexes.
*/
export class LibSQLVectorStore extends BaseVectorStore {
storesText: boolean = true
@@ -184,15 +184,6 @@ export class LibSQLVectorStore extends BaseVectorStore {
}
await client.execute(collectionIndexStatement)
const vectorIndexStatement: InStatement = {
sql: `
CREATE INDEX IF NOT EXISTS idx_${this.tableName}_vector
ON ${this.tableName} (libsql_vector_idx(embeddings, 'metric=cosine'))
`,
args: []
}
await client.execute(vectorIndexStatement)
// Create FTS5 virtual table for full-text search (bm25/hybrid modes)
const ftsTableName = `${this.tableName}_fts`
const ftsTableExistsResult = await client.execute({
@@ -569,22 +560,16 @@ export class LibSQLVectorStore extends BaseVectorStore {
const { where, params } = this.buildWhereClause(query, 't')
const vectorJson = `[${queryEmbedding.join(',')}]`
const indexName = `idx_${this.tableName}_vector`
// Use vector_top_k for efficient ANN search with vector index
// Fetch more candidates to account for filtering
const prefetch = where ? max * 5 : max
const vectorStatement: InStatement = {
sql: `
SELECT t.*, vector_distance_cos(t.embeddings, vector32(?)) as distance
FROM vector_top_k('${indexName}', vector32(?), ${prefetch}) AS v
JOIN ${this.tableName} t ON t.rowid = v.id
FROM ${this.tableName} t
${where}
ORDER BY distance
LIMIT ${max}
`,
args: toInArgs([vectorJson, vectorJson, ...params])
args: toInArgs([vectorJson, ...params])
}
const vectorResults = await this.clientInstance.execute(vectorStatement)

View File

@@ -366,28 +366,6 @@ describe('LibSQLVectorStore', () => {
await expect(store.add([invalidNode])).rejects.toThrow('Invalid libSQL argument at index 0: null')
})
it('should fail initialization when vector index creation fails', async () => {
const originalExecute = client.execute.bind(client)
const executeSpy = vi.spyOn(client, 'execute').mockImplementation(async (statement: any) => {
const sql = typeof statement === 'string' ? statement : statement.sql
if (typeof sql === 'string' && sql.includes('libsql_vector_idx')) {
throw new Error('vector index failed')
}
return await originalExecute(statement)
})
const node = new TextNode({
id_: 'chunk-hard-fail',
text: 'Document chunk',
embedding: [0.1, 0.2],
metadata: { category: 'test' }
})
await expect(store.add([node])).rejects.toThrow('vector index failed')
executeSpy.mockRestore()
})
it('should fail initialization when FTS schema creation fails', async () => {
const originalExecute = client.execute.bind(client)
const executeSpy = vi.spyOn(client, 'execute').mockImplementation(async (statement: any) => {

View File

@@ -11,7 +11,7 @@
3. 向量迁移过程中的关键字段转换
4. 文件安全、校验与跳过规则
这份文档只描述当前已经落地的迁移器行为,不展开到未来在线索引重建或最终 retrieval API 设计。
这份文档只描述当前已经落地的迁移器行为,不展开到未来在线向量数据重建或最终 retrieval API 设计。
对应实现:
@@ -29,7 +29,7 @@
换句话说:
1. `KnowledgeMigrator` 负责业务主数据
2. `KnowledgeVectorMigrator` 负责向量索引数据
2. `KnowledgeVectorMigrator` 负责向量数据迁移
两者共同完成知识库的完整迁移,但 source of truth 仍然是 V2 业务表,不是向量库。
@@ -103,9 +103,7 @@
2. 普通索引
- `external_id`
- `collection`
3. 向量索引
- `libsql_vector_idx(embeddings, 'metric=cosine')`
4. FTS 表和触发器
3. FTS 表和触发器
## 5. 核心转换规则
@@ -128,7 +126,7 @@ V2 迁移时,不保留这个旧字段作为最终业务标识,而是把它
这里有一个重要约束:
1. 只有能够映射到 V2 `knowledge_item.id` 的 legacy 向量记录,才属于有效可迁移数据
2. 无法映射到 `knowledge_item.id` 的 legacy 向量,即使仍存在于旧 `embedjs` DB 中,也视为无效索引残留
2. 无法映射到 `knowledge_item.id` 的 legacy 向量,即使仍存在于旧 `embedjs` DB 中,也视为无效残留数据
3. 因此迁移器的目标不是“尽量保留旧向量文件中的所有内容”,而是“只保留能被当前 V2 业务表证明合法归属的向量数据”
### 5.2 Chunk 内容映射
@@ -235,11 +233,11 @@ V2 迁移时,不保留这个旧字段作为最终业务标识,而是把它
1. 如果某个 base 的 legacy 向量记录最终全部被跳过,则该 base 在 V2 中会被重建为空的 vector store
2. 这不是“回滚保留旧 DB”的场景而是预期的数据清洗结果
3. 原因是这些被跳过的记录无法稳定关联到当前 V2 `knowledge_item`,因此不再被视为有效业务索引数据
3. 原因是这些被跳过的记录无法稳定关联到当前 V2 `knowledge_item`,因此不再被视为有效业务向量数据
## 9. 当前边界与限制
当前迁移器只负责“向量索引重建”,不负责:
当前迁移器只负责“向量数据重建”,不负责:
1. 重新切块
2. 重新 embedding
@@ -268,13 +266,13 @@ V2 迁移时,不保留这个旧字段作为最终业务标识,而是把它
- `knowledge-schema.md`
- 定义 V2 业务 schema
- 本文档
- 专门说明向量迁移器如何把旧向量索引接到新的 V2 业务模型上
- 专门说明向量迁移器如何把旧向量数据接到新的 V2 业务模型上
三者的关系可以简化为:
1. schema 定义业务结构
2. backend decisions 文档定义当前运行时边界
3. vector migrator 文档定义旧向量索引如何迁移进新体系
3. vector migrator 文档定义旧向量数据如何迁移进新体系
## 12. 与当前 Runtime 的衔接