fix(v2): remove knowledge libsql vector indexes (#14280)

2026-07-03 12:27:41 +08:00 · 2026-04-15 21:42:56 +08:00
parent 4ac451e292
commit 7f5486ca51
5 changed files with 35 additions and 50 deletions
--- a/.changeset/rare-schools-end.md
+++ b/.changeset/rare-schools-end.md
@@ -0,0 +1,5 @@
+---
+'@vectorstores/libsql': patch
+---
+
+Remove libSQL vector index and `vector_top_k` usage from the knowledge vector store query path.
--- a/docs/references/knowledge/knowledge-service.md
+++ b/docs/references/knowledge/knowledge-service.md
@@ -104,6 +104,25 @@ Current `KnowledgeSearchResult` includes:

 `chunkId` is the vector row identity used for result-level attribution. `itemId` is populated from stored metadata when available.

+### Current Retrieval Cost Assumption
+
+The current v2 implementation intentionally does **not** create a libSQL vector index and does **not** use `vector_top_k`.
+Similarity search currently queries the base table directly and sorts by `vector_distance_cos(...)`.
+
+This means retrieval cost scales roughly linearly with the number of vector rows in a single knowledge base.
+That tradeoff is currently accepted because it keeps the runtime path simpler and performs well enough for the expected near-term corpus sizes.
+
+A local benchmark run on April 15, 2026 with 1536-dimension embeddings and `topK=10` measured approximately:
+
+- `20k` rows: `~78ms` warm vector search
+- `50k` rows: `~195ms` warm vector search
+
+Current guidance:
+
+1. Treat the no-index design as the default for now, not as an unlimited scaling guarantee.
+2. Re-evaluate indexed search if real single-base corpora grow toward `100k+` rows or retrieval latency budgets can no longer tolerate a few hundred milliseconds per query.
+3. If future product requirements change, adding a vector index remains a valid follow-up optimization rather than a blocked prerequisite for the current design.
+
 ## Deletion

 Deletion still requires two concerns to be handled:
--- a/packages/vectorstores/libsql/src/LibSQLVectorStore.ts
+++ b/packages/vectorstores/libsql/src/LibSQLVectorStore.ts
@@ -64,7 +64,7 @@ function toInArgs(params: unknown[]): InArgs {

 /**
 * Provides support for writing and querying vector data in libSQL/Turso.
- * Uses native libSQL vector operations for similarity search.
+ * Uses native libSQL vector operations for similarity search without ANN indexes.
 */
 export class LibSQLVectorStore extends BaseVectorStore {
  storesText: boolean = true
@@ -184,15 +184,6 @@ export class LibSQLVectorStore extends BaseVectorStore {
    }
    await client.execute(collectionIndexStatement)

-    const vectorIndexStatement: InStatement = {
-      sql: `
-          CREATE INDEX IF NOT EXISTS idx_${this.tableName}_vector
-          ON ${this.tableName} (libsql_vector_idx(embeddings, 'metric=cosine'))
-        `,
-      args: []
-    }
-    await client.execute(vectorIndexStatement)
-
    // Create FTS5 virtual table for full-text search (bm25/hybrid modes)
    const ftsTableName = `${this.tableName}_fts`
    const ftsTableExistsResult = await client.execute({
@@ -569,22 +560,16 @@ export class LibSQLVectorStore extends BaseVectorStore {

    const { where, params } = this.buildWhereClause(query, 't')
    const vectorJson = `[${queryEmbedding.join(',')}]`
-    const indexName = `idx_${this.tableName}_vector`
-
-    // Use vector_top_k for efficient ANN search with vector index
-    // Fetch more candidates to account for filtering
-    const prefetch = where ? max * 5 : max

    const vectorStatement: InStatement = {
      sql: `
        SELECT t.*, vector_distance_cos(t.embeddings, vector32(?)) as distance
-        FROM vector_top_k('${indexName}', vector32(?), ${prefetch}) AS v
-        JOIN ${this.tableName} t ON t.rowid = v.id
+        FROM ${this.tableName} t
        ${where}
        ORDER BY distance
        LIMIT ${max}
      `,
-      args: toInArgs([vectorJson, vectorJson, ...params])
+      args: toInArgs([vectorJson, ...params])
    }

    const vectorResults = await this.clientInstance.execute(vectorStatement)
--- a/packages/vectorstores/libsql/tests/LibSQLVectorStore.test.ts
+++ b/packages/vectorstores/libsql/tests/LibSQLVectorStore.test.ts
@@ -366,28 +366,6 @@ describe('LibSQLVectorStore', () => {
      await expect(store.add([invalidNode])).rejects.toThrow('Invalid libSQL argument at index 0: null')
    })

-    it('should fail initialization when vector index creation fails', async () => {
-      const originalExecute = client.execute.bind(client)
-      const executeSpy = vi.spyOn(client, 'execute').mockImplementation(async (statement: any) => {
-        const sql = typeof statement === 'string' ? statement : statement.sql
-        if (typeof sql === 'string' && sql.includes('libsql_vector_idx')) {
-          throw new Error('vector index failed')
-        }
-
-        return await originalExecute(statement)
-      })
-
-      const node = new TextNode({
-        id_: 'chunk-hard-fail',
-        text: 'Document chunk',
-        embedding: [0.1, 0.2],
-        metadata: { category: 'test' }
-      })
-
-      await expect(store.add([node])).rejects.toThrow('vector index failed')
-      executeSpy.mockRestore()
-    })
-
    it('should fail initialization when FTS schema creation fails', async () => {
      const originalExecute = client.execute.bind(client)
      const executeSpy = vi.spyOn(client, 'execute').mockImplementation(async (statement: any) => {
--- a/v2-refactor-temp/docs/knowledge/knowledge-vector-migrator.md
+++ b/v2-refactor-temp/docs/knowledge/knowledge-vector-migrator.md
@@ -11,7 +11,7 @@
 3. 向量迁移过程中的关键字段转换
 4. 文件安全、校验与跳过规则

-这份文档只描述当前已经落地的迁移器行为，不展开到未来在线索引重建或最终 retrieval API 设计。
+这份文档只描述当前已经落地的迁移器行为，不展开到未来在线向量数据重建或最终 retrieval API 设计。

 对应实现：

@@ -29,7 +29,7 @@
 换句话说：

 1. `KnowledgeMigrator` 负责业务主数据
-2. `KnowledgeVectorMigrator` 负责向量索引数据
+2. `KnowledgeVectorMigrator` 负责向量数据迁移

 两者共同完成知识库的完整迁移，但 source of truth 仍然是 V2 业务表，不是向量库。

@@ -103,9 +103,7 @@
 2. 普通索引
   - `external_id`
   - `collection`
-3. 向量索引
-   - `libsql_vector_idx(embeddings, 'metric=cosine')`
-4. FTS 表和触发器
+3. FTS 表和触发器

 ## 5. 核心转换规则

@@ -128,7 +126,7 @@ V2 迁移时，不保留这个旧字段作为最终业务标识，而是把它
 这里有一个重要约束：

 1. 只有能够映射到 V2 `knowledge_item.id` 的 legacy 向量记录，才属于有效可迁移数据
-2. 无法映射到 `knowledge_item.id` 的 legacy 向量，即使仍存在于旧 `embedjs` DB 中，也视为无效索引残留
+2. 无法映射到 `knowledge_item.id` 的 legacy 向量，即使仍存在于旧 `embedjs` DB 中，也视为无效残留数据
 3. 因此迁移器的目标不是“尽量保留旧向量文件中的所有内容”，而是“只保留能被当前 V2 业务表证明合法归属的向量数据”

 ### 5.2 Chunk 内容映射
@@ -235,11 +233,11 @@ V2 迁移时，不保留这个旧字段作为最终业务标识，而是把它

 1. 如果某个 base 的 legacy 向量记录最终全部被跳过，则该 base 在 V2 中会被重建为空的 vector store
 2. 这不是“回滚保留旧 DB”的场景，而是预期的数据清洗结果
-3. 原因是这些被跳过的记录无法稳定关联到当前 V2 `knowledge_item`，因此不再被视为有效业务索引数据
+3. 原因是这些被跳过的记录无法稳定关联到当前 V2 `knowledge_item`，因此不再被视为有效业务向量数据

 ## 9. 当前边界与限制

-当前迁移器只负责“向量索引重建”，不负责：
+当前迁移器只负责“向量数据重建”，不负责：

 1. 重新切块
 2. 重新 embedding
@@ -268,13 +266,13 @@ V2 迁移时，不保留这个旧字段作为最终业务标识，而是把它
 - `knowledge-schema.md`
  - 定义 V2 业务 schema
 - 本文档
-  - 专门说明向量迁移器如何把旧向量索引接到新的 V2 业务模型上
+  - 专门说明向量迁移器如何把旧向量数据接到新的 V2 业务模型上

 三者的关系可以简化为：

 1. schema 定义业务结构
 2. backend decisions 文档定义当前运行时边界
-3. vector migrator 文档定义旧向量索引如何迁移进新体系
+3. vector migrator 文档定义旧向量数据如何迁移进新体系

 ## 12. 与当前 Runtime 的衔接