From 798a15e919b438e0902c6c9cd4c8ae77db989dac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A7=91=E5=9B=BF=E8=84=91=E8=A2=8B?= <70054568+eeee0717@users.noreply.github.com> Date: Fri, 10 Apr 2026 21:24:06 +0800 Subject: [PATCH] feat(v2): knowledge service backend (#14090) Co-authored-by: SuYao Co-authored-by: fullex <106392080+0xfullex@users.noreply.github.com> --- .changeset/wild-dots-dance.md | 4 + .../references/knowledge/knowledge-service.md | 217 ++-- electron.vite.config.ts | 7 +- .../sqlite-drizzle/0006_silly_screwball.sql | 2 +- .../sqlite-drizzle/meta/0006_snapshot.json | 2 +- package.json | 10 +- packages/shared/IpcChannel.ts | 5 + .../__tests__/knowledge-schemas.test.ts | 94 ++ .../shared/data/api/schemas/knowledges.ts | 165 ++- packages/shared/data/types/knowledge.ts | 40 +- packages/vectorstores/libsql/CHANGELOG.md | 13 + packages/vectorstores/libsql/package.json | 49 + .../libsql/src/LibSQLVectorStore.ts | 752 +++++++++++ packages/vectorstores/libsql/src/index.ts | 4 + packages/vectorstores/libsql/src/utils.ts | 17 + .../libsql/tests/LibSQLVectorStore.test.ts | 1142 +++++++++++++++++ packages/vectorstores/libsql/tsconfig.json | 11 + packages/vectorstores/libsql/tsdown.config.ts | 12 + pnpm-lock.yaml | 659 ++++++++-- pnpm-workspace.yaml | 1 + src/main/core/application/serviceRegistry.ts | 5 + .../api/handlers/__tests__/knowledges.test.ts | 127 ++ src/main/data/db/schemas/knowledge.ts | 6 +- .../migration/v2/core/MigrationContext.ts | 3 + .../v2/migrators/KnowledgeVectorMigrator.ts | 551 ++++++++ .../README-KnowledgeVectorMigrator.md | 75 ++ .../__tests__/KnowledgeVectorMigrator.test.ts | 697 ++++++++++ src/main/data/migration/v2/migrators/index.ts | 3 + .../migrators/mappings/KnowledgeMappings.ts | 45 +- .../__tests__/KnowledgeMappings.test.ts | 4 +- .../v2/utils/KnowledgeVectorSourceReader.ts | 111 ++ .../KnowledgeVectorSourceReader.test.ts | 136 ++ .../data/services/KnowledgeBaseService.ts | 81 +- .../data/services/KnowledgeItemService.ts | 267 +++- .../__tests__/KnowledgeBaseService.test.ts | 68 +- .../__tests__/KnowledgeItemService.test.ts | 476 +++++-- .../__tests__/knowledgeBaseConfig.test.ts | 93 -- src/main/data/services/knowledgeBaseConfig.ts | 102 -- .../KnowledgeOrchestrationService.ts | 157 +++ .../KnowledgeOrchestrationService.test.ts | 392 ++++++ src/main/services/knowledge/index.ts | 2 + .../knowledge/readers/KnowledgeFileReader.ts | 46 + .../knowledge/readers/KnowledgeNoteReader.ts | 15 + .../knowledge/readers/KnowledgeReader.ts | 24 + .../knowledge/readers/KnowledgeUrlReader.ts | 34 + .../readers/__tests__/ReaderFactory.test.ts | 363 ++++++ .../readers/files/DraftsExportReader.ts | 37 + .../knowledge/readers/files/EpubReader.ts | 61 + .../__tests__/DraftsExportReader.test.ts | 52 + .../files/__tests__/EpubReader.test.ts | 92 ++ .../knowledge/rerank/__tests__/rerank.test.ts | 192 +++ .../services/knowledge/rerank/adapters.ts | 151 +++ src/main/services/knowledge/rerank/rerank.ts | 105 ++ src/main/services/knowledge/rerank/types.ts | 25 + .../knowledge/runtime/KnowledgeAddQueue.ts | 146 +++ .../knowledge/runtime/KnowledgeAddRuntime.ts | 123 ++ .../runtime/KnowledgeRuntimeService.ts | 108 ++ .../__tests__/KnowledgeAddQueue.test.ts | 120 ++ .../__tests__/KnowledgeRuntimeService.test.ts | 950 ++++++++++++++ src/main/services/knowledge/runtime/index.ts | 5 + .../runtime/utils/__tests__/cleanup.test.ts | 249 ++++ .../utils/__tests__/taskRuntime.test.ts | 47 + .../knowledge/runtime/utils/cleanup.ts | 137 ++ .../knowledge/runtime/utils/taskRuntime.ts | 39 + .../knowledge/utils/__tests__/chunk.test.ts | 70 + .../knowledge/utils/__tests__/config.test.ts | 19 + .../utils/__tests__/directory.test.ts | 80 ++ .../knowledge/utils/__tests__/sitemap.test.ts | 128 ++ .../knowledge/utils/__tests__/url.test.ts | 136 ++ src/main/services/knowledge/utils/chunk.ts | 32 + src/main/services/knowledge/utils/config.ts | 43 + .../services/knowledge/utils/directory.ts | 145 +++ src/main/services/knowledge/utils/embed.ts | 38 + src/main/services/knowledge/utils/model.ts | 24 + src/main/services/knowledge/utils/sitemap.ts | 75 ++ src/main/services/knowledge/utils/url.ts | 69 + .../KnowledgeVectorStoreService.ts | 94 ++ .../KnowledgeVectorStoreService.test.ts | 201 +++ .../providers/BaseVectorStoreProvider.ts | 8 + .../providers/LibSqlVectorStoreProvider.ts | 62 + src/preload/index.ts | 13 + .../src/windows/migrationV2/i18n/locales.ts | 6 +- tsconfig.node.json | 5 +- tsconfig.web.json | 5 +- .../knowledge/knowledge-backend-decisions.md | 596 +++++++++ .../docs/knowledge/knowledge-schema.md | 73 +- .../knowledge/knowledge-vector-migrator.md | 299 +++++ vitest.config.ts | 12 + 88 files changed, 11276 insertions(+), 685 deletions(-) create mode 100644 .changeset/wild-dots-dance.md create mode 100644 packages/shared/__tests__/knowledge-schemas.test.ts create mode 100644 packages/vectorstores/libsql/CHANGELOG.md create mode 100644 packages/vectorstores/libsql/package.json create mode 100644 packages/vectorstores/libsql/src/LibSQLVectorStore.ts create mode 100644 packages/vectorstores/libsql/src/index.ts create mode 100644 packages/vectorstores/libsql/src/utils.ts create mode 100644 packages/vectorstores/libsql/tests/LibSQLVectorStore.test.ts create mode 100644 packages/vectorstores/libsql/tsconfig.json create mode 100644 packages/vectorstores/libsql/tsdown.config.ts create mode 100644 src/main/data/migration/v2/migrators/KnowledgeVectorMigrator.ts create mode 100644 src/main/data/migration/v2/migrators/README-KnowledgeVectorMigrator.md create mode 100644 src/main/data/migration/v2/migrators/__tests__/KnowledgeVectorMigrator.test.ts create mode 100644 src/main/data/migration/v2/utils/KnowledgeVectorSourceReader.ts create mode 100644 src/main/data/migration/v2/utils/__tests__/KnowledgeVectorSourceReader.test.ts delete mode 100644 src/main/data/services/__tests__/knowledgeBaseConfig.test.ts delete mode 100644 src/main/data/services/knowledgeBaseConfig.ts create mode 100644 src/main/services/knowledge/KnowledgeOrchestrationService.ts create mode 100644 src/main/services/knowledge/__tests__/KnowledgeOrchestrationService.test.ts create mode 100644 src/main/services/knowledge/index.ts create mode 100644 src/main/services/knowledge/readers/KnowledgeFileReader.ts create mode 100644 src/main/services/knowledge/readers/KnowledgeNoteReader.ts create mode 100644 src/main/services/knowledge/readers/KnowledgeReader.ts create mode 100644 src/main/services/knowledge/readers/KnowledgeUrlReader.ts create mode 100644 src/main/services/knowledge/readers/__tests__/ReaderFactory.test.ts create mode 100644 src/main/services/knowledge/readers/files/DraftsExportReader.ts create mode 100644 src/main/services/knowledge/readers/files/EpubReader.ts create mode 100644 src/main/services/knowledge/readers/files/__tests__/DraftsExportReader.test.ts create mode 100644 src/main/services/knowledge/readers/files/__tests__/EpubReader.test.ts create mode 100644 src/main/services/knowledge/rerank/__tests__/rerank.test.ts create mode 100644 src/main/services/knowledge/rerank/adapters.ts create mode 100644 src/main/services/knowledge/rerank/rerank.ts create mode 100644 src/main/services/knowledge/rerank/types.ts create mode 100644 src/main/services/knowledge/runtime/KnowledgeAddQueue.ts create mode 100644 src/main/services/knowledge/runtime/KnowledgeAddRuntime.ts create mode 100644 src/main/services/knowledge/runtime/KnowledgeRuntimeService.ts create mode 100644 src/main/services/knowledge/runtime/__tests__/KnowledgeAddQueue.test.ts create mode 100644 src/main/services/knowledge/runtime/__tests__/KnowledgeRuntimeService.test.ts create mode 100644 src/main/services/knowledge/runtime/index.ts create mode 100644 src/main/services/knowledge/runtime/utils/__tests__/cleanup.test.ts create mode 100644 src/main/services/knowledge/runtime/utils/__tests__/taskRuntime.test.ts create mode 100644 src/main/services/knowledge/runtime/utils/cleanup.ts create mode 100644 src/main/services/knowledge/runtime/utils/taskRuntime.ts create mode 100644 src/main/services/knowledge/utils/__tests__/chunk.test.ts create mode 100644 src/main/services/knowledge/utils/__tests__/config.test.ts create mode 100644 src/main/services/knowledge/utils/__tests__/directory.test.ts create mode 100644 src/main/services/knowledge/utils/__tests__/sitemap.test.ts create mode 100644 src/main/services/knowledge/utils/__tests__/url.test.ts create mode 100644 src/main/services/knowledge/utils/chunk.ts create mode 100644 src/main/services/knowledge/utils/config.ts create mode 100644 src/main/services/knowledge/utils/directory.ts create mode 100644 src/main/services/knowledge/utils/embed.ts create mode 100644 src/main/services/knowledge/utils/model.ts create mode 100644 src/main/services/knowledge/utils/sitemap.ts create mode 100644 src/main/services/knowledge/utils/url.ts create mode 100644 src/main/services/knowledge/vectorstore/KnowledgeVectorStoreService.ts create mode 100644 src/main/services/knowledge/vectorstore/__tests__/KnowledgeVectorStoreService.test.ts create mode 100644 src/main/services/knowledge/vectorstore/providers/BaseVectorStoreProvider.ts create mode 100644 src/main/services/knowledge/vectorstore/providers/LibSqlVectorStoreProvider.ts create mode 100644 v2-refactor-temp/docs/knowledge/knowledge-backend-decisions.md create mode 100644 v2-refactor-temp/docs/knowledge/knowledge-vector-migrator.md diff --git a/.changeset/wild-dots-dance.md b/.changeset/wild-dots-dance.md new file mode 100644 index 0000000000..4d786de869 --- /dev/null +++ b/.changeset/wild-dots-dance.md @@ -0,0 +1,4 @@ +--- +--- + +No package release is required for this PR. diff --git a/docs/references/knowledge/knowledge-service.md b/docs/references/knowledge/knowledge-service.md index f203a82f08..82249cfb4c 100644 --- a/docs/references/knowledge/knowledge-service.md +++ b/docs/references/knowledge/knowledge-service.md @@ -1,170 +1,117 @@ -# KnowledgeService Concurrency Control +# Knowledge Service -This document details the concurrency control and workload management mechanism in `KnowledgeService`. +This document records the current v2 knowledge backend shape in the main process. -## Concurrency Control and Workload Management +## Overview -KnowledgeService implements a fine-grained task queue system to control the number of concurrently processed items and workload. This system is implemented through the following key components: +The current implementation is split into three layers: -### 1. Key Variables and Limits +1. `KnowledgeBaseService` / `KnowledgeItemService` + - Persist SQLite-backed knowledge base and knowledge item data. + - Validate `type` / `data` consistency. + - Persist `knowledge_item.status` and `error`. +2. `KnowledgeOrchestrationService` + - Exposes the caller-facing IPC workflow. + - Coordinates expand, create, filter, add, delete, and search flows. +3. `KnowledgeRuntimeService` + - Executes indexing and retrieval work. + - Owns the in-memory add queue, interruption handling, and vector-store coordination. -```typescript -private workload = 0 -private processingItemCount = 0 -private knowledgeItemProcessingQueueMappingPromise: Map void> = new Map() -private static MAXIMUM_WORKLOAD = 1024 * 1024 * 80 // ~80MB -private static MAXIMUM_PROCESSING_ITEM_COUNT = 30 +```text +caller + -> Data API + -> preload IPC + -> KnowledgeOrchestrationService + -> KnowledgeBaseService / KnowledgeItemService + -> KnowledgeRuntimeService + -> reader / chunk / embed / rerank / vector store ``` -- `workload`: Tracks the total work currently being processed (in bytes) -- `processingItemCount`: Tracks the number of items currently being processed -- `MAXIMUM_WORKLOAD`: Maximum workload set to 80MB -- `MAXIMUM_PROCESSING_ITEM_COUNT`: Maximum concurrent processing items set to 30 +## Caller Contract -### 2. Workload Estimation +The caller-facing model is now unified: -Each task has a workload estimation mechanism via the `evaluateTaskWorkload` property: +1. Create item records through Data API. +2. Call runtime IPC once with item ids. -```typescript -interface EvaluateTaskWorkload { - workload: number -} +For leaf items (`file`, `url`, `note`): + +```text +caller + -> Data API create item(s) + -> preload IPC add-items(item ids) ``` -Different task types have different workload estimations: +For container items (`directory`, `sitemap`): -- File tasks: Use file size as workload `{ workload: file.size }` -- URL tasks: Use a fixed value `{ workload: 1024 * 1024 * 2 }` (~2MB) -- Sitemap tasks: Use a fixed value `{ workload: 1024 * 1024 * 20 }` (~20MB) -- Note tasks: Use text content byte length `{ workload: contentBytes.length }` - -### 3. Task State Management - -Tasks track their lifecycle through a state enum: - -```typescript -enum LoaderTaskItemState { - PENDING, // Waiting to be processed - PROCESSING, // Currently being processed - DONE // Completed -} +```text +caller + -> Data API create owner item + -> preload IPC add-items(owner item ids) + -> orchestration expands owner + -> orchestration persists child items + -> orchestration filters indexable leaf items + -> runtime enqueues leaf items ``` -### 4. Core Queue Processing Logic +The caller no longer needs to invoke separate `expand*` IPC APIs. -The core queue processing logic resides in the `processingQueueHandle` method: +## IPC Surface -```typescript -private processingQueueHandle() { - const getSubtasksUntilMaximumLoad = (): QueueTaskItem[] => { - const queueTaskList: QueueTaskItem[] = [] - that: for (const [task, resolve] of this.knowledgeItemProcessingQueueMappingPromise) { - for (const item of task.loaderTasks) { - if (this.maximumLoad()) { - break that - } +`KnowledgeOrchestrationService` currently owns the public IPC entrypoints: - const { state, task: taskPromise, evaluateTaskWorkload } = item +- `knowledge-runtime:create-base` +- `knowledge-runtime:delete-base` +- `knowledge-runtime:add-items` +- `knowledge-runtime:delete-items` +- `knowledge-runtime:search` - if (state !== LoaderTaskItemState.PENDING) { - continue - } +These IPC handlers are workflow-oriented. They may call data services and runtime services internally before returning. - const { workload } = evaluateTaskWorkload - this.workload += workload - this.processingItemCount += 1 - item.state = LoaderTaskItemState.PROCESSING - queueTaskList.push({ - taskPromise: () => - taskPromise().then(() => { - this.workload -= workload - this.processingItemCount -= 1 - task.loaderTasks.delete(item) - if (task.loaderTasks.size === 0) { - this.knowledgeItemProcessingQueueMappingPromise.delete(task) - resolve() - } - this.processingQueueHandle() - }), - resolve: () => {}, - evaluateTaskWorkload - }) - } - } - return queueTaskList - } +## Runtime Behavior - const subTasks = getSubtasksUntilMaximumLoad() - if (subTasks.length > 0) { - const subTaskPromises = subTasks.map(({ taskPromise }) => taskPromise()) - Promise.all(subTaskPromises).then(() => { - subTasks.forEach(({ resolve }) => resolve()) - }) - } -} -``` +`KnowledgeRuntimeService` keeps a single in-memory add queue with: -This method works as follows: +- one shared queue across all knowledge bases +- fixed concurrency of `5` +- item-level deduplication for pending/running add work +- interruption support for delete and shutdown -1. Iterates through all pending task sets -2. For each subtask in a task set: - - Checks if maximum load is reached (via `maximumLoad()`) - - If task state is PENDING: - - Increases current workload and processing item count - - Updates task state to PROCESSING - - Adds task to the execution queue -3. Executes all collected subtasks -4. When a subtask completes: - - Decreases workload and processing item count - - Removes completed task from the task set - - If the task set is empty, resolves the corresponding Promise - - Recursively calls `processingQueueHandle()` to process more tasks +Current status writes are: -### 5. Load Check +- `pending` before enqueue +- `completed` after successful vector write +- `failed` on error or shutdown interruption -```typescript -private maximumLoad() { - return ( - this.processingItemCount >= KnowledgeService.MAXIMUM_PROCESSING_ITEM_COUNT || - this.workload >= KnowledgeService.MAXIMUM_WORKLOAD - ) -} -``` +Intermediate states such as `file_processing`, `read`, and `embed` remain reserved in schema/types, but are not written by the current runtime. -This method checks whether maximum load is reached via two conditions: +## Search -- Processing item count reaches the limit (30) -- Total workload reaches the limit (80MB) +Search is executed by `KnowledgeRuntimeService.search(base, query)`: -### 6. Task Addition and Execution Flow +1. embed query +2. query the libsql vector store +3. map nodes into `KnowledgeSearchResult` +4. rerank only when `base.rerankModelId` is configured -When adding new tasks: +Current `KnowledgeSearchResult` includes: -1. Create a task (different tasks for different types) -2. Add the task to the queue via `appendProcessingQueue` -3. Call `processingQueueHandle` to start processing queued tasks +- `pageContent` +- `score` +- `metadata` +- optional `itemId` +- required `chunkId` -```typescript -private appendProcessingQueue(task: LoaderTask): Promise { - return new Promise((resolve) => { - this.knowledgeItemProcessingQueueMappingPromise.set(loaderTaskIntoOfSet(task), () => { - resolve(task.loaderDoneReturn!) - }) - }) -} -``` +`chunkId` is the vector row identity used for result-level attribution. `itemId` is populated from stored metadata when available. -## Benefits of This Concurrency Control +## Deletion -1. **Resource Optimization**: Limits concurrent items and total workload to prevent system resource exhaustion -2. **Auto-regulation**: Automatically fetches new tasks from the queue when tasks complete, maintaining efficient resource utilization -3. **Flexibility**: Different task types have different workload estimations, more accurately reflecting actual resource requirements -4. **Reliability**: State management and Promise resolution mechanism ensures tasks complete correctly and notify callers +Deletion still requires two concerns to be handled: -## Real-world Use Cases +1. Runtime deletion + - interrupt queue work + - delete vectors +2. Data deletion + - remove SQLite rows through Data API -This concurrency control is especially useful when processing large amounts of data: - -- Importing large directories that may contain hundreds of files -- Processing large sitemaps with many URLs -- Handling multiple users adding knowledge base items simultaneously +The runtime layer does not delete SQLite business data by itself. diff --git a/electron.vite.config.ts b/electron.vite.config.ts index d50a6953d7..bd67a52152 100644 --- a/electron.vite.config.ts +++ b/electron.vite.config.ts @@ -16,6 +16,10 @@ const visualizerPlugin = (type: 'renderer' | 'main') => { const isDev = process.env.NODE_ENV === 'development' const isProd = process.env.NODE_ENV === 'production' +const bundledMainDependencies = new Set(['@vectorstores/libsql']) +const mainExternalDependencies = Object.keys(pkg.dependencies).filter( + (dependency) => !bundledMainDependencies.has(dependency) +) export default defineConfig({ main: { @@ -36,6 +40,7 @@ export default defineConfig({ '@logger': resolve('src/main/services/LoggerService'), '@mcp-trace/trace-core': resolve('packages/mcp-trace/trace-core'), '@mcp-trace/trace-node': resolve('packages/mcp-trace/trace-node'), + '@vectorstores/libsql': resolve('packages/vectorstores/libsql/src/index.ts'), '@cherrystudio/provider-registry/node': resolve('packages/provider-registry/src/registry-loader'), '@cherrystudio/provider-registry': resolve('packages/provider-registry/src'), '@test-mocks': resolve('tests/__mocks__') @@ -43,7 +48,7 @@ export default defineConfig({ }, build: { rollupOptions: { - external: ['bufferutil', 'utf-8-validate', 'electron', ...Object.keys(pkg.dependencies)], + external: ['bufferutil', 'utf-8-validate', 'electron', ...mainExternalDependencies], output: { manualChunks: undefined, // 彻底禁用代码分割 - 返回 null 强制单文件打包 inlineDynamicImports: true // 内联所有动态导入,这是关键配置 diff --git a/migrations/sqlite-drizzle/0006_silly_screwball.sql b/migrations/sqlite-drizzle/0006_silly_screwball.sql index a03af4c020..2dd3260040 100644 --- a/migrations/sqlite-drizzle/0006_silly_screwball.sql +++ b/migrations/sqlite-drizzle/0006_silly_screwball.sql @@ -30,7 +30,7 @@ CREATE TABLE `knowledge_item` ( FOREIGN KEY (`base_id`) REFERENCES `knowledge_base`(`id`) ON UPDATE no action ON DELETE cascade, FOREIGN KEY (`base_id`,`group_id`) REFERENCES `knowledge_item`(`base_id`,`id`) ON UPDATE no action ON DELETE cascade, CONSTRAINT "knowledge_item_type_check" CHECK("knowledge_item"."type" IN ('file', 'url', 'note', 'sitemap', 'directory')), - CONSTRAINT "knowledge_item_status_check" CHECK("knowledge_item"."status" IN ('idle', 'pending', 'ocr', 'read', 'embed', 'completed', 'failed')) + CONSTRAINT "knowledge_item_status_check" CHECK("knowledge_item"."status" IN ('idle', 'pending', 'file_processing', 'read', 'embed', 'completed', 'failed')) ); --> statement-breakpoint CREATE INDEX `knowledge_item_base_type_created_idx` ON `knowledge_item` (`base_id`,`type`,`created_at`);--> statement-breakpoint diff --git a/migrations/sqlite-drizzle/meta/0006_snapshot.json b/migrations/sqlite-drizzle/meta/0006_snapshot.json index 150aba49af..1933129a24 100644 --- a/migrations/sqlite-drizzle/meta/0006_snapshot.json +++ b/migrations/sqlite-drizzle/meta/0006_snapshot.json @@ -342,7 +342,7 @@ }, "knowledge_item_status_check": { "name": "knowledge_item_status_check", - "value": "\"knowledge_item\".\"status\" IN ('idle', 'pending', 'ocr', 'read', 'embed', 'completed', 'failed')" + "value": "\"knowledge_item\".\"status\" IN ('idle', 'pending', 'file_processing', 'read', 'embed', 'completed', 'failed')" } } }, diff --git a/package.json b/package.json index 5c7461c116..dfce768a57 100644 --- a/package.json +++ b/package.json @@ -57,6 +57,7 @@ "test:aicore": "vitest run --project aiCore", "test:pkg:ui": "vitest run --project ui", "test:shared": "vitest run --project shared", + "test:vectorstores": "vitest run --project vectorstores", "test:update": "pnpm test:renderer --update", "test:coverage": "vitest run --coverage --silent", "test:ui": "vitest --ui", @@ -80,20 +81,23 @@ "changeset:status": "changeset status", "changeset:version": "changeset version && pnpm format", "changeset:publish": "changeset publish", - "packages:build": "pnpm --filter @cherrystudio/ai-sdk-provider build && pnpm --filter @cherrystudio/ai-core build && pnpm --filter @cherrystudio/extension-table-plus build", + "packages:build": "pnpm --filter @cherrystudio/ai-sdk-provider build && pnpm --filter @cherrystudio/ai-core build && pnpm --filter @cherrystudio/extension-table-plus build && pnpm --filter @vectorstores/libsql build", "packages:release": "pnpm packages:build && changeset publish", "ci:basic-check": "pnpm test:lint && pnpm format:check && pnpm typecheck && pnpm i18n:check && pnpm i18n:hardcoded:strict && pnpm openapi:check && pnpm skills:check", - "ci:test-check": "pnpm test:main && pnpm test:renderer && pnpm test:aicore && pnpm test:shared && pnpm test:scripts", + "ci:test-check": "pnpm test:main && pnpm test:renderer && pnpm test:aicore && pnpm test:shared && pnpm test:vectorstores && pnpm test:scripts", "ci": "pnpm ci:basic-check && pnpm ci:test-check" }, "dependencies": { "@anthropic-ai/claude-agent-sdk": "0.2.81", "@expo/sudo-prompt": "^9.3.2", "@larksuiteoapi/node-sdk": "^1.59.0", - "@libsql/client": "0.14.0", + "@libsql/client": "^0.15.15", "@napi-rs/canvas": "0.1.80", "@napi-rs/system-ocr": "1.0.2", "@paymoapp/electron-shutdown-handler": "1.1.2", + "@vectorstores/core": "^0.1.8", + "@vectorstores/libsql": "workspace:*", + "@vectorstores/readers": "^0.1.8", "cron-parser": "^5.0.8", "drizzle-zod": "^0.8.3", "express": "5.1.0", diff --git a/packages/shared/IpcChannel.ts b/packages/shared/IpcChannel.ts index 7c57816f47..c3dd153c98 100644 --- a/packages/shared/IpcChannel.ts +++ b/packages/shared/IpcChannel.ts @@ -196,6 +196,11 @@ export enum IpcChannel { KnowledgeBase_Remove = 'knowledge-base:remove', KnowledgeBase_Search = 'knowledge-base:search', KnowledgeBase_Rerank = 'knowledge-base:rerank', + KnowledgeRuntime_CreateBase = 'knowledge-runtime:create-base', + KnowledgeRuntime_DeleteBase = 'knowledge-runtime:delete-base', + KnowledgeRuntime_AddItems = 'knowledge-runtime:add-items', + KnowledgeRuntime_DeleteItems = 'knowledge-runtime:delete-items', + KnowledgeRuntime_Search = 'knowledge-runtime:search', //file File_Open = 'file:open', diff --git a/packages/shared/__tests__/knowledge-schemas.test.ts b/packages/shared/__tests__/knowledge-schemas.test.ts new file mode 100644 index 0000000000..cc4b97361c --- /dev/null +++ b/packages/shared/__tests__/knowledge-schemas.test.ts @@ -0,0 +1,94 @@ +import { describe, expect, it } from 'vitest' + +import { CreateKnowledgeBaseSchema, UpdateKnowledgeBaseSchema } from '../data/api/schemas/knowledges' +import { KnowledgeBaseSchema, KnowledgeItemSchema } from '../data/types/knowledge' + +describe('Knowledge base schemas', () => { + it('accepts valid numeric tuning fields', () => { + expect( + CreateKnowledgeBaseSchema.safeParse({ + name: 'KB', + dimensions: 1024, + embeddingModelId: 'embed-model', + chunkSize: 800, + chunkOverlap: 120, + threshold: 0.5, + documentCount: 5, + searchMode: 'hybrid', + hybridAlpha: 0.7 + }).success + ).toBe(true) + }) + + it('rejects invalid numeric tuning fields in create schema', () => { + const result = CreateKnowledgeBaseSchema.safeParse({ + name: 'KB', + dimensions: 1024, + embeddingModelId: 'embed-model', + chunkSize: 0, + chunkOverlap: -1, + threshold: 2, + documentCount: 0, + hybridAlpha: -0.1 + }) + + expect(result.success).toBe(false) + }) + + it('rejects invalid numeric tuning fields in update schema', () => { + const result = UpdateKnowledgeBaseSchema.safeParse({ + chunkSize: -10, + chunkOverlap: -1, + threshold: 1.1, + documentCount: 0, + hybridAlpha: 2 + }) + + expect(result.success).toBe(false) + }) + + it('rejects invalid numeric tuning fields in entity schema', () => { + const result = KnowledgeBaseSchema.safeParse({ + id: 'kb-1', + name: 'KB', + dimensions: 1024, + embeddingModelId: 'embed-model', + chunkSize: 0, + chunkOverlap: -1, + threshold: 2, + documentCount: 0, + hybridAlpha: 2, + createdAt: '2026-04-10T00:00:00.000Z', + updatedAt: '2026-04-10T00:00:00.000Z' + }) + + expect(result.success).toBe(false) + }) + + it('requires knowledge items to carry an explicit nullable error field', () => { + expect( + KnowledgeItemSchema.safeParse({ + id: 'item-1', + baseId: 'kb-1', + type: 'note', + data: { content: 'hello' }, + status: 'idle', + error: null, + createdAt: '2026-04-10T00:00:00.000Z', + updatedAt: '2026-04-10T00:00:00.000Z' + }).success + ).toBe(true) + + expect( + KnowledgeItemSchema.safeParse({ + id: 'item-1', + baseId: 'kb-1', + type: 'note', + data: { content: 'hello' }, + status: 'idle', + createdAt: '2026-04-10T00:00:00.000Z', + updatedAt: '2026-04-10T00:00:00.000Z' + }).success + ).toBe(false) + }) +}) diff --git a/packages/shared/data/api/schemas/knowledges.ts b/packages/shared/data/api/schemas/knowledges.ts index 3fda0108ac..d3c9a48ef6 100644 --- a/packages/shared/data/api/schemas/knowledges.ts +++ b/packages/shared/data/api/schemas/knowledges.ts @@ -2,23 +2,27 @@ * Knowledge API DTOs and schema contracts. */ +import type { OffsetPaginationResponse } from '@shared/data/api' import { DirectoryItemDataSchema, FileItemDataSchema, FileMetadataSchema, - ItemStatusSchema, type KnowledgeBase, + KnowledgeChunkOverlapSchema, + KnowledgeChunkSizeSchema, + KnowledgeDocumentCountSchema, + KnowledgeHybridAlphaSchema, type KnowledgeItem, + KnowledgeItemStatusSchema, KnowledgeItemTypeSchema, KnowledgeSearchModeSchema, + KnowledgeThresholdSchema, NoteItemDataSchema, SitemapItemDataSchema, UrlItemDataSchema } from '@shared/data/types/knowledge' import * as z from 'zod' -import type { OffsetPaginationResponse } from '../apiTypes' - export const CreateKnowledgeBaseSchema = z.object({ name: z.string().trim().min(1), description: z.string().optional(), @@ -26,12 +30,12 @@ export const CreateKnowledgeBaseSchema = z.object({ embeddingModelId: z.string().trim().min(1), rerankModelId: z.string().optional(), fileProcessorId: z.string().optional(), - chunkSize: z.number().optional(), - chunkOverlap: z.number().optional(), - threshold: z.number().optional(), - documentCount: z.number().optional(), + chunkSize: KnowledgeChunkSizeSchema.optional(), + chunkOverlap: KnowledgeChunkOverlapSchema.optional(), + threshold: KnowledgeThresholdSchema.optional(), + documentCount: KnowledgeDocumentCountSchema.optional(), searchMode: KnowledgeSearchModeSchema.optional(), - hybridAlpha: z.number().optional() + hybridAlpha: KnowledgeHybridAlphaSchema.optional() }) export type CreateKnowledgeBaseDto = z.infer @@ -41,12 +45,12 @@ export const UpdateKnowledgeBaseSchema = z description: z.string().nullable().optional(), rerankModelId: z.string().nullable().optional(), fileProcessorId: z.string().nullable().optional(), - chunkSize: z.number().nullable().optional(), - chunkOverlap: z.number().nullable().optional(), - threshold: z.number().nullable().optional(), - documentCount: z.number().nullable().optional(), + chunkSize: KnowledgeChunkSizeSchema.nullable().optional(), + chunkOverlap: KnowledgeChunkOverlapSchema.nullable().optional(), + threshold: KnowledgeThresholdSchema.nullable().optional(), + documentCount: KnowledgeDocumentCountSchema.nullable().optional(), searchMode: KnowledgeSearchModeSchema.nullable().optional(), - hybridAlpha: z.number().nullable().optional() + hybridAlpha: KnowledgeHybridAlphaSchema.nullable().optional() }) .strict() export type UpdateKnowledgeBaseDto = z.infer @@ -55,7 +59,7 @@ export { DirectoryItemDataSchema, FileItemDataSchema, FileMetadataSchema, - ItemStatusSchema, + KnowledgeItemStatusSchema, KnowledgeItemTypeSchema, KnowledgeSearchModeSchema, NoteItemDataSchema, @@ -63,42 +67,83 @@ export { UrlItemDataSchema } +const CreateKnowledgeItemBaseSchema = z + .object({ + ref: z.string().trim().min(1).optional(), + groupId: z.string().nullable().optional(), + groupRef: z.string().trim().min(1).optional() + }) + .strict() + +type CreateKnowledgeItemReferenceInput = z.input + +function validateCreateKnowledgeItemReferences(item: CreateKnowledgeItemReferenceInput, ctx: z.RefinementCtx): void { + if (item.groupId != null && item.groupRef != null) { + ctx.addIssue({ + code: 'custom', + path: ['groupRef'], + message: 'Knowledge items cannot specify both groupId and groupRef' + }) + } +} + +export function getCreateKnowledgeItemsReferenceErrors( + items: CreateKnowledgeItemReferenceInput[] +): Record { + const refs = new Set() + const duplicateRefs = new Set() + const missingGroupRefs = new Set() + + for (const item of items) { + if (item.ref) { + if (refs.has(item.ref)) { + duplicateRefs.add(item.ref) + } else { + refs.add(item.ref) + } + } + } + + for (const item of items) { + if (item.groupId == null && item.groupRef && !refs.has(item.groupRef)) { + missingGroupRefs.add(item.groupRef) + } + } + + const fieldErrors: Record = {} + + if (duplicateRefs.size > 0) { + fieldErrors.ref = [`Duplicate knowledge item refs in request batch: ${[...duplicateRefs].join(', ')}`] + } + + if (missingGroupRefs.size > 0) { + fieldErrors.groupRef = [`Knowledge item group ref not found in request batch: ${[...missingGroupRefs].join(', ')}`] + } + + return fieldErrors +} + export const CreateKnowledgeItemSchema = z.discriminatedUnion('type', [ - z - .object({ - groupId: z.string().nullable().optional(), - type: z.literal('file'), - data: FileItemDataSchema - }) - .strict(), - z - .object({ - groupId: z.string().nullable().optional(), - type: z.literal('url'), - data: UrlItemDataSchema - }) - .strict(), - z - .object({ - groupId: z.string().nullable().optional(), - type: z.literal('note'), - data: NoteItemDataSchema - }) - .strict(), - z - .object({ - groupId: z.string().nullable().optional(), - type: z.literal('sitemap'), - data: SitemapItemDataSchema - }) - .strict(), - z - .object({ - groupId: z.string().nullable().optional(), - type: z.literal('directory'), - data: DirectoryItemDataSchema - }) - .strict() + CreateKnowledgeItemBaseSchema.extend({ + type: z.literal('file'), + data: FileItemDataSchema + }).superRefine(validateCreateKnowledgeItemReferences), + CreateKnowledgeItemBaseSchema.extend({ + type: z.literal('url'), + data: UrlItemDataSchema + }).superRefine(validateCreateKnowledgeItemReferences), + CreateKnowledgeItemBaseSchema.extend({ + type: z.literal('note'), + data: NoteItemDataSchema + }).superRefine(validateCreateKnowledgeItemReferences), + CreateKnowledgeItemBaseSchema.extend({ + type: z.literal('sitemap'), + data: SitemapItemDataSchema + }).superRefine(validateCreateKnowledgeItemReferences), + CreateKnowledgeItemBaseSchema.extend({ + type: z.literal('directory'), + data: DirectoryItemDataSchema + }).superRefine(validateCreateKnowledgeItemReferences) ]) export type CreateKnowledgeItemDto = z.infer @@ -109,9 +154,23 @@ export const KNOWLEDGE_BASES_DEFAULT_PAGE = 1 export const KNOWLEDGE_BASES_DEFAULT_LIMIT = 20 export const KNOWLEDGE_BASES_MAX_LIMIT = 100 -export const CreateKnowledgeItemsSchema = z.object({ - items: z.array(CreateKnowledgeItemSchema).min(1).max(KNOWLEDGE_ITEMS_MAX_LIMIT) -}) +export const CreateKnowledgeItemsSchema = z + .object({ + items: z.array(CreateKnowledgeItemSchema).min(1).max(KNOWLEDGE_ITEMS_MAX_LIMIT) + }) + .superRefine((value, ctx) => { + const fieldErrors = getCreateKnowledgeItemsReferenceErrors(value.items) + + for (const [field, messages] of Object.entries(fieldErrors)) { + for (const message of messages) { + ctx.addIssue({ + code: 'custom', + path: ['items', field], + message + }) + } + } + }) export type CreateKnowledgeItemsDto = z.infer export const UpdateKnowledgeItemDataSchema = z.union([ @@ -125,7 +184,7 @@ export const UpdateKnowledgeItemDataSchema = z.union([ export const UpdateKnowledgeItemSchema = z .object({ data: UpdateKnowledgeItemDataSchema.optional(), - status: ItemStatusSchema.optional(), + status: KnowledgeItemStatusSchema.optional(), error: z.string().nullable().optional() }) .strict() diff --git a/packages/shared/data/types/knowledge.ts b/packages/shared/data/types/knowledge.ts index 3556154b44..6f739825c2 100644 --- a/packages/shared/data/types/knowledge.ts +++ b/packages/shared/data/types/knowledge.ts @@ -13,14 +13,28 @@ export const KNOWLEDGE_ITEM_TYPES = ['file', 'url', 'note', 'sitemap', 'director export const KnowledgeItemTypeSchema = z.enum(KNOWLEDGE_ITEM_TYPES) export type KnowledgeItemType = z.infer -export const KNOWLEDGE_ITEM_STATUSES = ['idle', 'pending', 'ocr', 'read', 'embed', 'completed', 'failed'] as const -export const ItemStatusSchema = z.enum(KNOWLEDGE_ITEM_STATUSES) -export type ItemStatus = z.infer +export const KNOWLEDGE_ITEM_STATUSES = [ + 'idle', + 'pending', + 'file_processing', + 'read', + 'embed', + 'completed', + 'failed' +] as const +export const KnowledgeItemStatusSchema = z.enum(KNOWLEDGE_ITEM_STATUSES) +export type KnowledgeItemStatus = z.infer export const KNOWLEDGE_SEARCH_MODES = ['default', 'bm25', 'hybrid'] as const export const KnowledgeSearchModeSchema = z.enum(KNOWLEDGE_SEARCH_MODES) export type KnowledgeSearchMode = z.infer +export const KnowledgeChunkSizeSchema = z.number().int().positive() +export const KnowledgeChunkOverlapSchema = z.number().int().min(0) +export const KnowledgeThresholdSchema = z.number().min(0).max(1) +export const KnowledgeDocumentCountSchema = z.number().int().positive() +export const KnowledgeHybridAlphaSchema = z.number().min(0).max(1) + /** * Temporary schema mirroring the current FileMetadata shape. * TODO: Move to `types/file.ts` once the dedicated file domain schema is ready. @@ -78,8 +92,8 @@ export type SitemapItemData = z.infer * Directory item data. */ export const DirectoryItemDataSchema = z.object({ - path: z.string().trim().min(1), - recursive: z.boolean() + name: z.string().trim().min(1), + path: z.string().trim().min(1) }) export type DirectoryItemData = z.infer @@ -114,12 +128,12 @@ export const KnowledgeBaseSchema = z.object({ embeddingModelId: z.string().min(1), rerankModelId: z.string().optional(), fileProcessorId: z.string().optional(), - chunkSize: z.number().optional(), - chunkOverlap: z.number().optional(), - threshold: z.number().optional(), - documentCount: z.number().optional(), + chunkSize: KnowledgeChunkSizeSchema.optional(), + chunkOverlap: KnowledgeChunkOverlapSchema.optional(), + threshold: KnowledgeThresholdSchema.optional(), + documentCount: KnowledgeDocumentCountSchema.optional(), searchMode: KnowledgeSearchModeSchema.optional(), - hybridAlpha: z.number().optional(), + hybridAlpha: KnowledgeHybridAlphaSchema.optional(), createdAt: z.iso.datetime(), updatedAt: z.iso.datetime() }) @@ -129,8 +143,8 @@ const KnowledgeItemBaseSchema = z.object({ id: z.string(), baseId: z.string(), groupId: z.string().nullable().optional(), - status: ItemStatusSchema, - error: z.string().nullable().optional(), + status: KnowledgeItemStatusSchema, + error: z.string().nullable(), createdAt: z.iso.datetime(), updatedAt: z.iso.datetime() }) @@ -171,6 +185,6 @@ export const KnowledgeSearchResultSchema = z.object({ score: z.number(), metadata: z.record(z.string(), z.unknown()), itemId: z.string().optional(), - chunkId: z.string().optional() + chunkId: z.string() }) export type KnowledgeSearchResult = z.infer diff --git a/packages/vectorstores/libsql/CHANGELOG.md b/packages/vectorstores/libsql/CHANGELOG.md new file mode 100644 index 0000000000..f02ea1db37 --- /dev/null +++ b/packages/vectorstores/libsql/CHANGELOG.md @@ -0,0 +1,13 @@ +# @vectorstores/libsql + +## 0.1.0 + +### Minor Changes + +- fddc11c: Add LibSQL/Turso vector store support with: + + - Vector search (default mode) using native vector32() and vector_distance_cos() + - BM25 full-text search mode using FTS5 + - Hybrid search mode combining vector + FTS5 + - Metadata filtering with all standard operators + - Collection management diff --git a/packages/vectorstores/libsql/package.json b/packages/vectorstores/libsql/package.json new file mode 100644 index 0000000000..ac57e8114e --- /dev/null +++ b/packages/vectorstores/libsql/package.json @@ -0,0 +1,49 @@ +{ + "name": "@vectorstores/libsql", + "description": "libSQL/Turso Storage for vectorstores", + "version": "0.1.0", + "type": "module", + "main": "./dist/index.cjs", + "module": "./dist/index.mjs", + "types": "./dist/index.d.mts", + "exports": { + ".": { + "types": "./dist/index.d.mts", + "import": "./dist/index.mjs", + "require": "./dist/index.cjs", + "default": "./dist/index.mjs" + } + }, + "files": [ + "dist" + ], + "repository": { + "type": "git", + "url": "git+https://github.com/marcusschiesser/vectorstores.git", + "directory": "packages/vectorstores/libsql" + }, + "scripts": { + "build": "tsdown", + "dev": "tsdown --watch", + "clean": "rm -rf dist", + "test": "pnpm --dir ../../.. exec vitest run --project vectorstores" + }, + "devDependencies": { + "tsdown": "^0.20.3", + "typescript": "^5.8.3", + "vitest": "2.1.0", + "@vectorstores/core": "^0.1.8", + "@vectorstores/env": "^0.1.0" + }, + "peerDependencies": { + "@vectorstores/core": "^0.1.8", + "@vectorstores/env": "^0.1.0" + }, + "dependencies": { + "@libsql/client": "^0.15.15" + }, + "sideEffects": false, + "engines": { + "node": ">=18.0.0" + } +} diff --git a/packages/vectorstores/libsql/src/LibSQLVectorStore.ts b/packages/vectorstores/libsql/src/LibSQLVectorStore.ts new file mode 100644 index 0000000000..f4f20730c4 --- /dev/null +++ b/packages/vectorstores/libsql/src/LibSQLVectorStore.ts @@ -0,0 +1,752 @@ +import { + type Client, + type Config as LibSQLClientConfig, + createClient, + type InArgs, + type InStatement +} from '@libsql/client' +import { + type BaseNode, + BaseVectorStore, + combineResults, + DEFAULT_COLLECTION, + Document, + FilterCondition, + FilterOperator, + type Metadata, + type MetadataFilter, + MetadataMode, + type VectorStoreQuery, + type VectorStoreQueryResult +} from '@vectorstores/core' +import { getEnv } from '@vectorstores/env' + +export const LIBSQL_TABLE = 'libsql_vectorstores_embedding' +export const DEFAULT_DIMENSIONS = 1536 +const SAFE_METADATA_KEY_PATTERN = /^[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)*$/ + +type PositionalArgs = Extract + +function toError(error: unknown): Error { + return error instanceof Error ? error : new Error(String(error)) +} + +function validateMetadataKey(key: string): string { + if (!SAFE_METADATA_KEY_PATTERN.test(key)) { + throw new Error(`Invalid metadata filter key: ${key}`) + } + + return key +} + +function isSupportedInArg(param: unknown): param is NonNullable { + return ( + param != null && + (typeof param === 'string' || + typeof param === 'number' || + typeof param === 'boolean' || + param instanceof ArrayBuffer || + ArrayBuffer.isView(param) || + param instanceof Date) + ) +} + +// Helper function to safely convert unknown array to InArgs +function toInArgs(params: unknown[]): InArgs { + for (const [index, param] of params.entries()) { + if (!isSupportedInArg(param)) { + throw new Error(`Invalid libSQL argument at index ${index}: ${String(param)}`) + } + } + + return params as PositionalArgs +} + +/** + * Provides support for writing and querying vector data in libSQL/Turso. + * Uses native libSQL vector operations for similarity search. + */ +export class LibSQLVectorStore extends BaseVectorStore { + storesText: boolean = true + + private collection: string = DEFAULT_COLLECTION + private readonly tableName: string = LIBSQL_TABLE + private readonly dimensions: number = DEFAULT_DIMENSIONS + + private clientInstance: Client + private initialized: boolean = false + private initializationPromise?: Promise + + constructor( + init: Partial<{ client: Client }> & + Partial<{ + tableName?: string + dimensions?: number + collection?: string + clientConfig?: LibSQLClientConfig + }> + ) { + super() + + this.collection = init.collection ?? DEFAULT_COLLECTION + this.tableName = init.tableName ?? LIBSQL_TABLE + this.dimensions = init.dimensions ?? DEFAULT_DIMENSIONS + + let clientConfig = init.clientConfig + + if (init.client) { + this.clientInstance = init.client + } else { + clientConfig = clientConfig ?? this.getDefaultClientConfig() + if (!clientConfig) { + throw new Error('LibSQL clientConfig is required when no client instance is provided.') + } + this.clientInstance = createClient(clientConfig) + } + } + + setCollection(coll: string) { + this.collection = coll + } + + getCollection(): string { + return this.collection + } + + client(): Client { + return this.clientInstance + } + + private getDefaultClientConfig(): LibSQLClientConfig { + const envUrl = getEnv('LIBSQL_URL') + const url = envUrl ?? ':memory:' + + if (!envUrl) { + console.warn( + 'LIBSQL_URL not set. Falling back to in-memory libSQL (non-persistent). Set LIBSQL_URL for a persistent database.' + ) + } + + const authToken = getEnv('LIBSQL_AUTH_TOKEN') + + return authToken ? { url, authToken } : { url } + } + + private async ensureInitialized(): Promise { + if (this.initialized) { + return + } + + if (!this.initializationPromise) { + this.initializationPromise = this.checkSchema(this.clientInstance) + .then(() => { + this.initialized = true + }) + .finally(() => { + this.initializationPromise = undefined + }) + } + + await this.initializationPromise + } + + private async checkSchema(client: Client) { + const createTableStatement: InStatement = { + sql: ` + CREATE TABLE IF NOT EXISTS ${this.tableName} ( + id TEXT PRIMARY KEY, + external_id TEXT, + collection TEXT, + document TEXT, + metadata JSON DEFAULT '{}', + embeddings F32_BLOB(${this.dimensions}) + ) + `, + args: [] + } + await client.execute(createTableStatement) + + const indexStatement: InStatement = { + sql: ` + CREATE INDEX IF NOT EXISTS idx_${this.tableName}_external_id + ON ${this.tableName} (external_id) + `, + args: [] + } + await client.execute(indexStatement) + + const collectionIndexStatement: InStatement = { + sql: ` + CREATE INDEX IF NOT EXISTS idx_${this.tableName}_collection + ON ${this.tableName} (collection) + `, + args: [] + } + await client.execute(collectionIndexStatement) + + const vectorIndexStatement: InStatement = { + sql: ` + CREATE INDEX IF NOT EXISTS idx_${this.tableName}_vector + ON ${this.tableName} (libsql_vector_idx(embeddings, 'metric=cosine')) + `, + args: [] + } + await client.execute(vectorIndexStatement) + + // Create FTS5 virtual table for full-text search (bm25/hybrid modes) + const ftsTableName = `${this.tableName}_fts` + const ftsTableExistsResult = await client.execute({ + sql: ` + SELECT 1 + FROM sqlite_master + WHERE type = 'table' AND name = ? + LIMIT 1 + `, + args: toInArgs([ftsTableName]) + }) + const shouldRebuildFts = ftsTableExistsResult.rows.length === 0 + const ftsStatement: InStatement = { + sql: ` + CREATE VIRTUAL TABLE IF NOT EXISTS ${ftsTableName} + USING fts5(document, content='${this.tableName}', content_rowid='rowid') + `, + args: [] + } + await client.execute(ftsStatement) + + await client.execute({ + sql: ` + CREATE TRIGGER IF NOT EXISTS ${this.tableName}_ai + AFTER INSERT ON ${this.tableName} + BEGIN + INSERT INTO ${ftsTableName}(rowid, document) + VALUES (NEW.rowid, NEW.document); + END + `, + args: [] + }) + + await client.execute({ + sql: ` + CREATE TRIGGER IF NOT EXISTS ${this.tableName}_au + AFTER UPDATE OF document ON ${this.tableName} + BEGIN + INSERT INTO ${ftsTableName}(${ftsTableName}, rowid, document) + VALUES ('delete', OLD.rowid, OLD.document); + INSERT INTO ${ftsTableName}(rowid, document) + VALUES (NEW.rowid, NEW.document); + END + `, + args: [] + }) + + await client.execute({ + sql: ` + CREATE TRIGGER IF NOT EXISTS ${this.tableName}_ad + AFTER DELETE ON ${this.tableName} + BEGIN + INSERT INTO ${ftsTableName}(${ftsTableName}, rowid, document) + VALUES ('delete', OLD.rowid, OLD.document); + END + `, + args: [] + }) + + if (shouldRebuildFts) { + await client.execute({ + sql: ` + INSERT INTO ${ftsTableName}(${ftsTableName}) + VALUES ('rebuild') + `, + args: [] + }) + } + } + + async clearCollection(): Promise { + const sql = `DELETE FROM ${this.tableName} WHERE collection = ?` + await this.ensureInitialized() + const validParams = toInArgs([this.collection]) + const statement: InStatement = { sql, args: validParams } + await this.clientInstance.execute(statement) + } + + private getDataToInsert(embeddingResults: BaseNode[]) { + return embeddingResults.map((node) => { + const id = node.id_.length ? node.id_ : null + const externalId = node.sourceNode?.nodeId || node.id_ + const meta = node.metadata || {} + if (!meta.create_date) { + meta.create_date = new Date() + } + + const nodeId = id ?? '' + const embedding = this.normalizeEmbeddingOrThrow(this.getNodeEmbedding(node, nodeId), nodeId) + + // Convert embedding to JSON string for vector() function + const embeddingJson = `[${Array.from(embedding).join(',')}]` + + return [id!, externalId, this.collection, node.getContent(MetadataMode.NONE), JSON.stringify(meta), embeddingJson] + }) + } + + private getNodeEmbedding(node: BaseNode, nodeId: string): number[] | undefined { + try { + return node.getEmbedding() + } catch (error) { + throw new Error(`Missing embedding for node ${nodeId}`, { cause: toError(error) }) + } + } + + async add(embeddingResults: BaseNode[]): Promise { + if (embeddingResults.length === 0) { + console.warn('Empty list sent to LibSQLVectorStore::add') + return [] + } + + await this.ensureInitialized() + const data = this.getDataToInsert(embeddingResults) + + const placeholders = data + .map( + (_, index) => + `(?${index * 6 + 1}, ?${index * 6 + 2}, ?${index * 6 + 3}, ?${index * 6 + 4}, ?${index * 6 + 5}, vector32(?${index * 6 + 6}))` + ) + .join(', ') + + const sql = ` + INSERT INTO ${this.tableName} + (id, external_id, collection, document, metadata, embeddings) + VALUES ${placeholders} + ON CONFLICT (id) DO UPDATE SET + external_id = excluded.external_id, + collection = excluded.collection, + document = excluded.document, + metadata = excluded.metadata, + embeddings = excluded.embeddings + ` + + const flattenedParams = data.flat() + const validParams = toInArgs(flattenedParams) + const statement: InStatement = { sql, args: validParams } + await this.clientInstance.execute(statement) + return data.map((row) => String(row[0])) + } + + async delete(refDocId: string, _deleteKwargs?: object): Promise { + void _deleteKwargs + await this.ensureInitialized() + + const collectionCriteria = this.collection.length ? 'AND collection = ?' : '' + const sql = `DELETE FROM ${this.tableName} WHERE external_id = ? ${collectionCriteria}` + + const args = this.collection.length ? [refDocId, this.collection] : [refDocId] + const validParams = toInArgs(args) + const statement: InStatement = { sql, args: validParams } + await this.clientInstance.execute(statement) + } + + private normalizeEmbeddingOrThrow(embedding: number[] | undefined, nodeId: string): Float32Array { + if (!embedding || embedding.length === 0) { + throw new Error(`Missing embedding for node ${nodeId}`) + } + + if (embedding.length !== this.dimensions) { + throw new Error( + `Embedding dimension mismatch for node ${nodeId}: expected ${this.dimensions}, got ${embedding.length}` + ) + } + + return new Float32Array(embedding) + } + + private deserializeEmbedding(raw: unknown): number[] { + if (raw == null) { + throw new Error('Missing embedding payload in LibSQLVectorStore.deserializeEmbedding') + } + + if (raw instanceof Float32Array) { + return Array.from(raw) + } + + if (raw instanceof ArrayBuffer) { + return Array.from(new Float32Array(raw)) + } + + if (ArrayBuffer.isView(raw)) { + const view = raw + return Array.from( + new Float32Array(view.buffer, view.byteOffset, view.byteLength / Float32Array.BYTES_PER_ELEMENT) + ) + } + + if (Array.isArray(raw)) { + return raw.map((value) => Number(value)) + } + + throw new Error( + `Unexpected embedding payload type in LibSQLVectorStore.deserializeEmbedding: ${JSON.stringify({ + type: typeof raw, + constructorName: raw instanceof Object ? raw.constructor?.name : undefined + })}` + ) + } + + private parseJson( + value: T | string | null | undefined, + fallback: T, + context: { field: string; rowId?: string } + ): T { + if (value == null) { + return fallback + } + + if (typeof value !== 'string') { + return value as T + } + + try { + return JSON.parse(value) as T + } catch (error) { + console.warn(`Failed to parse ${context.field} JSON for row ${context.rowId ?? ''}`, toError(error)) + return fallback + } + } + + private toLibSQLCondition(condition: `${FilterCondition}`) { + switch (condition) { + case FilterCondition.AND: + return 'AND' + case FilterCondition.OR: + return 'OR' + default: + return 'AND' + } + } + + private buildFilterClause( + filter: MetadataFilter, + alias: string + ): { + clause: string + params: unknown[] + } { + const key = validateMetadataKey(filter.key) + const metadataColumn = `${alias}.metadata` + + switch (filter.operator) { + case FilterOperator.EQ: + return { + clause: `json_extract(${metadataColumn}, '$.${key}') = ?`, + params: [filter.value] + } + case FilterOperator.GT: + return { + clause: `CAST(json_extract(${metadataColumn}, '$.${key}') AS REAL) > ?`, + params: [filter.value] + } + case FilterOperator.LT: + return { + clause: `CAST(json_extract(${metadataColumn}, '$.${key}') AS REAL) < ?`, + params: [filter.value] + } + case FilterOperator.GTE: + return { + clause: `CAST(json_extract(${metadataColumn}, '$.${key}') AS REAL) >= ?`, + params: [filter.value] + } + case FilterOperator.LTE: + return { + clause: `CAST(json_extract(${metadataColumn}, '$.${key}') AS REAL) <= ?`, + params: [filter.value] + } + case FilterOperator.NE: + return { + clause: `json_extract(${metadataColumn}, '$.${key}') != ?`, + params: [filter.value] + } + case FilterOperator.IN: + if (Array.isArray(filter.value)) { + const placeholders = filter.value.map(() => '?').join(', ') + return { + clause: `json_extract(${metadataColumn}, '$.${key}') IN (${placeholders})`, + params: filter.value + } + } + return { + clause: `json_extract(${metadataColumn}, '$.${key}') IN (?)`, + params: [filter.value] + } + case FilterOperator.NIN: + if (Array.isArray(filter.value)) { + const placeholders = filter.value.map(() => '?').join(', ') + return { + clause: `json_extract(${metadataColumn}, '$.${key}') NOT IN (${placeholders})`, + params: filter.value + } + } + return { + clause: `json_extract(${metadataColumn}, '$.${key}') NOT IN (?)`, + params: [filter.value] + } + case FilterOperator.CONTAINS: + return { + clause: `json_extract(${metadataColumn}, '$.${key}') LIKE '%' || ? || '%'`, + params: [filter.value] + } + case FilterOperator.IS_EMPTY: + return { + clause: `(json_extract(${metadataColumn}, '$.${key}') IS NULL OR json_extract(${metadataColumn}, '$.${key}') = '' OR json_extract(${metadataColumn}, '$.${key}') = '[]')`, + params: [] + } + case FilterOperator.TEXT_MATCH: + return { + clause: `LOWER(json_extract(${metadataColumn}, '$.${key}')) LIKE LOWER('%' || ? || '%')`, + params: [filter.value] + } + default: + return { + clause: `json_extract(${metadataColumn}, '$.${key}') = ?`, + params: [filter.value] + } + } + } + + async query(query: VectorStoreQuery, _options?: object): Promise { + void _options + await this.ensureInitialized() + + if (query.mode === 'bm25') { + return this.bm25Search(query) + } else if (query.mode === 'hybrid') { + return this.hybridSearch(query) + } else { + return this.vectorSearch(query) + } + } + + private buildWhereClause( + query: VectorStoreQuery, + alias: string + ): { + where: string + params: unknown[] + } { + const whereClauses: string[] = [] + const params: unknown[] = [] + + if (this.collection.length) { + whereClauses.push(`${alias}.collection = ?`) + params.push(this.collection) + } + + const filterClauses: string[] = [] + query.filters?.filters.forEach((filter: MetadataFilter) => { + const { clause, params: filterParams } = this.buildFilterClause(filter, alias) + filterClauses.push(clause) + if (filterParams.length > 0) { + params.push(...filterParams) + } + }) + + if (filterClauses.length > 0) { + const condition = this.toLibSQLCondition(query.filters?.condition ?? FilterCondition.AND) + whereClauses.push(`(${filterClauses.join(` ${condition} `)})`) + } + + const where = whereClauses.length > 0 ? `WHERE ${whereClauses.join(' AND ')}` : '' + + return { where, params } + } + + private async vectorSearch(query: VectorStoreQuery): Promise { + const max = query.similarityTopK ?? 2 + const queryEmbedding = query.queryEmbedding ?? [] + + if (!queryEmbedding.length) { + throw new Error('queryEmbedding is required for vector search') + } + + const { where, params } = this.buildWhereClause(query, 't') + const vectorJson = `[${queryEmbedding.join(',')}]` + const indexName = `idx_${this.tableName}_vector` + + // Use vector_top_k for efficient ANN search with vector index + // Fetch more candidates to account for filtering + const prefetch = where ? max * 5 : max + + const vectorStatement: InStatement = { + sql: ` + SELECT t.*, vector_distance_cos(t.embeddings, vector32(?)) as distance + FROM vector_top_k('${indexName}', vector32(?), ${prefetch}) AS v + JOIN ${this.tableName} t ON t.rowid = v.id + ${where} + ORDER BY distance + LIMIT ${max} + `, + args: toInArgs([vectorJson, vectorJson, ...params]) + } + + const vectorResults = await this.clientInstance.execute(vectorStatement) + return this.mapVectorResult(vectorResults.rows, max) + } + + private async bm25Search(query: VectorStoreQuery): Promise { + const max = query.similarityTopK ?? 2 + + if (!query.queryStr) { + throw new Error('queryStr is required for BM25 mode') + } + + const { where, params } = this.buildWhereClause(query, 'v') + + // Use FTS5 for BM25 search + const ftsStatement: InStatement = { + sql: ` + SELECT v.*, bm25(${this.tableName}_fts) as score + FROM ${this.tableName}_fts fts + JOIN ${this.tableName} v ON fts.rowid = v.rowid + ${where} + ${where ? 'AND' : 'WHERE'} ${this.tableName}_fts MATCH ? + ORDER BY score + LIMIT ${max} + `, + args: toInArgs([...params, query.queryStr]) + } + + try { + const results = await this.clientInstance.execute(ftsStatement) + return this.mapBm25Result(results.rows, max) + } catch (error) { + console.warn('FTS5 search failed:', toError(error)) + throw new Error('BM25 search failed', { cause: toError(error) }) + } + } + + private async hybridSearch(query: VectorStoreQuery): Promise { + const max = query.similarityTopK ?? 2 + const queryEmbedding = query.queryEmbedding ?? [] + + if (!queryEmbedding.length) { + throw new Error('queryEmbedding is required for HYBRID mode') + } + if (!query.queryStr) { + throw new Error('queryStr is required for HYBRID mode') + } + + const alpha = query.alpha ?? 0.5 + const prefetch = query.hybridPrefetch ?? max * 5 + + // Step 1: Get vector search results + const vectorQuery: VectorStoreQuery = { + ...query, + similarityTopK: prefetch, + mode: 'default' + } + const vectorResults = await this.vectorSearch(vectorQuery) + + // Step 2: Get BM25 results + const bm25Query: VectorStoreQuery = { + ...query, + similarityTopK: prefetch, + mode: 'bm25' + } + const bm25Results = await this.bm25Search(bm25Query) + + // Step 3: Combine results using RRF + return combineResults(vectorResults, bm25Results, alpha, max) + } + + private mapVectorResult(rows: Record[], max: number): VectorStoreQueryResult { + const results = rows.slice(0, max).map((row) => { + const embedding = this.deserializeEmbedding(row.embeddings) + const distance = Number(row.distance ?? 0) + const similarity = 1 - distance + const metadata = this.parseJson( + row.metadata as Metadata | string | null | undefined, + {}, + { + field: 'metadata', + rowId: String(row.id ?? '') + } + ) + const externalId = typeof row.external_id === 'string' && row.external_id.length > 0 ? row.external_id : undefined + + if (externalId && metadata.itemId === undefined) { + metadata.itemId = externalId + } + + return { + node: new Document({ + id_: String(row.id), + text: String(row.document || ''), + metadata, + embedding + }), + similarity, + id: String(row.id) + } + }) + + return { + nodes: results.map((r) => r.node), + similarities: results.map((r) => r.similarity), + ids: results.map((r) => r.id) + } + } + + private mapBm25Result(rows: Record[], max: number): VectorStoreQueryResult { + const results = rows.slice(0, max).map((row) => { + const embedding = this.deserializeEmbedding(row.embeddings) + const score = Math.abs(Number(row.score ?? 0)) + const metadata = this.parseJson( + row.metadata as Metadata | string | null | undefined, + {}, + { + field: 'metadata', + rowId: String(row.id ?? '') + } + ) + const externalId = typeof row.external_id === 'string' && row.external_id.length > 0 ? row.external_id : undefined + + if (externalId && metadata.itemId === undefined) { + metadata.itemId = externalId + } + + return { + node: new Document({ + id_: String(row.id), + text: String(row.document || ''), + metadata, + embedding + }), + similarity: score, + id: String(row.id) + } + }) + + return { + nodes: results.map((r) => r.node), + similarities: results.map((r) => r.similarity), + ids: results.map((r) => r.id) + } + } + + persist(_persistPath: string): Promise { + void _persistPath + return Promise.resolve() + } + + async exists(refDocId: string): Promise { + await this.ensureInitialized() + const collectionCriteria = this.collection.length ? 'AND collection = ?' : '' + const sql = `SELECT 1 FROM ${this.tableName} + WHERE external_id = ? ${collectionCriteria} LIMIT 1` + const params = this.collection.length ? [refDocId, this.collection] : [refDocId] + const results = await this.clientInstance.execute({ + sql, + args: toInArgs(params) + }) + return results.rows.length > 0 + } +} diff --git a/packages/vectorstores/libsql/src/index.ts b/packages/vectorstores/libsql/src/index.ts new file mode 100644 index 0000000000..d9e13facac --- /dev/null +++ b/packages/vectorstores/libsql/src/index.ts @@ -0,0 +1,4 @@ +import { DEFAULT_DIMENSIONS, LIBSQL_TABLE, LibSQLVectorStore } from './LibSQLVectorStore.js' +import { fromFloat32Array, toFloat32Array } from './utils.js' + +export { DEFAULT_DIMENSIONS, fromFloat32Array, LIBSQL_TABLE, LibSQLVectorStore, toFloat32Array } diff --git a/packages/vectorstores/libsql/src/utils.ts b/packages/vectorstores/libsql/src/utils.ts new file mode 100644 index 0000000000..3b86e62b1c --- /dev/null +++ b/packages/vectorstores/libsql/src/utils.ts @@ -0,0 +1,17 @@ +/** + * Utility functions for libSQL vector operations. + */ + +/** + * Converts an array of numbers to Float32Array for libSQL vector storage. + */ +export function toFloat32Array(array: number[]): Float32Array { + return new Float32Array(array) +} + +/** + * Converts Float32Array back to regular number array. + */ +export function fromFloat32Array(float32Array: Float32Array): number[] { + return Array.from(float32Array) +} diff --git a/packages/vectorstores/libsql/tests/LibSQLVectorStore.test.ts b/packages/vectorstores/libsql/tests/LibSQLVectorStore.test.ts new file mode 100644 index 0000000000..1ca8fe4545 --- /dev/null +++ b/packages/vectorstores/libsql/tests/LibSQLVectorStore.test.ts @@ -0,0 +1,1142 @@ +import { createClient } from '@libsql/client' +import type { BaseNode, MetadataFilters, VectorStoreQuery } from '@vectorstores/core' +import { + FilterCondition, + FilterOperator, + type Metadata, + MetadataMode, + NodeRelationship, + TextNode, + VectorStoreQueryMode +} from '@vectorstores/core' +import { beforeEach, describe, expect, it, vi } from 'vitest' + +import { LibSQLVectorStore } from '../src/LibSQLVectorStore.js' + +describe('LibSQLVectorStore', () => { + let store: LibSQLVectorStore + let client: ReturnType + + beforeEach(() => { + // Use in-memory database for testing + client = createClient({ + url: ':memory:' + }) + + store = new LibSQLVectorStore({ + client, + tableName: 'test_embeddings', + dimensions: 2 + }) + }) + + describe('Basic Operations', () => { + it('should initialize with default configuration', () => { + const defaultStore = new LibSQLVectorStore({ + clientConfig: { url: ':memory:' } + }) + expect(defaultStore).toBeDefined() + expect(defaultStore.storesText).toBe(true) + }) + + it('should default to in-memory client when no clientConfig or client provided', () => { + const previousUrl = process.env.LIBSQL_URL + const previousAuth = process.env.LIBSQL_AUTH_TOKEN + delete process.env.LIBSQL_URL + delete process.env.LIBSQL_AUTH_TOKEN + + const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}) + const fallbackStore = new LibSQLVectorStore({}) + warnSpy.mockRestore() + + if (previousUrl) process.env.LIBSQL_URL = previousUrl + else delete process.env.LIBSQL_URL + + if (previousAuth) process.env.LIBSQL_AUTH_TOKEN = previousAuth + else delete process.env.LIBSQL_AUTH_TOKEN + + expect(fallbackStore.client()).toBeDefined() + }) + + it('should set and get collection', () => { + store.setCollection('test-collection') + expect(store.getCollection()).toBe('test-collection') + }) + + it('should get client connection', () => { + const db = store.client() + expect(db).toBeDefined() + }) + }) + + describe('Vector Operations', () => { + beforeEach(async () => { + // Ensure the database schema is set up + // The schema is created lazily on first operation + }) + + it('should add nodes to vector store', async () => { + const nodes: BaseNode[] = [ + new TextNode({ + embedding: [0.1, 0.2], + metadata: { category: 'test', score: 1.0 } + }), + new TextNode({ + embedding: [0.3, 0.4], + metadata: { category: 'example', score: 0.5 } + }) + ] + + const ids = await store.add(nodes) + expect(ids).toHaveLength(2) + expect(ids[0]).toBeDefined() + expect(ids[1]).toBeDefined() + }) + + it('should reject nodes with missing embeddings instead of writing zero vectors', async () => { + const node = new TextNode({ + id_: 'chunk-missing-embedding', + text: 'Document chunk without embedding', + metadata: { category: 'invalid' } + }) + + await expect(store.add([node])).rejects.toThrow('Missing embedding for node chunk-missing-embedding') + + const rows = await client.execute( + "SELECT COUNT(*) as count FROM test_embeddings WHERE id = 'chunk-missing-embedding'" + ) + expect(Number(rows.rows[0]?.count ?? 0)).toBe(0) + }) + + it('should reject nodes with mismatched embedding dimensions', async () => { + const node = new TextNode({ + id_: 'chunk-bad-dimensions', + text: 'Document chunk with mismatched embedding dimensions', + embedding: [0.1, 0.2, 0.3], + metadata: { category: 'invalid' } + }) + + await expect(store.add([node])).rejects.toThrow( + 'Embedding dimension mismatch for node chunk-bad-dimensions: expected 2, got 3' + ) + + const rows = await client.execute( + "SELECT COUNT(*) as count FROM test_embeddings WHERE id = 'chunk-bad-dimensions'" + ) + expect(Number(rows.rows[0]?.count ?? 0)).toBe(0) + }) + + it('should persist external_id from sourceNode.nodeId', async () => { + const node = new TextNode({ + id_: 'chunk-1', + text: 'Document chunk', + embedding: [0.1, 0.2], + metadata: { category: 'test' }, + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: 'item-1', + metadata: {} + } + } + }) + + await store.add([node]) + + const rows = await client.execute('SELECT id, external_id, collection FROM test_embeddings') + expect(rows.rows).toHaveLength(1) + expect(rows.rows[0]).toMatchObject({ + id: 'chunk-1', + external_id: 'item-1', + collection: store.getCollection() + }) + }) + + it('should fall back to node.id_ when sourceNode.nodeId is missing', async () => { + const node = new TextNode({ + id_: 'chunk-2', + text: 'Document chunk without source node', + embedding: [0.3, 0.4], + metadata: { category: 'fallback' } + }) + + await store.add([node]) + + const rows = await client.execute("SELECT id, external_id FROM test_embeddings WHERE id = 'chunk-2'") + expect(rows.rows).toHaveLength(1) + expect(rows.rows[0]).toMatchObject({ + id: 'chunk-2', + external_id: 'chunk-2' + }) + }) + + it('should query vectors by similarity', async () => { + // Add test data + const nodes: BaseNode[] = [ + new TextNode({ + text: 'First document', + embedding: [1.0, 0.0], + metadata: { category: 'doc1' } + }), + new TextNode({ + text: 'Second document', + embedding: [0.0, 1.0], + metadata: { category: 'doc2' } + }) + ] + + await store.add(nodes) + + // Query for similar vectors + const query: VectorStoreQuery = { + queryEmbedding: [0.9, 0.1], + similarityTopK: 2, + mode: VectorStoreQueryMode.DEFAULT + } + + const result = await store.query(query) + + expect(result.nodes).toHaveLength(2) + expect(result.ids).toHaveLength(2) + expect(result.similarities).toHaveLength(2) + + // First result should be more similar (closer to [1.0, 0.0]) + expect(result.similarities[0]).toBeGreaterThan(result.similarities[1]) + }) + + it('should expose itemId from external_id in query results', async () => { + const node = new TextNode({ + id_: 'chunk-knowledge-1', + text: 'Knowledge document', + embedding: [1.0, 0.0], + metadata: { source: '/tmp/doc.md' }, + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: 'item-knowledge-1', + metadata: {} + } + } + }) + + await store.add([node]) + + const result = await store.query({ + queryEmbedding: [1.0, 0.0], + similarityTopK: 1, + mode: VectorStoreQueryMode.DEFAULT + }) + + expect(result.nodes).toHaveLength(1) + expect(result.nodes?.[0]?.metadata).toMatchObject({ + source: '/tmp/doc.md', + itemId: 'item-knowledge-1' + }) + }) + + it('should tolerate invalid metadata JSON in vector query results', async () => { + await store.add([ + new TextNode({ + id_: 'chunk-invalid-metadata-vector', + text: 'Knowledge document', + embedding: [1.0, 0.0], + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: 'item-invalid-metadata-vector', + metadata: {} + } + } + }) + ]) + + await client.execute({ + sql: 'UPDATE test_embeddings SET metadata = ? WHERE id = ?', + args: ['{"itemId":', 'chunk-invalid-metadata-vector'] + }) + + const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}) + const result = await store.query({ + queryEmbedding: [1.0, 0.0], + similarityTopK: 1, + mode: VectorStoreQueryMode.DEFAULT + }) + + expect(result.nodes).toHaveLength(1) + expect(result.nodes?.[0]?.metadata).toMatchObject({ + itemId: 'item-invalid-metadata-vector' + }) + expect(warnSpy).toHaveBeenCalledWith( + 'Failed to parse metadata JSON for row chunk-invalid-metadata-vector', + expect.any(Error) + ) + warnSpy.mockRestore() + }) + + it('should tolerate invalid metadata JSON in bm25 query results', async () => { + await store.add([ + new TextNode({ + id_: 'chunk-invalid-metadata-bm25', + text: 'searchable bm25 document', + embedding: [1.0, 0.0], + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: 'item-invalid-metadata-bm25', + metadata: {} + } + } + }) + ]) + + await client.execute({ + sql: 'UPDATE test_embeddings SET metadata = ? WHERE id = ?', + args: ['{"itemId":', 'chunk-invalid-metadata-bm25'] + }) + + const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}) + const result = await store.query({ + queryStr: 'searchable', + similarityTopK: 1, + mode: VectorStoreQueryMode.BM25 + }) + + expect(result.nodes).toHaveLength(1) + expect(result.nodes?.[0]?.metadata).toMatchObject({ + itemId: 'item-invalid-metadata-bm25' + }) + expect(warnSpy).toHaveBeenCalledWith( + 'Failed to parse metadata JSON for row chunk-invalid-metadata-bm25', + expect.any(Error) + ) + warnSpy.mockRestore() + }) + + it('should preserve the original cause when bm25 execution fails', async () => { + await store.add([ + new TextNode({ + id_: 'chunk-bm25-failure', + text: 'searchable document', + embedding: [1.0, 0.0], + metadata: { category: 'test' } + }) + ]) + + const originalExecute = client.execute.bind(client) + const executeSpy = vi.spyOn(client, 'execute').mockImplementation(async (statement: any) => { + const sql = typeof statement === 'string' ? statement : statement.sql + if (typeof sql === 'string' && sql.includes('bm25(')) { + throw new Error('fts execution failed') + } + + return await originalExecute(statement) + }) + + const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}) + + try { + await store.query({ + queryStr: 'searchable', + similarityTopK: 1, + mode: VectorStoreQueryMode.BM25 + }) + throw new Error('Expected BM25 query to fail') + } catch (error) { + expect(error).toBeInstanceOf(Error) + expect((error as Error).message).toBe('BM25 search failed') + expect((error as Error & { cause?: unknown }).cause).toBeInstanceOf(Error) + expect(((error as Error & { cause?: Error }).cause as Error).message).toBe('fts execution failed') + } + + expect(warnSpy).toHaveBeenCalledWith('FTS5 search failed:', expect.any(Error)) + warnSpy.mockRestore() + executeSpy.mockRestore() + }) + + it('should handle empty add request', async () => { + const ids = await store.add([]) + expect(ids).toEqual([]) + }) + + it('should throw when SQL arguments would contain invalid nullish values', async () => { + const invalidNode = { + id_: '', + metadata: { category: 'test' }, + sourceNode: undefined, + getEmbedding: () => [0.1, 0.2], + getContent: () => 'Document chunk' + } as unknown as BaseNode + + await expect(store.add([invalidNode])).rejects.toThrow('Invalid libSQL argument at index 0: null') + }) + + it('should fail initialization when vector index creation fails', async () => { + const originalExecute = client.execute.bind(client) + const executeSpy = vi.spyOn(client, 'execute').mockImplementation(async (statement: any) => { + const sql = typeof statement === 'string' ? statement : statement.sql + if (typeof sql === 'string' && sql.includes('libsql_vector_idx')) { + throw new Error('vector index failed') + } + + return await originalExecute(statement) + }) + + const node = new TextNode({ + id_: 'chunk-hard-fail', + text: 'Document chunk', + embedding: [0.1, 0.2], + metadata: { category: 'test' } + }) + + await expect(store.add([node])).rejects.toThrow('vector index failed') + executeSpy.mockRestore() + }) + + it('should fail initialization when FTS schema creation fails', async () => { + const originalExecute = client.execute.bind(client) + const executeSpy = vi.spyOn(client, 'execute').mockImplementation(async (statement: any) => { + const sql = typeof statement === 'string' ? statement : statement.sql + if (typeof sql === 'string' && sql.includes('CREATE VIRTUAL TABLE IF NOT EXISTS test_embeddings_fts')) { + throw new Error('fts creation failed') + } + + return await originalExecute(statement) + }) + + const node = new TextNode({ + id_: 'chunk-fts-fail', + text: 'Document chunk', + embedding: [0.1, 0.2], + metadata: { category: 'test' } + }) + + await expect(store.add([node])).rejects.toThrow('fts creation failed') + executeSpy.mockRestore() + }) + + it('should only run schema initialization once for concurrent callers', async () => { + let checkSchemaCalls = 0 + let resolveInitialization!: () => void + const initializationBarrier = new Promise((resolve) => { + resolveInitialization = resolve + }) + const originalCheckSchema = (store as any).checkSchema.bind(store) as (clientArg: unknown) => Promise + + const checkSchemaSpy = vi.spyOn(store as any, 'checkSchema').mockImplementation(async (clientArg: unknown) => { + checkSchemaCalls += 1 + await initializationBarrier + return await originalCheckSchema(clientArg) + }) + + const firstAddPromise = store.add([ + new TextNode({ + id_: 'chunk-concurrent-1', + text: 'Concurrent document 1', + embedding: [0.1, 0.2], + metadata: { category: 'first' } + }) + ]) + + const secondAddPromise = store.add([ + new TextNode({ + id_: 'chunk-concurrent-2', + text: 'Concurrent document 2', + embedding: [0.2, 0.1], + metadata: { category: 'second' } + }) + ]) + + await vi.waitFor(() => { + expect(checkSchemaCalls).toBe(1) + }) + + resolveInitialization() + + await expect(Promise.all([firstAddPromise, secondAddPromise])).resolves.toEqual([ + ['chunk-concurrent-1'], + ['chunk-concurrent-2'] + ]) + + expect(checkSchemaCalls).toBe(1) + checkSchemaSpy.mockRestore() + }) + + it('should rebuild FTS only when the FTS table is first created', async () => { + let rebuildCount = 0 + const originalExecute = client.execute.bind(client) + const executeSpy = vi.spyOn(client, 'execute').mockImplementation(async (statement: any) => { + const sql = typeof statement === 'string' ? statement : statement.sql + if (typeof sql === 'string' && sql.includes("VALUES ('rebuild')")) { + rebuildCount += 1 + } + + return await originalExecute(statement) + }) + + await store.add([ + new TextNode({ + id_: 'chunk-first-init', + text: 'First document', + embedding: [0.1, 0.2], + metadata: { category: 'first' } + }) + ]) + + const secondStore = new LibSQLVectorStore({ + client, + tableName: 'test_embeddings', + dimensions: 2 + }) + + await secondStore.add([ + new TextNode({ + id_: 'chunk-second-init', + text: 'Second document', + embedding: [0.2, 0.1], + metadata: { category: 'second' } + }) + ]) + + expect(rebuildCount).toBe(1) + executeSpy.mockRestore() + }) + + it('should delete all nodes by external_id', async () => { + const nodeA = new TextNode({ + id_: 'chunk-1', + text: 'Document chunk A', + embedding: [0.1, 0.2], + metadata: { category: 'test' }, + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: 'item-1', + metadata: {} + } + } + }) + + const nodeB = new TextNode({ + id_: 'chunk-2', + text: 'Document chunk B', + embedding: [0.1, 0.2], + metadata: { category: 'test' }, + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: 'item-1', + metadata: {} + } + } + }) + + await store.add([nodeA, nodeB]) + + const queryBefore: VectorStoreQuery = { + queryEmbedding: [0.1, 0.2], + similarityTopK: 2, + mode: VectorStoreQueryMode.DEFAULT + } + const resultBefore = await store.query(queryBefore) + expect(resultBefore.nodes).toHaveLength(2) + + await store.delete('item-1') + + const queryAfter: VectorStoreQuery = { + queryEmbedding: [0.1, 0.2], + similarityTopK: 2, + mode: VectorStoreQueryMode.DEFAULT + } + const resultAfter = await store.query(queryAfter) + expect(resultAfter.nodes).toHaveLength(0) + }) + + it('should scope delete by collection', async () => { + const otherCollectionStore = new LibSQLVectorStore({ + client, + tableName: 'test_embeddings', + dimensions: 2, + collection: 'other' + }) + + const nodeDefault = new TextNode({ + id_: 'chunk-default', + text: 'Default collection chunk', + embedding: [0.2, 0.3], + metadata: { category: 'scope' }, + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: 'item-shared', + metadata: {} + } + } + }) + + const nodeOther = new TextNode({ + id_: 'chunk-other', + text: 'Other collection chunk', + embedding: [0.2, 0.3], + metadata: { category: 'scope' }, + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: 'item-shared', + metadata: {} + } + } + }) + + await store.add([nodeDefault]) + await otherCollectionStore.add([nodeOther]) + + await store.delete('item-shared') + + const rows = await client.execute( + "SELECT id, external_id, collection FROM test_embeddings WHERE external_id = 'item-shared' ORDER BY id" + ) + expect(rows.rows).toHaveLength(1) + expect(rows.rows[0]).toMatchObject({ + id: 'chunk-other', + external_id: 'item-shared', + collection: 'other' + }) + }) + }) + + describe('Metadata Filtering', () => { + const filterCases: Array<{ + title: string + filters: MetadataFilters + queryEmbedding?: number[] + expectedCount: number + assert?: (nodes: BaseNode[]) => void + }> = [ + { + title: 'metadata equality', + filters: { + filters: [ + { + key: 'category', + value: 'technology', + operator: FilterOperator.EQ + } + ] + }, + expectedCount: 2, + assert: (nodes) => nodes.forEach((node) => expect(node.metadata?.category).toBe('technology')) + }, + { + title: 'numeric comparison', + filters: { + filters: [{ key: 'rating', value: 4, operator: FilterOperator.GTE }] + }, + expectedCount: 2, + assert: (nodes) => nodes.forEach((node) => expect(node.metadata?.rating).toBeGreaterThanOrEqual(4)) + }, + { + title: 'combined AND', + filters: { + filters: [ + { + key: 'category', + value: 'technology', + operator: FilterOperator.EQ + }, + { key: 'rating', value: 4, operator: FilterOperator.GTE } + ], + condition: FilterCondition.AND + }, + expectedCount: 2, + assert: (nodes) => { + const ratings = nodes.map((node) => node.metadata?.rating) + expect(ratings).toContain(4) + expect(ratings).toContain(5) + nodes.forEach((node) => expect(node.metadata?.category).toBe('technology')) + } + }, + { + title: 'text match', + filters: { + filters: [{ key: 'tags', value: 'ai', operator: FilterOperator.TEXT_MATCH }] + }, + queryEmbedding: [1.0, 0.0], + expectedCount: 1, + assert: (nodes) => { + expect(nodes[0].metadata?.tags).toContain('ai') + } + } + ] + + beforeEach(async () => { + // Add test data with metadata + const nodes: BaseNode[] = [ + new TextNode({ + text: 'Document about AI', + embedding: [1.0, 0.0], + metadata: { category: 'technology', rating: 5, tags: ['ai', 'ml'] } + }), + new TextNode({ + text: 'Document about cooking', + embedding: [0.0, 1.0], + metadata: { + category: 'food', + rating: 3, + tags: ['cooking', 'recipes'] + } + }), + new TextNode({ + text: 'Another tech document', + embedding: [0.5, 0.5], + metadata: { + category: 'technology', + rating: 4, + tags: ['programming'] + } + }) + ] + + await store.add(nodes) + }) + + filterCases.forEach(({ title, filters, queryEmbedding, expectedCount, assert }) => { + it(`should filter by ${title}`, async () => { + const query: VectorStoreQuery = { + queryEmbedding: queryEmbedding ?? [0.5, 0.5], + similarityTopK: 5, + filters, + mode: VectorStoreQueryMode.DEFAULT + } + + const result = await store.query(query) + expect(result.nodes).toHaveLength(expectedCount) + assert?.(result.nodes as BaseNode[]) + }) + }) + + it('should reject invalid metadata filter keys', async () => { + const query: VectorStoreQuery = { + queryEmbedding: [0.5, 0.5], + similarityTopK: 5, + filters: { + filters: [ + { + key: "category') = 'technology' OR 1=1 --", + value: 'technology', + operator: FilterOperator.EQ + } + ] + }, + mode: VectorStoreQueryMode.DEFAULT + } + + await expect(store.query(query)).rejects.toThrow( + "Invalid metadata filter key: category') = 'technology' OR 1=1 --" + ) + }) + }) + + describe('Collection Management', () => { + beforeEach(async () => { + // Add data to default collection + const nodes: BaseNode[] = [ + new TextNode({ + embedding: [0.1, 0.2], + metadata: { collection: 'default' } + }) + ] + + await store.add(nodes) + }) + + it('should clear collection', async () => { + // Verify data exists + const query: VectorStoreQuery = { + queryEmbedding: [0.1, 0.2], + similarityTopK: 1, + mode: VectorStoreQueryMode.DEFAULT + } + let result = await store.query(query) + expect(result.nodes).toHaveLength(1) + + // Clear collection + await store.clearCollection() + + // Verify data is gone + result = await store.query(query) + expect(result.nodes).toHaveLength(0) + }) + + it('should isolate data by collection', async () => { + const originalCollection = store.getCollection() + // Add data to different collection + store.setCollection('test-collection') + + const newNodes: BaseNode[] = [ + new TextNode({ + embedding: [0.3, 0.4], + metadata: { collection: 'test' } + }) + ] + + await store.add(newNodes) + + // Query in test-collection should find data + let query: VectorStoreQuery = { + queryEmbedding: [0.3, 0.4], + similarityTopK: 1, + mode: VectorStoreQueryMode.DEFAULT + } + let result = await store.query(query) + expect(result.nodes).toHaveLength(1) + + // Switch back to default collection and query + store.setCollection(originalCollection) + query = { + queryEmbedding: [0.1, 0.2], + similarityTopK: 1, + mode: VectorStoreQueryMode.DEFAULT + } + result = await store.query(query) + expect(result.nodes).toHaveLength(1) + }) + }) + + describe('Utility Functions', () => { + it('should convert to Float32Array', async () => { + const { toFloat32Array } = await import('../src/utils.js') + const array = [0.1, 0.2, 0.3] + const result = toFloat32Array(array) + expect(result).toBeInstanceOf(Float32Array) + Array.from(result).forEach((value, idx) => { + expect(value).toBeCloseTo(array[idx], 6) + }) + }) + + it('should convert from Float32Array', async () => { + const { fromFloat32Array } = await import('../src/utils.js') + const float32Array = new Float32Array([0.1, 0.2, 0.3]) + const result = fromFloat32Array(float32Array) + result.forEach((value, idx) => { + expect(value).toBeCloseTo([0.1, 0.2, 0.3][idx], 6) + }) + }) + + it('should throw when deserializeEmbedding receives an unsupported payload type', () => { + expect(() => (store as any).deserializeEmbedding('not-an-embedding')).toThrow( + 'Unexpected embedding payload type in LibSQLVectorStore.deserializeEmbedding' + ) + }) + + it('should throw when deserializeEmbedding receives a missing payload', () => { + expect(() => (store as any).deserializeEmbedding(null)).toThrow( + 'Missing embedding payload in LibSQLVectorStore.deserializeEmbedding' + ) + }) + }) + + describe('Error Handling', () => { + it('should reject nodes with missing embeddings', async () => { + const nodeWithoutEmbedding = new TextNode({ + text: 'Test node', + metadata: { category: 'test' } + }) + + await expect(store.add([nodeWithoutEmbedding])).rejects.toThrow('Missing embedding for node') + }) + + it('should reject query with null embedding', async () => { + const query: VectorStoreQuery = { + queryEmbedding: undefined, + similarityTopK: 1, + mode: VectorStoreQueryMode.DEFAULT + } + + await expect(store.query(query)).rejects.toThrow('queryEmbedding is required for vector search') + }) + }) + + describe('Configuration Options', () => { + it('should work with pre-configured client', async () => { + const customClient = createClient({ url: ':memory:' }) + const customStore = new LibSQLVectorStore({ + client: customClient, + tableName: 'custom_table', + dimensions: 4 + }) + + expect(customStore).toBeDefined() + + const nodes: BaseNode[] = [ + new TextNode({ + embedding: [0.1, 0.2, 0.3, 0.4], + metadata: { custom: true } + }) + ] + + const ids = await customStore.add(nodes) + expect(ids).toHaveLength(1) + }) + + it('should work with client configuration', async () => { + const configStore = new LibSQLVectorStore({ + clientConfig: { + url: ':memory:' + }, + tableName: 'config_table', + dimensions: 3 + }) + + expect(configStore).toBeDefined() + + const db = configStore.client() + expect(db).toBeDefined() + }) + }) + + describe('Query Modes', () => { + beforeEach(async () => { + // Add test data with text content for FTS + const nodes: BaseNode[] = [ + new TextNode({ + text: 'Machine learning and artificial intelligence are transforming technology', + embedding: [1.0, 0.0], + metadata: { category: 'technology', topic: 'ai' } + }), + new TextNode({ + text: 'Cooking recipes and food preparation techniques', + embedding: [0.0, 1.0], + metadata: { category: 'food', topic: 'cooking' } + }), + new TextNode({ + text: 'Deep learning neural networks for artificial intelligence', + embedding: [0.8, 0.2], + metadata: { category: 'technology', topic: 'ai' } + }) + ] + + await store.add(nodes) + }) + + it('should query using default mode (vector search)', async () => { + const query: VectorStoreQuery = { + queryEmbedding: [0.9, 0.1], + similarityTopK: 2, + mode: VectorStoreQueryMode.DEFAULT + } + + const result = await store.query(query) + + expect(result.nodes).toHaveLength(2) + expect(result.similarities).toHaveLength(2) + expect(result.ids).toHaveLength(2) + // First result should be more similar (closer to [1.0, 0.0]) + expect(result.similarities[0]).toBeGreaterThan(result.similarities[1]) + }) + + it('should query using bm25 mode (full-text search)', async () => { + const query: VectorStoreQuery = { + queryStr: 'artificial intelligence', + similarityTopK: 2, + mode: 'bm25' as VectorStoreQueryMode + } + + const result = await store.query(query) + const nodes = result.nodes ?? [] + + expect(nodes).toHaveLength(2) + expect(result.similarities).toHaveLength(2) + expect(result.ids).toHaveLength(2) + nodes.forEach((node) => { + const text = node.getContent(MetadataMode.NONE).toLowerCase() + expect(text.includes('artificial') || text.includes('intelligence')).toBe(true) + }) + }) + + it('should throw error for bm25 mode without queryStr', async () => { + const query: VectorStoreQuery = { + queryEmbedding: [0.5, 0.5], + similarityTopK: 2, + mode: 'bm25' as VectorStoreQueryMode + } + + await expect(store.query(query)).rejects.toThrow('queryStr is required for BM25 mode') + }) + + it('should query using hybrid mode (vector + FTS)', async () => { + const query: VectorStoreQuery = { + queryEmbedding: [0.9, 0.1], + queryStr: 'artificial intelligence', + similarityTopK: 2, + mode: 'hybrid' as VectorStoreQueryMode, + alpha: 0.5 + } + + const result = await store.query(query) + const nodes = result.nodes ?? [] + + expect(nodes).toHaveLength(2) + expect(result.similarities).toHaveLength(2) + expect(result.ids).toHaveLength(2) + nodes.forEach((node) => { + const text = node.getContent(MetadataMode.NONE).toLowerCase() + expect(text.includes('artificial') || text.includes('intelligence') || text.includes('learning')).toBe(true) + }) + }) + + it('should throw error for hybrid mode without queryEmbedding', async () => { + const query: VectorStoreQuery = { + queryStr: 'artificial intelligence', + similarityTopK: 2, + mode: 'hybrid' as VectorStoreQueryMode + } + + await expect(store.query(query)).rejects.toThrow('queryEmbedding is required for HYBRID mode') + }) + + it('should throw error for hybrid mode without queryStr', async () => { + const query: VectorStoreQuery = { + queryEmbedding: [0.5, 0.5], + similarityTopK: 2, + mode: 'hybrid' as VectorStoreQueryMode + } + + await expect(store.query(query)).rejects.toThrow('queryStr is required for HYBRID mode') + }) + + it('should fallback to vector search for unknown query mode', async () => { + const query: VectorStoreQuery = { + queryEmbedding: [0.5, 0.5], + similarityTopK: 2, + mode: 'unknown_mode' as VectorStoreQueryMode + } + + const result = await store.query(query) + + // Should fallback to vector search and return results + expect(result.nodes).toBeDefined() + expect(result.similarities).toBeDefined() + expect(result.ids).toBeDefined() + }) + + it('should update bm25 index after upsert', async () => { + const node = new TextNode({ + id_: 'upsert-doc', + text: 'legacy keyword content', + embedding: [0.6, 0.4], + metadata: { category: 'technology' } + }) + + await store.add([node]) + + let result = await store.query({ + queryStr: 'legacy', + similarityTopK: 5, + mode: 'bm25' as VectorStoreQueryMode + }) + expect(result.ids).toContain('upsert-doc') + + await store.add([ + new TextNode({ + id_: 'upsert-doc', + text: 'fresh keyword content', + embedding: [0.6, 0.4], + metadata: { category: 'technology' } + }) + ]) + + result = await store.query({ + queryStr: 'legacy', + similarityTopK: 5, + mode: 'bm25' as VectorStoreQueryMode + }) + expect(result.ids).not.toContain('upsert-doc') + + result = await store.query({ + queryStr: 'fresh', + similarityTopK: 5, + mode: 'bm25' as VectorStoreQueryMode + }) + expect(result.ids).toContain('upsert-doc') + }) + + it('should remove deleted documents from bm25 index', async () => { + const node = new TextNode({ + id_: 'delete-doc', + text: 'remove me from bm25', + embedding: [0.4, 0.6], + metadata: { category: 'technology' }, + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: 'item-delete', + metadata: {} + } + } + }) + + await store.add([node]) + + let result = await store.query({ + queryStr: 'remove', + similarityTopK: 5, + mode: 'bm25' as VectorStoreQueryMode + }) + expect(result.ids).toContain('delete-doc') + + await store.delete('item-delete') + + result = await store.query({ + queryStr: 'remove', + similarityTopK: 5, + mode: 'bm25' as VectorStoreQueryMode + }) + expect(result.ids).not.toContain('delete-doc') + }) + }) + + describe('exists', () => { + it('should return true for existing external_id', async () => { + const nodes: BaseNode[] = [ + new TextNode({ + id_: 'doc-123', + embedding: [0.1, 0.2], + metadata: { category: 'exists' }, + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: 'item-1', + metadata: {} + } + } + }) + ] + + await store.add(nodes) + + const exists = await store.exists('item-1') + expect(exists).toBe(true) + }) + + it('should return false for non-existing document', async () => { + const exists = await store.exists('non-existent-ref') + expect(exists).toBe(false) + }) + + it('should respect collection when checking existence', async () => { + store.setCollection('collection-a') + + const nodes: BaseNode[] = [ + new TextNode({ + embedding: [0.1, 0.2], + metadata: { category: 'exists' }, + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: 'item-collection', + metadata: {} + } + } + }) + ] + + await store.add(nodes) + + // Should find in same collection + expect(await store.exists('item-collection')).toBe(true) + + // Should not find in different collection + store.setCollection('collection-b') + expect(await store.exists('item-collection')).toBe(false) + }) + }) +}) diff --git a/packages/vectorstores/libsql/tsconfig.json b/packages/vectorstores/libsql/tsconfig.json new file mode 100644 index 0000000000..3e198c4c26 --- /dev/null +++ b/packages/vectorstores/libsql/tsconfig.json @@ -0,0 +1,11 @@ +{ + "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "moduleResolution": "bundler", + "rootDir": "./src", + "outDir": "./dist", + "tsBuildInfoFile": "./dist/.tsbuildinfo" + }, + "include": ["./src"] +} diff --git a/packages/vectorstores/libsql/tsdown.config.ts b/packages/vectorstores/libsql/tsdown.config.ts new file mode 100644 index 0000000000..0e07d34cac --- /dev/null +++ b/packages/vectorstores/libsql/tsdown.config.ts @@ -0,0 +1,12 @@ +import { defineConfig } from 'tsdown' + +export default defineConfig({ + entry: { + index: 'src/index.ts' + }, + outDir: 'dist', + format: ['esm', 'cjs'], + clean: true, + dts: true, + tsconfig: 'tsconfig.json' +}) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index d441561040..7b30ebb22b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -96,8 +96,8 @@ importers: specifier: ^1.59.0 version: 1.60.0 '@libsql/client': - specifier: 0.14.0 - version: 0.14.0 + specifier: ^0.15.15 + version: 0.15.15 '@napi-rs/canvas': specifier: 0.1.97 version: 0.1.97 @@ -107,12 +107,21 @@ importers: '@paymoapp/electron-shutdown-handler': specifier: 1.1.2 version: 1.1.2 + '@vectorstores/core': + specifier: ^0.1.8 + version: 0.1.8 + '@vectorstores/libsql': + specifier: workspace:* + version: link:packages/vectorstores/libsql + '@vectorstores/readers': + specifier: ^0.1.8 + version: 0.1.8(@vectorstores/core@0.1.8)(@vectorstores/env@0.1.0)(encoding@0.1.13) cron-parser: specifier: ^5.0.8 version: 5.5.0 drizzle-zod: specifier: ^0.8.3 - version: 0.8.3(drizzle-orm@0.44.7(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0))(zod@4.3.4) + version: 0.8.3(drizzle-orm@0.44.7(@libsql/client@0.15.15)(@opentelemetry/api@1.9.0))(zod@4.3.4) express: specifier: 5.1.0 version: 5.1.0 @@ -383,7 +392,7 @@ importers: version: 0.3.4 '@langchain/community': specifier: ^1.0.0 - version: 1.1.1(3bba964449060bd176fc73da062d305c) + version: 1.1.1(9b32df8fea10b7369f157d4e11bd59c1) '@langchain/core': specifier: 1.0.2 version: 1.0.2(patch_hash=8dc787a82cebafe8b23c8826f25f29aca64fc8b43a0a1878e0010782e4da96ed)(@opentelemetry/api@1.9.0)(@opentelemetry/sdk-trace-base@2.2.0(@opentelemetry/api@1.9.0))(openai@6.15.0(ws@8.20.0)(zod@4.3.4)) @@ -785,7 +794,7 @@ importers: version: 0.31.8 drizzle-orm: specifier: ^0.44.5 - version: 0.44.7(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0) + version: 0.44.7(@libsql/client@0.15.15)(@opentelemetry/api@1.9.0) electron: specifier: 40.8.0 version: 40.8.0 @@ -1626,6 +1635,28 @@ importers: specifier: ^3.2.4 version: 3.2.4(@types/debug@4.1.13)(@types/node@24.10.4)(@vitest/browser@3.2.4)(@vitest/ui@3.2.4)(esbuild@0.25.12)(jiti@2.6.1)(jsdom@26.1.0)(msw@2.12.7(@types/node@24.10.4)(typescript@5.8.3))(tsx@4.21.0)(yaml@2.8.2) + packages/vectorstores/libsql: + dependencies: + '@libsql/client': + specifier: ^0.15.15 + version: 0.15.15 + devDependencies: + '@vectorstores/core': + specifier: ^0.1.8 + version: 0.1.8 + '@vectorstores/env': + specifier: ^0.1.0 + version: 0.1.0 + tsdown: + specifier: ^0.20.3 + version: 0.20.3(@typescript/native-preview@7.0.0-dev.20260204.1)(typescript@5.8.3) + typescript: + specifier: ^5.8.3 + version: 5.8.3 + vitest: + specifier: 2.1.0 + version: 2.1.0(@types/node@24.10.4)(@vitest/browser@3.2.4(msw@2.12.7(@types/node@24.10.4)(typescript@5.8.3))(playwright@1.57.0)(rolldown-vite@7.3.0(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2))(vitest@3.2.4))(@vitest/ui@3.2.4(vitest@3.2.4))(esbuild@0.25.12)(jiti@2.6.1)(jsdom@26.1.0)(msw@2.12.7(@types/node@24.10.4)(typescript@5.8.3))(tsx@4.21.0)(yaml@2.8.2) + packages: 7zip-bin@5.2.0: @@ -2152,28 +2183,24 @@ packages: engines: {node: '>=14.21.3'} cpu: [arm64] os: [linux] - libc: [musl] '@biomejs/cli-linux-arm64@2.2.4': resolution: {integrity: sha512-M/Iz48p4NAzMXOuH+tsn5BvG/Jb07KOMTdSVwJpicmhN309BeEyRyQX+n1XDF0JVSlu28+hiTQ2L4rZPvu7nMw==} engines: {node: '>=14.21.3'} cpu: [arm64] os: [linux] - libc: [glibc] '@biomejs/cli-linux-x64-musl@2.2.4': resolution: {integrity: sha512-m41nFDS0ksXK2gwXL6W6yZTYPMH0LughqbsxInSKetoH6morVj43szqKx79Iudkp8WRT5SxSh7qVb8KCUiewGg==} engines: {node: '>=14.21.3'} cpu: [x64] os: [linux] - libc: [musl] '@biomejs/cli-linux-x64@2.2.4': resolution: {integrity: sha512-orr3nnf2Dpb2ssl6aihQtvcKtLySLta4E2UcXdp7+RTa7mfJjBgIsbS0B9GC8gVu0hjOu021aU8b3/I1tn+pVQ==} engines: {node: '>=14.21.3'} cpu: [x64] os: [linux] - libc: [glibc] '@biomejs/cli-win32-arm64@2.2.4': resolution: {integrity: sha512-NXnfTeKHDFUWfxAefa57DiGmu9VyKi0cDqFpdI+1hJWQjGJhJutHPX0b5m+eXvTKOaf+brU+P0JrQAZMb5yYaQ==} @@ -2485,6 +2512,10 @@ packages: resolution: {integrity: sha512-0cp4PsWQ/9avqTVMCtZ+GirikIA36ikvjtHweU4/j8yLtgObI0+JUPhYFScgwlteveGB1rt3Cm8UhN04XayDig==} engines: {node: '>= 8.9.0'} + '@discoveryjs/json-ext@0.6.3': + resolution: {integrity: sha512-4B4OijXeVNOPZlYA2oEwWOTkzyltLao+xbotHQeqN++Rv27Y6s818+n2Qkp8q+Fxhn0t/5lA5X1Mxktud8eayQ==} + engines: {node: '>=14.17.0'} + '@dnd-kit/accessibility@3.1.1': resolution: {integrity: sha512-2P+YgaXF+gRsIihwwY1gCsQSYnu9Zyj2py8kY5fFvUM1qm2WA2u639R6YNVfU4GWr+ZM5mqEsfHZZLoRONbemw==} peerDependencies: @@ -2961,92 +2992,78 @@ packages: resolution: {integrity: sha512-RXwd0CgG+uPRX5YYrkzKyalt2OJYRiJQ8ED/fi1tq9WQW2jsQIn0tqrlR5l5dr/rjqq6AHAxURhj2DVjyQWSOA==} cpu: [arm64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-arm@1.2.0': resolution: {integrity: sha512-mWd2uWvDtL/nvIzThLq3fr2nnGfyr/XMXlq8ZJ9WMR6PXijHlC3ksp0IpuhK6bougvQrchUAfzRLnbsen0Cqvw==} cpu: [arm] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-ppc64@1.2.0': resolution: {integrity: sha512-Xod/7KaDDHkYu2phxxfeEPXfVXFKx70EAFZ0qyUdOjCcxbjqyJOEUpDe6RIyaunGxT34Anf9ue/wuWOqBW2WcQ==} cpu: [ppc64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-s390x@1.2.0': resolution: {integrity: sha512-eMKfzDxLGT8mnmPJTNMcjfO33fLiTDsrMlUVcp6b96ETbnJmd4uvZxVJSKPQfS+odwfVaGifhsB07J1LynFehw==} cpu: [s390x] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-x64@1.2.0': resolution: {integrity: sha512-ZW3FPWIc7K1sH9E3nxIGB3y3dZkpJlMnkk7z5tu1nSkBoCgw2nSRTFHI5pB/3CQaJM0pdzMF3paf9ckKMSE9Tg==} cpu: [x64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linuxmusl-arm64@1.2.0': resolution: {integrity: sha512-UG+LqQJbf5VJ8NWJ5Z3tdIe/HXjuIdo4JeVNADXBFuG7z9zjoegpzzGIyV5zQKi4zaJjnAd2+g2nna8TZvuW9Q==} cpu: [arm64] os: [linux] - libc: [musl] '@img/sharp-libvips-linuxmusl-x64@1.2.0': resolution: {integrity: sha512-SRYOLR7CXPgNze8akZwjoGBoN1ThNZoqpOgfnOxmWsklTGVfJiGJoC/Lod7aNMGA1jSsKWM1+HRX43OP6p9+6Q==} cpu: [x64] os: [linux] - libc: [musl] '@img/sharp-linux-arm64@0.34.3': resolution: {integrity: sha512-QdrKe3EvQrqwkDrtuTIjI0bu6YEJHTgEeqdzI3uWJOH6G1O8Nl1iEeVYRGdj1h5I21CqxSvQp1Yv7xeU3ZewbA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] - libc: [glibc] '@img/sharp-linux-arm@0.34.3': resolution: {integrity: sha512-oBK9l+h6KBN0i3dC8rYntLiVfW8D8wH+NPNT3O/WBHeW0OQWCjfWksLUaPidsrDKpJgXp3G3/hkmhptAW0I3+A==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm] os: [linux] - libc: [glibc] '@img/sharp-linux-ppc64@0.34.3': resolution: {integrity: sha512-GLtbLQMCNC5nxuImPR2+RgrviwKwVql28FWZIW1zWruy6zLgA5/x2ZXk3mxj58X/tszVF69KK0Is83V8YgWhLA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [ppc64] os: [linux] - libc: [glibc] '@img/sharp-linux-s390x@0.34.3': resolution: {integrity: sha512-3gahT+A6c4cdc2edhsLHmIOXMb17ltffJlxR0aC2VPZfwKoTGZec6u5GrFgdR7ciJSsHT27BD3TIuGcuRT0KmQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [s390x] os: [linux] - libc: [glibc] '@img/sharp-linux-x64@0.34.3': resolution: {integrity: sha512-8kYso8d806ypnSq3/Ly0QEw90V5ZoHh10yH0HnrzOCr6DKAPI6QVHvwleqMkVQ0m+fc7EH8ah0BB0QPuWY6zJQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] - libc: [glibc] '@img/sharp-linuxmusl-arm64@0.34.3': resolution: {integrity: sha512-vAjbHDlr4izEiXM1OTggpCcPg9tn4YriK5vAjowJsHwdBIdx0fYRsURkxLG2RLm9gyBq66gwtWI8Gx0/ov+JKQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] - libc: [musl] '@img/sharp-linuxmusl-x64@0.34.3': resolution: {integrity: sha512-gCWUn9547K5bwvOn9l5XGAEjVTTRji4aPTqLzGXHvIr6bIDZKNTA34seMPgM0WmSf+RYBH411VavCejp3PkOeQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] - libc: [musl] '@img/sharp-wasm32@0.34.3': resolution: {integrity: sha512-+CyRcpagHMGteySaWos8IbnXcHgfDn7pO2fiC2slJxvNq9gDipYBN42/RagzctVRKgxATmfqOSulgZv5e1RdMg==} @@ -3647,19 +3664,35 @@ packages: '@libsql/client@0.14.0': resolution: {integrity: sha512-/9HEKfn6fwXB5aTEEoMeFh4CtG0ZzbncBb1e++OCdVpgKZ/xyMsIVYXm0w7Pv4RUel803vE6LwniB3PqD72R0Q==} + '@libsql/client@0.15.15': + resolution: {integrity: sha512-twC0hQxPNHPKfeOv3sNT6u2pturQjLcI+CnpTM0SjRpocEGgfiZ7DWKXLNnsothjyJmDqEsBQJ5ztq9Wlu470w==} + '@libsql/core@0.14.0': resolution: {integrity: sha512-nhbuXf7GP3PSZgdCY2Ecj8vz187ptHlZQ0VRc751oB2C1W8jQUXKKklvt7t1LJiUTQBVJuadF628eUk+3cRi4Q==} + '@libsql/core@0.15.15': + resolution: {integrity: sha512-C88Z6UKl+OyuKKPwz224riz02ih/zHYI3Ho/LAcVOgjsunIRZoBw7fjRfaH9oPMmSNeQfhGklSG2il1URoOIsA==} + '@libsql/darwin-arm64@0.4.7': resolution: {integrity: sha512-yOL742IfWUlUevnI5PdnIT4fryY3LYTdLm56bnY0wXBw7dhFcnjuA7jrH3oSVz2mjZTHujxoITgAE7V6Z+eAbg==} cpu: [arm64] os: [darwin] + '@libsql/darwin-arm64@0.5.29': + resolution: {integrity: sha512-K+2RIB1OGFPYQbfay48GakLhqf3ArcbHqPFu7EZiaUcRgFcdw8RoltsMyvbj5ix2fY0HV3Q3Ioa/ByvQdaSM0A==} + cpu: [arm64] + os: [darwin] + '@libsql/darwin-x64@0.4.7': resolution: {integrity: sha512-ezc7V75+eoyyH07BO9tIyJdqXXcRfZMbKcLCeF8+qWK5nP8wWuMcfOVywecsXGRbT99zc5eNra4NEx6z5PkSsA==} cpu: [x64] os: [darwin] + '@libsql/darwin-x64@0.5.29': + resolution: {integrity: sha512-OtT+KFHsKFy1R5FVadr8FJ2Bb1mghtXTyJkxv0trocq7NuHntSki1eUbxpO5ezJesDvBlqFjnWaYYY516QNLhQ==} + cpu: [x64] + os: [darwin] + '@libsql/hrana-client@0.7.0': resolution: {integrity: sha512-OF8fFQSkbL7vJY9rfuegK1R7sPgQ6kFMkDamiEccNUvieQ+3urzfDFI616oPl8V7T9zRmnTkSjMOImYCAVRVuw==} @@ -3670,31 +3703,66 @@ packages: '@libsql/isomorphic-ws@0.1.5': resolution: {integrity: sha512-DtLWIH29onUYR00i0GlQ3UdcTRC6EP4u9w/h9LxpUZJWRMARk6dQwZ6Jkd+QdwVpuAOrdxt18v0K2uIYR3fwFg==} + '@libsql/linux-arm-gnueabihf@0.5.29': + resolution: {integrity: sha512-CD4n4zj7SJTHso4nf5cuMoWoMSS7asn5hHygsDuhRl8jjjCTT3yE+xdUvI4J7zsyb53VO5ISh4cwwOtf6k2UhQ==} + cpu: [arm] + os: [linux] + + '@libsql/linux-arm-musleabihf@0.5.29': + resolution: {integrity: sha512-2Z9qBVpEJV7OeflzIR3+l5yAd4uTOLxklScYTwpZnkm2vDSGlC1PRlueLaufc4EFITkLKXK2MWBpexuNJfMVcg==} + cpu: [arm] + os: [linux] + '@libsql/linux-arm64-gnu@0.4.7': resolution: {integrity: sha512-WlX2VYB5diM4kFfNaYcyhw5y+UJAI3xcMkEUJZPtRDEIu85SsSFrQ+gvoKfcVh76B//ztSeEX2wl9yrjF7BBCA==} cpu: [arm64] os: [linux] + '@libsql/linux-arm64-gnu@0.5.29': + resolution: {integrity: sha512-gURBqaiXIGGwFNEaUj8Ldk7Hps4STtG+31aEidCk5evMMdtsdfL3HPCpvys+ZF/tkOs2MWlRWoSq7SOuCE9k3w==} + cpu: [arm64] + os: [linux] + '@libsql/linux-arm64-musl@0.4.7': resolution: {integrity: sha512-6kK9xAArVRlTCpWeqnNMCoXW1pe7WITI378n4NpvU5EJ0Ok3aNTIC2nRPRjhro90QcnmLL1jPcrVwO4WD1U0xw==} cpu: [arm64] os: [linux] + '@libsql/linux-arm64-musl@0.5.29': + resolution: {integrity: sha512-fwgYZ0H8mUkyVqXZHF3mT/92iIh1N94Owi/f66cPVNsk9BdGKq5gVpoKO+7UxaNzuEH1roJp2QEwsCZMvBLpqg==} + cpu: [arm64] + os: [linux] + '@libsql/linux-x64-gnu@0.4.7': resolution: {integrity: sha512-CMnNRCmlWQqqzlTw6NeaZXzLWI8bydaXDke63JTUCvu8R+fj/ENsLrVBtPDlxQ0wGsYdXGlrUCH8Qi9gJep0yQ==} cpu: [x64] os: [linux] + '@libsql/linux-x64-gnu@0.5.29': + resolution: {integrity: sha512-y14V0vY0nmMC6G0pHeJcEarcnGU2H6cm21ZceRkacWHvQAEhAG0latQkCtoS2njFOXiYIg+JYPfAoWKbi82rkg==} + cpu: [x64] + os: [linux] + '@libsql/linux-x64-musl@0.4.7': resolution: {integrity: sha512-nI6tpS1t6WzGAt1Kx1n1HsvtBbZ+jHn0m7ogNNT6pQHZQj7AFFTIMeDQw/i/Nt5H38np1GVRNsFe99eSIMs9XA==} cpu: [x64] os: [linux] + '@libsql/linux-x64-musl@0.5.29': + resolution: {integrity: sha512-gquqwA/39tH4pFl+J9n3SOMSymjX+6kZ3kWgY3b94nXFTwac9bnFNMffIomgvlFaC4ArVqMnOZD3nuJ3H3VO1w==} + cpu: [x64] + os: [linux] + '@libsql/win32-x64-msvc@0.4.7': resolution: {integrity: sha512-7pJzOWzPm6oJUxml+PCDRzYQ4A1hTMHAciTAHfFK4fkbDZX33nWPVG7Y3vqdKtslcwAzwmrNDc6sXy2nwWnbiw==} cpu: [x64] os: [win32] + '@libsql/win32-x64-msvc@0.5.29': + resolution: {integrity: sha512-4/0CvEdhi6+KjMxMaVbFM2n2Z44escBRoEYpR+gZg64DdetzGnYm8mcNLcoySaDJZNaBd6wz5DNdgRmcI4hXcg==} + cpu: [x64] + os: [win32] + '@malept/cross-spawn-promise@2.0.0': resolution: {integrity: sha512-1DpKU0Z5ThltBwjNySMC14g0CkbyhCaz9FkhxqNsZI6uAPJXFS8cMXlBKo26FJ8ZuW6S9GCMcR9IO5k2X5/9Fg==} engines: {node: '>= 12.13.0'} @@ -3709,6 +3777,10 @@ packages: '@manypkg/get-packages@1.1.3': resolution: {integrity: sha512-fo+QhuU3qE/2TQMQmbVMqaQ6EWbMhi4ABWP+O4AM1NqPBuy0OrApV5LO6BrrgnhtAHS2NH6RrVk9OL181tTi8A==} + '@mapbox/node-pre-gyp@1.0.11': + resolution: {integrity: sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==} + hasBin: true + '@marijn/find-cluster-break@1.0.2': resolution: {integrity: sha512-l0h88YhZFyKdXIFNfSWpyjStDjGHwZ/U7iobcK1cQQD8sejsONdQtTVU+1wVN1PBw40PiiHB1vA5S7VTfQiP9g==} @@ -3799,35 +3871,30 @@ packages: engines: {node: '>= 10'} cpu: [arm64] os: [linux] - libc: [glibc] '@napi-rs/canvas-linux-arm64-musl@0.1.97': resolution: {integrity: sha512-kKmSkQVnWeqg7qdsiXvYxKhAFuHz3tkBjW/zyQv5YKUPhotpaVhpBGv5LqCngzyuRV85SXoe+OFj+Tv0a0QXkQ==} engines: {node: '>= 10'} cpu: [arm64] os: [linux] - libc: [musl] '@napi-rs/canvas-linux-riscv64-gnu@0.1.97': resolution: {integrity: sha512-Jc7I3A51jnEOIAXeLsN/M/+Z28LUeakcsXs07FLq9prXc0eYOtVwsDEv913Gr+06IRo34gJJVgT0TXvmz+N2VA==} engines: {node: '>= 10'} cpu: [riscv64] os: [linux] - libc: [glibc] '@napi-rs/canvas-linux-x64-gnu@0.1.97': resolution: {integrity: sha512-iDUBe7AilfuBSRbSa8/IGX38Mf+iCSBqoVKLSQ5XaY2JLOaqz1TVyPFEyIck7wT6mRQhQt5sN6ogfjIDfi74tg==} engines: {node: '>= 10'} cpu: [x64] os: [linux] - libc: [glibc] '@napi-rs/canvas-linux-x64-musl@0.1.97': resolution: {integrity: sha512-AKLFd/v0Z5fvgqBDqhvqtAdx+fHMJ5t9JcUNKq4FIZ5WH+iegGm8HPdj00NFlCSnm83Fp3Ln8I2f7uq1aIiWaA==} engines: {node: '>= 10'} cpu: [x64] os: [linux] - libc: [musl] '@napi-rs/canvas-win32-arm64-msvc@0.1.97': resolution: {integrity: sha512-u883Yr6A6fO7Vpsy9YE4FVCIxzzo5sO+7pIUjjoDLjS3vQaNMkVzx5bdIpEL+ob+gU88WDK4VcxYMZ6nmnoX9A==} @@ -3920,28 +3987,24 @@ packages: engines: {node: '>= 10'} cpu: [arm64] os: [linux] - libc: [glibc] '@neplex/vectorizer-linux-arm64-musl@0.0.5': resolution: {integrity: sha512-r2a85bAkgwSxAbQTSHnzXaDZCyABgVTYf6f0OSh1oGHHIc9pC97VUZbmQLtGFeIQLQR9j4nKjF1MlOHmnV4EDA==} engines: {node: '>= 10'} cpu: [arm64] os: [linux] - libc: [musl] '@neplex/vectorizer-linux-x64-gnu@0.0.5': resolution: {integrity: sha512-8pdPe27RNXHwkvYiK3vj5b3/Yi8rWgJzUsBdT/Jm2bjk5c32wiV454yT0fLZQjRB1DCAK2DvyHjf6eZ0R9HaJg==} engines: {node: '>= 10'} cpu: [x64] os: [linux] - libc: [glibc] '@neplex/vectorizer-linux-x64-musl@0.0.5': resolution: {integrity: sha512-VP/DHuX40I/9KzSFRctxksXzJBGwbPE/E30NCAcPA1mS6iApovWsZe3la5dA9A5kStaKh9wTJcZuVEGL8tGIMg==} engines: {node: '>= 10'} cpu: [x64] os: [linux] - libc: [musl] '@neplex/vectorizer-win32-arm64-msvc@0.0.5': resolution: {integrity: sha512-VfQRITnqvjABiIcnx5b/9XjyktTbpDHzY2nVt5wplOqGM88f6fPn2JYiia7IEdv2BA/1+oN/Bcw75eq12mW8Ug==} @@ -4211,56 +4274,48 @@ packages: engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [glibc] '@oxlint/binding-linux-arm64-musl@1.56.0': resolution: {integrity: sha512-rkTZkBfJ4TYLjansjSzL6mgZOdN5IvUnSq3oNJSLwBcNvy3dlgQtpHPrRxrCEbbcp7oQ6If0tkNaqfOsphYZ9g==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [musl] '@oxlint/binding-linux-ppc64-gnu@1.56.0': resolution: {integrity: sha512-uqL1kMH3u69/e1CH2EJhP3CP28jw2ExLsku4o8RVAZ7fySo9zOyI2fy9pVlTAp4voBLVgzndXi3SgtdyCTa2aA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [ppc64] os: [linux] - libc: [glibc] '@oxlint/binding-linux-riscv64-gnu@1.56.0': resolution: {integrity: sha512-j0CcMBOgV6KsRaBdsebIeiy7hCjEvq2KdEsiULf2LZqAq0v1M1lWjelhCV57LxsqaIGChXFuFJ0RiFrSRHPhSg==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [riscv64] os: [linux] - libc: [glibc] '@oxlint/binding-linux-riscv64-musl@1.56.0': resolution: {integrity: sha512-7VDOiL8cDG3DQ/CY3yKjbV1c4YPvc4vH8qW09Vv+5ukq3l/Kcyr6XGCd5NvxUmxqDb2vjMpM+eW/4JrEEsUetA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [riscv64] os: [linux] - libc: [musl] '@oxlint/binding-linux-s390x-gnu@1.56.0': resolution: {integrity: sha512-JGRpX0M+ikD3WpwJ7vKcHKV6Kg0dT52BW2Eu2BupXotYeqGXBrbY+QPkAyKO6MNgKozyTNaRh3r7g+VWgyAQYQ==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [s390x] os: [linux] - libc: [glibc] '@oxlint/binding-linux-x64-gnu@1.56.0': resolution: {integrity: sha512-dNaICPvtmuxFP/VbqdofrLqdS3bM/AKJN3LMJD52si44ea7Be1cBk6NpfIahaysG9Uo+L98QKddU9CD5L8UHnQ==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [glibc] '@oxlint/binding-linux-x64-musl@1.56.0': resolution: {integrity: sha512-pF1vOtM+GuXmbklM1hV8WMsn6tCNPvkUzklj/Ej98JhlanbmA2RB1BILgOpwSuCTRTIYx2MXssmEyQQ90QF5aA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [musl] '@oxlint/binding-openharmony-arm64@1.56.0': resolution: {integrity: sha512-bp8NQ4RE6fDIFLa4bdBiOA+TAvkNkg+rslR+AvvjlLTYXLy9/uKAYLQudaQouWihLD/hgkrXIKKzXi5IXOewwg==} @@ -5118,154 +5173,132 @@ packages: engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [glibc] '@rolldown/binding-linux-arm64-gnu@1.0.0-beta.52': resolution: {integrity: sha512-V48oDR84feRU2KRuzpALp594Uqlx27+zFsT6+BgTcXOtu7dWy350J1G28ydoCwKB+oxwsRPx2e7aeQnmd3YJbQ==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [glibc] '@rolldown/binding-linux-arm64-gnu@1.0.0-beta.53': resolution: {integrity: sha512-bpIGX+ov9PhJYV+wHNXl9rzq4F0QvILiURn0y0oepbQx+7stmQsKA0DhPGwmhfvF856wq+gbM8L92SAa/CBcLg==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [glibc] '@rolldown/binding-linux-arm64-gnu@1.0.0-rc.12': resolution: {integrity: sha512-/I5AS4cIroLpslsmzXfwbe5OmWvSsrFuEw3mwvbQ1kDxJ822hFHIx+vsN/TAzNVyepI/j/GSzrtCIwQPeKCLIg==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [glibc] '@rolldown/binding-linux-arm64-gnu@1.0.0-rc.3': resolution: {integrity: sha512-kWXkoxxarYISBJ4bLNf5vFkEbb4JvccOwxWDxuK9yee8lg5XA7OpvlTptfRuwEvYcOZf+7VS69Uenpmpyo5Bjw==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [glibc] '@rolldown/binding-linux-arm64-musl@1.0.0-beta.45': resolution: {integrity: sha512-tdy8ThO/fPp40B81v0YK3QC+KODOmzJzSUOO37DinQxzlTJ026gqUSOM8tzlVixRbQJltgVDCTYF8HNPRErQTA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [musl] '@rolldown/binding-linux-arm64-musl@1.0.0-beta.52': resolution: {integrity: sha512-ENLmSQCWqSA/+YN45V2FqTIemg7QspaiTjlm327eUAMeOLdqmSOVVyrQexJGNTQ5M8sDYCgVAig2Kk01Ggmqaw==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [musl] '@rolldown/binding-linux-arm64-musl@1.0.0-beta.53': resolution: {integrity: sha512-bGe5EBB8FVjHBR1mOLOPEFg1Lp3//7geqWkU5NIhxe+yH0W8FVrQ6WRYOap4SUTKdklD/dC4qPLREkMMQ855FA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [musl] '@rolldown/binding-linux-arm64-musl@1.0.0-rc.12': resolution: {integrity: sha512-V6/wZztnBqlx5hJQqNWwFdxIKN0m38p8Jas+VoSfgH54HSj9tKTt1dZvG6JRHcjh6D7TvrJPWFGaY9UBVOaWPw==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [musl] '@rolldown/binding-linux-arm64-musl@1.0.0-rc.3': resolution: {integrity: sha512-Z03/wrqau9Bicfgb3Dbs6SYTHliELk2PM2LpG2nFd+cGupTMF5kanLEcj2vuuJLLhptNyS61rtk7SOZ+lPsTUA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [arm64] os: [linux] - libc: [musl] '@rolldown/binding-linux-ppc64-gnu@1.0.0-rc.12': resolution: {integrity: sha512-AP3E9BpcUYliZCxa3w5Kwj9OtEVDYK6sVoUzy4vTOJsjPOgdaJZKFmN4oOlX0Wp0RPV2ETfmIra9x1xuayFB7g==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [ppc64] os: [linux] - libc: [glibc] '@rolldown/binding-linux-s390x-gnu@1.0.0-rc.12': resolution: {integrity: sha512-nWwpvUSPkoFmZo0kQazZYOrT7J5DGOJ/+QHHzjvNlooDZED8oH82Yg67HvehPPLAg5fUff7TfWFHQS8IV1n3og==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [s390x] os: [linux] - libc: [glibc] '@rolldown/binding-linux-x64-gnu@1.0.0-beta.45': resolution: {integrity: sha512-lS082ROBWdmOyVY/0YB3JmsiClaWoxvC+dA8/rbhyB9VLkvVEaihLEOr4CYmrMse151C4+S6hCw6oa1iewox7g==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [glibc] '@rolldown/binding-linux-x64-gnu@1.0.0-beta.52': resolution: {integrity: sha512-klahlb2EIFltSUubn/VLjuc3qxp1E7th8ukayPfdkcKvvYcQ5rJztgx8JsJSuAKVzKtNTqUGOhy4On71BuyV8g==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [glibc] '@rolldown/binding-linux-x64-gnu@1.0.0-beta.53': resolution: {integrity: sha512-qL+63WKVQs1CMvFedlPt0U9PiEKJOAL/bsHMKUDS6Vp2Q+YAv/QLPu8rcvkfIMvQ0FPU2WL0aX4eWwF6e/GAnA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [glibc] '@rolldown/binding-linux-x64-gnu@1.0.0-rc.12': resolution: {integrity: sha512-RNrafz5bcwRy+O9e6P8Z/OCAJW/A+qtBczIqVYwTs14pf4iV1/+eKEjdOUta93q2TsT/FI0XYDP3TCky38LMAg==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [glibc] '@rolldown/binding-linux-x64-gnu@1.0.0-rc.3': resolution: {integrity: sha512-iSXXZsQp08CSilff/DCTFZHSVEpEwdicV3W8idHyrByrcsRDVh9sGC3sev6d8BygSGj3vt8GvUKBPCoyMA4tgQ==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [glibc] '@rolldown/binding-linux-x64-musl@1.0.0-beta.45': resolution: {integrity: sha512-Hi73aYY0cBkr1/SvNQqH8Cd+rSV6S9RB5izCv0ySBcRnd/Wfn5plguUoGYwBnhHgFbh6cPw9m2dUVBR6BG1gxA==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [musl] '@rolldown/binding-linux-x64-musl@1.0.0-beta.52': resolution: {integrity: sha512-UuA+JqQIgqtkgGN2c/AQ5wi8M6mJHrahz/wciENPTeI6zEIbbLGoth5XN+sQe2pJDejEVofN9aOAp0kaazwnVg==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [musl] '@rolldown/binding-linux-x64-musl@1.0.0-beta.53': resolution: {integrity: sha512-VGl9JIGjoJh3H8Mb+7xnVqODajBmrdOOb9lxWXdcmxyI+zjB2sux69br0hZJDTyLJfvBoYm439zPACYbCjGRmw==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [musl] '@rolldown/binding-linux-x64-musl@1.0.0-rc.12': resolution: {integrity: sha512-Jpw/0iwoKWx3LJ2rc1yjFrj+T7iHZn2JDg1Yny1ma0luviFS4mhAIcd1LFNxK3EYu3DHWCps0ydXQ5i/rrJ2ig==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [musl] '@rolldown/binding-linux-x64-musl@1.0.0-rc.3': resolution: {integrity: sha512-qaj+MFudtdCv9xZo9znFvkgoajLdc+vwf0Kz5N44g+LU5XMe+IsACgn3UG7uTRlCCvhMAGXm1XlpEA5bZBrOcw==} engines: {node: ^20.19.0 || >=22.12.0} cpu: [x64] os: [linux] - libc: [musl] '@rolldown/binding-openharmony-arm64@1.0.0-beta.45': resolution: {integrity: sha512-fljEqbO7RHHogNDxYtTzr+GNjlfOx21RUyGmF+NrkebZ8emYYiIqzPxsaMZuRx0rgZmVmliOzEp86/CQFDKhJQ==} @@ -5455,85 +5488,71 @@ packages: resolution: {integrity: sha512-Rn3n+FUk2J5VWx+ywrG/HGPTD9jXNbicRtTM11e/uorplArnXZYsVifnPPqNNP5BsO3roI4n8332ukpY/zN7rQ==} cpu: [arm] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm-musleabihf@4.55.1': resolution: {integrity: sha512-grPNWydeKtc1aEdrJDWk4opD7nFtQbMmV7769hiAaYyUKCT1faPRm2av8CX1YJsZ4TLAZcg9gTR1KvEzoLjXkg==} cpu: [arm] os: [linux] - libc: [musl] '@rollup/rollup-linux-arm64-gnu@4.55.1': resolution: {integrity: sha512-a59mwd1k6x8tXKcUxSyISiquLwB5pX+fJW9TkWU46lCqD/GRDe9uDN31jrMmVP3feI3mhAdvcCClhV8V5MhJFQ==} cpu: [arm64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm64-musl@4.55.1': resolution: {integrity: sha512-puS1MEgWX5GsHSoiAsF0TYrpomdvkaXm0CofIMG5uVkP6IBV+ZO9xhC5YEN49nsgYo1DuuMquF9+7EDBVYu4uA==} cpu: [arm64] os: [linux] - libc: [musl] '@rollup/rollup-linux-loong64-gnu@4.55.1': resolution: {integrity: sha512-r3Wv40in+lTsULSb6nnoudVbARdOwb2u5fpeoOAZjFLznp6tDU8kd+GTHmJoqZ9lt6/Sys33KdIHUaQihFcu7g==} cpu: [loong64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-loong64-musl@4.55.1': resolution: {integrity: sha512-MR8c0+UxAlB22Fq4R+aQSPBayvYa3+9DrwG/i1TKQXFYEaoW3B5b/rkSRIypcZDdWjWnpcvxbNaAJDcSbJU3Lw==} cpu: [loong64] os: [linux] - libc: [musl] '@rollup/rollup-linux-ppc64-gnu@4.55.1': resolution: {integrity: sha512-3KhoECe1BRlSYpMTeVrD4sh2Pw2xgt4jzNSZIIPLFEsnQn9gAnZagW9+VqDqAHgm1Xc77LzJOo2LdigS5qZ+gw==} cpu: [ppc64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-ppc64-musl@4.55.1': resolution: {integrity: sha512-ziR1OuZx0vdYZZ30vueNZTg73alF59DicYrPViG0NEgDVN8/Jl87zkAPu4u6VjZST2llgEUjaiNl9JM6HH1Vdw==} cpu: [ppc64] os: [linux] - libc: [musl] '@rollup/rollup-linux-riscv64-gnu@4.55.1': resolution: {integrity: sha512-uW0Y12ih2XJRERZ4jAfKamTyIHVMPQnTZcQjme2HMVDAHY4amf5u414OqNYC+x+LzRdRcnIG1YodLrrtA8xsxw==} cpu: [riscv64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-riscv64-musl@4.55.1': resolution: {integrity: sha512-u9yZ0jUkOED1BFrqu3BwMQoixvGHGZ+JhJNkNKY/hyoEgOwlqKb62qu+7UjbPSHYjiVy8kKJHvXKv5coH4wDeg==} cpu: [riscv64] os: [linux] - libc: [musl] '@rollup/rollup-linux-s390x-gnu@4.55.1': resolution: {integrity: sha512-/0PenBCmqM4ZUd0190j7J0UsQ/1nsi735iPRakO8iPciE7BQ495Y6msPzaOmvx0/pn+eJVVlZrNrSh4WSYLxNg==} cpu: [s390x] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.45.1': resolution: {integrity: sha512-+E/lYl6qu1zqgPEnTrs4WysQtvc/Sh4fC2nByfFExqgYrqkKWp1tWIbe+ELhixnenSpBbLXNi6vbEEJ8M7fiHw==} cpu: [x64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.55.1': resolution: {integrity: sha512-a8G4wiQxQG2BAvo+gU6XrReRRqj+pLS2NGXKm8io19goR+K8lw269eTrPkSdDTALwMmJp4th2Uh0D8J9bEV1vg==} cpu: [x64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-musl@4.55.1': resolution: {integrity: sha512-bD+zjpFrMpP/hqkfEcnjXWHMw5BIghGisOKPj+2NaNDuVT+8Ds4mPf3XcPHuat1tz89WRL+1wbcxKY3WSbiT7w==} cpu: [x64] os: [linux] - libc: [musl] '@rollup/rollup-openbsd-x64@4.55.1': resolution: {integrity: sha512-eLXw0dOiqE4QmvikfQ6yjgkg/xDM+MdU9YJuP4ySTibXU0oAvnEWXt7UDJmD4UkYialMfOGFPJnIHSe/kdzPxg==} @@ -6023,28 +6042,24 @@ packages: engines: {node: '>=10'} cpu: [arm64] os: [linux] - libc: [glibc] '@swc/core-linux-arm64-musl@1.15.8': resolution: {integrity: sha512-koiCqL09EwOP1S2RShCI7NbsQuG6r2brTqUYE7pV7kZm9O17wZ0LSz22m6gVibpwEnw8jI3IE1yYsQTVpluALw==} engines: {node: '>=10'} cpu: [arm64] os: [linux] - libc: [musl] '@swc/core-linux-x64-gnu@1.15.8': resolution: {integrity: sha512-4p6lOMU3bC+Vd5ARtKJ/FxpIC5G8v3XLoPEZ5s7mLR8h7411HWC/LmTXDHcrSXRC55zvAVia1eldy6zDLz8iFQ==} engines: {node: '>=10'} cpu: [x64] os: [linux] - libc: [glibc] '@swc/core-linux-x64-musl@1.15.8': resolution: {integrity: sha512-z3XBnbrZAL+6xDGAhJoN4lOueIxC/8rGrJ9tg+fEaeqLEuAtHSW2QHDHxDwkxZMjuF/pZ6MUTjHjbp8wLbuRLA==} engines: {node: '>=10'} cpu: [x64] os: [linux] - libc: [musl] '@swc/core-win32-arm64-msvc@1.15.8': resolution: {integrity: sha512-djQPJ9Rh9vP8GTS/Df3hcc6XP6xnG5c8qsngWId/BLA9oX6C7UzCPAn74BG/wGb9a6j4w3RINuoaieJB3t+7iQ==} @@ -6131,28 +6146,24 @@ packages: engines: {node: '>= 10'} cpu: [arm64] os: [linux] - libc: [glibc] '@tailwindcss/oxide-linux-arm64-musl@4.1.18': resolution: {integrity: sha512-1px92582HkPQlaaCkdRcio71p8bc8i/ap5807tPRDK/uw953cauQBT8c5tVGkOwrHMfc2Yh6UuxaH4vtTjGvHg==} engines: {node: '>= 10'} cpu: [arm64] os: [linux] - libc: [musl] '@tailwindcss/oxide-linux-x64-gnu@4.1.18': resolution: {integrity: sha512-v3gyT0ivkfBLoZGF9LyHmts0Isc8jHZyVcbzio6Wpzifg/+5ZJpDiRiUhDLkcr7f/r38SWNe7ucxmGW3j3Kb/g==} engines: {node: '>= 10'} cpu: [x64] os: [linux] - libc: [glibc] '@tailwindcss/oxide-linux-x64-musl@4.1.18': resolution: {integrity: sha512-bhJ2y2OQNlcRwwgOAGMY0xTFStt4/wyU6pvI6LSuZpRgKQwxTec0/3Scu91O8ir7qCR3AuepQKLU/kX99FouqQ==} engines: {node: '>= 10'} cpu: [x64] os: [linux] - libc: [musl] '@tailwindcss/oxide-wasm32-wasi@4.1.18': resolution: {integrity: sha512-LffYTvPjODiP6PT16oNeUQJzNVyJl1cjIebq/rWWBF+3eDst5JGEFSc5cWxyRCJ0Mxl+KyIkqRxk1XPEs9x8TA==} @@ -7143,6 +7154,18 @@ packages: '@upsetjs/venn.js@2.0.0': resolution: {integrity: sha512-WbBhLrooyePuQ1VZxrJjtLvTc4NVfpOyKx0sKqioq9bX1C1m7Jgykkn8gLrtwumBioXIqam8DLxp88Adbue6Hw==} + '@vectorstores/core@0.1.8': + resolution: {integrity: sha512-9Hvtoq9A3Hkp0U8+J5Zn9CimkSXVL2DHeZzICG5uSIHZsRVoIzRJ7qV5c3TpRl2uMU/1RzNbhH0w64DFqlDjyg==} + + '@vectorstores/env@0.1.0': + resolution: {integrity: sha512-nB7lPVizwDD5fEtMDsXnrml9re+8QorqdNBtuFDDq94WrKBDeM3b0z7UNC41eukkTzM9mdHPlw15OHNxchM5pA==} + + '@vectorstores/readers@0.1.8': + resolution: {integrity: sha512-OI0kpund7io7h9F0T3juf9SqDFQn84uYOTBDnv+LCW62h9PFyY31iMYf5r/1T4CJhkd5j3jiXW6AykQxSJOvJQ==} + peerDependencies: + '@vectorstores/core': 0.1.8 + '@vectorstores/env': 0.1.0 + '@vercel/oidc@3.1.0': resolution: {integrity: sha512-Fw28YZpRnA3cAHHDlkt7xQHiJ0fcL+NRcIqsocZQUSmbzeIKRpwttJjik5ZGanXP+vlA4SbTg+AbA3bP363l+w==} engines: {node: '>= 20'} @@ -7179,12 +7202,27 @@ packages: '@vitest/browser': optional: true + '@vitest/expect@2.1.0': + resolution: {integrity: sha512-N3/xR4fSu0+6sVZETEtPT1orUs2+Y477JOXTcU3xKuu3uBlsgbD7/7Mz2LZ1Jr1XjwilEWlrIgSCj4N1+5ZmsQ==} + '@vitest/expect@3.2.4': resolution: {integrity: sha512-Io0yyORnB6sikFlt8QW5K7slY4OjqNX9jmJQ02QDda8lyM6B5oNgVWoSoKPac8/kgnCUzuHQKrSLtu/uOqqrig==} '@vitest/expect@4.1.2': resolution: {integrity: sha512-gbu+7B0YgUJ2nkdsRJrFFW6X7NTP44WlhiclHniUhxADQJH5Szt9mZ9hWnJPJ8YwOK5zUOSSlSvyzRf0u1DSBQ==} + '@vitest/mocker@2.1.0': + resolution: {integrity: sha512-ZxENovUqhzl+QiOFpagiHUNUuZ1qPd5yYTCYHomGIZOFArzn4mgX2oxZmiAItJWAaXHG6bbpb/DpSPhlk5DgtA==} + peerDependencies: + '@vitest/spy': 2.1.0 + msw: ^2.3.5 + vite: ^5.0.0 + peerDependenciesMeta: + msw: + optional: true + vite: + optional: true + '@vitest/mocker@3.2.4': resolution: {integrity: sha512-46ryTE9RZO/rfDd7pEqFl7etuyzekzEhUbTW3BvmeO/BcCMEgq59BKhek3dXDWgAj4oMK6OZi+vRr1wPW6qjEQ==} peerDependencies: @@ -7207,24 +7245,39 @@ packages: vite: optional: true + '@vitest/pretty-format@2.1.0': + resolution: {integrity: sha512-7sxf2F3DNYatgmzXXcTh6cq+/fxwB47RIQqZJFoSH883wnVAoccSRT6g+dTKemUBo8Q5N4OYYj1EBXLuRKvp3Q==} + + '@vitest/pretty-format@2.1.9': + resolution: {integrity: sha512-KhRIdGV2U9HOUzxfiHmY8IFHTdqtOhIzCpd8WRdJiE7D/HUcZVD0EgQCVjm+Q9gkUXWgBvMmTtZgIG48wq7sOQ==} + '@vitest/pretty-format@3.2.4': resolution: {integrity: sha512-IVNZik8IVRJRTr9fxlitMKeJeXFFFN0JaB9PHPGQ8NKQbGpfjlTx9zO4RefN8gp7eqjNy8nyK3NZmBzOPeIxtA==} '@vitest/pretty-format@4.1.2': resolution: {integrity: sha512-dwQga8aejqeuB+TvXCMzSQemvV9hNEtDDpgUKDzOmNQayl2OG241PSWeJwKRH3CiC+sESrmoFd49rfnq7T4RnA==} + '@vitest/runner@2.1.0': + resolution: {integrity: sha512-D9+ZiB8MbMt7qWDRJc4CRNNUlne/8E1X7dcKhZVAbcOKG58MGGYVDqAq19xlhNfMFZsW0bpVKgztBwks38Ko0w==} + '@vitest/runner@3.2.4': resolution: {integrity: sha512-oukfKT9Mk41LreEW09vt45f8wx7DordoWUZMYdY/cyAk7w5TWkTRCNZYF7sX7n2wB7jyGAl74OxgwhPgKaqDMQ==} '@vitest/runner@4.1.2': resolution: {integrity: sha512-Gr+FQan34CdiYAwpGJmQG8PgkyFVmARK8/xSijia3eTFgVfpcpztWLuP6FttGNfPLJhaZVP/euvujeNYar36OQ==} + '@vitest/snapshot@2.1.0': + resolution: {integrity: sha512-x69CygGMzt9VCO283K2/FYQ+nBrOj66OTKpsPykjCR4Ac3lLV+m85hj9reaIGmjBSsKzVvbxWmjWE3kF5ha3uQ==} + '@vitest/snapshot@3.2.4': resolution: {integrity: sha512-dEYtS7qQP2CjU27QBC5oUOxLE/v5eLkGqPE0ZKEIDGMs4vKWe7IjgLOeauHsR0D5YuuycGRO5oSRXnwnmA78fQ==} '@vitest/snapshot@4.1.2': resolution: {integrity: sha512-g7yfUmxYS4mNxk31qbOYsSt2F4m1E02LFqO53Xpzg3zKMhLAPZAjjfyl9e6z7HrW6LvUdTwAQR3HHfLjpko16A==} + '@vitest/spy@2.1.0': + resolution: {integrity: sha512-IXX5NkbdgTYTog3F14i2LgnBc+20YmkXMx0IWai84mcxySUDRgm0ihbOfR4L0EVRBDFG85GjmQQEZNNKVVpkZw==} + '@vitest/spy@3.2.4': resolution: {integrity: sha512-vAfasCOe6AIK70iP5UD11Ac4siNUNJ9i/9PZ3NKx07sG6sUxeag1LWdNrMWeKKYBLlzuK+Gn65Yd5nyL6ds+nw==} @@ -7236,6 +7289,9 @@ packages: peerDependencies: vitest: 3.2.4 + '@vitest/utils@2.1.0': + resolution: {integrity: sha512-rreyfVe0PuNqJfKYUwfPDfi6rrp0VSu0Wgvp5WBqJonP+4NvXHk48X6oBam1Lj47Hy6jbJtnMj3OcRdrkTP0tA==} + '@vitest/utils@3.2.4': resolution: {integrity: sha512-fB2V0JFrQSMsCo9HiSq3Ezpdv4iYaXRG1Sx8edX3MwxfyNn83mKiGzOcH+Fkxt4MHxr3y42fQi1oeAInqgX2QA==} @@ -7284,6 +7340,9 @@ packages: '@xyflow/system@0.0.74': resolution: {integrity: sha512-7v7B/PkiVrkdZzSbL+inGAo6tkR/WQHHG0/jhSvLQToCsfa8YubOGmBYd1s08tpKpihdHDZFwzQZeR69QSBb4Q==} + abbrev@1.1.1: + resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==} + abbrev@3.0.1: resolution: {integrity: sha512-AO2ac6pjRB3SJmGJo+v5/aK6Omggp6fsLrs6wN9bd35ulu4cCwaAU9+7ZhXjeqHVkaHThLuzH0nZr0YpCDhygg==} engines: {node: ^18.17.0 || >=20.5.0} @@ -7310,6 +7369,10 @@ packages: resolution: {integrity: sha512-TFi4HBKSGfIKsK5YCkKaaFG2m4PEDyViZmEwof3MTIgzimHLto6muaHVpbrljdIvIrFZzEq/p4nafOeLcYegrg==} engines: {node: '>=0.3.0'} + agent-base@6.0.2: + resolution: {integrity: sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==} + engines: {node: '>= 6.0.0'} + agent-base@7.1.4: resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==} engines: {node: '>= 14'} @@ -7445,6 +7508,11 @@ packages: resolution: {integrity: sha512-nxwy40TuMiUGqMyRHgCSWZ9FM4VAoRP4xUYSTv5ImRog+h9yISPbVH7H8fASCIzYn9wlEv4zvFL7uKDMCFQm3g==} deprecated: This package is no longer supported. + are-we-there-yet@2.0.0: + resolution: {integrity: sha512-Ci/qENmwHnsYo9xKIcUJN5LeDKdJ6R1Z1j9V/J5wyq8nh/mYPEpIKJbBZXtZjG04HiK7zV/p6Vs9952MrMeUIw==} + engines: {node: '>=10'} + deprecated: This package is no longer supported. + argparse@1.0.10: resolution: {integrity: sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==} @@ -7804,6 +7872,10 @@ packages: caniuse-lite@1.0.30001762: resolution: {integrity: sha512-PxZwGNvH7Ak8WX5iXzoK1KPZttBXNPuaOvI2ZYU7NrlM+d9Ov+TUvlLOBNGzVXAntMSMMlJPd+jY6ovrVjSmUw==} + canvas@2.11.2: + resolution: {integrity: sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==} + engines: {node: '>=6'} + caseless@0.12.0: resolution: {integrity: sha512-4tYFyifaFfGacoiObjJegolkwSU4xQNGbVgUiNYVUxbQ2x2lUsFvY4hVgVzGiIe6WLOPqycWXA40l+PWsxthUw==} @@ -8039,6 +8111,10 @@ packages: resolution: {integrity: sha512-Bb6Cq8oq0IjDOe8wJmi4JeNn763Xs9cfrBcaylK1tPypWzyoy2G3l90v9k64kjphl/ZJjPIShFztenRomi8WTg==} engines: {node: '>=18'} + color-support@1.1.3: + resolution: {integrity: sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==} + hasBin: true + color@4.2.3: resolution: {integrity: sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==} engines: {node: '>=12.5.0'} @@ -9594,6 +9670,11 @@ packages: resolution: {integrity: sha512-14x4kjc6lkD3ltw589k0NrPD6cCNTD6CWoVUNpB85+DrtONoZn+Rug6xZU5RvSC4+TZPxA5AnBibQYAvZn41Hg==} deprecated: This package is no longer supported. + gauge@3.0.2: + resolution: {integrity: sha512-+5J6MS/5XksCuXq++uFRsnUd7Ovu1XenbeuIuNRJxYWjgQbPuFhT14lAvsWfqfAmnwluf1OwMjz39HjfLPci0Q==} + engines: {node: '>=10'} + deprecated: This package is no longer supported. + gaxios@6.7.1: resolution: {integrity: sha512-LDODD4TMYx7XXdpwxAVRAIAuB0bzv0s+ywFonY46k126qzQHT9ygyoa9tncmOiQmmDrik65UYsEkv3lbfqQ3yQ==} engines: {node: '>=14'} @@ -9943,6 +10024,10 @@ packages: resolution: {integrity: sha512-V5nVw1PAOgfI3Lmeaj2Exmeg7fenjhRUgz1lPSezy1CuhPYbgQtbQj4jZfEAEMlaL+vupsvhjqCyjzob0yxsmQ==} engines: {node: '>=10.19.0'} + https-proxy-agent@5.0.1: + resolution: {integrity: sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==} + engines: {node: '>= 6'} + https-proxy-agent@7.0.6: resolution: {integrity: sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==} engines: {node: '>= 14'} @@ -10576,6 +10661,11 @@ packages: cpu: [x64, arm64, wasm32] os: [darwin, linux, win32] + libsql@0.5.29: + resolution: {integrity: sha512-8lMP8iMgiBzzoNbAPQ59qdVcj6UaE/Vnm+fiwX4doX4Narook0a4GPKWBEv+CR8a1OwbfkgL18uBfBjWdF0Fzg==} + cpu: [x64, arm64, wasm32, arm] + os: [darwin, linux, win32] + lie@3.1.1: resolution: {integrity: sha512-RiNhHysUjhrDQntfYSfY4MU24coXXdEOgw9WGcKHNeEwffDYbF//u87M1EWaMGzuFoSbqW0C9C6lEEhDOAswfw==} @@ -10647,56 +10737,48 @@ packages: engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] - libc: [glibc] lightningcss-linux-arm64-gnu@1.32.0: resolution: {integrity: sha512-0nnMyoyOLRJXfbMOilaSRcLH3Jw5z9HDNGfT/gwCPgaDjnx0i8w7vBzFLFR1f6CMLKF8gVbebmkUN3fa/kQJpQ==} engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] - libc: [glibc] lightningcss-linux-arm64-musl@1.30.2: resolution: {integrity: sha512-5Vh9dGeblpTxWHpOx8iauV02popZDsCYMPIgiuw97OJ5uaDsL86cnqSFs5LZkG3ghHoX5isLgWzMs+eD1YzrnA==} engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] - libc: [musl] lightningcss-linux-arm64-musl@1.32.0: resolution: {integrity: sha512-UpQkoenr4UJEzgVIYpI80lDFvRmPVg6oqboNHfoH4CQIfNA+HOrZ7Mo7KZP02dC6LjghPQJeBsvXhJod/wnIBg==} engines: {node: '>= 12.0.0'} cpu: [arm64] os: [linux] - libc: [musl] lightningcss-linux-x64-gnu@1.30.2: resolution: {integrity: sha512-Cfd46gdmj1vQ+lR6VRTTadNHu6ALuw2pKR9lYq4FnhvgBc4zWY1EtZcAc6EffShbb1MFrIPfLDXD6Xprbnni4w==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] - libc: [glibc] lightningcss-linux-x64-gnu@1.32.0: resolution: {integrity: sha512-V7Qr52IhZmdKPVr+Vtw8o+WLsQJYCTd8loIfpDaMRWGUZfBOYEJeyJIkqGIDMZPwPx24pUMfwSxxI8phr/MbOA==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] - libc: [glibc] lightningcss-linux-x64-musl@1.30.2: resolution: {integrity: sha512-XJaLUUFXb6/QG2lGIW6aIk6jKdtjtcffUT0NKvIqhSBY3hh9Ch+1LCeH80dR9q9LBjG3ewbDjnumefsLsP6aiA==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] - libc: [musl] lightningcss-linux-x64-musl@1.32.0: resolution: {integrity: sha512-bYcLp+Vb0awsiXg/80uCRezCYHNg1/l3mt0gzHnWV9XP1W5sKa5/TCdGWaR/zBM2PeF/HbsQv/j2URNOiVuxWg==} engines: {node: '>= 12.0.0'} cpu: [x64] os: [linux] - libc: [musl] lightningcss-win32-arm64-msvc@1.30.2: resolution: {integrity: sha512-FZn+vaj7zLv//D/192WFFVA0RgHawIcHqLX9xuWiQt7P0PtdFEVaxgF9rjM/IRYHQXNnk61/H/gb2Ei+kUQ4xQ==} @@ -10930,6 +11012,10 @@ packages: resolution: {integrity: sha512-2w31R7SJtieJJnQtGc7RVL2StM2vGYVfqUOvUDxH6bC6aJTxPxTF0GnIgCyu7tjockiUWAYQRbxa7vKn34s5sQ==} engines: {node: '>=4'} + make-dir@3.1.0: + resolution: {integrity: sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==} + engines: {node: '>=8'} + make-dir@4.0.0: resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==} engines: {node: '>=10'} @@ -10938,6 +11024,11 @@ packages: resolution: {integrity: sha512-QMjGbFTP0blj97EeidG5hk/QhKQ3T4ICckQGLgz38QF7Vgbk6e6FTARN8KhKxyBbWn8R0HU+bnw8aSoFPD4qtQ==} engines: {node: ^18.17.0 || >=20.5.0} + mammoth@1.12.0: + resolution: {integrity: sha512-cwnK1RIcRdDMi2HRx2EXGYlxqIEh0Oo3bLhorgnsVJi2UkbX1+jKxuBNR9PC5+JaX7EkmJxFPmo6mjLpqShI2w==} + engines: {node: '>=12.0.0'} + hasBin: true + mammoth@1.6.0: resolution: {integrity: sha512-jOwbj6BwJzxCf6jr2l1zmSemniIkLnchvELXnDJCANlJawhzyIKObIq48B8kWEPLgUUh57k7FtEO3DHFQMnjMg==} engines: {node: '>=12.0.0'} @@ -11445,6 +11536,9 @@ packages: n-gram@2.0.2: resolution: {integrity: sha512-S24aGsn+HLBxUGVAUFOwGpKs7LBcG4RudKU//eWzt/mQ97/NMKQxDWHyHx63UNWk/OOdihgmzoETn1tf5nQDzQ==} + nan@2.26.2: + resolution: {integrity: sha512-0tTvBTYkt3tdGw22nrAy50x7gpbGCCFH3AFcyS5WiUu7Eu4vWlri1woE6qHBSfy11vksDqkiwjOnlR7WV8G1Hw==} + nanoid@3.3.11: resolution: {integrity: sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==} engines: {node: ^10 || ^12 || ^13.7 || ^14 || >=15.0.1} @@ -11544,6 +11638,11 @@ packages: noop-logger@0.1.1: resolution: {integrity: sha512-6kM8CLXvuW5crTxsAtva2YLrRrDaiTIkIePWs9moLHqbFWT94WpNFjwS/5dfLfECg5i/lkmw3aoqVidxt23TEQ==} + nopt@5.0.0: + resolution: {integrity: sha512-Tbj67rffqceeLpcRXrT7vKAN8CwfPeIBgM7E6iBkmKLV7bEMwpGgYLGv0jACUsECaa/vuxP0IjEont6umdMgtQ==} + engines: {node: '>=6'} + hasBin: true + nopt@8.1.0: resolution: {integrity: sha512-ieGu42u/Qsa4TFktmaKEwM6MQH0pOWnaB3htzh0JRtx84+Mebc0cbZYN5bC+6WTZ4+77xrL9Pn5m7CV6VIkV7A==} engines: {node: ^18.17.0 || >=20.5.0} @@ -11569,6 +11668,10 @@ packages: resolution: {integrity: sha512-2uUqazuKlTaSI/dC8AzicUck7+IrEaOnN/e0jd3Xtt1KcGpwx30v50mL7oPyr/h9bL3E4aZccVwpwP+5W9Vjkg==} deprecated: This package is no longer supported. + npmlog@5.0.1: + resolution: {integrity: sha512-AqZtDUWOMKs1G/8lwylVjrdYgqA4d9nu8hc+0gzRxlDb1I10+FHBGMXs6aiQHFdCUUlqH99MUMuLfzWDNDtfxw==} + deprecated: This package is no longer supported. + npx-scope-finder@1.3.0: resolution: {integrity: sha512-ho4pakGCvM0VEj3u3yj9V71uUzf8EggPYaoXAPnvkyRAAzOG3aM2xeeZV6N4eOZBZBXq9xIZI1Jfd++XNB498g==} engines: {node: '>=18'} @@ -11892,6 +11995,9 @@ packages: resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==} engines: {node: '>=8'} + pathe@1.1.2: + resolution: {integrity: sha512-whLdWMYL2TwI08hn8/ZqAbrVemu0LNaNNJZX73O6qaIdCTfXutsLhMkjdENX0qhsQ9uIimo4/aQOmXkoon2nDQ==} + pathe@2.0.3: resolution: {integrity: sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==} @@ -13688,6 +13794,10 @@ packages: resolution: {integrity: sha512-Zba82s87IFq9A9XmjiX5uZA/ARWDrB03OHlq+Vw1fSdt0I+4/Kutwy8BP4Y/y/aORMo61FQ0vIb5j44vSo5Pkg==} engines: {node: ^18.0.0 || >=20.0.0} + tinyrainbow@1.2.0: + resolution: {integrity: sha512-weEDEq7Z5eTHPDh4xjX789+fHfF+P8boiFB+0vbWzpbnbsEr/GRaohi/uMKxg8RZMXnl1ItAi/IUHWMsjDV7kQ==} + engines: {node: '>=14.0.0'} + tinyrainbow@2.0.0: resolution: {integrity: sha512-op4nsTR47R6p0vMUUoYl/a+ljLFVtlfaXkLQmqfLR1qHma1h/ysYk4hEXZ880bf2CYgTskvTa/e196Vd5dDQXw==} engines: {node: '>=14.0.0'} @@ -13696,6 +13806,10 @@ packages: resolution: {integrity: sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==} engines: {node: '>=14.0.0'} + tinyspy@3.0.2: + resolution: {integrity: sha512-n1cw8k1k0x4pgA2+9XrOkFydTerNcJ1zWCO5Nn9scWHTD+5tp8dghT2x1uduQePZTZgd3Tupf+x9BxJjeJi77Q==} + engines: {node: '>=14.0.0'} + tinyspy@4.0.4: resolution: {integrity: sha512-azl+t0z7pw/z958Gy9svOTuzqIk6xq+NSheJzn5MMWtWTFywIacg2wUlzKFGtt3cthx0r2SxMK0yzJOR0IES7Q==} engines: {node: '>=14.0.0'} @@ -13999,9 +14113,6 @@ packages: unbzip2-stream@1.4.3: resolution: {integrity: sha512-mlExGW4w71ebDJviH16lQLtZS32VKqsSfk80GCfUlwT/4/hNRFsoscrF/c++9xinkMzECL1uL9DDwXqFWkruPg==} - unconfig-core@7.4.2: - resolution: {integrity: sha512-VgPCvLWugINbXvMQDf8Jh0mlbvNjNC6eSUziHsBCMpxR05OPrNrvDnyatdMjRgcHaaNsCqz+wjNXxNw1kRLHUg==} - unconfig-core@7.5.0: resolution: {integrity: sha512-Su3FauozOGP44ZmKdHy2oE6LPjk51M/TRRjHv2HNCWiDvfvCoxC2lno6jevMA91MYAdCdwP05QnWdWpSbncX/w==} @@ -14083,6 +14194,9 @@ packages: resolution: {integrity: sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==} engines: {node: '>= 10.0.0'} + unpdf@0.12.2: + resolution: {integrity: sha512-3eyDFfayk+Sf5+inJ4OyhecR2BtRFEeZqUfGPdq2O8aBLau9MYL9lAP+GEcSAaVd2JWqde8Dnz38z0x7KRglaA==} + unpipe@1.0.0: resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==} engines: {node: '>= 0.8'} @@ -14256,6 +14370,11 @@ packages: vite-code-inspector-plugin@0.20.17: resolution: {integrity: sha512-WdAEvVZCtvJR/xFGaObdI23ic9umqf8BIAaROetsICsJaXnS0AqvtbAONGgfQ7zLXOBv9PJAuIgICkIDFsCnZA==} + vite-node@2.1.0: + resolution: {integrity: sha512-+ybYqBVUjYyIscoLzMWodus2enQDZOpGhcU6HdOVD6n8WZdk12w1GFL3mbnxLs7hPtRtqs1Wo5YF6/Tsr6fmhg==} + engines: {node: ^18.0.0 || >=20.0.0} + hasBin: true + vite-node@3.2.4: resolution: {integrity: sha512-EbKSKh+bh1E1IFxeO0pg1n4dvoOTt0UDiXMd/qn++r98+jPO1xtJilvXldeuQ8giIB5IkpjCgMleHMNEsGH6pg==} engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} @@ -14304,6 +14423,31 @@ packages: yaml: optional: true + vitest@2.1.0: + resolution: {integrity: sha512-XuuEeyNkqbfr0FtAvd9vFbInSSNY1ykCQTYQ0sj9wPy4hx+1gR7gqVNdW0AX2wrrM1wWlN5fnJDjF9xG6mYRSQ==} + engines: {node: ^18.0.0 || >=20.0.0} + hasBin: true + peerDependencies: + '@edge-runtime/vm': '*' + '@types/node': ^18.0.0 || >=20.0.0 + '@vitest/browser': 2.1.0 + '@vitest/ui': 2.1.0 + happy-dom: '*' + jsdom: '*' + peerDependenciesMeta: + '@edge-runtime/vm': + optional: true + '@types/node': + optional: true + '@vitest/browser': + optional: true + '@vitest/ui': + optional: true + happy-dom: + optional: true + jsdom: + optional: true + vitest@3.2.4: resolution: {integrity: sha512-LUCP5ev3GURDysTWiP47wRRUpLKMOfPh+yKTx3kVIEiu5KOMeqzpnYNsKyOoVrULivR8tLcks4+lga33Whn90A==} engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} @@ -16435,6 +16579,8 @@ snapshots: ajv: 6.12.6 ajv-keywords: 3.5.2(ajv@6.12.6) + '@discoveryjs/json-ext@0.6.3': {} + '@dnd-kit/accessibility@3.1.1(react@19.2.3)': dependencies: react: 19.2.3 @@ -16566,7 +16712,7 @@ snapshots: node-gyp: 11.5.0 ora: 5.4.1 read-binary-file-arch: 1.0.6 - semver: 7.7.2 + semver: 7.7.1 tar: 6.2.1 yargs: 17.7.2 transitivePeerDependencies: @@ -16655,7 +16801,7 @@ snapshots: '@esbuild-kit/esm-loader@2.6.5': dependencies: '@esbuild-kit/core-utils': 3.3.2 - get-tsconfig: 4.13.0 + get-tsconfig: 4.13.6 '@esbuild/aix-ppc64@0.25.12': optional: true @@ -17162,7 +17308,7 @@ snapshots: - openai - ws - '@langchain/community@1.1.1(3bba964449060bd176fc73da062d305c)': + '@langchain/community@1.1.1(9b32df8fea10b7369f157d4e11bd59c1)': dependencies: '@browserbasehq/stagehand': 1.14.0(@playwright/test@1.57.0)(deepmerge@4.3.1)(dotenv@16.6.1)(encoding@0.1.13)(openai@6.15.0(ws@8.20.0)(zod@4.3.4))(zod@4.3.4) '@ibm-cloud/watsonx-ai': 1.7.5 @@ -17182,7 +17328,7 @@ snapshots: '@aws-sdk/client-s3': 3.998.0 '@aws-sdk/credential-provider-node': 3.972.13 '@browserbasehq/sdk': 2.10.0(encoding@0.1.13) - '@libsql/client': 0.14.0 + '@libsql/client': 0.15.15 '@mozilla/readability': 0.6.0 '@notionhq/client': 2.3.0(encoding@0.1.13) '@smithy/util-utf8': 2.3.0 @@ -17194,6 +17340,7 @@ snapshots: jsdom: 26.1.0 jsonwebtoken: 9.0.3 lodash: 4.17.21 + mammoth: 1.12.0 officeparser: 4.2.0 pdf-parse: 2.4.5 playwright: 1.57.0 @@ -17404,16 +17551,37 @@ snapshots: - bufferutil - utf-8-validate + '@libsql/client@0.15.15': + dependencies: + '@libsql/core': 0.15.15 + '@libsql/hrana-client': 0.7.0 + js-base64: 3.7.7 + libsql: 0.5.29 + promise-limit: 2.7.0 + transitivePeerDependencies: + - bufferutil + - utf-8-validate + '@libsql/core@0.14.0': dependencies: js-base64: 3.7.7 + '@libsql/core@0.15.15': + dependencies: + js-base64: 3.7.7 + '@libsql/darwin-arm64@0.4.7': optional: true + '@libsql/darwin-arm64@0.5.29': + optional: true + '@libsql/darwin-x64@0.4.7': optional: true + '@libsql/darwin-x64@0.5.29': + optional: true + '@libsql/hrana-client@0.7.0': dependencies: '@libsql/isomorphic-fetch': 0.3.1 @@ -17434,21 +17602,42 @@ snapshots: - bufferutil - utf-8-validate + '@libsql/linux-arm-gnueabihf@0.5.29': + optional: true + + '@libsql/linux-arm-musleabihf@0.5.29': + optional: true + '@libsql/linux-arm64-gnu@0.4.7': optional: true + '@libsql/linux-arm64-gnu@0.5.29': + optional: true + '@libsql/linux-arm64-musl@0.4.7': optional: true + '@libsql/linux-arm64-musl@0.5.29': + optional: true + '@libsql/linux-x64-gnu@0.4.7': optional: true + '@libsql/linux-x64-gnu@0.5.29': + optional: true + '@libsql/linux-x64-musl@0.4.7': optional: true + '@libsql/linux-x64-musl@0.5.29': + optional: true + '@libsql/win32-x64-msvc@0.4.7': optional: true + '@libsql/win32-x64-msvc@0.5.29': + optional: true + '@malept/cross-spawn-promise@2.0.0': dependencies: cross-spawn: 7.0.6 @@ -17478,6 +17667,22 @@ snapshots: globby: 11.1.0 read-yaml-file: 1.1.0 + '@mapbox/node-pre-gyp@1.0.11(encoding@0.1.13)': + dependencies: + detect-libc: 2.1.2 + https-proxy-agent: 5.0.1 + make-dir: 3.1.0 + node-fetch: 2.7.0(encoding@0.1.13) + nopt: 5.0.0 + npmlog: 5.0.1 + rimraf: 3.0.2 + semver: 7.7.1 + tar: 6.2.1 + transitivePeerDependencies: + - encoding + - supports-color + optional: true + '@marijn/find-cluster-break@1.0.2': {} '@mdx-js/react@3.1.1(@types/react@19.2.7)(react@19.2.3)': @@ -21118,6 +21323,28 @@ snapshots: d3-selection: 3.0.0 d3-transition: 3.0.1(d3-selection@3.0.0) + '@vectorstores/core@0.1.8': + dependencies: + '@vectorstores/env': 0.1.0 + + '@vectorstores/env@0.1.0': + dependencies: + '@aws-crypto/sha256-js': 5.2.0 + pathe: 1.1.2 + + '@vectorstores/readers@0.1.8(@vectorstores/core@0.1.8)(@vectorstores/env@0.1.0)(encoding@0.1.13)': + dependencies: + '@discoveryjs/json-ext': 0.6.3 + '@vectorstores/core': 0.1.8 + '@vectorstores/env': 0.1.0 + '@xmldom/xmldom': 0.9.8 + csv-parse: 5.6.0 + mammoth: 1.12.0 + unpdf: 0.12.2(encoding@0.1.13) + transitivePeerDependencies: + - encoding + - supports-color + '@vercel/oidc@3.1.0': {} '@vimeo/player@2.29.0': @@ -21173,6 +21400,13 @@ snapshots: transitivePeerDependencies: - supports-color + '@vitest/expect@2.1.0': + dependencies: + '@vitest/spy': 2.1.0 + '@vitest/utils': 2.1.0 + chai: 5.3.3 + tinyrainbow: 1.2.0 + '@vitest/expect@3.2.4': dependencies: '@types/chai': 5.2.3 @@ -21190,6 +21424,15 @@ snapshots: chai: 6.2.2 tinyrainbow: 3.1.0 + '@vitest/mocker@2.1.0(@vitest/spy@2.1.0)(msw@2.12.7(@types/node@24.10.4)(typescript@5.8.3))(rolldown-vite@7.3.0(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2))': + dependencies: + '@vitest/spy': 2.1.0 + estree-walker: 3.0.3 + magic-string: 0.30.21 + optionalDependencies: + msw: 2.12.7(@types/node@24.10.4)(typescript@5.8.3) + vite: rolldown-vite@7.3.0(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2) + '@vitest/mocker@3.2.4(msw@2.12.7(@types/node@24.10.4)(typescript@5.8.3))(rolldown-vite@7.3.0(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2))': dependencies: '@vitest/spy': 3.2.4 @@ -21208,6 +21451,14 @@ snapshots: msw: 2.12.7(@types/node@24.10.4)(typescript@5.9.3) vite: 8.0.5(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2) + '@vitest/pretty-format@2.1.0': + dependencies: + tinyrainbow: 1.2.0 + + '@vitest/pretty-format@2.1.9': + dependencies: + tinyrainbow: 1.2.0 + '@vitest/pretty-format@3.2.4': dependencies: tinyrainbow: 2.0.0 @@ -21216,6 +21467,11 @@ snapshots: dependencies: tinyrainbow: 3.1.0 + '@vitest/runner@2.1.0': + dependencies: + '@vitest/utils': 2.1.0 + pathe: 1.1.2 + '@vitest/runner@3.2.4': dependencies: '@vitest/utils': 3.2.4 @@ -21227,6 +21483,12 @@ snapshots: '@vitest/utils': 4.1.2 pathe: 2.0.3 + '@vitest/snapshot@2.1.0': + dependencies: + '@vitest/pretty-format': 2.1.0 + magic-string: 0.30.21 + pathe: 1.1.2 + '@vitest/snapshot@3.2.4': dependencies: '@vitest/pretty-format': 3.2.4 @@ -21240,6 +21502,10 @@ snapshots: magic-string: 0.30.21 pathe: 2.0.3 + '@vitest/spy@2.1.0': + dependencies: + tinyspy: 3.0.2 + '@vitest/spy@3.2.4': dependencies: tinyspy: 4.0.4 @@ -21257,6 +21523,12 @@ snapshots: tinyrainbow: 2.0.0 vitest: 3.2.4(@types/debug@4.1.13)(@types/node@24.10.4)(@vitest/browser@3.2.4)(@vitest/ui@3.2.4)(esbuild@0.25.12)(jiti@2.6.1)(jsdom@26.1.0)(msw@2.12.7(@types/node@24.10.4)(typescript@5.8.3))(tsx@4.21.0)(yaml@2.8.2) + '@vitest/utils@2.1.0': + dependencies: + '@vitest/pretty-format': 2.1.0 + loupe: 3.2.1 + tinyrainbow: 1.2.0 + '@vitest/utils@3.2.4': dependencies: '@vitest/pretty-format': 3.2.4 @@ -21330,6 +21602,9 @@ snapshots: d3-selection: 3.0.0 d3-zoom: 3.0.0 + abbrev@1.1.1: + optional: true + abbrev@3.0.1: {} abort-controller@3.0.0: @@ -21349,6 +21624,13 @@ snapshots: adm-zip@0.4.16: {} + agent-base@6.0.2: + dependencies: + debug: 4.4.3 + transitivePeerDependencies: + - supports-color + optional: true + agent-base@7.1.4: {} agentkeepalive@4.6.0: @@ -21602,6 +21884,12 @@ snapshots: delegates: 1.0.0 readable-stream: 2.3.8 + are-we-there-yet@2.0.0: + dependencies: + delegates: 1.0.0 + readable-stream: 3.6.2 + optional: true + argparse@1.0.10: dependencies: sprintf-js: 1.0.3 @@ -22022,6 +22310,16 @@ snapshots: caniuse-lite@1.0.30001762: {} + canvas@2.11.2(encoding@0.1.13): + dependencies: + '@mapbox/node-pre-gyp': 1.0.11(encoding@0.1.13) + nan: 2.26.2 + simple-get: 3.1.1 + transitivePeerDependencies: + - encoding + - supports-color + optional: true + caseless@0.12.0: {} castable-video@1.1.11: @@ -22289,6 +22587,9 @@ snapshots: dependencies: color-name: 2.1.0 + color-support@1.1.3: + optional: true + color@4.2.3: dependencies: color-convert: 2.0.1 @@ -23004,14 +23305,14 @@ snapshots: transitivePeerDependencies: - supports-color - drizzle-orm@0.44.7(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0): + drizzle-orm@0.44.7(@libsql/client@0.15.15)(@opentelemetry/api@1.9.0): optionalDependencies: - '@libsql/client': 0.14.0 + '@libsql/client': 0.15.15 '@opentelemetry/api': 1.9.0 - drizzle-zod@0.8.3(drizzle-orm@0.44.7(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0))(zod@4.3.4): + drizzle-zod@0.8.3(drizzle-orm@0.44.7(@libsql/client@0.15.15)(@opentelemetry/api@1.9.0))(zod@4.3.4): dependencies: - drizzle-orm: 0.44.7(@libsql/client@0.14.0)(@opentelemetry/api@1.9.0) + drizzle-orm: 0.44.7(@libsql/client@0.15.15)(@opentelemetry/api@1.9.0) zod: 4.3.4 dts-resolver@2.1.3: {} @@ -24018,6 +24319,19 @@ snapshots: strip-ansi: 3.0.1 wide-align: 1.1.5 + gauge@3.0.2: + dependencies: + aproba: 1.2.0 + color-support: 1.1.3 + console-control-strings: 1.1.0 + has-unicode: 2.0.1 + object-assign: 4.1.1 + signal-exit: 3.0.7 + string-width: 4.2.3 + strip-ansi: 6.0.1 + wide-align: 1.1.5 + optional: true + gaxios@6.7.1(encoding@0.1.13): dependencies: extend: 3.0.2 @@ -24557,6 +24871,14 @@ snapshots: quick-lru: 5.1.1 resolve-alpn: 1.2.1 + https-proxy-agent@5.0.1: + dependencies: + agent-base: 6.0.2 + debug: 4.4.3 + transitivePeerDependencies: + - supports-color + optional: true + https-proxy-agent@7.0.6: dependencies: agent-base: 7.1.4 @@ -25139,6 +25461,21 @@ snapshots: '@libsql/linux-x64-musl': 0.4.7 '@libsql/win32-x64-msvc': 0.4.7 + libsql@0.5.29: + dependencies: + '@neon-rs/load': 0.0.4 + detect-libc: 2.0.2 + optionalDependencies: + '@libsql/darwin-arm64': 0.5.29 + '@libsql/darwin-x64': 0.5.29 + '@libsql/linux-arm-gnueabihf': 0.5.29 + '@libsql/linux-arm-musleabihf': 0.5.29 + '@libsql/linux-arm64-gnu': 0.5.29 + '@libsql/linux-arm64-musl': 0.5.29 + '@libsql/linux-x64-gnu': 0.5.29 + '@libsql/linux-x64-musl': 0.5.29 + '@libsql/win32-x64-msvc': 0.5.29 + lie@3.1.1: dependencies: immediate: 3.0.6 @@ -25412,6 +25749,11 @@ snapshots: dependencies: pify: 3.0.0 + make-dir@3.1.0: + dependencies: + semver: 6.3.1 + optional: true + make-dir@4.0.0: dependencies: semver: 7.7.1 @@ -25432,6 +25774,19 @@ snapshots: transitivePeerDependencies: - supports-color + mammoth@1.12.0: + dependencies: + '@xmldom/xmldom': 0.8.11 + argparse: 1.0.10 + base64-js: 1.5.1 + bluebird: 3.4.7 + dingbat-to-unicode: 1.0.1 + jszip: 3.10.1 + lop: 0.4.2 + path-is-absolute: 1.0.1 + underscore: 1.13.7 + xmlbuilder: 10.1.1 + mammoth@1.6.0: dependencies: '@xmldom/xmldom': 0.8.11 @@ -26287,6 +26642,9 @@ snapshots: n-gram@2.0.2: {} + nan@2.26.2: + optional: true + nanoid@3.3.11: {} nanoid@5.1.6: {} @@ -26368,6 +26726,11 @@ snapshots: noop-logger@0.1.1: {} + nopt@5.0.0: + dependencies: + abbrev: 1.1.1 + optional: true + nopt@8.1.0: dependencies: abbrev: 3.0.1 @@ -26387,6 +26750,14 @@ snapshots: gauge: 2.7.4 set-blocking: 2.0.0 + npmlog@5.0.1: + dependencies: + are-we-there-yet: 2.0.0 + console-control-strings: 1.1.0 + gauge: 3.0.2 + set-blocking: 2.0.0 + optional: true + npx-scope-finder@1.3.0: {} nth-check@2.1.1: @@ -26734,6 +27105,8 @@ snapshots: path-type@4.0.0: {} + pathe@1.1.2: {} + pathe@2.0.3: {} pathval@2.0.1: {} @@ -29027,10 +29400,14 @@ snapshots: tinypool@1.1.1: {} + tinyrainbow@1.2.0: {} + tinyrainbow@2.0.0: {} tinyrainbow@3.1.0: {} + tinyspy@3.0.2: {} + tinyspy@4.0.4: {} tldts-core@6.1.86: {} @@ -29221,7 +29598,7 @@ snapshots: tinyexec: 1.0.2 tinyglobby: 0.2.15 tree-kill: 1.2.2 - unconfig-core: 7.4.2 + unconfig-core: 7.5.0 unrun: 0.2.27 optionalDependencies: typescript: 5.8.3 @@ -29248,7 +29625,7 @@ snapshots: tinyexec: 1.0.2 tinyglobby: 0.2.15 tree-kill: 1.2.2 - unconfig-core: 7.4.2 + unconfig-core: 7.5.0 unrun: 0.2.27 optionalDependencies: typescript: 5.9.3 @@ -29348,11 +29725,6 @@ snapshots: buffer: 5.7.1 through: 2.3.8 - unconfig-core@7.4.2: - dependencies: - '@quansync/fs': 1.0.0 - quansync: 1.0.0 - unconfig-core@7.5.0: dependencies: '@quansync/fs': 1.0.0 @@ -29462,6 +29834,13 @@ snapshots: universalify@2.0.1: {} + unpdf@0.12.2(encoding@0.1.13): + optionalDependencies: + canvas: 2.11.2(encoding@0.1.13) + transitivePeerDependencies: + - encoding + - supports-color + unpipe@1.0.0: {} unplugin@2.3.11: @@ -29623,6 +30002,26 @@ snapshots: transitivePeerDependencies: - supports-color + vite-node@2.1.0(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2): + dependencies: + cac: 6.7.14 + debug: 4.4.3 + pathe: 1.1.2 + vite: rolldown-vite@7.3.0(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2) + transitivePeerDependencies: + - '@types/node' + - esbuild + - jiti + - less + - sass + - sass-embedded + - stylus + - sugarss + - supports-color + - terser + - tsx + - yaml + vite-node@3.2.4(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2): dependencies: cac: 6.7.14 @@ -29659,6 +30058,46 @@ snapshots: tsx: 4.21.0 yaml: 2.8.2 + vitest@2.1.0(@types/node@24.10.4)(@vitest/browser@3.2.4(msw@2.12.7(@types/node@24.10.4)(typescript@5.8.3))(playwright@1.57.0)(rolldown-vite@7.3.0(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2))(vitest@3.2.4))(@vitest/ui@3.2.4(vitest@3.2.4))(esbuild@0.25.12)(jiti@2.6.1)(jsdom@26.1.0)(msw@2.12.7(@types/node@24.10.4)(typescript@5.8.3))(tsx@4.21.0)(yaml@2.8.2): + dependencies: + '@vitest/expect': 2.1.0 + '@vitest/mocker': 2.1.0(@vitest/spy@2.1.0)(msw@2.12.7(@types/node@24.10.4)(typescript@5.8.3))(rolldown-vite@7.3.0(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2)) + '@vitest/pretty-format': 2.1.9 + '@vitest/runner': 2.1.0 + '@vitest/snapshot': 2.1.0 + '@vitest/spy': 2.1.0 + '@vitest/utils': 2.1.0 + chai: 5.3.3 + debug: 4.4.3 + magic-string: 0.30.21 + pathe: 1.1.2 + std-env: 3.10.0 + tinybench: 2.9.0 + tinyexec: 0.3.2 + tinypool: 1.1.1 + tinyrainbow: 1.2.0 + vite: rolldown-vite@7.3.0(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2) + vite-node: 2.1.0(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2) + why-is-node-running: 2.3.0 + optionalDependencies: + '@types/node': 24.10.4 + '@vitest/browser': 3.2.4(msw@2.12.7(@types/node@24.10.4)(typescript@5.8.3))(playwright@1.57.0)(rolldown-vite@7.3.0(@types/node@24.10.4)(esbuild@0.25.12)(jiti@2.6.1)(tsx@4.21.0)(yaml@2.8.2))(vitest@3.2.4) + '@vitest/ui': 3.2.4(vitest@3.2.4) + jsdom: 26.1.0 + transitivePeerDependencies: + - esbuild + - jiti + - less + - msw + - sass + - sass-embedded + - stylus + - sugarss + - supports-color + - terser + - tsx + - yaml + vitest@3.2.4(@types/debug@4.1.13)(@types/node@24.10.4)(@vitest/browser@3.2.4)(@vitest/ui@3.2.4)(esbuild@0.25.12)(jiti@2.6.1)(jsdom@26.1.0)(msw@2.12.7(@types/node@24.10.4)(typescript@5.8.3))(tsx@4.21.0)(yaml@2.8.2): dependencies: '@types/chai': 5.2.3 diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index 486c912434..a96e2d6839 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -1,5 +1,6 @@ packages: - 'packages/*' + - 'packages/vectorstores/*' supportedArchitectures: os: diff --git a/src/main/core/application/serviceRegistry.ts b/src/main/core/application/serviceRegistry.ts index 7566dafc6f..bc110a94c4 100644 --- a/src/main/core/application/serviceRegistry.ts +++ b/src/main/core/application/serviceRegistry.ts @@ -9,6 +9,8 @@ import { AppMenuService } from '@main/services/AppMenuService' import { AppUpdaterService } from '@main/services/AppUpdaterService' import { CodeCliService } from '@main/services/CodeCliService' import { DetachedWindowManager } from '@main/services/DetachedWindowManager' +import { KnowledgeOrchestrationService, KnowledgeRuntimeService } from '@main/services/knowledge' +import { KnowledgeVectorStoreService } from '@main/services/knowledge/vectorstore/KnowledgeVectorStoreService' import { LanTransferService } from '@main/services/lanTransfer' import { MCPService } from '@main/services/MCPService' import { NodeTraceService } from '@main/services/NodeTraceService' @@ -80,6 +82,9 @@ export const services = { MCPService, OpenClawService, SearchService, + KnowledgeOrchestrationService, + KnowledgeVectorStoreService, + KnowledgeRuntimeService, AgentBootstrapService, ApiServerService, AppUpdaterService diff --git a/src/main/data/api/handlers/__tests__/knowledges.test.ts b/src/main/data/api/handlers/__tests__/knowledges.test.ts index d3cefaccb8..cdeb75d0df 100644 --- a/src/main/data/api/handlers/__tests__/knowledges.test.ts +++ b/src/main/data/api/handlers/__tests__/knowledges.test.ts @@ -363,6 +363,67 @@ describe('knowledgeHandlers', () => { }) }) + it('should accept sitemap owner items with grouped url children', async () => { + const body: CreateKnowledgeItemsDto = { + items: [ + { + ref: 'sitemap-root', + type: 'sitemap', + data: { + url: 'https://example.com/sitemap.xml', + name: 'Example Sitemap' + } + }, + { + groupRef: 'sitemap-root', + type: 'url', + data: { + url: 'https://example.com/page-a', + name: 'Page A' + } + } + ] + } + + createKnowledgeItemsMock.mockResolvedValueOnce({ + items: [ + { + id: 'sitemap-1', + baseId: 'kb-1', + groupId: null, + type: 'sitemap', + data: { + url: 'https://example.com/sitemap.xml', + name: 'Example Sitemap' + } + }, + { + id: 'url-1', + baseId: 'kb-1', + groupId: 'sitemap-1', + type: 'url', + data: { + url: 'https://example.com/page-a', + name: 'Page A' + } + } + ] + }) + + const result = await knowledgeHandlers['/knowledge-bases/:id/items'].POST({ + params: { id: 'kb-1' }, + body + }) + + expect(createKnowledgeItemsMock).toHaveBeenCalledWith('kb-1', body) + expect(result).toMatchObject({ + items: [ + { id: 'sitemap-1', type: 'sitemap' }, + { id: 'url-1', type: 'url' } + ] + }) + }) + it('should reject invalid POST bodies before calling the service', async () => { await expect( knowledgeHandlers['/knowledge-bases/:id/items'].POST({ @@ -394,6 +455,72 @@ describe('knowledgeHandlers', () => { expect(createKnowledgeItemsMock).not.toHaveBeenCalled() }) + + it('should reject POST bodies that specify both groupId and groupRef', async () => { + await expect( + knowledgeHandlers['/knowledge-bases/:id/items'].POST({ + params: { id: 'kb-1' }, + body: { + items: [ + { + groupId: 'group-1', + groupRef: 'root', + type: 'note', + data: { content: 'hello world' } + } + ] + } + } as never) + ).rejects.toHaveProperty('name', 'ZodError') + + expect(createKnowledgeItemsMock).not.toHaveBeenCalled() + }) + + it('should reject POST bodies with duplicate refs', async () => { + await expect( + knowledgeHandlers['/knowledge-bases/:id/items'].POST({ + params: { id: 'kb-1' }, + body: { + items: [ + { + ref: 'duplicate', + type: 'directory', + data: { name: 'files', path: '/tmp/files' } + }, + { + ref: 'duplicate', + type: 'note', + data: { content: 'hello world' } + } + ] + } + } as never) + ).rejects.toHaveProperty('name', 'ZodError') + + expect(createKnowledgeItemsMock).not.toHaveBeenCalled() + }) + + it('should reject POST bodies with missing groupRef targets', async () => { + await expect( + knowledgeHandlers['/knowledge-bases/:id/items'].POST({ + params: { id: 'kb-1' }, + body: { + items: [ + { + groupRef: 'missing-root', + type: 'url', + data: { + url: 'https://example.com/page-a', + name: 'Page A' + } + } + ] + } + } as never) + ).rejects.toHaveProperty('name', 'ZodError') + + expect(createKnowledgeItemsMock).not.toHaveBeenCalled() + }) }) describe('/knowledge-items/:id', () => { diff --git a/src/main/data/db/schemas/knowledge.ts b/src/main/data/db/schemas/knowledge.ts index 1c3c474fee..e538fdeac1 100644 --- a/src/main/data/db/schemas/knowledge.ts +++ b/src/main/data/db/schemas/knowledge.ts @@ -1,6 +1,6 @@ import type { - ItemStatus, KnowledgeItemData, + KnowledgeItemStatus, KnowledgeItemType, KnowledgeSearchMode } from '@shared/data/types/knowledge' @@ -72,7 +72,7 @@ export const knowledgeItemTable = sqliteTable( data: text({ mode: 'json' }).$type().notNull(), // Processing status - status: text().$type().notNull().default('idle'), + status: text().$type().notNull().default('idle'), error: text(), ...createUpdateTimestamps @@ -81,7 +81,7 @@ export const knowledgeItemTable = sqliteTable( check('knowledge_item_type_check', sql`${t.type} IN ('file', 'url', 'note', 'sitemap', 'directory')`), check( 'knowledge_item_status_check', - sql`${t.status} IN ('idle', 'pending', 'ocr', 'read', 'embed', 'completed', 'failed')` + sql`${t.status} IN ('idle', 'pending', 'file_processing', 'read', 'embed', 'completed', 'failed')` ), // Enforce that group owners live inside the same knowledge base. foreignKey({ columns: [t.baseId, t.groupId], foreignColumns: [t.baseId, t.id] }).onDelete('cascade'), diff --git a/src/main/data/migration/v2/core/MigrationContext.ts b/src/main/data/migration/v2/core/MigrationContext.ts index 15d352e3d8..1ce70ec33c 100644 --- a/src/main/data/migration/v2/core/MigrationContext.ts +++ b/src/main/data/migration/v2/core/MigrationContext.ts @@ -10,6 +10,7 @@ import fs from 'fs/promises' import { DexieFileReader } from '../utils/DexieFileReader' import { DexieSettingsReader, type DexieSettingsRecord } from '../utils/DexieSettingsReader' +import { KnowledgeVectorSourceReader } from '../utils/KnowledgeVectorSourceReader' import { LegacyHomeConfigReader } from '../utils/LegacyHomeConfigReader' import { LocalStorageReader } from '../utils/LocalStorageReader' import { ReduxStateReader } from '../utils/ReduxStateReader' @@ -32,6 +33,7 @@ export interface MigrationContext { dexieExport: DexieFileReader dexieSettings: DexieSettingsReader localStorage: LocalStorageReader + knowledgeVectorSource: KnowledgeVectorSourceReader legacyHomeConfig: LegacyHomeConfigReader } @@ -98,6 +100,7 @@ export async function createMigrationContext( dexieExport: dexieFileReader, dexieSettings: new DexieSettingsReader(dexieSettingsRecords), localStorage: new LocalStorageReader(localStorageRecords), + knowledgeVectorSource: new KnowledgeVectorSourceReader(), legacyHomeConfig: new LegacyHomeConfigReader(paths.legacyConfigFile) }, db, diff --git a/src/main/data/migration/v2/migrators/KnowledgeVectorMigrator.ts b/src/main/data/migration/v2/migrators/KnowledgeVectorMigrator.ts new file mode 100644 index 0000000000..9cfa4eeefd --- /dev/null +++ b/src/main/data/migration/v2/migrators/KnowledgeVectorMigrator.ts @@ -0,0 +1,551 @@ +import fs from 'node:fs' +import { pathToFileURL } from 'node:url' + +import { knowledgeBaseTable, knowledgeItemTable } from '@data/db/schemas/knowledge' +import { type Client, createClient } from '@libsql/client' +import { loggerService } from '@logger' +import type { ExecuteResult, PrepareResult, ValidateResult, ValidationError } from '@shared/data/migration/v2/types' +import { v4 as uuidv4 } from 'uuid' + +import type { MigrationContext } from '../core/MigrationContext' +import { BaseMigrator } from './BaseMigrator' + +const logger = loggerService.withContext('KnowledgeVectorMigrator') + +const VECTORSTORE_TABLE_NAME = 'libsql_vectorstores_embedding' +const INSERT_BATCH_SIZE = 100 + +function yieldToEventLoop(): Promise { + return new Promise((resolve) => { + setImmediate(resolve) + }) +} + +interface LegacyKnowledgeItemWithLoaders { + id?: string + uniqueId?: string + uniqueIds?: string[] +} + +interface LegacyKnowledgeBaseWithLoaders { + id?: string + items?: LegacyKnowledgeItemWithLoaders[] +} + +interface LegacyKnowledgeStateWithLoaders { + bases?: LegacyKnowledgeBaseWithLoaders[] +} + +interface PreparedVectorRow { + document: string + externalId: string + source: string + embedding: number[] +} + +interface PreparedBasePlan { + baseId: string + dbPath: string + dimensions: number + rows: PreparedVectorRow[] + sourceRowCount: number +} + +export class KnowledgeVectorMigrator extends BaseMigrator { + readonly id = 'knowledge_vector' + readonly name = 'KnowledgeVector' + readonly description = 'Rebuild legacy knowledge vectors into vectorstores libsql' + readonly order = 3.5 + + private sourceCount = 0 + private skippedCount = 0 + private warnings: string[] = [] + private preparedBasePlans: PreparedBasePlan[] = [] + private successfulBaseIds = new Set() + private targetCountByBaseId = new Map() + private executionErrors: string[] = [] + + override reset(): void { + this.sourceCount = 0 + this.skippedCount = 0 + this.warnings = [] + this.preparedBasePlans = [] + this.successfulBaseIds = new Set() + this.targetCountByBaseId = new Map() + this.executionErrors = [] + } + + private getTempVectorStorePath(dbPath: string): string { + return `${dbPath}.vectorstore.tmp` + } + + private async ensureVectorStoreSchema(client: Client, dimensions: number): Promise { + await client.execute({ + sql: ` + CREATE TABLE IF NOT EXISTS ${VECTORSTORE_TABLE_NAME} ( + id TEXT PRIMARY KEY, + external_id TEXT, + collection TEXT, + document TEXT, + metadata JSON DEFAULT '{}', + embeddings F32_BLOB(${dimensions}) + ) + `, + args: [] + }) + + const indexStatements = [ + ` + CREATE INDEX IF NOT EXISTS idx_${VECTORSTORE_TABLE_NAME}_external_id + ON ${VECTORSTORE_TABLE_NAME} (external_id) + `, + ` + CREATE INDEX IF NOT EXISTS idx_${VECTORSTORE_TABLE_NAME}_collection + ON ${VECTORSTORE_TABLE_NAME} (collection) + `, + ` + CREATE INDEX IF NOT EXISTS idx_${VECTORSTORE_TABLE_NAME}_vector + ON ${VECTORSTORE_TABLE_NAME} (libsql_vector_idx(embeddings, 'metric=cosine')) + ` + ] + + for (const statement of indexStatements) { + await client.execute({ sql: statement, args: [] }) + } + + const ftsTableName = `${VECTORSTORE_TABLE_NAME}_fts` + await client.execute({ + sql: ` + CREATE VIRTUAL TABLE IF NOT EXISTS ${ftsTableName} + USING fts5(document, content='${VECTORSTORE_TABLE_NAME}', content_rowid='rowid') + `, + args: [] + }) + + await client.execute({ + sql: ` + CREATE TRIGGER IF NOT EXISTS ${VECTORSTORE_TABLE_NAME}_ai + AFTER INSERT ON ${VECTORSTORE_TABLE_NAME} + BEGIN + INSERT INTO ${ftsTableName}(rowid, document) + VALUES (NEW.rowid, NEW.document); + END + `, + args: [] + }) + + await client.execute({ + sql: ` + CREATE TRIGGER IF NOT EXISTS ${VECTORSTORE_TABLE_NAME}_au + AFTER UPDATE OF document ON ${VECTORSTORE_TABLE_NAME} + BEGIN + INSERT INTO ${ftsTableName}(${ftsTableName}, rowid, document) + VALUES ('delete', OLD.rowid, OLD.document); + INSERT INTO ${ftsTableName}(rowid, document) + VALUES (NEW.rowid, NEW.document); + END + `, + args: [] + }) + + await client.execute({ + sql: ` + CREATE TRIGGER IF NOT EXISTS ${VECTORSTORE_TABLE_NAME}_ad + AFTER DELETE ON ${VECTORSTORE_TABLE_NAME} + BEGIN + INSERT INTO ${ftsTableName}(${ftsTableName}, rowid, document) + VALUES ('delete', OLD.rowid, OLD.document); + END + `, + args: [] + }) + } + + private async insertVectorRows( + client: Client, + rows: Array, + collection: string + ): Promise { + if (rows.length === 0) { + return + } + + const placeholders = rows + .map( + (_, index) => + `(?${index * 6 + 1}, ?${index * 6 + 2}, ?${index * 6 + 3}, ?${index * 6 + 4}, ?${index * 6 + 5}, vector32(?${index * 6 + 6}))` + ) + .join(', ') + + const args = rows.flatMap((row) => [ + row.id, + row.externalId, + collection, + row.document, + JSON.stringify({ + itemId: row.externalId, + ...(row.source.trim() !== '' ? { source: row.source } : {}) + }), + `[${row.embedding.join(',')}]` + ]) + + await client.execute({ + sql: ` + INSERT INTO ${VECTORSTORE_TABLE_NAME} + (id, external_id, collection, document, metadata, embeddings) + VALUES ${placeholders} + `, + args + }) + } + + private buildLoaderKeyMap( + legacyBase: LegacyKnowledgeBaseWithLoaders | undefined, + migratedItemIds: Set + ): Map { + const map = new Map() + if (!legacyBase || !Array.isArray(legacyBase.items)) { + return map + } + + for (const item of legacyBase.items) { + if (!item.id || !migratedItemIds.has(item.id)) { + continue + } + + if (Array.isArray(item.uniqueIds) && item.uniqueIds.length > 0) { + for (const uniqueId of item.uniqueIds) { + if (typeof uniqueId === 'string' && uniqueId.trim() !== '') { + map.set(uniqueId, item.id) + } + } + continue + } + + if (typeof item.uniqueId === 'string' && item.uniqueId.trim() !== '') { + map.set(item.uniqueId, item.id) + } + } + + return map + } + + async prepare(ctx: MigrationContext): Promise { + try { + const knowledgeState = ctx.sources.reduxState.getCategory('knowledge') + const migratedBases = await ctx.db.select().from(knowledgeBaseTable) + + if (!knowledgeState?.bases || knowledgeState.bases.length === 0 || migratedBases.length === 0) { + return { + success: true, + itemCount: 0 + } + } + + const migratedItems = await ctx.db + .select({ id: knowledgeItemTable.id, baseId: knowledgeItemTable.baseId }) + .from(knowledgeItemTable) + + const migratedItemIdsByBaseId = new Map>() + for (const item of migratedItems) { + const bucket = migratedItemIdsByBaseId.get(item.baseId) ?? new Set() + bucket.add(item.id) + migratedItemIdsByBaseId.set(item.baseId, bucket) + } + + const legacyBasesById = new Map( + knowledgeState.bases + .filter((base): base is LegacyKnowledgeBaseWithLoaders & { id: string } => typeof base.id === 'string') + .map((base) => [base.id, base]) + ) + + for (const base of migratedBases) { + const legacyBase = legacyBasesById.get(base.id) + if (!legacyBase) { + const warningMessage = `Skipped knowledge vector base ${base.id}: legacy knowledge base not found` + logger.warn(warningMessage) + this.warnings.push(warningMessage) + continue + } + + const source = await ctx.sources.knowledgeVectorSource.loadBase(base.id) + switch (source.status) { + case 'invalid_path': { + const warningMessage = `Skipped knowledge vector base ${base.id}: invalid legacy vector DB path` + logger.warn(warningMessage) + this.warnings.push(warningMessage) + continue + } + case 'missing': { + const warningMessage = `Skipped knowledge vector base ${base.id}: legacy vector DB missing` + logger.warn(warningMessage) + this.warnings.push(warningMessage) + continue + } + case 'directory': { + const warningMessage = `Skipped knowledge vector base ${base.id}: legacy vector DB path is a directory` + logger.warn(warningMessage) + this.warnings.push(warningMessage) + continue + } + case 'not_embedjs': { + const warningMessage = `Skipped knowledge vector base ${base.id}: legacy DB is not embedjs format` + logger.warn(warningMessage) + this.warnings.push(warningMessage) + continue + } + } + + const vectorRows = source.rows + this.sourceCount += vectorRows.length + + const loaderKeyMap = this.buildLoaderKeyMap( + legacyBase, + migratedItemIdsByBaseId.get(base.id) ?? new Set() + ) + const rows: PreparedVectorRow[] = [] + + for (const row of vectorRows) { + // V2 only keeps vectors that can be proven to belong to an existing + // migrated knowledge_item row. Unmapped legacy vectors are treated + // as invalid index residue and are intentionally dropped. + const externalId = loaderKeyMap.get(row.uniqueLoaderId) + if (!externalId) { + this.skippedCount += 1 + const warningMessage = `Skipped knowledge vector row in base ${base.id}: uniqueLoaderId '${row.uniqueLoaderId}' cannot be mapped to item.id` + logger.warn(warningMessage) + this.warnings.push(warningMessage) + continue + } + + if (!row.vector || row.vector.length === 0) { + this.skippedCount += 1 + const warningMessage = `Skipped knowledge vector row in base ${base.id}: vector payload missing for uniqueLoaderId '${row.uniqueLoaderId}'` + logger.warn(warningMessage) + this.warnings.push(warningMessage) + continue + } + + rows.push({ + document: row.pageContent, + externalId, + source: row.source, + embedding: row.vector + }) + } + + // A base is still planned even when rows.length === 0. In that case the + // rebuilt V2 vector store is intentionally empty because none of the + // legacy vectors can be associated with valid migrated knowledge_item rows. + this.preparedBasePlans.push({ + baseId: base.id, + dbPath: source.dbPath, + dimensions: base.dimensions, + rows, + sourceRowCount: vectorRows.length + }) + } + + return { + success: true, + itemCount: this.sourceCount, + warnings: this.warnings.length > 0 ? this.warnings : undefined + } + } catch (error) { + logger.error('KnowledgeVectorMigrator.prepare failed', error as Error) + return { + success: false, + itemCount: this.sourceCount, + warnings: [error instanceof Error ? error.message : String(error)] + } + } + } + + async execute(): Promise { + if (this.preparedBasePlans.length === 0) { + return { + success: true, + processedCount: 0 + } + } + + const totalWork = this.preparedBasePlans.reduce((sum, plan) => sum + Math.max(plan.rows.length, 1), 0) + let processedWork = 0 + let processedCount = 0 + + for (const plan of this.preparedBasePlans) { + const tempPath = this.getTempVectorStorePath(plan.dbPath) + + try { + const rebuiltRows: Array = plan.rows.map((row) => ({ + ...row, + id: uuidv4() + })) + + await fs.promises.rm(tempPath, { force: true }) + + const targetClient = createClient({ url: pathToFileURL(tempPath).toString() }) + try { + await this.ensureVectorStoreSchema(targetClient, plan.dimensions) + + for (let i = 0; i < rebuiltRows.length; i += INSERT_BATCH_SIZE) { + const batch = rebuiltRows.slice(i, i + INSERT_BATCH_SIZE) + await this.insertVectorRows(targetClient, batch, plan.baseId) + processedWork += batch.length + this.reportProgress( + Math.round((processedWork / totalWork) * 100), + `Migrated ${processedWork}/${totalWork} knowledge vector work units`, + { + key: 'migration.progress.migrated_knowledge_vectors', + params: { processed: processedWork, total: totalWork } + } + ) + await yieldToEventLoop() + } + } finally { + targetClient.close() + } + + if (rebuiltRows.length === 0) { + processedWork += 1 + this.reportProgress( + Math.round((processedWork / totalWork) * 100), + `Migrated ${processedWork}/${totalWork} knowledge vector work units`, + { + key: 'migration.progress.migrated_knowledge_vectors', + params: { processed: processedWork, total: totalWork } + } + ) + await yieldToEventLoop() + } + + await fs.promises.rm(plan.dbPath, { force: true }) + await fs.promises.rename(tempPath, plan.dbPath) + + this.successfulBaseIds.add(plan.baseId) + this.targetCountByBaseId.set(plan.baseId, rebuiltRows.length) + processedCount += rebuiltRows.length + } catch (error) { + const errorMessage = `Knowledge vector base ${plan.baseId} execution failed: ${error instanceof Error ? error.message : String(error)}` + logger.error(errorMessage, error instanceof Error ? error : new Error(String(error))) + this.executionErrors.push(errorMessage) + + await fs.promises.rm(tempPath, { force: true }) + + return { + success: false, + processedCount, + error: errorMessage + } + } + } + + logger.info('KnowledgeVectorMigrator.execute completed', { + processedCount, + successfulBaseCount: this.successfulBaseIds.size, + warningCount: this.warnings.length, + executionErrorCount: this.executionErrors.length + }) + + return { + success: true, + processedCount + } + } + + async validate(): Promise { + const errors: ValidationError[] = [] + let targetCount = 0 + + try { + for (const plan of this.preparedBasePlans) { + if (!this.successfulBaseIds.has(plan.baseId)) { + continue + } + + const client = createClient({ url: pathToFileURL(plan.dbPath).toString() }) + try { + const expectedCount = this.targetCountByBaseId.get(plan.baseId) ?? 0 + const countResult = await client.execute({ + sql: `SELECT count(*) AS count FROM ${VECTORSTORE_TABLE_NAME}`, + args: [] + }) + const actualCount = Number(countResult.rows[0]?.count ?? 0) + targetCount += actualCount + + if (actualCount !== expectedCount) { + errors.push({ + key: `knowledge_vector_count_mismatch_${plan.baseId}`, + expected: expectedCount, + actual: actualCount, + message: `Knowledge vector count mismatch for base ${plan.baseId}: expected ${expectedCount}, got ${actualCount}` + }) + } + + const missingExternalIdResult = await client.execute({ + sql: `SELECT count(*) AS count FROM ${VECTORSTORE_TABLE_NAME} WHERE external_id IS NULL OR external_id = ''`, + args: [] + }) + const missingExternalIdCount = Number(missingExternalIdResult.rows[0]?.count ?? 0) + if (missingExternalIdCount > 0) { + errors.push({ + key: `knowledge_vector_missing_external_id_${plan.baseId}`, + expected: 0, + actual: missingExternalIdCount, + message: `Found ${missingExternalIdCount} knowledge vector rows without external_id in base ${plan.baseId}` + }) + } + + const missingOrMismatchedItemIdResult = await client.execute({ + sql: `SELECT count(*) AS count FROM ${VECTORSTORE_TABLE_NAME} WHERE json_extract(metadata, '$.itemId') IS NULL OR json_extract(metadata, '$.itemId') = '' OR json_extract(metadata, '$.itemId') != external_id`, + args: [] + }) + const missingOrMismatchedItemIdCount = Number(missingOrMismatchedItemIdResult.rows[0]?.count ?? 0) + if (missingOrMismatchedItemIdCount > 0) { + errors.push({ + key: `knowledge_vector_missing_item_id_${plan.baseId}`, + expected: 0, + actual: missingOrMismatchedItemIdCount, + message: `Found ${missingOrMismatchedItemIdCount} knowledge vector rows without matching metadata.itemId in base ${plan.baseId}` + }) + } + } finally { + client.close() + } + } + + logger.info('KnowledgeVectorMigrator.validate completed', { + sourceCount: this.sourceCount, + targetCount, + skippedCount: this.skippedCount, + errors: errors.length + }) + + return { + success: errors.length === 0, + errors, + stats: { + sourceCount: this.sourceCount, + targetCount, + skippedCount: this.skippedCount + } + } + } catch (error) { + logger.error('KnowledgeVectorMigrator.validate failed', error as Error) + return { + success: false, + errors: [ + { + key: 'validation', + message: error instanceof Error ? error.message : String(error) + } + ], + stats: { + sourceCount: this.sourceCount, + targetCount, + skippedCount: this.skippedCount + } + } + } + } +} diff --git a/src/main/data/migration/v2/migrators/README-KnowledgeVectorMigrator.md b/src/main/data/migration/v2/migrators/README-KnowledgeVectorMigrator.md new file mode 100644 index 0000000000..5cdbe7b4f3 --- /dev/null +++ b/src/main/data/migration/v2/migrators/README-KnowledgeVectorMigrator.md @@ -0,0 +1,75 @@ +# KnowledgeVectorMigrator + +`KnowledgeVectorMigrator` migrates legacy per-base `embedjs` vector databases into the new libsql-backed `vectorstores` layout. + +## Data Sources + +| Data | Source | File/Path | +|------|--------|-----------| +| Migrated knowledge base identities and dimensions | SQLite `knowledge_base` | `knowledge_base` table | +| Migrated knowledge item identities | SQLite `knowledge_item` | `knowledge_item` table | +| Legacy loader metadata | Redux `knowledge.bases[].items[]` | `ReduxStateReader.getCategory('knowledge')` | +| Legacy chunk vectors | Per-base legacy vector DB | `application.getPath('feature.knowledgebase.data', )` | + +## Target Storage + +- Per-base libsql vector store file at the existing knowledge DB path +- Table: `libsql_vectorstores_embedding` + +## Key Transformations + +1. Loader identity remapping + - `uniqueLoaderId` is not kept as a persisted field. + - It is resolved back to `knowledge_item.id` and written into `external_id`. + - `uniqueIds[]` takes precedence over legacy `uniqueId`. + - A legacy vector row is considered valid only if it can be mapped to an existing V2 `knowledge_item.id`. + - Unmapped legacy rows are treated as invalid index residue, not as business data that must be preserved. + +2. Chunk payload migration + - `pageContent` -> `document` + - `knowledge_item.id` -> `metadata.itemId` + - `source` -> optional `metadata.source` + - Other legacy metadata fields are dropped. + +3. Embedding reuse + - Legacy `vector` payloads are decoded from `F32_BLOB` and written directly to `embeddings`. + - Existing chunk embeddings are reused; this migrator does not re-embed content. + +4. Chunk identity regeneration + - Legacy chunk IDs are not reused. + - Every migrated vector row gets a new UUID v4 `id`. + +5. Schema bootstrap + - Creates `external_id`, `collection`, vector index, and FTS schema needed by `@vectorstores/libsql`. + - Migrated rows use `collection = base.id` so runtime reads and deletes match the same per-base store contract. + +## File-Safety Contract + +- The migrator writes each rebuilt vector store to a temporary sibling file first. +- The original embedjs DB stays untouched until the temporary file has been written successfully. +- Once the temp file is ready, the migrator replaces the original DB in place. +- The migration flow relies on the user-completed pre-migration v1 backup; it does not keep an additional in-place rollback copy. + +## IMPORTANT: Current Limitations + +- Base-level execution failures are treated as migration failures, not as skippable data warnings. If rebuilding or replacing one base fails, `execute()` returns `success: false`. +- The current implementation does **not** preserve a retryable in-place copy of the original embedjs DB. It does not keep `.bak` files or other retry artifacts beside the knowledge DB path. +- Because the replacement is in-place, a failure that happens after the original DB has been removed but before the new file is fully placed may leave the base without a usable legacy source file on disk. +- Therefore, retry semantics currently depend on the user restoring the pre-migration v1 backup before running migration again. The migrator itself does not guarantee that a failed run leaves the knowledge vector source in a reusable retry state. +- This limitation is intentional for the current implementation, but it is **important** and may need follow-up design discussion or future changes if the project later wants first-class retry support without requiring manual restore. + +## Validation + +- Per-base row count must equal the prepared row count. +- `external_id` must be non-empty for every migrated row. +- `metadata.itemId` must be present and match `external_id` for every migrated row. +- `metadata.source` is optional and is only preserved when the legacy row has a non-empty `source`. + +## Skipped Data + +- Bases missing from migrated `knowledge_base` +- Bases whose legacy DB file is missing, resolves to a directory, or does not contain a `vectors` table +- Vector rows whose `uniqueLoaderId` cannot be mapped to a migrated `knowledge_item.id` +- Vector rows with missing or empty `vector` payloads + +If every legacy vector row under one base is skipped, the rebuilt V2 vector store for that base is expected to be empty. This is intentional: only vectors that can be proven to belong to migrated `knowledge_item` rows remain valid in V2. diff --git a/src/main/data/migration/v2/migrators/__tests__/KnowledgeVectorMigrator.test.ts b/src/main/data/migration/v2/migrators/__tests__/KnowledgeVectorMigrator.test.ts new file mode 100644 index 0000000000..617c896102 --- /dev/null +++ b/src/main/data/migration/v2/migrators/__tests__/KnowledgeVectorMigrator.test.ts @@ -0,0 +1,697 @@ +import * as fs from 'node:fs' +import * as os from 'node:os' +import path from 'node:path' +import { pathToFileURL } from 'node:url' + +import type { DbType } from '@data/db/types' +import { createClient } from '@libsql/client' +import { sql } from 'drizzle-orm' +import { drizzle } from 'drizzle-orm/libsql' +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' + +import { KnowledgeVectorSourceReader } from '../../utils/KnowledgeVectorSourceReader' +import { ReduxStateReader } from '../../utils/ReduxStateReader' + +const { loggerWarnMock, setKnowledgeBaseRoot, getPathMock } = vi.hoisted(() => { + let currentKnowledgeBaseRoot = '' + + return { + loggerWarnMock: vi.fn(), + setKnowledgeBaseRoot: (nextPath: string) => { + currentKnowledgeBaseRoot = nextPath + }, + getPathMock: vi.fn((key: string, filename?: string) => { + if (key !== 'feature.knowledgebase.data') { + throw new Error(`Unexpected path key: ${key}`) + } + + return filename ? path.join(currentKnowledgeBaseRoot, filename) : currentKnowledgeBaseRoot + }) + } +}) + +vi.mock('@logger', () => ({ + loggerService: { + withContext: vi.fn(() => ({ + info: vi.fn(), + warn: loggerWarnMock, + error: vi.fn(), + debug: vi.fn() + })) + } +})) + +vi.mock('node:fs', async (importOriginal) => { + return (await importOriginal()) as any +}) + +vi.mock('node:os', async (importOriginal) => { + return (await importOriginal()) as any +}) + +vi.mock('@main/core/application', () => ({ + application: { + getPath: getPathMock + } +})) + +vi.mock('@main/utils/file', () => ({ + sanitizeFilename: (value: string) => value +})) + +const { KnowledgeVectorMigrator } = await import('../KnowledgeVectorMigrator') + +function createTempRoot() { + return fs.mkdtempSync(path.join(os.tmpdir(), 'knowledge-vector-migrator-')) +} + +async function createMainDb(): Promise<{ db: DbType; close: () => void }> { + const client = createClient({ url: 'file::memory:' }) + const db = drizzle(client) + + await db.run( + sql.raw(` + CREATE TABLE knowledge_base ( + id TEXT PRIMARY KEY, + name TEXT NOT NULL, + description TEXT, + dimensions INTEGER NOT NULL, + embeddingModelId TEXT NOT NULL, + rerankModelId TEXT, + fileProcessorId TEXT, + chunkSize INTEGER, + chunkOverlap INTEGER, + threshold REAL, + documentCount INTEGER, + searchMode TEXT, + hybridAlpha REAL, + createdAt INTEGER, + updatedAt INTEGER + ) + `) + ) + + await db.run( + sql.raw(` + CREATE TABLE knowledge_item ( + id TEXT PRIMARY KEY, + baseId TEXT NOT NULL, + groupId TEXT, + type TEXT NOT NULL, + data TEXT NOT NULL, + status TEXT NOT NULL, + error TEXT, + createdAt INTEGER, + updatedAt INTEGER + ) + `) + ) + + return { + db, + close: () => client.close() + } +} + +async function insertKnowledgeBaseRow( + db: DbType, + row: { + id: string + name: string + dimensions: number + embeddingModelId: string + } +) { + await db.run( + sql.raw(` + INSERT INTO knowledge_base (id, name, dimensions, embeddingModelId) + VALUES ('${row.id}', '${row.name}', ${row.dimensions}, '${row.embeddingModelId}') + `) + ) +} + +async function insertKnowledgeItemRow( + db: DbType, + row: { + id: string + baseId: string + type: string + data: unknown + status: string + } +) { + await db.run( + sql.raw(` + INSERT INTO knowledge_item (id, baseId, groupId, type, data, status) + VALUES ('${row.id}', '${row.baseId}', NULL, '${row.type}', '${JSON.stringify(row.data).replace(/'/g, "''")}', '${row.status}') + `) + ) +} + +async function createLegacyVectorDb( + dbPath: string, + rows: Array<{ + id: string + pageContent: string + uniqueLoaderId: string + source: string + vector: number[] + }> +) { + const client = createClient({ url: pathToFileURL(dbPath).toString() }) + + await client.execute(` + CREATE TABLE vectors ( + id TEXT PRIMARY KEY, + pageContent TEXT UNIQUE, + uniqueLoaderId TEXT NOT NULL, + source TEXT NOT NULL, + vector F32_BLOB(2), + metadata TEXT + ) + `) + + for (const row of rows) { + await client.execute({ + sql: ` + INSERT INTO vectors (id, pageContent, uniqueLoaderId, source, vector, metadata) + VALUES (?, ?, ?, ?, vector32(?), '{}') + `, + args: [row.id, row.pageContent, row.uniqueLoaderId, row.source, `[${row.vector.join(',')}]`] + }) + } + + client.close() +} + +function createMigrationCtx(db: DbType, reduxData: Record) { + return { + sources: { + electronStore: { get: vi.fn() }, + reduxState: new ReduxStateReader(reduxData), + dexieExport: {} as any, + dexieSettings: {} as any, + localStorage: {} as any, + knowledgeVectorSource: new KnowledgeVectorSourceReader() + }, + db, + sharedData: new Map(), + logger: {} as any + } +} + +describe('KnowledgeVectorMigrator', () => { + let tempRoot: string + let knowledgeBaseDir: string + let db: DbType + let closeDb: (() => void) | undefined + + beforeEach(async () => { + vi.clearAllMocks() + tempRoot = createTempRoot() + knowledgeBaseDir = path.join(tempRoot, 'KnowledgeBase') + fs.mkdirSync(knowledgeBaseDir, { recursive: true }) + setKnowledgeBaseRoot(knowledgeBaseDir) + + const mainDb = await createMainDb() + db = mainDb.db + closeDb = mainDb.close + }) + + afterEach(() => { + closeDb?.() + closeDb = undefined + fs.rmSync(tempRoot, { recursive: true, force: true }) + }) + + it('prepare uses uniqueIds first, falls back to uniqueId, and records warnings for unmapped vectors', async () => { + await insertKnowledgeBaseRow(db, { + id: 'kb-1', + name: 'Base 1', + dimensions: 2, + embeddingModelId: 'openai::text-embedding-3-small' + }) + await insertKnowledgeItemRow(db, { + id: 'item-file', + baseId: 'kb-1', + type: 'file', + data: { + file: { + id: 'file-1', + name: 'file-1.md', + origin_name: 'file-1.md', + path: '/tmp/file-1.md', + size: 1, + ext: '.md', + type: 'text', + created_at: '2024-01-01T00:00:00.000Z', + count: 1 + } + }, + status: 'completed' + }) + await insertKnowledgeItemRow(db, { + id: 'item-directory', + baseId: 'kb-1', + type: 'directory', + data: { name: 'dir', path: '/tmp/dir' }, + status: 'completed' + }) + + await createLegacyVectorDb(path.join(knowledgeBaseDir, 'kb-1'), [ + { + id: 'legacy-file-0', + pageContent: 'file chunk', + uniqueLoaderId: 'loader-file', + source: '/tmp/file-1.md', + vector: [1, 2] + }, + { + id: 'legacy-dir-0', + pageContent: 'dir chunk', + uniqueLoaderId: 'loader-dir-a', + source: '/tmp/dir/a.md', + vector: [3, 4] + }, + { + id: 'legacy-missing-0', + pageContent: 'missing chunk', + uniqueLoaderId: 'loader-missing', + source: '/tmp/missing.md', + vector: [5, 6] + } + ]) + + const migrationCtx = createMigrationCtx(db, { + knowledge: { + bases: [ + { + id: 'kb-1', + name: 'Base 1', + items: [ + { + id: 'item-file', + type: 'file', + uniqueId: 'loader-file' + }, + { + id: 'item-directory', + type: 'directory', + uniqueId: 'DirectoryLoader_ignore', + uniqueIds: ['loader-dir-a'] + } + ] + } + ] + } + }) + + const migrator = new KnowledgeVectorMigrator() as any + const result = await migrator.prepare(migrationCtx as any) + + expect(result.success).toBe(true) + expect(result.itemCount).toBe(3) + expect(migrator.preparedBasePlans).toHaveLength(1) + expect(migrator.preparedBasePlans[0].rows).toHaveLength(2) + expect(migrator.preparedBasePlans[0].rows.map((row: any) => row.externalId)).toEqual([ + 'item-file', + 'item-directory' + ]) + expect(migrator.skippedCount).toBe(1) + expect(result.warnings?.some((warning) => warning.includes('loader-missing'))).toBe(true) + }) + + it('hard fails when vector index schema creation fails', async () => { + const migrator = new KnowledgeVectorMigrator() + const client = { + execute: vi.fn(async ({ sql: statement }: { sql: string }) => { + if (statement.includes('libsql_vector_idx')) { + throw new Error('vector index failed') + } + }) + } + + await expect((migrator as any).ensureVectorStoreSchema(client, 2)).rejects.toThrow('vector index failed') + }) + + it('hard fails when FTS schema creation fails', async () => { + const migrator = new KnowledgeVectorMigrator() + const client = { + execute: vi.fn(async ({ sql: statement }: { sql: string }) => { + if (statement.includes('CREATE VIRTUAL TABLE IF NOT EXISTS libsql_vectorstores_embedding_fts')) { + throw new Error('fts creation failed') + } + }) + } + + await expect((migrator as any).ensureVectorStoreSchema(client, 2)).rejects.toThrow('fts creation failed') + }) + + it('execute rebuilds vector rows with uuid v4 ids, externalId item ids, and metadata.itemId/source', async () => { + await insertKnowledgeBaseRow(db, { + id: 'kb-1', + name: 'Base 1', + dimensions: 2, + embeddingModelId: 'openai::text-embedding-3-small' + }) + await insertKnowledgeItemRow(db, { + id: 'item-file', + baseId: 'kb-1', + type: 'file', + data: { + file: { + id: 'file-1', + name: 'file-1.md', + origin_name: 'file-1.md', + path: '/tmp/file-1.md', + size: 1, + ext: '.md', + type: 'text', + created_at: '2024-01-01T00:00:00.000Z', + count: 1 + } + }, + status: 'completed' + }) + + const dbPath = path.join(knowledgeBaseDir, 'kb-1') + await createLegacyVectorDb(dbPath, [ + { + id: 'legacy-file-0', + pageContent: 'file chunk', + uniqueLoaderId: 'loader-file', + source: '/tmp/file-1.md', + vector: [1, 2] + } + ]) + + const migrationCtx = createMigrationCtx(db, { + knowledge: { + bases: [ + { + id: 'kb-1', + name: 'Base 1', + items: [ + { + id: 'item-file', + type: 'file', + uniqueId: 'loader-file' + } + ] + } + ] + } + }) + + const migrator = new KnowledgeVectorMigrator() as any + const prepareResult = await migrator.prepare(migrationCtx as any) + expect(prepareResult.success).toBe(true) + + const executeResult = await migrator.execute(migrationCtx as any) + expect(executeResult.success).toBe(true) + expect(executeResult.processedCount).toBe(1) + + const targetClient = createClient({ url: pathToFileURL(dbPath).toString() }) + const rows = await targetClient.execute( + 'SELECT id, external_id, collection, document, metadata, length(embeddings) AS bytes FROM libsql_vectorstores_embedding' + ) + targetClient.close() + + expect(rows.rows).toHaveLength(1) + const row = rows.rows[0] as Record + expect(String(row.id)).toMatch(/^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i) + expect(String(row.id)).not.toBe('legacy-file-0') + expect(row.external_id).toBe('item-file') + expect(row.collection).toBe('kb-1') + expect(row.document).toBe('file chunk') + expect(JSON.parse(String(row.metadata))).toEqual({ + itemId: 'item-file', + source: '/tmp/file-1.md' + }) + expect(Number(row.bytes)).toBeGreaterThan(0) + + const validateResult = await migrator.validate(migrationCtx as any) + expect(validateResult.success).toBe(true) + expect(validateResult.errors).toStrictEqual([]) + expect(validateResult.stats).toMatchObject({ + sourceCount: 1, + targetCount: 1, + skippedCount: 0 + }) + + expect(fs.existsSync(`${dbPath}.vectorstore.tmp`)).toBe(false) + }) + + it('reports knowledge vector migration progress for each inserted batch', async () => { + const migrator = new KnowledgeVectorMigrator() as any + const dbPath = path.join(knowledgeBaseDir, 'kb-progress') + const reportedProgress: number[] = [] + + migrator.preparedBasePlans = [ + { + baseId: 'kb-progress', + dbPath, + dimensions: 2, + rows: Array.from({ length: 250 }, (_, index) => ({ + document: `doc-${index}`, + externalId: `item-${index}`, + source: `/tmp/doc-${index}.md`, + embedding: [index, index + 1] + })), + sourceRowCount: 250 + } + ] + + migrator.setProgressCallback((progress: number) => { + reportedProgress.push(progress) + }) + + await expect(migrator.execute()).resolves.toMatchObject({ + success: true, + processedCount: 250 + }) + + expect(reportedProgress).toEqual([40, 80, 100]) + expect(fs.existsSync(dbPath)).toBe(true) + expect(fs.existsSync(`${dbPath}.vectorstore.tmp`)).toBe(false) + }) + + it('execute allows missing legacy source and omits metadata.source', async () => { + await insertKnowledgeBaseRow(db, { + id: 'kb-1', + name: 'Base 1', + dimensions: 2, + embeddingModelId: 'openai::text-embedding-3-small' + }) + await insertKnowledgeItemRow(db, { + id: 'item-file', + baseId: 'kb-1', + type: 'file', + data: { + file: { + id: 'file-1', + name: 'file-1.md', + origin_name: 'file-1.md', + path: '/tmp/file-1.md', + size: 1, + ext: '.md', + type: 'text', + created_at: '2024-01-01T00:00:00.000Z', + count: 1 + } + }, + status: 'completed' + }) + + const dbPath = path.join(knowledgeBaseDir, 'kb-1') + await createLegacyVectorDb(dbPath, [ + { + id: 'legacy-file-0', + pageContent: 'file chunk', + uniqueLoaderId: 'loader-file', + source: '', + vector: [1, 2] + } + ]) + + const migrationCtx = createMigrationCtx(db, { + knowledge: { + bases: [ + { + id: 'kb-1', + name: 'Base 1', + items: [ + { + id: 'item-file', + type: 'file', + uniqueId: 'loader-file' + } + ] + } + ] + } + }) + + const migrator = new KnowledgeVectorMigrator() as any + expect((await migrator.prepare(migrationCtx as any)).success).toBe(true) + expect((await migrator.execute(migrationCtx as any)).success).toBe(true) + + const targetClient = createClient({ url: pathToFileURL(dbPath).toString() }) + const rows = await targetClient.execute('SELECT metadata FROM libsql_vectorstores_embedding') + targetClient.close() + + expect(rows.rows).toHaveLength(1) + expect(JSON.parse(String((rows.rows[0] as Record).metadata))).toEqual({ + itemId: 'item-file' + }) + + const validateResult = await migrator.validate(migrationCtx as any) + expect(validateResult.success).toBe(true) + expect(validateResult.errors).toStrictEqual([]) + }) + + it('execute fails when rebuilding a base fails and does not count it as skipped', async () => { + await insertKnowledgeBaseRow(db, { + id: 'kb-1', + name: 'Base 1', + dimensions: 2, + embeddingModelId: 'openai::text-embedding-3-small' + }) + await insertKnowledgeItemRow(db, { + id: 'item-file', + baseId: 'kb-1', + type: 'file', + data: { + file: { + id: 'file-1', + name: 'file-1.md', + origin_name: 'file-1.md', + path: '/tmp/file-1.md', + size: 1, + ext: '.md', + type: 'text', + created_at: '2024-01-01T00:00:00.000Z', + count: 1 + } + }, + status: 'completed' + }) + + await createLegacyVectorDb(path.join(knowledgeBaseDir, 'kb-1'), [ + { + id: 'legacy-file-0', + pageContent: 'file chunk', + uniqueLoaderId: 'loader-file', + source: '/tmp/file-1.md', + vector: [1, 2] + } + ]) + + const migrationCtx = createMigrationCtx(db, { + knowledge: { + bases: [ + { + id: 'kb-1', + name: 'Base 1', + items: [ + { + id: 'item-file', + type: 'file', + uniqueId: 'loader-file' + } + ] + } + ] + } + }) + + const migrator = new KnowledgeVectorMigrator() as any + const prepareResult = await migrator.prepare(migrationCtx as any) + expect(prepareResult.success).toBe(true) + + vi.spyOn(migrator, 'insertVectorRows').mockRejectedValueOnce(new Error('insert failed')) + + const executeResult = await migrator.execute(migrationCtx as any) + expect(executeResult.success).toBe(false) + expect(executeResult.processedCount).toBe(0) + expect(executeResult.error).toContain('kb-1') + expect(executeResult.error).toContain('insert failed') + expect(migrator.skippedCount).toBe(0) + }) + + it('validate fails when migrated metadata.itemId is missing or mismatched', async () => { + await insertKnowledgeBaseRow(db, { + id: 'kb-1', + name: 'Base 1', + dimensions: 2, + embeddingModelId: 'openai::text-embedding-3-small' + }) + await insertKnowledgeItemRow(db, { + id: 'item-file', + baseId: 'kb-1', + type: 'file', + data: { + file: { + id: 'file-1', + name: 'file-1.md', + origin_name: 'file-1.md', + path: '/tmp/file-1.md', + size: 1, + ext: '.md', + type: 'text', + created_at: '2024-01-01T00:00:00.000Z', + count: 1 + } + }, + status: 'completed' + }) + + const dbPath = path.join(knowledgeBaseDir, 'kb-1') + await createLegacyVectorDb(dbPath, [ + { + id: 'legacy-file-0', + pageContent: 'file chunk', + uniqueLoaderId: 'loader-file', + source: '/tmp/file-1.md', + vector: [1, 2] + } + ]) + + const migrationCtx = createMigrationCtx(db, { + knowledge: { + bases: [ + { + id: 'kb-1', + name: 'Base 1', + items: [ + { + id: 'item-file', + type: 'file', + uniqueId: 'loader-file' + } + ] + } + ] + } + }) + + const migrator = new KnowledgeVectorMigrator() as any + await expect(migrator.prepare(migrationCtx as any)).resolves.toMatchObject({ success: true }) + await expect(migrator.execute(migrationCtx as any)).resolves.toMatchObject({ success: true, processedCount: 1 }) + + const targetClient = createClient({ url: pathToFileURL(dbPath).toString() }) + await targetClient.execute({ + sql: `UPDATE libsql_vectorstores_embedding SET metadata = ? WHERE external_id = ?`, + args: [JSON.stringify({ source: '/tmp/file-1.md' }), 'item-file'] + }) + targetClient.close() + + const validateResult = await migrator.validate(migrationCtx as any) + expect(validateResult.success).toBe(false) + expect(validateResult.errors).toContainEqual( + expect.objectContaining({ + key: 'knowledge_vector_missing_item_id_kb-1' + }) + ) + }) +}) diff --git a/src/main/data/migration/v2/migrators/index.ts b/src/main/data/migration/v2/migrators/index.ts index b710b43583..110fc381cd 100644 --- a/src/main/data/migration/v2/migrators/index.ts +++ b/src/main/data/migration/v2/migrators/index.ts @@ -9,6 +9,7 @@ import { AssistantMigrator } from './AssistantMigrator' import { BootConfigMigrator } from './BootConfigMigrator' import { ChatMigrator } from './ChatMigrator' import { KnowledgeMigrator } from './KnowledgeMigrator' +import { KnowledgeVectorMigrator } from './KnowledgeVectorMigrator' import { McpServerMigrator } from './McpServerMigrator' import { MiniAppMigrator } from './MiniAppMigrator' import { PreferencesMigrator } from './PreferencesMigrator' @@ -21,6 +22,7 @@ export { BootConfigMigrator, ChatMigrator, KnowledgeMigrator, + KnowledgeVectorMigrator, McpServerMigrator, MiniAppMigrator, PreferencesMigrator, @@ -40,6 +42,7 @@ export function getAllMigrators() { new ProviderModelMigrator(), new AssistantMigrator(), new KnowledgeMigrator(), + new KnowledgeVectorMigrator(), new ChatMigrator(), new TranslateMigrator() ] diff --git a/src/main/data/migration/v2/migrators/mappings/KnowledgeMappings.ts b/src/main/data/migration/v2/migrators/mappings/KnowledgeMappings.ts index fc29a43173..c6b2600117 100644 --- a/src/main/data/migration/v2/migrators/mappings/KnowledgeMappings.ts +++ b/src/main/data/migration/v2/migrators/mappings/KnowledgeMappings.ts @@ -1,7 +1,8 @@ +import path from 'node:path' + import type { knowledgeBaseTable, knowledgeItemTable } from '@data/db/schemas/knowledge' -import { normalizeKnowledgeBaseConfig } from '@data/services/knowledgeBaseConfig' import type { FileMetadata } from '@shared/data/types/file' -import type { ItemStatus, KnowledgeItemData } from '@shared/data/types/knowledge' +import type { KnowledgeItemData, KnowledgeItemStatus } from '@shared/data/types/knowledge' export type NewKnowledgeBase = typeof knowledgeBaseTable.$inferInsert export type NewKnowledgeItem = typeof knowledgeItemTable.$inferInsert @@ -127,9 +128,41 @@ export const toCompositeModelId = (model: LegacyModel | null | undefined): strin return `${providerId}::${modelId}` } -export const inferKnowledgeItemStatus = (item: Pick): ItemStatus => +export const inferKnowledgeItemStatus = (item: Pick): KnowledgeItemStatus => typeof item.uniqueId === 'string' && item.uniqueId.trim() !== '' ? 'completed' : 'idle' +function normalizeMigratedKnowledgeBaseConfig>(config: T): T { + const normalized = { ...config } + + if (normalized.chunkSize != null && normalized.chunkSize <= 0) { + normalized.chunkSize = undefined as T['chunkSize'] + } + + if (normalized.chunkOverlap != null) { + if (normalized.chunkOverlap < 0) { + normalized.chunkOverlap = undefined as T['chunkOverlap'] + } else if (normalized.chunkSize == null || normalized.chunkOverlap >= normalized.chunkSize) { + normalized.chunkOverlap = undefined as T['chunkOverlap'] + } + } + + if (normalized.threshold != null && (normalized.threshold < 0 || normalized.threshold > 1)) { + normalized.threshold = undefined as T['threshold'] + } + + if (normalized.documentCount != null && normalized.documentCount <= 0) { + normalized.documentCount = undefined as T['documentCount'] + } + + if (normalized.hybridAlpha != null) { + if (normalized.hybridAlpha < 0 || normalized.hybridAlpha > 1 || normalized.searchMode !== 'hybrid') { + normalized.hybridAlpha = undefined as T['hybridAlpha'] + } + } + + return normalized +} + export const resolveLegacyFileMetadata = ( content: LegacyKnowledgeItem['content'], filesById: Map @@ -186,7 +219,7 @@ export const transformKnowledgeBase = ( return { ok: true, - value: normalizeKnowledgeBaseConfig(transformedBase) + value: normalizeMigratedKnowledgeBaseConfig(transformedBase) } } @@ -255,8 +288,8 @@ export const transformKnowledgeItem = ( type = 'directory' data = { - path: item.content, - recursive: true + name: path.basename(item.content), + path: item.content } } else if (item.type === 'note') { const note = deps.noteById.get(item.id) diff --git a/src/main/data/migration/v2/migrators/mappings/__tests__/KnowledgeMappings.test.ts b/src/main/data/migration/v2/migrators/mappings/__tests__/KnowledgeMappings.test.ts index 564d11fa12..79dc6ac3ff 100644 --- a/src/main/data/migration/v2/migrators/mappings/__tests__/KnowledgeMappings.test.ts +++ b/src/main/data/migration/v2/migrators/mappings/__tests__/KnowledgeMappings.test.ts @@ -220,8 +220,8 @@ describe('KnowledgeMappings', () => { groupId: null, type: 'directory', data: { - path: '/tmp/docs', - recursive: true + name: 'docs', + path: '/tmp/docs' }, status: 'idle', error: null, diff --git a/src/main/data/migration/v2/utils/KnowledgeVectorSourceReader.ts b/src/main/data/migration/v2/utils/KnowledgeVectorSourceReader.ts new file mode 100644 index 0000000000..471cdd8c9d --- /dev/null +++ b/src/main/data/migration/v2/utils/KnowledgeVectorSourceReader.ts @@ -0,0 +1,111 @@ +import fs from 'node:fs' +import { pathToFileURL } from 'node:url' + +import { type Client, createClient, type Value as LibsqlValue } from '@libsql/client' +import { application } from '@main/core/application' +import { sanitizeFilename } from '@main/utils/file' + +const LEGACY_VECTOR_TABLE_NAME = 'vectors' + +export interface LegacyKnowledgeVectorRow { + pageContent: string + uniqueLoaderId: string + source: string + vector: number[] | null +} + +export type LegacyKnowledgeVectorLoadResult = + | { status: 'ok'; dbPath: string; rows: LegacyKnowledgeVectorRow[] } + | { status: 'invalid_path' | 'missing' | 'directory' | 'not_embedjs'; dbPath?: string } + +export class KnowledgeVectorSourceReader { + getLegacyDbPath(baseId: string): string | null { + return application.getPath('feature.knowledgebase.data', sanitizeFilename(baseId, '_')) + } + + async loadBase(baseId: string): Promise { + const dbPath = this.getLegacyDbPath(baseId) + if (!dbPath) { + return { status: 'invalid_path' } + } + + if (!fs.existsSync(dbPath)) { + return { status: 'missing', dbPath } + } + + const stat = fs.statSync(dbPath) + if (stat.isDirectory()) { + return { status: 'directory', dbPath } + } + + const client = createClient({ url: pathToFileURL(dbPath).toString() }) + try { + const isEmbedjs = await this.isEmbedjsDatabase(client) + if (!isEmbedjs) { + return { status: 'not_embedjs', dbPath } + } + + return { + status: 'ok', + dbPath, + rows: await this.readLegacyVectorRows(client) + } + } finally { + client.close() + } + } + + private async isEmbedjsDatabase(client: Client): Promise { + const result = await client.execute({ + sql: "SELECT name FROM sqlite_master WHERE type='table' AND name=?", + args: [LEGACY_VECTOR_TABLE_NAME] + }) + + return result.rows.length > 0 + } + + private async readLegacyVectorRows(client: Client): Promise { + const result = await client.execute({ + sql: `SELECT pageContent, uniqueLoaderId, source, vector FROM ${LEGACY_VECTOR_TABLE_NAME}`, + args: [] + }) + + return result.rows.map((row) => ({ + pageContent: String(row.pageContent ?? ''), + uniqueLoaderId: String(row.uniqueLoaderId ?? ''), + source: String(row.source ?? ''), + vector: this.deserializeLegacyVector(row.vector) + })) + } + + // libsql F32_BLOB values are not decoded to one stable JS type across + // client/runtime combinations. In local verification on macOS this returns + // ArrayBuffer, but other environments may expose Float32Array or another + // ArrayBufferView, so keep the decoder intentionally permissive. + private deserializeLegacyVector(raw: LibsqlValue): number[] | null { + if (raw === null || raw === undefined) { + return null + } + + if (raw instanceof Float32Array) { + return Array.from(raw) + } + + if (raw instanceof ArrayBuffer) { + return Array.from(new Float32Array(raw)) + } + + if (ArrayBuffer.isView(raw)) { + const view = raw as ArrayBufferView + return Array.from( + new Float32Array(view.buffer, view.byteOffset, view.byteLength / Float32Array.BYTES_PER_ELEMENT) + ) + } + + if (Array.isArray(raw)) { + return raw.map((value) => Number(value)) + } + + return null + } +} diff --git a/src/main/data/migration/v2/utils/__tests__/KnowledgeVectorSourceReader.test.ts b/src/main/data/migration/v2/utils/__tests__/KnowledgeVectorSourceReader.test.ts new file mode 100644 index 0000000000..5c5d041742 --- /dev/null +++ b/src/main/data/migration/v2/utils/__tests__/KnowledgeVectorSourceReader.test.ts @@ -0,0 +1,136 @@ +import * as fs from 'node:fs' +import * as os from 'node:os' +import path from 'node:path' +import { pathToFileURL } from 'node:url' + +import { createClient } from '@libsql/client' +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' + +const { setKnowledgeBaseRoot, getPathMock } = vi.hoisted(() => { + let currentKnowledgeBaseRoot = '' + + return { + setKnowledgeBaseRoot: (nextPath: string) => { + currentKnowledgeBaseRoot = nextPath + }, + getPathMock: vi.fn((key: string, filename?: string) => { + if (key !== 'feature.knowledgebase.data') { + throw new Error(`Unexpected path key: ${key}`) + } + + return filename ? path.join(currentKnowledgeBaseRoot, filename) : currentKnowledgeBaseRoot + }) + } +}) + +vi.mock('@main/core/application', () => ({ + application: { + getPath: getPathMock + } +})) + +vi.mock('@main/utils/file', () => ({ + sanitizeFilename: (value: string) => value +})) + +vi.mock('node:fs', async (importOriginal) => { + return (await importOriginal()) as any +}) + +vi.mock('node:os', async (importOriginal) => { + return (await importOriginal()) as any +}) + +const { KnowledgeVectorSourceReader } = await import('../KnowledgeVectorSourceReader') + +async function createLegacyVectorDb( + dbPath: string, + rows: Array<{ + id: string + pageContent: string + uniqueLoaderId: string + source: string + vector: number[] + }> +) { + const client = createClient({ url: pathToFileURL(dbPath).toString() }) + + await client.execute(` + CREATE TABLE vectors ( + id TEXT PRIMARY KEY, + pageContent TEXT UNIQUE, + uniqueLoaderId TEXT NOT NULL, + source TEXT NOT NULL, + vector F32_BLOB(2), + metadata TEXT + ) + `) + + for (const row of rows) { + await client.execute({ + sql: ` + INSERT INTO vectors (id, pageContent, uniqueLoaderId, source, vector, metadata) + VALUES (?, ?, ?, ?, vector32(?), '{}') + `, + args: [row.id, row.pageContent, row.uniqueLoaderId, row.source, `[${row.vector.join(',')}]`] + }) + } + + client.close() +} + +describe('KnowledgeVectorSourceReader', () => { + let tempRoot: string + + beforeEach(() => { + tempRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'knowledge-vector-source-reader-')) + fs.mkdirSync(path.join(tempRoot, 'KnowledgeBase'), { recursive: true }) + setKnowledgeBaseRoot(path.join(tempRoot, 'KnowledgeBase')) + }) + + afterEach(() => { + fs.rmSync(tempRoot, { recursive: true, force: true }) + }) + + it('loads legacy embedjs rows from the knowledge base path', async () => { + const reader = new KnowledgeVectorSourceReader() + const dbPath = path.join(tempRoot, 'KnowledgeBase', 'kb-1') + + await createLegacyVectorDb(dbPath, [ + { + id: 'legacy-row-1', + pageContent: 'hello vector', + uniqueLoaderId: 'loader-1', + source: '/tmp/file.md', + vector: [1, 2] + } + ]) + + await expect(reader.loadBase('kb-1')).resolves.toEqual({ + status: 'ok', + dbPath, + rows: [ + { + pageContent: 'hello vector', + uniqueLoaderId: 'loader-1', + source: '/tmp/file.md', + vector: [1, 2] + } + ] + }) + }) + + it('returns not_embedjs for non-embedjs sqlite files', async () => { + const reader = new KnowledgeVectorSourceReader() + const dbPath = path.join(tempRoot, 'KnowledgeBase', 'kb-1') + const client = createClient({ url: pathToFileURL(dbPath).toString() }) + + await client.execute(`CREATE TABLE something_else (id TEXT PRIMARY KEY)`) + client.close() + + await expect(reader.loadBase('kb-1')).resolves.toEqual({ + status: 'not_embedjs', + dbPath + }) + }) +}) diff --git a/src/main/data/services/KnowledgeBaseService.ts b/src/main/data/services/KnowledgeBaseService.ts index 90d102a488..f39c217c6d 100644 --- a/src/main/data/services/KnowledgeBaseService.ts +++ b/src/main/data/services/KnowledgeBaseService.ts @@ -14,13 +14,88 @@ import { type KnowledgeBaseListQuery, type UpdateKnowledgeBaseDto } from '@shared/data/api/schemas/knowledges' -import type { KnowledgeBase } from '@shared/data/types/knowledge' +import type { KnowledgeBase, KnowledgeSearchMode } from '@shared/data/types/knowledge' import { desc, eq, sql } from 'drizzle-orm' -import { normalizeKnowledgeBaseConfigDependencies, validateKnowledgeBaseConfig } from './knowledgeBaseConfig' - const logger = loggerService.withContext('DataApi:KnowledgeBaseService') +export interface KnowledgeBaseConfigInput { + chunkSize?: number | null + chunkOverlap?: number | null + threshold?: number | null + documentCount?: number | null + searchMode?: KnowledgeSearchMode | null + hybridAlpha?: number | null +} + +function addFieldError( + fieldErrors: Record, + field: keyof KnowledgeBaseConfigInput, + message: string +): void { + if (!fieldErrors[field]) { + fieldErrors[field] = [] + } + + fieldErrors[field].push(message) +} + +export function normalizeKnowledgeBaseConfigDependencies(config: T): T { + const normalized = { ...config } + + if (normalized.chunkOverlap != null) { + if (normalized.chunkSize == null || normalized.chunkOverlap >= normalized.chunkSize) { + normalized.chunkOverlap = undefined as T['chunkOverlap'] + } + } + + if (normalized.hybridAlpha != null && normalized.searchMode !== 'hybrid') { + normalized.hybridAlpha = undefined as T['hybridAlpha'] + } + + return normalized +} + +export function validateKnowledgeBaseConfig(config: KnowledgeBaseConfigInput): Record { + const fieldErrors: Record = {} + + if (config.chunkSize != null && config.chunkSize <= 0) { + addFieldError(fieldErrors, 'chunkSize', 'Chunk size must be greater than 0') + } + + if (config.chunkOverlap != null && config.chunkOverlap < 0) { + addFieldError(fieldErrors, 'chunkOverlap', 'Chunk overlap must be greater than or equal to 0') + } + + if (config.threshold != null && (config.threshold < 0 || config.threshold > 1)) { + addFieldError(fieldErrors, 'threshold', 'Threshold must be between 0 and 1') + } + + if (config.documentCount != null && config.documentCount <= 0) { + addFieldError(fieldErrors, 'documentCount', 'Document count must be greater than 0') + } + + const hybridAlphaIsInRange = config.hybridAlpha == null || (config.hybridAlpha >= 0 && config.hybridAlpha <= 1) + if (!hybridAlphaIsInRange) { + addFieldError(fieldErrors, 'hybridAlpha', 'Hybrid alpha must be between 0 and 1') + } + + const chunkOverlap = config.chunkOverlap + if (chunkOverlap != null && chunkOverlap >= 0) { + if (config.chunkSize == null) { + addFieldError(fieldErrors, 'chunkOverlap', 'Chunk overlap requires chunk size') + } else if (chunkOverlap >= config.chunkSize) { + addFieldError(fieldErrors, 'chunkOverlap', 'Chunk overlap must be smaller than chunk size') + } + } + + if (config.hybridAlpha != null && hybridAlphaIsInRange && config.searchMode !== 'hybrid') { + addFieldError(fieldErrors, 'hybridAlpha', 'Hybrid alpha requires hybrid search mode') + } + + return fieldErrors +} + function rowToKnowledgeBase(row: typeof knowledgeBaseTable.$inferSelect): KnowledgeBase { return { id: row.id, diff --git a/src/main/data/services/KnowledgeItemService.ts b/src/main/data/services/KnowledgeItemService.ts index aedf5c03ec..14c214fe69 100644 --- a/src/main/data/services/KnowledgeItemService.ts +++ b/src/main/data/services/KnowledgeItemService.ts @@ -7,13 +7,14 @@ import { knowledgeItemTable } from '@data/db/schemas/knowledge' import { loggerService } from '@logger' import { application } from '@main/core/application' +import type { OffsetPaginationResponse } from '@shared/data/api' import { DataApiErrorFactory } from '@shared/data/api' -import type { OffsetPaginationResponse } from '@shared/data/api/apiTypes' import type { CreateKnowledgeItemsDto, KnowledgeItemsQuery, UpdateKnowledgeItemDto } from '@shared/data/api/schemas/knowledges' +import { getCreateKnowledgeItemsReferenceErrors } from '@shared/data/api/schemas/knowledges' import { DirectoryItemDataSchema, FileItemDataSchema, @@ -36,6 +37,61 @@ const KNOWLEDGE_ITEM_DATA_SCHEMAS = { directory: DirectoryItemDataSchema } as const +type PlannedKnowledgeItemInsert = CreateKnowledgeItemsDto['items'][number] & { + parsedData: CreateKnowledgeItemsDto['items'][number]['data'] + index: number +} + +function getCreateKnowledgeItemGroupingErrors( + itemsToCreate: CreateKnowledgeItemsDto['items'] +): Record { + const itemsByRef = new Map( + itemsToCreate + .filter((item): item is (typeof itemsToCreate)[number] & { ref: string } => typeof item.ref === 'string') + .map((item) => [item.ref, item] as const) + ) + + for (const item of itemsToCreate) { + if (item.ref && item.groupRef === item.ref) { + return { + groupRef: ['Knowledge item cannot reference itself as group owner'] + } + } + } + + const visitState = new Map() + + const hasCycle = (ref: string): boolean => { + const state = visitState.get(ref) + if (state === 'visiting') { + return true + } + if (state === 'visited') { + return false + } + + visitState.set(ref, 'visiting') + + const targetRef = itemsByRef.get(ref)?.groupRef + if (targetRef && itemsByRef.has(targetRef) && hasCycle(targetRef)) { + return true + } + + visitState.set(ref, 'visited') + return false + } + + for (const ref of itemsByRef.keys()) { + if (hasCycle(ref)) { + return { + groupRef: ['Knowledge item grouping cannot contain cycles within one request batch'] + } + } + } + + return {} +} + function rowToKnowledgeItem(row: typeof knowledgeItemTable.$inferSelect): KnowledgeItem { const parseJson = (value: T | string | null | undefined, context?: string): T | null => { if (value == null) return null @@ -72,8 +128,12 @@ function rowToKnowledgeItem(row: typeof knowledgeItemTable.$inferSelect): Knowle } export class KnowledgeItemService { + private get db() { + const dbService = application.get('DbService') + return dbService.getDb() + } + async list(baseId: string, query: KnowledgeItemsQuery): Promise> { - const db = application.get('DbService').getDb() await knowledgeBaseService.getById(baseId) const { page, limit, type, groupId } = query const offset = (page - 1) * limit @@ -87,29 +147,38 @@ export class KnowledgeItemService { } const where = conditions.length === 1 ? conditions[0] : and(...conditions) - const [rows, [{ count }]] = await Promise.all([ - db + this.db .select() .from(knowledgeItemTable) .where(where) .orderBy(desc(knowledgeItemTable.createdAt), desc(knowledgeItemTable.id)) .limit(limit) .offset(offset), - db.select({ count: sql`count(*)` }).from(knowledgeItemTable).where(where) + this.db.select({ count: sql`count(*)` }).from(knowledgeItemTable).where(where) ]) return { items: rows.map((row) => rowToKnowledgeItem(row)), total: count, - page + page: query.page } } async createMany(baseId: string, dto: CreateKnowledgeItemsDto): Promise<{ items: KnowledgeItem[] }> { - const db = application.get('DbService').getDb() await knowledgeBaseService.getById(baseId) - const values: Array = dto.items.map((item, index) => { + + const referenceErrors = getCreateKnowledgeItemsReferenceErrors(dto.items) + if (Object.keys(referenceErrors).length > 0) { + throw DataApiErrorFactory.validation(referenceErrors) + } + + const groupingErrors = getCreateKnowledgeItemGroupingErrors(dto.items) + if (Object.keys(groupingErrors).length > 0) { + throw DataApiErrorFactory.validation(groupingErrors) + } + + const itemsToCreate = dto.items.map((item, index) => { const parsed = KNOWLEDGE_ITEM_DATA_SCHEMAS[item.type].safeParse(item.data) if (!parsed.success) { throw DataApiErrorFactory.validation({ @@ -118,41 +187,44 @@ export class KnowledgeItemService { } return { - baseId, - groupId: item.groupId ?? null, - type: item.type, - data: parsed.data, - status: 'idle', - error: null + ...item, + parsedData: parsed.data, + index } }) - const requestedGroupIds = [...new Set(dto.items.map((item) => item.groupId).filter((groupId) => groupId != null))] - if (requestedGroupIds.length > 0) { - const existingGroupRows = await db - .select({ id: knowledgeItemTable.id }) - .from(knowledgeItemTable) - .where(and(eq(knowledgeItemTable.baseId, baseId), inArray(knowledgeItemTable.id, requestedGroupIds))) - const existingGroupIds = new Set(existingGroupRows.map((row) => row.id)) - const missingGroupIds = requestedGroupIds.filter((groupId) => !existingGroupIds.has(groupId)) + const requestedGroupIds = [ + ...new Set(itemsToCreate.flatMap((item) => (item.groupId != null ? [item.groupId] : []))) + ] + const existingGroupIds = await this.getExistingGroupIdsInBase(baseId, requestedGroupIds) + const missingGroupIds = requestedGroupIds.filter((groupId) => !existingGroupIds.has(groupId)) - if (missingGroupIds.length > 0) { - throw DataApiErrorFactory.validation({ - groupId: [`Knowledge item group owner not found in base '${baseId}': ${missingGroupIds.join(', ')}`] - }) - } + if (missingGroupIds.length > 0) { + throw DataApiErrorFactory.validation({ + groupId: [`Knowledge item group owner not found in base '${baseId}': ${missingGroupIds.join(', ')}`] + }) } - const rows = await db.insert(knowledgeItemTable).values(values).returning() - const items = rows.map((row) => rowToKnowledgeItem(row)) + const createdRows = await this.createBatch(baseId, itemsToCreate) + + const items = itemsToCreate.map((item) => { + const createdRow = createdRows[item.index] + if (!createdRow) { + throw DataApiErrorFactory.dataInconsistent( + 'KnowledgeItem', + `Knowledge item create result missing for index '${item.index}'` + ) + } + + return rowToKnowledgeItem(createdRow) + }) logger.info('Created knowledge items', { baseId, count: items.length }) return { items } } async getById(id: string): Promise { - const db = application.get('DbService').getDb() - const [row] = await db.select().from(knowledgeItemTable).where(eq(knowledgeItemTable.id, id)).limit(1) + const [row] = await this.db.select().from(knowledgeItemTable).where(eq(knowledgeItemTable.id, id)).limit(1) if (!row) { throw DataApiErrorFactory.notFound('KnowledgeItem', id) @@ -161,8 +233,63 @@ export class KnowledgeItemService { return rowToKnowledgeItem(row) } + async getByIdsInBase(baseId: string, itemIds: string[]): Promise { + const uniqueItemIds = [...new Set(itemIds)] + + if (uniqueItemIds.length === 0) { + return [] + } + + const rows = await this.db + .select() + .from(knowledgeItemTable) + .where(and(eq(knowledgeItemTable.baseId, baseId), inArray(knowledgeItemTable.id, uniqueItemIds))) + + const itemsById = new Map(rows.map((row) => [row.id, rowToKnowledgeItem(row)])) + + for (const itemId of uniqueItemIds) { + if (!itemsById.has(itemId)) { + throw DataApiErrorFactory.notFound('KnowledgeItem', itemId) + } + } + + return uniqueItemIds.map((itemId) => itemsById.get(itemId)!) + } + + async getCascadeIdsInBase(baseId: string, rootIds: string[]): Promise { + const uniqueRootIds = [...new Set(rootIds)] + + if (uniqueRootIds.length === 0) { + return [] + } + + await this.getByIdsInBase(baseId, uniqueRootIds) + const descendantRows = await this.db.all<{ id: string }>(sql` + WITH RECURSIVE descendants AS ( + SELECT id + FROM knowledge_item + WHERE base_id = ${baseId} + AND group_id IN (${sql.join( + uniqueRootIds.map((id) => sql`${id}`), + sql`, ` + )}) + + UNION ALL + + SELECT child.id + FROM knowledge_item child + INNER JOIN descendants parent ON child.group_id = parent.id + WHERE child.base_id = ${baseId} + ) + SELECT DISTINCT id FROM descendants + `) + const descendantIds = descendantRows.map((row) => row.id) + + const rootIdSet = new Set(uniqueRootIds) + return [...uniqueRootIds, ...descendantIds.filter((id) => !rootIdSet.has(id))] + } + async update(id: string, dto: UpdateKnowledgeItemDto): Promise { - const db = application.get('DbService').getDb() const existing = await this.getById(id) const updates: Partial = {} @@ -182,17 +309,87 @@ export class KnowledgeItemService { return existing } - const [row] = await db.update(knowledgeItemTable).set(updates).where(eq(knowledgeItemTable.id, id)).returning() + const [row] = await this.db.update(knowledgeItemTable).set(updates).where(eq(knowledgeItemTable.id, id)).returning() + if (!row) { + throw DataApiErrorFactory.dataInconsistent('KnowledgeItem', `Knowledge item update result missing for id '${id}'`) + } logger.info('Updated knowledge item', { id, changes: Object.keys(dto) }) return rowToKnowledgeItem(row) } async delete(id: string): Promise { - const db = application.get('DbService').getDb() await this.getById(id) - await db.delete(knowledgeItemTable).where(eq(knowledgeItemTable.id, id)) + await this.db.delete(knowledgeItemTable).where(eq(knowledgeItemTable.id, id)) logger.info('Deleted knowledge item', { id }) } + + private async createBatch( + baseId: string, + itemsToCreate: PlannedKnowledgeItemInsert[] + ): Promise> { + const rowsByIndex = new Map() + const itemsByRef = new Map() + + await this.db.transaction(async (tx) => { + const pendingItems = [...itemsToCreate] + + while (pendingItems.length > 0) { + const readyItems = pendingItems.filter((item) => item.groupRef == null || itemsByRef.has(item.groupRef)) + + if (readyItems.length === 0) { + throw DataApiErrorFactory.dataInconsistent( + 'KnowledgeItem', + `Unable to resolve knowledge item grouping in base '${baseId}'` + ) + } + + for (const item of readyItems) { + const groupId = item.groupRef ? (itemsByRef.get(item.groupRef)?.id ?? null) : (item.groupId ?? null) + const [row] = await tx + .insert(knowledgeItemTable) + .values({ + baseId, + groupId, + type: item.type, + data: item.parsedData, + status: 'idle', + error: null + }) + .returning() + + rowsByIndex.set(item.index, row) + + if (item.ref) { + itemsByRef.set(item.ref, row) + } + } + + const readyIndices = new Set(readyItems.map((item) => item.index)) + for (let index = pendingItems.length - 1; index >= 0; index -= 1) { + if (readyIndices.has(pendingItems[index].index)) { + pendingItems.splice(index, 1) + } + } + } + }) + + return itemsToCreate.map((item) => rowsByIndex.get(item.index)) + } + + private async getExistingGroupIdsInBase(baseId: string, groupIds: string[]): Promise> { + const uniqueGroupIds = [...new Set(groupIds)] + + if (uniqueGroupIds.length === 0) { + return new Set() + } + + const rows = await this.db + .select({ id: knowledgeItemTable.id }) + .from(knowledgeItemTable) + .where(and(eq(knowledgeItemTable.baseId, baseId), inArray(knowledgeItemTable.id, uniqueGroupIds))) + + return new Set(rows.map((row) => row.id)) + } } export const knowledgeItemService = new KnowledgeItemService() diff --git a/src/main/data/services/__tests__/KnowledgeBaseService.test.ts b/src/main/data/services/__tests__/KnowledgeBaseService.test.ts index 92086c1791..52dd1d34d5 100644 --- a/src/main/data/services/__tests__/KnowledgeBaseService.test.ts +++ b/src/main/data/services/__tests__/KnowledgeBaseService.test.ts @@ -20,7 +20,9 @@ vi.mock('@main/core/application', () => ({ } })) -const { KnowledgeBaseService } = await import('../KnowledgeBaseService') +const { KnowledgeBaseService, normalizeKnowledgeBaseConfigDependencies, validateKnowledgeBaseConfig } = await import( + '../KnowledgeBaseService' +) function createMockRow(overrides: Record = {}) { return { @@ -423,4 +425,68 @@ describe('KnowledgeBaseService', () => { }) }) }) + + describe('config helpers', () => { + describe('normalizeKnowledgeBaseConfigDependencies', () => { + it('should clear stale dependent fields after primary config changes', () => { + expect( + normalizeKnowledgeBaseConfigDependencies({ + chunkSize: 100, + chunkOverlap: 120, + searchMode: 'default' as const, + hybridAlpha: 0.6 + }) + ).toEqual({ + chunkSize: 100, + chunkOverlap: undefined, + searchMode: 'default', + hybridAlpha: undefined + }) + }) + }) + + describe('validateKnowledgeBaseConfig', () => { + it('should return field errors for invalid runtime config combinations', () => { + expect( + validateKnowledgeBaseConfig({ + chunkSize: null, + chunkOverlap: 64, + threshold: 1.5, + documentCount: 0, + searchMode: 'default', + hybridAlpha: 2 + }) + ).toEqual({ + chunkOverlap: ['Chunk overlap requires chunk size'], + threshold: ['Threshold must be between 0 and 1'], + documentCount: ['Document count must be greater than 0'], + hybridAlpha: ['Hybrid alpha must be between 0 and 1'] + }) + }) + + it('should reject hybridAlpha when searchMode is not hybrid', () => { + expect( + validateKnowledgeBaseConfig({ + searchMode: 'bm25', + hybridAlpha: 0.7 + }) + ).toEqual({ + hybridAlpha: ['Hybrid alpha requires hybrid search mode'] + }) + }) + + it('should accept valid config', () => { + expect( + validateKnowledgeBaseConfig({ + chunkSize: 512, + chunkOverlap: 64, + threshold: 0.5, + documentCount: 5, + searchMode: 'hybrid', + hybridAlpha: 0.7 + }) + ).toEqual({}) + }) + }) + }) }) diff --git a/src/main/data/services/__tests__/KnowledgeItemService.test.ts b/src/main/data/services/__tests__/KnowledgeItemService.test.ts index 7c63ea0b1c..140acab10f 100644 --- a/src/main/data/services/__tests__/KnowledgeItemService.test.ts +++ b/src/main/data/services/__tests__/KnowledgeItemService.test.ts @@ -19,7 +19,9 @@ const mockDb = { select: mockSelect, insert: mockInsert, update: mockUpdate, - delete: mockDelete + delete: mockDelete, + all: vi.fn(), + transaction: vi.fn(async (callback: (tx: typeof mockDb) => Promise) => await callback(mockDb)) } let realDb: DbType | null = null @@ -56,6 +58,52 @@ function createMockRow(overrides: Record = {}) { } } +async function initializeKnowledgeTables(db: DbType) { + await db.run(sql`PRAGMA foreign_keys = ON`) + await db.run( + sql.raw(` + CREATE TABLE knowledge_base ( + id TEXT PRIMARY KEY NOT NULL, + name TEXT NOT NULL, + description TEXT, + dimensions INTEGER NOT NULL, + embedding_model_id TEXT NOT NULL, + rerank_model_id TEXT, + file_processor_id TEXT, + chunk_size INTEGER, + chunk_overlap INTEGER, + threshold REAL, + document_count INTEGER, + search_mode TEXT, + hybrid_alpha REAL, + created_at INTEGER, + updated_at INTEGER, + CONSTRAINT knowledge_base_search_mode_check CHECK (search_mode IN ('default', 'bm25', 'hybrid') OR search_mode IS NULL) + ) + `) + ) + await db.run( + sql.raw(` + CREATE TABLE knowledge_item ( + id TEXT PRIMARY KEY NOT NULL, + base_id TEXT NOT NULL, + group_id TEXT, + type TEXT NOT NULL, + data TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'idle', + error TEXT, + created_at INTEGER, + updated_at INTEGER, + CONSTRAINT knowledge_item_type_check CHECK (type IN ('file', 'url', 'note', 'sitemap', 'directory')), + CONSTRAINT knowledge_item_status_check CHECK (status IN ('idle', 'pending', 'file_processing', 'read', 'embed', 'completed', 'failed')), + FOREIGN KEY (base_id) REFERENCES knowledge_base(id) ON DELETE CASCADE, + FOREIGN KEY (base_id, group_id) REFERENCES knowledge_item(base_id, id) ON DELETE CASCADE, + CONSTRAINT knowledge_item_baseId_id_unique UNIQUE (base_id, id) + ) + `) + ) +} + describe('KnowledgeItemService', () => { let service: InstanceType @@ -64,6 +112,8 @@ describe('KnowledgeItemService', () => { mockInsert.mockReset() mockUpdate.mockReset() mockDelete.mockReset() + mockDb.all.mockReset() + mockDb.transaction.mockClear() getKnowledgeBaseByIdMock.mockReset() getKnowledgeBaseByIdMock.mockResolvedValue({ id: 'kb-1' }) realDb = null @@ -146,13 +196,17 @@ describe('KnowledgeItemService', () => { describe('createMany', () => { it('should create and return knowledge items', async () => { - const values = vi.fn().mockReturnValue({ + const valuesFirst = vi.fn().mockReturnValue({ returning: vi.fn().mockResolvedValue([ createMockRow({ id: 'item-1', type: 'directory', - data: { path: '/tmp/files', recursive: true } - }), + data: { name: 'files', path: '/tmp/files' } + }) + ]) + }) + const valuesSecond = vi.fn().mockReturnValue({ + returning: vi.fn().mockResolvedValue([ createMockRow({ id: 'item-2', type: 'note', @@ -160,13 +214,13 @@ describe('KnowledgeItemService', () => { }) ]) }) - mockInsert.mockReturnValue({ values }) + mockInsert.mockReturnValueOnce({ values: valuesFirst }).mockReturnValueOnce({ values: valuesSecond }) const dto: CreateKnowledgeItemsDto = { items: [ { type: 'directory', - data: { path: '/tmp/files', recursive: true } + data: { name: 'files', path: '/tmp/files' } }, { type: 'note', @@ -177,30 +231,85 @@ describe('KnowledgeItemService', () => { const result = await service.createMany('kb-1', dto) - expect(values).toHaveBeenCalledWith([ - { - baseId: 'kb-1', - groupId: null, - type: 'directory', - data: { path: '/tmp/files', recursive: true }, - status: 'idle', - error: null - }, - { - baseId: 'kb-1', - groupId: null, - type: 'note', - data: { content: 'child note' }, - status: 'idle', - error: null - } - ]) + expect(valuesFirst).toHaveBeenCalledWith({ + baseId: 'kb-1', + groupId: null, + type: 'directory', + data: { name: 'files', path: '/tmp/files' }, + status: 'idle', + error: null + }) + expect(valuesSecond).toHaveBeenCalledWith({ + baseId: 'kb-1', + groupId: null, + type: 'note', + data: { content: 'child note' }, + status: 'idle', + error: null + }) expect(result.items).toHaveLength(2) expect(result.items[0]).toMatchObject({ id: 'item-1' }) }) + it('should create grouped items that reference a batch-local parent by groupRef', async () => { + const valuesFirst = vi.fn().mockReturnValue({ + returning: vi.fn().mockResolvedValue([ + createMockRow({ + id: 'generated-dir', + groupId: null, + type: 'directory', + data: { name: 'files', path: '/tmp/files' } + }) + ]) + }) + const valuesSecond = vi.fn().mockReturnValue({ + returning: vi.fn().mockResolvedValue([ + createMockRow({ + id: 'generated-note', + groupId: 'generated-dir', + type: 'note', + data: { content: 'child note' } + }) + ]) + }) + mockInsert.mockReturnValueOnce({ values: valuesFirst }).mockReturnValueOnce({ values: valuesSecond }) + + const result = await service.createMany('kb-1', { + items: [ + { + ref: 'root', + type: 'directory', + data: { name: 'files', path: '/tmp/files' } + }, + { + groupRef: 'root', + type: 'note', + data: { content: 'child note' } + } + ] + }) + + expect(valuesFirst).toHaveBeenCalledWith({ + baseId: 'kb-1', + groupId: null, + type: 'directory', + data: { name: 'files', path: '/tmp/files' }, + status: 'idle', + error: null + }) + expect(valuesSecond).toHaveBeenCalledWith({ + baseId: 'kb-1', + groupId: 'generated-dir', + type: 'note', + data: { content: 'child note' }, + status: 'idle', + error: null + }) + expect(result.items).toHaveLength(2) + }) + it('should reject invalid item data with validation error before insert', async () => { await expect( service.createMany('kb-1', { @@ -263,49 +372,7 @@ describe('KnowledgeItemService', () => { }) const db = realDb - await db.run(sql`PRAGMA foreign_keys = ON`) - await db.run( - sql.raw(` - CREATE TABLE knowledge_base ( - id TEXT PRIMARY KEY NOT NULL, - name TEXT NOT NULL, - description TEXT, - dimensions INTEGER NOT NULL, - embedding_model_id TEXT NOT NULL, - rerank_model_id TEXT, - file_processor_id TEXT, - chunk_size INTEGER, - chunk_overlap INTEGER, - threshold REAL, - document_count INTEGER, - search_mode TEXT, - hybrid_alpha REAL, - created_at INTEGER, - updated_at INTEGER, - CONSTRAINT knowledge_base_search_mode_check CHECK (search_mode IN ('default', 'bm25', 'hybrid') OR search_mode IS NULL) - ) - `) - ) - await db.run( - sql.raw(` - CREATE TABLE knowledge_item ( - id TEXT PRIMARY KEY NOT NULL, - base_id TEXT NOT NULL, - group_id TEXT, - type TEXT NOT NULL, - data TEXT NOT NULL, - status TEXT NOT NULL DEFAULT 'idle', - error TEXT, - created_at INTEGER, - updated_at INTEGER, - CONSTRAINT knowledge_item_type_check CHECK (type IN ('file', 'url', 'note', 'sitemap', 'directory')), - CONSTRAINT knowledge_item_status_check CHECK (status IN ('idle', 'pending', 'ocr', 'read', 'embed', 'completed', 'failed')), - FOREIGN KEY (base_id) REFERENCES knowledge_base(id) ON DELETE CASCADE, - FOREIGN KEY (base_id, group_id) REFERENCES knowledge_item(base_id, id) ON DELETE CASCADE, - CONSTRAINT knowledge_item_baseId_id_unique UNIQUE (base_id, id) - ) - `) - ) + await initializeKnowledgeTables(db) await db.insert(knowledgeBaseTable).values({ id: 'kb-1', @@ -320,7 +387,7 @@ describe('KnowledgeItemService', () => { baseId: 'kb-1', groupId: null, type: 'directory', - data: { path: '/a', recursive: true }, + data: { name: 'a', path: '/a' }, status: 'idle', error: null, createdAt: 100 @@ -330,7 +397,7 @@ describe('KnowledgeItemService', () => { baseId: 'kb-1', groupId: null, type: 'directory', - data: { path: '/b', recursive: true }, + data: { name: 'b', path: '/b' }, status: 'idle', error: null, createdAt: 90 @@ -400,6 +467,72 @@ describe('KnowledgeItemService', () => { expect(result.items.map((item) => item.id)).toEqual(['note-group-a']) }) + it('getCascadeIdsInBase returns root ids with recursive descendants', async () => { + const db = realDb! + + await db.insert(knowledgeItemTable).values({ + id: 'note-grandchild', + baseId: 'kb-1', + groupId: 'note-group-a', + type: 'note', + data: { content: 'grandchild note' }, + status: 'idle', + error: null, + createdAt: 50 + }) + + const result = await service.getCascadeIdsInBase('kb-1', ['dir-a']) + + expect(result).toEqual(['dir-a', 'note-group-a', 'note-grandchild']) + }) + + it('getByIdsInBase returns items in input order for one base', async () => { + const result = await service.getByIdsInBase('kb-1', ['note-plain', 'dir-a']) + + expect(result.map((item) => item.id)).toEqual(['note-plain', 'dir-a']) + expect(result.map((item) => item.data)).toEqual([{ content: 'plain note' }, { name: 'a', path: '/a' }]) + }) + + it('getByIdsInBase throws when any requested item is outside the base or missing', async () => { + await expect(service.getByIdsInBase('kb-1', ['note-plain', 'missing-item'])).rejects.toMatchObject({ + code: ErrorCode.NOT_FOUND, + status: 404 + }) + }) + + it('getCascadeIdsInBase preserves root order and deduplicates repeated root ids', async () => { + const db = realDb! + + await db.insert(knowledgeItemTable).values([ + { + id: 'child-root-1', + baseId: 'kb-1', + groupId: 'dir-a', + type: 'note', + data: { content: 'child a' }, + status: 'idle', + error: null, + createdAt: 40 + }, + { + id: 'child-root-2', + baseId: 'kb-1', + groupId: 'dir-b', + type: 'note', + data: { content: 'child b' }, + status: 'idle', + error: null, + createdAt: 30 + } + ]) + + const result = await service.getCascadeIdsInBase('kb-1', ['dir-a', 'dir-b', 'dir-a']) + + expect(result.slice(0, 2)).toEqual(['dir-a', 'dir-b']) + expect(result.slice(2)).toEqual(expect.arrayContaining(['note-group-a', 'child-root-1', 'child-root-2'])) + expect(result).toHaveLength(5) + }) + it('db check constraints reject invalid knowledge enums', async () => { const db = realDb! @@ -471,6 +604,163 @@ describe('KnowledgeItemService', () => { data: { content: 'new grouped note' } }) }) + + it('createMany accepts multi-level groupRef trees in one batch', async () => { + const result = await service.createMany('kb-1', { + items: [ + { + ref: 'dir-a', + type: 'directory', + data: { name: 'a', path: '/a' } + }, + { + ref: 'dir-b', + groupRef: 'dir-a', + type: 'directory', + data: { name: 'b', path: '/a/b' } + }, + { + groupRef: 'dir-b', + type: 'note', + data: { content: 'nested note' } + } + ] + }) + + expect(result.items).toHaveLength(3) + + const dirA = result.items.find((item) => item.type === 'directory' && item.data.path === '/a') + const dirB = result.items.find((item) => item.type === 'directory' && item.data.path === '/a/b') + const note = result.items.find((item) => item.type === 'note') + + expect(dirA?.groupId).toBeNull() + expect(dirB?.groupId).toBe(dirA?.id) + expect(note?.groupId).toBe(dirB?.id) + }) + + it('createMany accepts sitemap owner items with grouped url children in one batch', async () => { + const result = await service.createMany('kb-1', { + items: [ + { + ref: 'sitemap-root', + type: 'sitemap', + data: { + url: 'https://example.com/sitemap.xml', + name: 'Example Sitemap' + } + }, + { + groupRef: 'sitemap-root', + type: 'url', + data: { + url: 'https://example.com/page-a', + name: 'Page A' + } + }, + { + groupRef: 'sitemap-root', + type: 'url', + data: { + url: 'https://example.com/page-b', + name: 'Page B' + } + } + ] + }) + + expect(result.items).toHaveLength(3) + + const sitemap = result.items.find((item) => item.type === 'sitemap') + const urlItems = result.items.filter((item) => item.type === 'url') + + expect(sitemap?.groupId).toBeNull() + expect(urlItems).toHaveLength(2) + expect(urlItems.every((item) => item.groupId === sitemap?.id)).toBe(true) + }) + + it('createMany rejects self-referencing groupRef items', async () => { + await expect( + service.createMany('kb-1', { + items: [ + { + ref: 'self', + groupRef: 'self', + type: 'note', + data: { content: 'self ref' } + } + ] + }) + ).rejects.toMatchObject({ + code: ErrorCode.VALIDATION_ERROR, + details: { + fieldErrors: { + groupRef: ['Knowledge item cannot reference itself as group owner'] + } + } + }) + }) + + it('createMany rejects two-node groupRef cycles', async () => { + await expect( + service.createMany('kb-1', { + items: [ + { + ref: 'a', + groupRef: 'b', + type: 'note', + data: { content: 'A' } + }, + { + ref: 'b', + groupRef: 'a', + type: 'note', + data: { content: 'B' } + } + ] + }) + ).rejects.toMatchObject({ + code: ErrorCode.VALIDATION_ERROR, + details: { + fieldErrors: { + groupRef: ['Knowledge item grouping cannot contain cycles within one request batch'] + } + } + }) + }) + + it('createMany rejects longer groupRef cycles', async () => { + await expect( + service.createMany('kb-1', { + items: [ + { + ref: 'a', + groupRef: 'c', + type: 'directory', + data: { name: 'a', path: '/a' } + }, + { + ref: 'b', + groupRef: 'a', + type: 'directory', + data: { name: 'b', path: '/b' } + }, + { + ref: 'c', + groupRef: 'b', + type: 'note', + data: { content: 'cycle' } + } + ] + }) + ).rejects.toMatchObject({ + code: ErrorCode.VALIDATION_ERROR, + details: { + fieldErrors: { + groupRef: ['Knowledge item grouping cannot contain cycles within one request batch'] + } + } + }) + }) }) describe('getById', () => { @@ -568,7 +858,7 @@ describe('KnowledgeItemService', () => { await expect( service.update('item-1', { - data: { path: '/tmp/files', recursive: true } + data: { name: 'files', path: '/tmp/files' } }) ).rejects.toMatchObject({ code: ErrorCode.VALIDATION_ERROR, @@ -652,49 +942,7 @@ describe('KnowledgeItemService', () => { }) const db = realDb - await db.run(sql`PRAGMA foreign_keys = ON`) - await db.run( - sql.raw(` - CREATE TABLE knowledge_base ( - id TEXT PRIMARY KEY NOT NULL, - name TEXT NOT NULL, - description TEXT, - dimensions INTEGER NOT NULL, - embedding_model_id TEXT NOT NULL, - rerank_model_id TEXT, - file_processor_id TEXT, - chunk_size INTEGER, - chunk_overlap INTEGER, - threshold REAL, - document_count INTEGER, - search_mode TEXT, - hybrid_alpha REAL, - created_at INTEGER, - updated_at INTEGER, - CONSTRAINT knowledge_base_search_mode_check CHECK (search_mode IN ('default', 'bm25', 'hybrid') OR search_mode IS NULL) - ) - `) - ) - await db.run( - sql.raw(` - CREATE TABLE knowledge_item ( - id TEXT PRIMARY KEY NOT NULL, - base_id TEXT NOT NULL, - group_id TEXT, - type TEXT NOT NULL, - data TEXT NOT NULL, - status TEXT NOT NULL DEFAULT 'idle', - error TEXT, - created_at INTEGER, - updated_at INTEGER, - CONSTRAINT knowledge_item_type_check CHECK (type IN ('file', 'url', 'note', 'sitemap', 'directory')), - CONSTRAINT knowledge_item_status_check CHECK (status IN ('idle', 'pending', 'ocr', 'read', 'embed', 'completed', 'failed')), - FOREIGN KEY (base_id) REFERENCES knowledge_base(id) ON DELETE CASCADE, - FOREIGN KEY (base_id, group_id) REFERENCES knowledge_item(base_id, id) ON DELETE CASCADE, - CONSTRAINT knowledge_item_baseId_id_unique UNIQUE (base_id, id) - ) - `) - ) + await initializeKnowledgeTables(db) await db.insert(knowledgeBaseTable).values({ id: 'kb-delete', @@ -709,7 +957,7 @@ describe('KnowledgeItemService', () => { baseId: 'kb-delete', groupId: null, type: 'directory', - data: { path: '/docs', recursive: true }, + data: { name: 'docs', path: '/docs' }, status: 'idle', error: null, createdAt: 100 diff --git a/src/main/data/services/__tests__/knowledgeBaseConfig.test.ts b/src/main/data/services/__tests__/knowledgeBaseConfig.test.ts deleted file mode 100644 index b1605a5f34..0000000000 --- a/src/main/data/services/__tests__/knowledgeBaseConfig.test.ts +++ /dev/null @@ -1,93 +0,0 @@ -import { describe, expect, it } from 'vitest' - -import { - normalizeKnowledgeBaseConfig, - normalizeKnowledgeBaseConfigDependencies, - validateKnowledgeBaseConfig -} from '../knowledgeBaseConfig' - -describe('knowledgeBaseConfig', () => { - describe('normalizeKnowledgeBaseConfig', () => { - it('should clear invalid tuning fields for migration inputs', () => { - expect( - normalizeKnowledgeBaseConfig({ - chunkSize: 200, - chunkOverlap: 200, - threshold: 2, - documentCount: 0, - searchMode: 'default' as const, - hybridAlpha: 0.6 - }) - ).toEqual({ - chunkSize: 200, - chunkOverlap: undefined, - threshold: undefined, - documentCount: undefined, - searchMode: 'default', - hybridAlpha: undefined - }) - }) - }) - - describe('normalizeKnowledgeBaseConfigDependencies', () => { - it('should clear stale dependent fields after primary config changes', () => { - expect( - normalizeKnowledgeBaseConfigDependencies({ - chunkSize: 100, - chunkOverlap: 120, - searchMode: 'default' as const, - hybridAlpha: 0.6 - }) - ).toEqual({ - chunkSize: 100, - chunkOverlap: undefined, - searchMode: 'default', - hybridAlpha: undefined - }) - }) - }) - - describe('validateKnowledgeBaseConfig', () => { - it('should return field errors for invalid runtime config combinations', () => { - expect( - validateKnowledgeBaseConfig({ - chunkSize: null, - chunkOverlap: 64, - threshold: 1.5, - documentCount: 0, - searchMode: 'default', - hybridAlpha: 2 - }) - ).toEqual({ - chunkOverlap: ['Chunk overlap requires chunk size'], - threshold: ['Threshold must be between 0 and 1'], - documentCount: ['Document count must be greater than 0'], - hybridAlpha: ['Hybrid alpha must be between 0 and 1'] - }) - }) - - it('should reject hybridAlpha when searchMode is not hybrid', () => { - expect( - validateKnowledgeBaseConfig({ - searchMode: 'bm25', - hybridAlpha: 0.7 - }) - ).toEqual({ - hybridAlpha: ['Hybrid alpha requires hybrid search mode'] - }) - }) - - it('should accept valid config', () => { - expect( - validateKnowledgeBaseConfig({ - chunkSize: 512, - chunkOverlap: 64, - threshold: 0.5, - documentCount: 5, - searchMode: 'hybrid', - hybridAlpha: 0.7 - }) - ).toEqual({}) - }) - }) -}) diff --git a/src/main/data/services/knowledgeBaseConfig.ts b/src/main/data/services/knowledgeBaseConfig.ts deleted file mode 100644 index 9b7f7ce8f7..0000000000 --- a/src/main/data/services/knowledgeBaseConfig.ts +++ /dev/null @@ -1,102 +0,0 @@ -import type { KnowledgeSearchMode } from '@shared/data/types/knowledge' - -export interface KnowledgeBaseConfigInput { - chunkSize?: number | null - chunkOverlap?: number | null - threshold?: number | null - documentCount?: number | null - searchMode?: KnowledgeSearchMode | null - hybridAlpha?: number | null -} - -type FieldErrors = Record - -function addFieldError(fieldErrors: FieldErrors, field: keyof KnowledgeBaseConfigInput, message: string): void { - if (!fieldErrors[field]) { - fieldErrors[field] = [] - } - - fieldErrors[field].push(message) -} - -export function normalizeKnowledgeBaseConfig(config: T): T { - const normalized = { ...config } - - if (normalized.chunkSize != null && normalized.chunkSize <= 0) { - normalized.chunkSize = undefined as T['chunkSize'] - } - - if (normalized.chunkOverlap != null && normalized.chunkOverlap < 0) { - normalized.chunkOverlap = undefined as T['chunkOverlap'] - } - - if (normalized.threshold != null && (normalized.threshold < 0 || normalized.threshold > 1)) { - normalized.threshold = undefined as T['threshold'] - } - - if (normalized.documentCount != null && normalized.documentCount <= 0) { - normalized.documentCount = undefined as T['documentCount'] - } - - if (normalized.hybridAlpha != null && (normalized.hybridAlpha < 0 || normalized.hybridAlpha > 1)) { - normalized.hybridAlpha = undefined as T['hybridAlpha'] - } - - return normalizeKnowledgeBaseConfigDependencies(normalized) -} - -export function normalizeKnowledgeBaseConfigDependencies(config: T): T { - const normalized = { ...config } - - if (normalized.chunkOverlap != null) { - if (normalized.chunkSize == null || normalized.chunkOverlap >= normalized.chunkSize) { - normalized.chunkOverlap = undefined as T['chunkOverlap'] - } - } - - if (normalized.hybridAlpha != null && normalized.searchMode !== 'hybrid') { - normalized.hybridAlpha = undefined as T['hybridAlpha'] - } - - return normalized -} - -export function validateKnowledgeBaseConfig(config: KnowledgeBaseConfigInput): FieldErrors { - const fieldErrors: FieldErrors = {} - - if (config.chunkSize != null && config.chunkSize <= 0) { - addFieldError(fieldErrors, 'chunkSize', 'Chunk size must be greater than 0') - } - - if (config.chunkOverlap != null && config.chunkOverlap < 0) { - addFieldError(fieldErrors, 'chunkOverlap', 'Chunk overlap must be greater than or equal to 0') - } - - if (config.threshold != null && (config.threshold < 0 || config.threshold > 1)) { - addFieldError(fieldErrors, 'threshold', 'Threshold must be between 0 and 1') - } - - if (config.documentCount != null && config.documentCount <= 0) { - addFieldError(fieldErrors, 'documentCount', 'Document count must be greater than 0') - } - - const hybridAlphaIsInRange = config.hybridAlpha == null || (config.hybridAlpha >= 0 && config.hybridAlpha <= 1) - if (!hybridAlphaIsInRange) { - addFieldError(fieldErrors, 'hybridAlpha', 'Hybrid alpha must be between 0 and 1') - } - - const chunkOverlap = config.chunkOverlap - if (chunkOverlap != null && chunkOverlap >= 0) { - if (config.chunkSize == null) { - addFieldError(fieldErrors, 'chunkOverlap', 'Chunk overlap requires chunk size') - } else if (chunkOverlap >= config.chunkSize) { - addFieldError(fieldErrors, 'chunkOverlap', 'Chunk overlap must be smaller than chunk size') - } - } - - if (config.hybridAlpha != null && hybridAlphaIsInRange && config.searchMode !== 'hybrid') { - addFieldError(fieldErrors, 'hybridAlpha', 'Hybrid alpha requires hybrid search mode') - } - - return fieldErrors -} diff --git a/src/main/services/knowledge/KnowledgeOrchestrationService.ts b/src/main/services/knowledge/KnowledgeOrchestrationService.ts new file mode 100644 index 0000000000..3d3dc7212e --- /dev/null +++ b/src/main/services/knowledge/KnowledgeOrchestrationService.ts @@ -0,0 +1,157 @@ +import { knowledgeBaseService } from '@data/services/KnowledgeBaseService' +import { knowledgeItemService } from '@data/services/KnowledgeItemService' +import { application } from '@main/core/application' +import { BaseService, DependsOn, Injectable, Phase, ServicePhase } from '@main/core/lifecycle' +import type { CreateKnowledgeItemsDto } from '@shared/data/api/schemas/knowledges' +import type { KnowledgeItem, KnowledgeSearchResult } from '@shared/data/types/knowledge' +import { IpcChannel } from '@shared/IpcChannel' +import * as z from 'zod' + +import { expandDirectoryOwnerToCreateItems } from './utils/directory' +import { expandSitemapOwnerToCreateItems } from './utils/sitemap' + +const KnowledgeRuntimeBasePayloadSchema = z + .object({ + baseId: z.string().trim().min(1) + }) + .strict() + +const KnowledgeRuntimeItemsPayloadSchema = z + .object({ + baseId: z.string().trim().min(1), + itemIds: z.array(z.string().trim().min(1)).min(1) + }) + .strict() + +const KnowledgeRuntimeSearchPayloadSchema = z + .object({ + baseId: z.string().trim().min(1), + query: z.string().trim().min(1).max(1000) + }) + .strict() + +@Injectable('KnowledgeOrchestrationService') +@ServicePhase(Phase.WhenReady) +@DependsOn(['KnowledgeRuntimeService']) +export class KnowledgeOrchestrationService extends BaseService { + protected onInit(): void { + this.registerIpcHandlers() + } + + async createBase(baseId: string): Promise { + const base = await knowledgeBaseService.getById(baseId) + const runtime = application.get('KnowledgeRuntimeService') + await runtime.createBase(base) + } + + async deleteBase(baseId: string): Promise { + const runtime = application.get('KnowledgeRuntimeService') + await runtime.deleteBase(baseId) + } + + async addItems(baseId: string, itemIds: string[]): Promise { + const [base, items] = await Promise.all([ + knowledgeBaseService.getById(baseId), + knowledgeItemService.getByIdsInBase(baseId, itemIds) + ]) + + const expandedItems = await this.expandItemsToCreateInputs(items) + const expandedLeafItems = + expandedItems.length === 0 + ? [] + : this.collectIndexableItems( + ( + await knowledgeItemService.createMany(baseId, { + items: expandedItems + }) + ).items + ) + + const allLeafItems = this.collectIndexableItems([...items, ...expandedLeafItems]) + + if (allLeafItems.length === 0) { + return [] + } + + const runtime = application.get('KnowledgeRuntimeService') + return await runtime.addItems(base, allLeafItems) + } + + async deleteItems(baseId: string, itemIds: string[]): Promise { + const [base, items] = await Promise.all([ + knowledgeBaseService.getById(baseId), + knowledgeItemService.getByIdsInBase(baseId, itemIds) + ]) + + const runtime = application.get('KnowledgeRuntimeService') + await runtime.deleteItems(base, items) + } + + async search(baseId: string, query: string): Promise { + const base = await knowledgeBaseService.getById(baseId) + const runtime = application.get('KnowledgeRuntimeService') + return await runtime.search(base, query) + } + + private registerIpcHandlers(): void { + this.ipcHandle(IpcChannel.KnowledgeRuntime_CreateBase, async (_, payload: unknown) => { + const { baseId } = KnowledgeRuntimeBasePayloadSchema.parse(payload) + return await this.createBase(baseId) + }) + this.ipcHandle(IpcChannel.KnowledgeRuntime_DeleteBase, async (_, payload: unknown) => { + const { baseId } = KnowledgeRuntimeBasePayloadSchema.parse(payload) + return await this.deleteBase(baseId) + }) + this.ipcHandle(IpcChannel.KnowledgeRuntime_AddItems, async (_, payload: unknown) => { + const { baseId, itemIds } = KnowledgeRuntimeItemsPayloadSchema.parse(payload) + return await this.addItems(baseId, itemIds) + }) + this.ipcHandle(IpcChannel.KnowledgeRuntime_DeleteItems, async (_, payload: unknown) => { + const { baseId, itemIds } = KnowledgeRuntimeItemsPayloadSchema.parse(payload) + return await this.deleteItems(baseId, itemIds) + }) + this.ipcHandle(IpcChannel.KnowledgeRuntime_Search, async (_, payload: unknown) => { + const { baseId, query } = KnowledgeRuntimeSearchPayloadSchema.parse(payload) + return await this.search(baseId, query) + }) + } + + private async expandItemsToCreateInputs(items: KnowledgeItem[]): Promise { + const expandedItems: CreateKnowledgeItemsDto['items'] = [] + + for (const item of items) { + const itemCreateInputs = await this.expandItemToCreateInputs(item) + if (itemCreateInputs.length === 0) { + continue + } + + expandedItems.push(...itemCreateInputs) + } + + return expandedItems + } + + private async expandItemToCreateInputs(item: KnowledgeItem): Promise { + if (item.type === 'directory') { + return await expandDirectoryOwnerToCreateItems(item) + } + + if (item.type === 'sitemap') { + return await expandSitemapOwnerToCreateItems(item) + } + + return [] + } + + private collectIndexableItems(items: KnowledgeItem[]): KnowledgeItem[] { + const leafItems = new Map() + + for (const item of items) { + if (item.type === 'file' || item.type === 'url' || item.type === 'note') { + leafItems.set(item.id, item) + } + } + + return [...leafItems.values()] + } +} diff --git a/src/main/services/knowledge/__tests__/KnowledgeOrchestrationService.test.ts b/src/main/services/knowledge/__tests__/KnowledgeOrchestrationService.test.ts new file mode 100644 index 0000000000..620cb2e95b --- /dev/null +++ b/src/main/services/knowledge/__tests__/KnowledgeOrchestrationService.test.ts @@ -0,0 +1,392 @@ +import type * as LifecycleModule from '@main/core/lifecycle' +import { getDependencies, getPhase } from '@main/core/lifecycle/decorators' +import { Phase } from '@main/core/lifecycle/types' +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const { + appGetMock, + createBaseMock, + deleteBaseMock, + runtimeAddItemsMock, + runtimeDeleteItemsMock, + runtimeSearchMock, + expandDirectoryOwnerToCreateItemsMock, + expandSitemapOwnerToCreateItemsMock, + knowledgeBaseGetByIdMock, + knowledgeItemCreateManyMock, + knowledgeItemGetByIdsInBaseMock +} = vi.hoisted(() => ({ + appGetMock: vi.fn(), + createBaseMock: vi.fn(), + deleteBaseMock: vi.fn(), + runtimeAddItemsMock: vi.fn(), + runtimeDeleteItemsMock: vi.fn(), + runtimeSearchMock: vi.fn(), + expandDirectoryOwnerToCreateItemsMock: vi.fn(), + expandSitemapOwnerToCreateItemsMock: vi.fn(), + knowledgeBaseGetByIdMock: vi.fn(), + knowledgeItemCreateManyMock: vi.fn(), + knowledgeItemGetByIdsInBaseMock: vi.fn() +})) + +vi.mock('@main/core/application', () => ({ + application: { + get: appGetMock + } +})) + +vi.mock('@main/core/lifecycle', async (importOriginal) => { + const actual = await importOriginal() + + class MockBaseService { + ipcHandle = vi.fn() + } + + return { + ...actual, + BaseService: MockBaseService + } +}) + +vi.mock('@data/services/KnowledgeBaseService', () => ({ + knowledgeBaseService: { + getById: knowledgeBaseGetByIdMock + } +})) + +vi.mock('@data/services/KnowledgeItemService', () => ({ + knowledgeItemService: { + createMany: knowledgeItemCreateManyMock, + getByIdsInBase: knowledgeItemGetByIdsInBaseMock + } +})) + +vi.mock('../utils/directory', () => ({ + expandDirectoryOwnerToCreateItems: expandDirectoryOwnerToCreateItemsMock +})) + +vi.mock('../utils/sitemap', () => ({ + expandSitemapOwnerToCreateItems: expandSitemapOwnerToCreateItemsMock +})) + +const { KnowledgeOrchestrationService } = await import('../KnowledgeOrchestrationService') + +function createBase() { + return { + id: 'kb-1', + name: 'KB', + dimensions: 1024, + embeddingModelId: 'ollama::nomic-embed-text', + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createDirectoryItem() { + return { + id: 'dir-1', + baseId: 'kb-1', + groupId: null, + type: 'directory' as const, + data: { name: 'docs', path: '/docs' }, + status: 'idle' as const, + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createSitemapItem() { + return { + id: 'sitemap-1', + baseId: 'kb-1', + groupId: null, + type: 'sitemap' as const, + data: { url: 'https://example.com/sitemap.xml', name: 'Example Sitemap' }, + status: 'idle' as const, + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createNoteItem(id = 'note-1') { + return { + id, + baseId: 'kb-1', + groupId: null, + type: 'note' as const, + data: { content: `hello ${id}` }, + status: 'idle' as const, + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createFileItem(id = 'file-1', groupId: string | null = null) { + return { + id, + baseId: 'kb-1', + groupId, + type: 'file' as const, + data: { + file: { + id: `${id}-meta`, + name: `${id}.md`, + origin_name: `${id}.md`, + path: `/docs/${id}.md`, + created_at: '2026-04-08T00:00:00.000Z', + size: 10, + ext: '.md', + type: 'text', + count: 1 + } + }, + status: 'idle' as const, + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +describe('KnowledgeOrchestrationService', () => { + beforeEach(() => { + vi.clearAllMocks() + + appGetMock.mockImplementation((serviceName: string) => { + if (serviceName === 'KnowledgeRuntimeService') { + return { + createBase: createBaseMock, + deleteBase: deleteBaseMock, + addItems: runtimeAddItemsMock, + deleteItems: runtimeDeleteItemsMock, + search: runtimeSearchMock + } + } + + throw new Error(`Unexpected application.get(${serviceName}) in test`) + }) + + knowledgeBaseGetByIdMock.mockResolvedValue(createBase()) + knowledgeItemGetByIdsInBaseMock.mockResolvedValue([createNoteItem()]) + knowledgeItemCreateManyMock.mockResolvedValue({ items: [] }) + expandDirectoryOwnerToCreateItemsMock.mockResolvedValue([]) + expandSitemapOwnerToCreateItemsMock.mockResolvedValue([]) + createBaseMock.mockResolvedValue(undefined) + deleteBaseMock.mockResolvedValue(undefined) + runtimeAddItemsMock.mockResolvedValue([undefined]) + runtimeDeleteItemsMock.mockResolvedValue(undefined) + runtimeSearchMock.mockResolvedValue([]) + }) + + it('uses WhenReady phase and depends on KnowledgeRuntimeService', () => { + expect(getPhase(KnowledgeOrchestrationService)).toBe(Phase.WhenReady) + expect(getDependencies(KnowledgeOrchestrationService)).toEqual(['KnowledgeRuntimeService']) + }) + + it('registers only the five caller-facing knowledge runtime IPC handlers', async () => { + const service = new KnowledgeOrchestrationService() + ;(service as any).onInit() + + const handlerCalls = ((service as any).ipcHandle as ReturnType).mock.calls.map((call) => call[0]) + expect(handlerCalls).toEqual([ + 'knowledge-runtime:create-base', + 'knowledge-runtime:delete-base', + 'knowledge-runtime:add-items', + 'knowledge-runtime:delete-items', + 'knowledge-runtime:search' + ]) + }) + + it('rejects invalid create-base IPC payloads before touching services', async () => { + const service = new KnowledgeOrchestrationService() + ;(service as any).onInit() + + const createBaseHandlerCall = ((service as any).ipcHandle as ReturnType).mock.calls.find( + (call) => call[0] === 'knowledge-runtime:create-base' + ) + expect(createBaseHandlerCall).toBeDefined() + const createBaseHandler = createBaseHandlerCall?.[1] as (_event: unknown, payload: unknown) => Promise + + await expect(createBaseHandler({}, { baseId: '' })).rejects.toThrow() + await expect(createBaseHandler({}, { baseId: 'kb-1', extra: true })).rejects.toThrow() + + expect(knowledgeBaseGetByIdMock).not.toHaveBeenCalled() + expect(createBaseMock).not.toHaveBeenCalled() + }) + + it('rejects invalid add-items IPC payloads before touching services', async () => { + const service = new KnowledgeOrchestrationService() + ;(service as any).onInit() + + const addItemsHandlerCall = ((service as any).ipcHandle as ReturnType).mock.calls.find( + (call) => call[0] === 'knowledge-runtime:add-items' + ) + expect(addItemsHandlerCall).toBeDefined() + const addItemsHandler = addItemsHandlerCall?.[1] as (_event: unknown, payload: unknown) => Promise + + await expect(addItemsHandler({}, { baseId: 'kb-1', itemIds: [] })).rejects.toThrow() + await expect(addItemsHandler({}, { baseId: 'kb-1', itemIds: ['note-1', ''] })).rejects.toThrow() + + expect(knowledgeItemGetByIdsInBaseMock).not.toHaveBeenCalled() + expect(runtimeAddItemsMock).not.toHaveBeenCalled() + }) + + it('rejects invalid search IPC payloads before touching services', async () => { + const service = new KnowledgeOrchestrationService() + ;(service as any).onInit() + + const searchHandlerCall = ((service as any).ipcHandle as ReturnType).mock.calls.find( + (call) => call[0] === 'knowledge-runtime:search' + ) + expect(searchHandlerCall).toBeDefined() + const searchHandler = searchHandlerCall?.[1] as (_event: unknown, payload: unknown) => Promise + + await expect(searchHandler({}, { baseId: 'kb-1', query: '' })).rejects.toThrow() + await expect(searchHandler({}, { baseId: 'kb-1', query: 'hello', extra: true })).rejects.toThrow() + + expect(knowledgeBaseGetByIdMock).not.toHaveBeenCalled() + expect(runtimeSearchMock).not.toHaveBeenCalled() + }) + + it('forwards base lifecycle operations to runtime', async () => { + const service = new KnowledgeOrchestrationService() + const base = createBase() + knowledgeBaseGetByIdMock.mockResolvedValue(base) + + await expect(service.createBase(base.id)).resolves.toBeUndefined() + await expect(service.deleteBase(base.id)).resolves.toBeUndefined() + + expect(createBaseMock).toHaveBeenCalledWith(base) + expect(deleteBaseMock).toHaveBeenCalledWith(base.id) + }) + + it('expands container items, persists children, and only enqueues leaf items', async () => { + const service = new KnowledgeOrchestrationService() + const base = createBase() + const directoryItem = createDirectoryItem() + const noteItem = createNoteItem('note-leaf') + const createdDirectoryItem = { + ...createDirectoryItem(), + id: 'dir-child', + groupId: directoryItem.id, + data: { name: 'nested', path: '/docs/nested' } + } + const createdFileItem = createFileItem('file-child', directoryItem.id) + + knowledgeBaseGetByIdMock.mockResolvedValue(base) + knowledgeItemGetByIdsInBaseMock.mockResolvedValue([directoryItem, noteItem]) + expandDirectoryOwnerToCreateItemsMock.mockResolvedValue([ + { + groupId: directoryItem.id, + type: 'directory', + data: { name: 'nested', path: '/docs/nested' } + }, + { + groupId: directoryItem.id, + type: 'file', + data: createdFileItem.data + } + ]) + knowledgeItemCreateManyMock.mockResolvedValue({ + items: [createdDirectoryItem, createdFileItem] + }) + + await expect(service.addItems(base.id, [directoryItem.id, noteItem.id])).resolves.toEqual([undefined]) + + expect(expandDirectoryOwnerToCreateItemsMock).toHaveBeenCalledWith(directoryItem) + expect(knowledgeItemCreateManyMock).toHaveBeenCalledWith(base.id, { + items: [ + { + groupId: directoryItem.id, + type: 'directory', + data: { name: 'nested', path: '/docs/nested' } + }, + { + groupId: directoryItem.id, + type: 'file', + data: createdFileItem.data + } + ] + }) + expect(runtimeAddItemsMock).toHaveBeenCalledWith(base, [noteItem, createdFileItem]) + }) + + it('searches through runtime after resolving the base', async () => { + const service = new KnowledgeOrchestrationService() + const base = createBase() + const results = [ + { + pageContent: 'hello', + score: 0.9, + metadata: { itemId: 'note-1' }, + itemId: 'note-1', + chunkId: 'chunk-1' + } + ] + knowledgeBaseGetByIdMock.mockResolvedValue(base) + runtimeSearchMock.mockResolvedValue(results) + + await expect(service.search(base.id, 'hello')).resolves.toEqual(results) + expect(runtimeSearchMock).toHaveBeenCalledWith(base, 'hello') + }) + + it('expands sitemap items into urls before enqueueing leaf items', async () => { + const service = new KnowledgeOrchestrationService() + const base = createBase() + const sitemapItem = createSitemapItem() + const createdUrlItem = { + id: 'url-child', + baseId: base.id, + groupId: sitemapItem.id, + type: 'url' as const, + data: { url: 'https://example.com/page-1', name: 'https://example.com/page-1' }, + status: 'idle' as const, + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } + + knowledgeBaseGetByIdMock.mockResolvedValue(base) + knowledgeItemGetByIdsInBaseMock.mockResolvedValue([sitemapItem]) + expandSitemapOwnerToCreateItemsMock.mockResolvedValue([ + { + groupId: sitemapItem.id, + type: 'url', + data: { url: 'https://example.com/page-1', name: 'https://example.com/page-1' } + } + ]) + knowledgeItemCreateManyMock.mockResolvedValue({ items: [createdUrlItem] }) + + await expect(service.addItems(base.id, [sitemapItem.id])).resolves.toEqual([undefined]) + + expect(expandSitemapOwnerToCreateItemsMock).toHaveBeenCalledWith(sitemapItem) + expect(runtimeAddItemsMock).toHaveBeenCalledWith(base, [createdUrlItem]) + }) + + it('does not create expanded child items until all expansions succeed', async () => { + const service = new KnowledgeOrchestrationService() + const base = createBase() + const directoryItem = createDirectoryItem() + const sitemapItem = createSitemapItem() + + knowledgeBaseGetByIdMock.mockResolvedValue(base) + knowledgeItemGetByIdsInBaseMock.mockResolvedValue([directoryItem, sitemapItem]) + expandDirectoryOwnerToCreateItemsMock.mockResolvedValue([ + { + groupId: directoryItem.id, + type: 'file', + data: createFileItem('file-child', directoryItem.id).data + } + ]) + expandSitemapOwnerToCreateItemsMock.mockRejectedValue(new Error('sitemap expansion failed')) + + await expect(service.addItems(base.id, [directoryItem.id, sitemapItem.id])).rejects.toThrow( + 'sitemap expansion failed' + ) + + expect(knowledgeItemCreateManyMock).not.toHaveBeenCalled() + expect(runtimeAddItemsMock).not.toHaveBeenCalled() + }) +}) diff --git a/src/main/services/knowledge/index.ts b/src/main/services/knowledge/index.ts new file mode 100644 index 0000000000..45c2412050 --- /dev/null +++ b/src/main/services/knowledge/index.ts @@ -0,0 +1,2 @@ +export { KnowledgeOrchestrationService } from './KnowledgeOrchestrationService' +export { KnowledgeRuntimeService } from './runtime' diff --git a/src/main/services/knowledge/readers/KnowledgeFileReader.ts b/src/main/services/knowledge/readers/KnowledgeFileReader.ts new file mode 100644 index 0000000000..e103920de2 --- /dev/null +++ b/src/main/services/knowledge/readers/KnowledgeFileReader.ts @@ -0,0 +1,46 @@ +import { getFileExt } from '@main/utils/file' +import type { FileMetadata } from '@shared/data/types/file' +import type { KnowledgeItemOf } from '@shared/data/types/knowledge' +import { type Document, type FileReader as VectorStoreFileReader } from '@vectorstores/core' +import { CSVReader } from '@vectorstores/readers/csv' +import { DocxReader } from '@vectorstores/readers/docx' +import { JSONReader } from '@vectorstores/readers/json' +import { MarkdownReader } from '@vectorstores/readers/markdown' +import { PDFReader } from '@vectorstores/readers/pdf' +import { TextFileReader } from '@vectorstores/readers/text' + +import { DraftsExportReader } from './files/DraftsExportReader' +import { EpubReader } from './files/EpubReader' + +export function createSupportedFileReader(file: FileMetadata): VectorStoreFileReader { + const extension = getFileExt(file.path).toLowerCase() + + switch (extension) { + case '.pdf': + return new PDFReader() + case '.csv': + return new CSVReader() + case '.docx': + return new DocxReader() + case '.epub': + return new EpubReader() + case '.json': + return new JSONReader() + case '.md': + return new MarkdownReader() + case '.draftsexport': + return new DraftsExportReader() + default: + return new TextFileReader() + } +} + +export async function loadFileDocuments(item: KnowledgeItemOf<'file'>): Promise { + const file = item.data.file + if (!file.path) { + throw new Error(`Knowledge file ${file.id} is missing file.path`) + } + + const reader = createSupportedFileReader(file) + return await reader.loadData(file.path) +} diff --git a/src/main/services/knowledge/readers/KnowledgeNoteReader.ts b/src/main/services/knowledge/readers/KnowledgeNoteReader.ts new file mode 100644 index 0000000000..0e3dd65ebd --- /dev/null +++ b/src/main/services/knowledge/readers/KnowledgeNoteReader.ts @@ -0,0 +1,15 @@ +import type { KnowledgeItemOf } from '@shared/data/types/knowledge' +import { Document } from '@vectorstores/core' + +export async function loadNoteDocuments(item: KnowledgeItemOf<'note'>): Promise { + return [ + new Document({ + text: item.data.content, + metadata: { + itemId: item.id, + itemType: item.type, + sourceUrl: item.data.sourceUrl + } + }) + ] +} diff --git a/src/main/services/knowledge/readers/KnowledgeReader.ts b/src/main/services/knowledge/readers/KnowledgeReader.ts new file mode 100644 index 0000000000..15c232d7ba --- /dev/null +++ b/src/main/services/knowledge/readers/KnowledgeReader.ts @@ -0,0 +1,24 @@ +import type { KnowledgeItem, KnowledgeItemOf } from '@shared/data/types/knowledge' +import type { Document } from '@vectorstores/core' + +import { loadFileDocuments } from './KnowledgeFileReader' +import { loadNoteDocuments } from './KnowledgeNoteReader' +import { loadUrlDocuments } from './KnowledgeUrlReader' + +export type ReadableKnowledgeItem = KnowledgeItemOf<'file'> | KnowledgeItemOf<'url'> | KnowledgeItemOf<'note'> + +export async function loadKnowledgeItemDocuments( + item: ReadableKnowledgeItem, + signal?: AbortSignal +): Promise { + switch (item.type) { + case 'file': + return await loadFileDocuments(item) + case 'url': + return await loadUrlDocuments(item, signal) + case 'note': + return await loadNoteDocuments(item) + default: + throw new Error(`Unsupported knowledge item type: ${(item as KnowledgeItem).type}`) + } +} diff --git a/src/main/services/knowledge/readers/KnowledgeUrlReader.ts b/src/main/services/knowledge/readers/KnowledgeUrlReader.ts new file mode 100644 index 0000000000..08783866ec --- /dev/null +++ b/src/main/services/knowledge/readers/KnowledgeUrlReader.ts @@ -0,0 +1,34 @@ +import { loggerService } from '@logger' +import type { KnowledgeItemOf } from '@shared/data/types/knowledge' +import { Document, type Document as VectorStoreDocument } from '@vectorstores/core' + +import { fetchKnowledgeWebPage } from '../utils/url' + +const logger = loggerService.withContext('KnowledgeUrlReader') + +export async function loadUrlDocuments( + item: KnowledgeItemOf<'url'>, + signal?: AbortSignal +): Promise { + const markdown = await fetchKnowledgeWebPage(item.data.url, signal) + if (!markdown) { + logger.warn('Knowledge URL reader received empty markdown', { + itemId: item.id, + sourceUrl: item.data.url, + name: item.data.name + }) + throw new Error(`Knowledge URL returned empty markdown: ${item.data.url}`) + } + + return [ + new Document({ + text: markdown, + metadata: { + itemId: item.id, + itemType: item.type, + sourceUrl: item.data.url, + name: item.data.name + } + }) + ] +} diff --git a/src/main/services/knowledge/readers/__tests__/ReaderFactory.test.ts b/src/main/services/knowledge/readers/__tests__/ReaderFactory.test.ts new file mode 100644 index 0000000000..aadd140fc1 --- /dev/null +++ b/src/main/services/knowledge/readers/__tests__/ReaderFactory.test.ts @@ -0,0 +1,363 @@ +import type { KnowledgeItemOf } from '@shared/data/types/knowledge' +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const fetchMock = vi.hoisted(() => vi.fn()) +const loggerWarnMock = vi.hoisted(() => vi.fn()) +const customReaderSpies = vi.hoisted(() => ({ + drafts: vi.fn(async (item: KnowledgeItemOf<'file'>) => [{ metadata: { reader: 'drafts', itemId: item.id } }]), + epub: vi.fn(async (item: KnowledgeItemOf<'file'>) => [{ metadata: { reader: 'epub', itemId: item.id } }]) +})) +const readerSpies = vi.hoisted(() => ({ + csv: vi.fn(async (filePath: string) => [{ metadata: { reader: 'csv', filePath } }]), + docx: vi.fn(async (filePath: string) => [{ metadata: { reader: 'docx', filePath } }]), + json: vi.fn(async (filePath: string) => [{ metadata: { reader: 'json', filePath } }]), + markdown: vi.fn(async (filePath: string) => [{ metadata: { reader: 'markdown', filePath } }]), + pdf: vi.fn(async (filePath: string) => [{ metadata: { reader: 'pdf', filePath } }]), + text: vi.fn(async (filePath: string) => [{ metadata: { reader: 'text', filePath } }]) +})) + +vi.mock('@logger', () => ({ + loggerService: { + withContext: () => ({ + debug: vi.fn(), + info: vi.fn(), + warn: loggerWarnMock, + error: vi.fn() + }) + } +})) + +vi.mock('electron', () => ({ + net: { + fetch: fetchMock + } +})) + +vi.mock('@vectorstores/readers/csv', () => ({ + CSVReader: class { + loadData = readerSpies.csv + } +})) + +vi.mock('@vectorstores/readers/docx', () => ({ + DocxReader: class { + loadData = readerSpies.docx + } +})) + +vi.mock('@vectorstores/readers/json', () => ({ + JSONReader: class { + loadData = readerSpies.json + } +})) + +vi.mock('@vectorstores/readers/markdown', () => ({ + MarkdownReader: class { + loadData = readerSpies.markdown + } +})) + +vi.mock('@vectorstores/readers/pdf', () => ({ + PDFReader: class { + loadData = readerSpies.pdf + } +})) + +vi.mock('@vectorstores/readers/text', () => ({ + TextFileReader: class { + loadData = readerSpies.text + } +})) + +vi.mock('../files/DraftsExportReader', () => ({ + DraftsExportReader: class { + loadData = (filePath: string) => + customReaderSpies.drafts({ + id: 'item-1', + baseId: 'base-1', + groupId: null, + type: 'file', + status: 'idle', + error: null, + createdAt: '2026-04-03T00:00:00.000Z', + updatedAt: '2026-04-03T00:00:00.000Z', + data: { + file: { + id: 'file-1', + name: filePath.split('/').pop() || filePath, + origin_name: filePath.split('/').pop() || filePath, + path: filePath, + size: 1, + ext: '.draftsexport', + type: 'document', + created_at: '2026-04-03T00:00:00.000Z', + count: 1 + } + } + } as KnowledgeItemOf<'file'>) + } +})) + +vi.mock('../files/EpubReader', () => ({ + EpubReader: class { + loadData = (filePath: string) => + customReaderSpies.epub({ + id: 'item-1', + baseId: 'base-1', + groupId: null, + type: 'file', + status: 'idle', + error: null, + createdAt: '2026-04-03T00:00:00.000Z', + updatedAt: '2026-04-03T00:00:00.000Z', + data: { + file: { + id: 'file-1', + name: filePath.split('/').pop() || filePath, + origin_name: filePath.split('/').pop() || filePath, + path: filePath, + size: 1, + ext: '.epub', + type: 'document', + created_at: '2026-04-03T00:00:00.000Z', + count: 1 + } + } + } as KnowledgeItemOf<'file'>) + } +})) + +const { loadKnowledgeItemDocuments } = await import('../KnowledgeReader') + +function createFileItem(ext: string, filePath?: string): KnowledgeItemOf<'file'> { + return { + id: 'item-1', + baseId: 'base-1', + groupId: null, + type: 'file', + status: 'idle', + error: null, + createdAt: '2026-04-03T00:00:00.000Z', + updatedAt: '2026-04-03T00:00:00.000Z', + data: { + file: { + id: 'file-1', + name: `sample${ext}`, + origin_name: `sample${ext}`, + path: filePath ?? `/tmp/sample${ext}`, + size: 1, + ext, + type: 'document', + created_at: '2026-04-03T00:00:00.000Z', + count: 1 + } + } + } +} + +function createNoteItem(content: string, sourceUrl?: string): KnowledgeItemOf<'note'> { + return { + id: 'note-1', + baseId: 'base-1', + groupId: null, + type: 'note', + status: 'idle', + error: null, + createdAt: '2026-04-03T00:00:00.000Z', + updatedAt: '2026-04-03T00:00:00.000Z', + data: { + content, + sourceUrl + } + } +} + +function createUrlItem(): KnowledgeItemOf<'url'> { + return { + id: 'url-1', + baseId: 'base-1', + groupId: null, + type: 'url', + status: 'idle', + error: null, + createdAt: '2026-04-03T00:00:00.000Z', + updatedAt: '2026-04-03T00:00:00.000Z', + data: { + url: 'https://example.com', + name: 'Example' + } + } +} + +function createSitemapItem(): KnowledgeItemOf<'sitemap'> { + return { + id: 'sitemap-1', + baseId: 'base-1', + groupId: null, + type: 'sitemap', + status: 'idle', + error: null, + createdAt: '2026-04-03T00:00:00.000Z', + updatedAt: '2026-04-03T00:00:00.000Z', + data: { + url: 'https://example.com/sitemap.xml', + name: 'Example Sitemap' + } + } +} + +function createDirectoryItem(): KnowledgeItemOf<'directory'> { + return { + id: 'directory-1', + baseId: 'base-1', + groupId: null, + type: 'directory', + status: 'idle', + error: null, + createdAt: '2026-04-03T00:00:00.000Z', + updatedAt: '2026-04-03T00:00:00.000Z', + data: { + name: 'example-directory', + path: '/tmp/example-directory' + } + } +} + +describe('loadKnowledgeItemDocuments', () => { + beforeEach(() => { + fetchMock.mockReset() + loggerWarnMock.mockReset() + }) + + it.each([ + ['.pdf', 'pdf'], + ['.csv', 'csv'], + ['.docx', 'docx'], + ['.json', 'json'], + ['.md', 'markdown'] + ])('maps %s files to the %s reader', async (ext, expectedReader) => { + const item = createFileItem(ext) + const docs = await loadKnowledgeItemDocuments(item) + + expect(docs[0]).toMatchObject({ + metadata: { + reader: expectedReader, + filePath: `/tmp/sample${ext}` + } + }) + }) + + it('falls back to TextFileReader for unmatched file extensions', async () => { + const item = createFileItem('.log') + const docs = await loadKnowledgeItemDocuments(item) + + expect(docs[0]).toMatchObject({ + metadata: { + reader: 'text', + filePath: '/tmp/sample.log' + } + }) + }) + + it('uses the drafts export reader for .draftsexport files', async () => { + const item = createFileItem('.draftsexport') + + const docs = await loadKnowledgeItemDocuments(item) + + expect(customReaderSpies.drafts).toHaveBeenCalled() + expect(docs[0]).toMatchObject({ + metadata: { + reader: 'drafts', + itemId: 'item-1' + } + }) + }) + + it('uses the epub reader for .epub files', async () => { + const item = createFileItem('.epub') + + const docs = await loadKnowledgeItemDocuments(item) + + expect(customReaderSpies.epub).toHaveBeenCalled() + expect(docs[0]).toMatchObject({ + metadata: { + reader: 'epub', + itemId: 'item-1' + } + }) + }) + + it('throws when a file item is missing file.path at load time', async () => { + const item = createFileItem('.txt', '') + + await expect(loadKnowledgeItemDocuments(item)).rejects.toThrow('Knowledge file file-1 is missing file.path') + }) + + it('creates a note reader that returns a single Document', async () => { + const item = createNoteItem('hello world', 'https://example.com/note') + const docs = await loadKnowledgeItemDocuments(item) + + expect(docs).toHaveLength(1) + expect(docs[0]).toMatchObject({ + text: 'hello world', + metadata: { + itemId: 'note-1', + itemType: 'note', + sourceUrl: 'https://example.com/note' + } + }) + }) + + it('fetches markdown from the local knowledge web provider and splits it into documents', async () => { + fetchMock.mockResolvedValue(new Response('# Example Page\n\nHello knowledge', { status: 200 })) + + const item = createUrlItem() + const docs = await loadKnowledgeItemDocuments(item) + + expect(fetchMock).toHaveBeenCalledWith( + 'https://r.jina.ai/https://example.com', + expect.objectContaining({ + signal: expect.any(AbortSignal), + headers: { + 'X-Retain-Images': 'none', + 'X-Return-Format': 'markdown' + } + }) + ) + expect(docs).toHaveLength(1) + expect(docs[0]).toMatchObject({ + text: '# Example Page\n\nHello knowledge', + metadata: { + itemId: 'url-1', + itemType: 'url', + sourceUrl: 'https://example.com', + name: 'Example' + } + }) + }) + + it('throws when the knowledge web provider returns empty markdown', async () => { + fetchMock.mockResolvedValue(new Response(' ', { status: 200 })) + + const item = createUrlItem() + + await expect(loadKnowledgeItemDocuments(item)).rejects.toThrow( + 'Knowledge URL returned empty markdown: https://example.com' + ) + expect(loggerWarnMock).toHaveBeenCalledWith('Knowledge URL reader received empty markdown', { + itemId: 'url-1', + sourceUrl: 'https://example.com', + name: 'Example' + }) + }) + + it.each([ + ['directory', createDirectoryItem()], + ['sitemap', createSitemapItem()] + ])('throws for unsupported %s items', async (_type, item) => { + await expect( + loadKnowledgeItemDocuments(item as unknown as Parameters[0]) + ).rejects.toThrow(`Unsupported knowledge item type: ${item.type}`) + expect(fetchMock).not.toHaveBeenCalled() + }) +}) diff --git a/src/main/services/knowledge/readers/files/DraftsExportReader.ts b/src/main/services/knowledge/readers/files/DraftsExportReader.ts new file mode 100644 index 0000000000..d0d34b2306 --- /dev/null +++ b/src/main/services/knowledge/readers/files/DraftsExportReader.ts @@ -0,0 +1,37 @@ +import { Document, FileReader, type Metadata } from '@vectorstores/core' + +type DraftsExportItem = { + content?: string + tags?: string[] + created_at?: string + modified_at?: string +} + +export class DraftsExportReader extends FileReader> { + async loadDataAsContent(fileContent: Uint8Array): Promise[]> { + const text = new TextDecoder('utf-8').decode(fileContent) + let rawJson: DraftsExportItem[] + + try { + rawJson = JSON.parse(text) as DraftsExportItem[] + } catch (error) { + throw new Error('Failed to parse Drafts export JSON', { + cause: error instanceof Error ? error : new Error(String(error)) + }) + } + + return rawJson + .filter((entry) => typeof entry.content === 'string' && entry.content.trim().length > 0) + .map( + (entry, index) => + new Document({ + text: entry.content!.trim(), + metadata: { + draftIndex: index, + tags: entry.tags ?? [], + modifiedAt: entry.modified_at ?? entry.created_at + } + }) + ) + } +} diff --git a/src/main/services/knowledge/readers/files/EpubReader.ts b/src/main/services/knowledge/readers/files/EpubReader.ts new file mode 100644 index 0000000000..79b82a0f1e --- /dev/null +++ b/src/main/services/knowledge/readers/files/EpubReader.ts @@ -0,0 +1,61 @@ +import { loggerService } from '@logger' +import { Document, FileReader, type Metadata } from '@vectorstores/core' +import EPub from 'epub' + +const logger = loggerService.withContext('KnowledgeEpubReader') + +function stripHtml(html: string): string { + return html + .replace(/<[^>]*>/g, ' ') + .replace(/\s+/g, ' ') + .trim() +} + +export class EpubReader extends FileReader> { + async loadDataAsContent(fileContent: Uint8Array, filename?: string): Promise[]> { + const epub = new EPub(Buffer.from(fileContent)) + await epub.parse() + + const chapters = epub.flow ?? [] + const documents: Document[] = [] + const failedChapterIds: string[] = [] + + for (const [index, chapter] of chapters.entries()) { + try { + const content = await epub.getChapter(chapter.id) + const text = stripHtml(content) + + if (!text) { + continue + } + + documents.push( + new Document({ + text, + metadata: { + source: filename, + title: epub.metadata.title || filename || '', + creator: epub.metadata.creator || '', + language: epub.metadata.language || '', + chapterId: chapter.id, + chapterTitle: chapter.title || `Chapter ${index + 1}`, + chapterOrder: index + 1 + } + }) + ) + } catch (error) { + failedChapterIds.push(chapter.id) + logger.error('Failed to read epub chapter', error as Error, { + filename, + chapterId: chapter.id + }) + } + } + + if (failedChapterIds.length > 0) { + throw new Error(`Failed to read epub chapters: ${failedChapterIds.join(', ')}`) + } + + return documents + } +} diff --git a/src/main/services/knowledge/readers/files/__tests__/DraftsExportReader.test.ts b/src/main/services/knowledge/readers/files/__tests__/DraftsExportReader.test.ts new file mode 100644 index 0000000000..9145e9750d --- /dev/null +++ b/src/main/services/knowledge/readers/files/__tests__/DraftsExportReader.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, it } from 'vitest' + +import { DraftsExportReader } from '../DraftsExportReader' + +describe('DraftsExportReader', () => { + it('parses draft export entries into documents', async () => { + const reader = new DraftsExportReader() + const fileContent = new TextEncoder().encode( + JSON.stringify([ + { + content: ' first draft ', + tags: ['work'], + created_at: '2026-04-01T00:00:00.000Z' + }, + { + content: ' ' + }, + { + content: 'second draft', + modified_at: '2026-04-02T00:00:00.000Z' + } + ]) + ) + + const documents = await reader.loadDataAsContent(fileContent) + + expect(documents).toHaveLength(2) + expect(documents[0]).toMatchObject({ + text: 'first draft', + metadata: { + draftIndex: 0, + tags: ['work'], + modifiedAt: '2026-04-01T00:00:00.000Z' + } + }) + expect(documents[1]).toMatchObject({ + text: 'second draft', + metadata: { + draftIndex: 1, + tags: [], + modifiedAt: '2026-04-02T00:00:00.000Z' + } + }) + }) + + it('throws a readable error when the export JSON is invalid', async () => { + const reader = new DraftsExportReader() + const fileContent = new TextEncoder().encode('{invalid json') + + await expect(reader.loadDataAsContent(fileContent)).rejects.toThrow('Failed to parse Drafts export JSON') + }) +}) diff --git a/src/main/services/knowledge/readers/files/__tests__/EpubReader.test.ts b/src/main/services/knowledge/readers/files/__tests__/EpubReader.test.ts new file mode 100644 index 0000000000..4fc92f7de5 --- /dev/null +++ b/src/main/services/knowledge/readers/files/__tests__/EpubReader.test.ts @@ -0,0 +1,92 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const { loggerErrorMock, parseMock, getChapterMock } = vi.hoisted(() => ({ + loggerErrorMock: vi.fn(), + parseMock: vi.fn(), + getChapterMock: vi.fn() +})) + +vi.mock('@logger', () => ({ + loggerService: { + withContext: () => ({ + error: loggerErrorMock + }) + } +})) + +vi.mock('epub', () => ({ + default: class MockEpub { + flow = [ + { id: 'chapter-1', title: 'Chapter 1' }, + { id: 'chapter-2', title: 'Chapter 2' } + ] + + metadata = { + title: 'Test EPUB', + creator: 'Author', + language: 'en' + } + + constructor(buffer: Buffer) { + void buffer + } + + async parse() { + return await parseMock() + } + + async getChapter(id: string) { + return await getChapterMock(id) + } + } +})) + +const { EpubReader } = await import('../EpubReader') + +describe('EpubReader', () => { + beforeEach(() => { + vi.clearAllMocks() + parseMock.mockResolvedValue(undefined) + }) + + it('returns chapter documents when all chapters are readable', async () => { + getChapterMock.mockImplementation(async (id: string) => `

${id} content

`) + + const reader = new EpubReader() + const docs = await reader.loadDataAsContent(new Uint8Array([1, 2, 3]), 'book.epub') + + expect(docs).toHaveLength(2) + expect(docs[0]?.text).toBe('chapter-1 content') + expect(docs[0]?.metadata).toMatchObject({ + source: 'book.epub', + title: 'Test EPUB', + creator: 'Author', + language: 'en', + chapterId: 'chapter-1', + chapterTitle: 'Chapter 1', + chapterOrder: 1 + }) + expect(loggerErrorMock).not.toHaveBeenCalled() + }) + + it('rejects the epub when any chapter fails instead of silently returning partial content', async () => { + const chapterError = new Error('chapter read failed') + getChapterMock.mockImplementation(async (id: string) => { + if (id === 'chapter-2') { + throw chapterError + } + + return '

chapter-1 content

' + }) + + const reader = new EpubReader() + + await expect(reader.loadDataAsContent(new Uint8Array([1, 2, 3]), 'book.epub')).rejects.toThrow( + 'Failed to read epub chapters: chapter-2' + ) + expect(loggerErrorMock).toHaveBeenCalledWith('Failed to read epub chapter', chapterError, { + filename: 'book.epub', + chapterId: 'chapter-2' + }) + }) +}) diff --git a/src/main/services/knowledge/rerank/__tests__/rerank.test.ts b/src/main/services/knowledge/rerank/__tests__/rerank.test.ts new file mode 100644 index 0000000000..4049306a65 --- /dev/null +++ b/src/main/services/knowledge/rerank/__tests__/rerank.test.ts @@ -0,0 +1,192 @@ +import type { KnowledgeBase, KnowledgeSearchResult } from '@shared/data/types/knowledge' +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const fetchMock = vi.hoisted(() => vi.fn()) + +vi.mock('@logger', () => ({ + loggerService: { + withContext: () => ({ + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + error: vi.fn() + }) + } +})) + +vi.mock('electron', () => ({ + net: { + fetch: fetchMock + } +})) + +const { getRerankAdapter } = await import('../adapters') +const { executeRerankRequest, rerankKnowledgeSearchResults, resolveRerankRuntime } = await import('../rerank') + +function createKnowledgeBase(overrides: Partial = {}): KnowledgeBase { + return { + id: 'kb-1', + name: 'Knowledge Base', + dimensions: 1024, + embeddingModelId: 'ollama::nomic-embed-text', + createdAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + ...overrides + } +} + +function createSearchResults(): KnowledgeSearchResult[] { + return [ + { + pageContent: 'alpha', + score: 0.1, + metadata: { type: 'text' }, + chunkId: 'chunk-1' + }, + { + pageContent: 'beta', + score: 0.2, + metadata: { type: 'text' }, + chunkId: 'chunk-2' + } + ] +} + +describe('knowledge rerank adapters', () => { + it('passes text documents to jina rerank requests', () => { + const adapter = getRerankAdapter('jina') + + expect( + adapter.buildBody({ + modelId: 'jina-reranker-m0', + query: 'hello', + documents: ['alpha', 'beta'], + topN: 3 + }) + ).toEqual({ + model: 'jina-reranker-m0', + query: 'hello', + documents: ['alpha', 'beta'], + top_n: 3 + }) + }) + + it('maps tei-style providers to the tei request shape', () => { + const adapter = getRerankAdapter('tei-local') + + expect( + adapter.buildBody({ + modelId: 'ignored', + query: 'hello', + documents: ['alpha', 'beta'], + topN: 3 + }) + ).toEqual({ + query: 'hello', + texts: ['alpha', 'beta'], + return_text: true + }) + }) + + it('uses the bailian fixed rerank endpoint', () => { + const adapter = getRerankAdapter('bailian') + + expect(adapter.buildUrl('https://example.com/ignored')).toBe( + 'https://dashscope.aliyuncs.com/api/v1/services/rerank/text-rerank/text-rerank' + ) + }) + + it('throws when object-based rerank payloads do not contain an array of results', () => { + const adapter = getRerankAdapter('jina') + + expect(() => adapter.parseResponse({ results: 'bad-payload' })).toThrow() + }) + + it('throws when array-based rerank payloads are malformed', () => { + const adapter = getRerankAdapter('tei-local') + + expect(() => adapter.parseResponse({ results: [] })).toThrow() + }) +}) + +describe('knowledge rerank runtime', () => { + beforeEach(() => { + fetchMock.mockReset() + }) + + it('returns null runtime config until provider runtime integration lands', async () => { + await expect( + resolveRerankRuntime(createKnowledgeBase({ rerankModelId: 'jina::jina-reranker-v2-base-multilingual' })) + ).resolves.toBeNull() + }) + + it('skips rerank when the base has no rerank model id', async () => { + const searchResults = createSearchResults() + + await expect(rerankKnowledgeSearchResults(createKnowledgeBase(), 'hello', searchResults)).resolves.toBe( + searchResults + ) + expect(fetchMock).not.toHaveBeenCalled() + }) + + it('executes rerank requests and sorts by rerank score', async () => { + fetchMock.mockResolvedValue( + new Response( + JSON.stringify({ + results: [ + { index: 0, relevance_score: 0.2 }, + { index: 1, relevance_score: 0.9 } + ] + }), + { status: 200 } + ) + ) + + const result = await executeRerankRequest( + { + providerId: 'jina', + modelId: 'jina-reranker-v2-base-multilingual', + baseUrl: 'https://api.jina.ai', + apiKey: 'secret' + }, + 'hello', + createSearchResults(), + 2 + ) + + expect(fetchMock).toHaveBeenCalledWith( + 'https://api.jina.ai/v1/rerank', + expect.objectContaining({ + method: 'POST', + headers: { + Authorization: 'Bearer secret', + 'Content-Type': 'application/json' + } + }) + ) + expect(result.map((item) => ({ chunkId: item.chunkId, score: item.score }))).toEqual([ + { chunkId: 'chunk-2', score: 0.9 }, + { chunkId: 'chunk-1', score: 0.2 } + ]) + }) + + it('throws when rerank upstream responds with a non-ok status', async () => { + fetchMock.mockResolvedValue( + new Response(JSON.stringify({ error: 'bad request' }), { status: 400, statusText: 'Bad Request' }) + ) + + await expect( + executeRerankRequest( + { + providerId: 'jina', + modelId: 'jina-reranker-v2-base-multilingual', + baseUrl: 'https://api.jina.ai', + apiKey: 'secret' + }, + 'hello', + createSearchResults(), + 2 + ) + ).rejects.toThrow('HTTP 400: Bad Request') + }) +}) diff --git a/src/main/services/knowledge/rerank/adapters.ts b/src/main/services/knowledge/rerank/adapters.ts new file mode 100644 index 0000000000..c4d2634bb9 --- /dev/null +++ b/src/main/services/knowledge/rerank/adapters.ts @@ -0,0 +1,151 @@ +import * as z from 'zod' + +import type { RerankAdapter, RerankRequestInput, RerankResult } from './types' + +const OPENAI_COMPATIBLE_RERANK_SUFFIX = '/rerank' +const OPENAI_COMPATIBLE_V1_SUFFIX = '/v1' +const BAILIAN_RERANK_URL = 'https://dashscope.aliyuncs.com/api/v1/services/rerank/text-rerank/text-rerank' + +const RerankResultItemSchema = z.object({ + index: z.number(), + relevance_score: z.number().optional(), + score: z.number().optional() +}) + +const OpenAiCompatibleRerankResponseSchema = z.object({ + results: z.array(RerankResultItemSchema) +}) + +const VoyageRerankResponseSchema = z.object({ + data: z.array(RerankResultItemSchema) +}) + +const TeiRerankResponseSchema = z.array(RerankResultItemSchema) + +const BailianRerankResponseSchema = z.object({ + output: z.object({ + results: z.array(RerankResultItemSchema) + }) +}) + +function parseResults(items: z.infer[]) { + return items.map((item) => ({ + index: item.index, + relevanceScore: item.relevance_score ?? item.score ?? 0 + })) +} + +function buildOpenAiCompatibleUrl(baseUrl: string): string { + if (baseUrl.endsWith('/')) { + return `${baseUrl}rerank` + } + + if (!baseUrl.endsWith(OPENAI_COMPATIBLE_V1_SUFFIX)) { + return `${baseUrl}${OPENAI_COMPATIBLE_V1_SUFFIX}${OPENAI_COMPATIBLE_RERANK_SUFFIX}` + } + + return `${baseUrl}${OPENAI_COMPATIBLE_RERANK_SUFFIX}` +} + +function defaultHeaders(apiKey: string): Record { + return { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json' + } +} + +const defaultAdapter: RerankAdapter = { + buildUrl: buildOpenAiCompatibleUrl, + buildHeaders: defaultHeaders, + buildBody({ modelId, query, documents, topN }: RerankRequestInput) { + return { + model: modelId, + query, + documents, + top_n: topN + } + }, + parseResponse(data: unknown): RerankResult[] { + return parseResults(OpenAiCompatibleRerankResponseSchema.parse(data).results) + } +} + +const jinaAdapter: RerankAdapter = { + ...defaultAdapter, + buildBody({ modelId, query, documents, topN }: RerankRequestInput) { + return { + model: modelId, + query, + documents, + top_n: topN + } + } +} + +const voyageAdapter: RerankAdapter = { + ...defaultAdapter, + buildBody({ modelId, query, documents, topN }: RerankRequestInput) { + return { + model: modelId, + query, + documents, + top_k: topN + } + }, + parseResponse(data: unknown): RerankResult[] { + return parseResults(VoyageRerankResponseSchema.parse(data).data) + } +} + +const teiAdapter: RerankAdapter = { + ...defaultAdapter, + buildBody({ query, documents }: RerankRequestInput) { + return { + query, + texts: documents, + return_text: true + } + }, + parseResponse(data: unknown): RerankResult[] { + return parseResults(TeiRerankResponseSchema.parse(data)) + } +} + +const bailianAdapter: RerankAdapter = { + ...defaultAdapter, + buildUrl() { + return BAILIAN_RERANK_URL + }, + buildBody({ modelId, query, documents, topN }: RerankRequestInput) { + return { + model: modelId, + input: { + query, + documents + }, + parameters: { + top_n: topN + } + } + }, + parseResponse(data: unknown): RerankResult[] { + return parseResults(BailianRerankResponseSchema.parse(data).output.results) + } +} + +export function getRerankAdapter(providerId: string): RerankAdapter { + switch (providerId) { + case 'jina': + return jinaAdapter + case 'voyageai': + return voyageAdapter + case 'bailian': + return bailianAdapter + default: + if (providerId.includes('tei')) { + return teiAdapter + } + + return defaultAdapter + } +} diff --git a/src/main/services/knowledge/rerank/rerank.ts b/src/main/services/knowledge/rerank/rerank.ts new file mode 100644 index 0000000000..262643acac --- /dev/null +++ b/src/main/services/knowledge/rerank/rerank.ts @@ -0,0 +1,105 @@ +import { loggerService } from '@logger' +import { DEFAULT_DOCUMENT_COUNT, DEFAULT_RELEVANT_SCORE } from '@main/utils/knowledge' +import type { KnowledgeBase, KnowledgeSearchResult } from '@shared/data/types/knowledge' +import { net } from 'electron' + +import { parseCompositeModelId } from '../utils/config' +import { getRerankAdapter } from './adapters' +import type { ResolvedRerankRuntime } from './types' + +const logger = loggerService.withContext('KnowledgeRerank') + +function mergeRerankResults( + searchResults: KnowledgeSearchResult[], + rerankResults: Array<{ index: number; relevanceScore: number }> +): KnowledgeSearchResult[] { + const resultMap = new Map( + rerankResults.map((result) => [result.index, result.relevanceScore || DEFAULT_RELEVANT_SCORE]) + ) + + return searchResults + .map((result, index) => { + const score = resultMap.get(index) + if (score === undefined) { + return undefined + } + + return { ...result, score } + }) + .filter((result): result is KnowledgeSearchResult => result !== undefined) + .sort((a, b) => b.score - a.score) +} + +export async function resolveRerankRuntime(base: KnowledgeBase): Promise { + if (!base.rerankModelId) { + return null + } + + const { providerId, modelId } = parseCompositeModelId(base.rerankModelId) + + // TODO(v2): Read provider runtime config from the model/provider domain after the + // pending provider/model PR lands. + // const { baseUrl, apiKey } = modelProviderService.getRuntimeConfig(providerId) + void providerId + void modelId + return null +} + +export async function executeRerankRequest( + runtime: ResolvedRerankRuntime, + query: string, + searchResults: KnowledgeSearchResult[], + topN: number +): Promise { + const adapter = getRerankAdapter(runtime.providerId) + const requestBody = adapter.buildBody({ + modelId: runtime.modelId, + query, + documents: searchResults.map((result) => result.pageContent), + topN + }) + const url = adapter.buildUrl(runtime.baseUrl) + + try { + const response = await net.fetch(url, { + method: 'POST', + headers: adapter.buildHeaders(runtime.apiKey), + body: JSON.stringify(requestBody) + }) + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`) + } + + return mergeRerankResults(searchResults, adapter.parseResponse(await response.json())) + } catch (error) { + const normalizedError = error instanceof Error ? error : new Error(String(error)) + logger.error('Knowledge rerank request failed', normalizedError, { + providerId: runtime.providerId, + modelId: runtime.modelId, + topN + }) + throw normalizedError + } +} + +export async function rerankKnowledgeSearchResults( + base: KnowledgeBase, + query: string, + searchResults: KnowledgeSearchResult[] +): Promise { + if (!base.rerankModelId || searchResults.length === 0) { + return searchResults + } + + const runtime = await resolveRerankRuntime(base) + if (!runtime) { + logger.debug('Skipping knowledge rerank until provider runtime config is available', { + baseId: base.id, + rerankModelId: base.rerankModelId + }) + return searchResults + } + + return await executeRerankRequest(runtime, query, searchResults, base.documentCount ?? DEFAULT_DOCUMENT_COUNT) +} diff --git a/src/main/services/knowledge/rerank/types.ts b/src/main/services/knowledge/rerank/types.ts new file mode 100644 index 0000000000..2105611779 --- /dev/null +++ b/src/main/services/knowledge/rerank/types.ts @@ -0,0 +1,25 @@ +export interface RerankRequestInput { + modelId: string + query: string + documents: string[] + topN: number +} + +export interface RerankResult { + index: number + relevanceScore: number +} + +export interface ResolvedRerankRuntime { + providerId: string + modelId: string + baseUrl: string + apiKey: string +} + +export interface RerankAdapter { + buildUrl(baseUrl: string): string + buildHeaders(apiKey: string): Record + buildBody(input: RerankRequestInput): unknown + parseResponse(data: unknown): RerankResult[] +} diff --git a/src/main/services/knowledge/runtime/KnowledgeAddQueue.ts b/src/main/services/knowledge/runtime/KnowledgeAddQueue.ts new file mode 100644 index 0000000000..28f460a75f --- /dev/null +++ b/src/main/services/knowledge/runtime/KnowledgeAddQueue.ts @@ -0,0 +1,146 @@ +import type { KnowledgeBase, KnowledgeItem } from '@shared/data/types/knowledge' +import PQueue from 'p-queue' + +export interface AddTaskEntry { + base: KnowledgeBase + item: KnowledgeItem +} + +export interface AddTaskContext extends AddTaskEntry { + controller: AbortController + interruptedBy?: 'delete' | 'stop' +} + +type QueueEntry = AddTaskContext & { + status: 'pending' | 'running' + promise: Promise +} + +export class KnowledgeAddQueue { + private readonly concurrency: number + private readonly executeAdd: (entry: AddTaskContext) => Promise + private queue: PQueue + private entries = new Map() + + constructor(concurrency: number, executeAdd: (entry: AddTaskContext) => Promise) { + this.concurrency = concurrency + this.executeAdd = executeAdd + this.queue = this.createQueue() + } + + reset(): void { + this.queue.clear() + this.queue = this.createQueue() + this.entries.clear() + } + + enqueue(base: KnowledgeBase, item: KnowledgeItem): Promise { + const existingEntry = this.entries.get(item.id) + if (existingEntry) { + return existingEntry.promise + } + + const entry = this.createEntry(base, item) + this.entries.set(item.id, entry) + this.schedule(entry) + + return entry.promise + } + + interrupt(itemIds: string[], interruptedBy: 'delete' | 'stop', reason: string): AddTaskEntry[] { + const interruptedEntries = this.getEntriesByIds(itemIds) + + for (const entry of interruptedEntries) { + if (entry.status === 'pending') { + entry.controller.abort(reason) + this.deleteEntry(entry) + continue + } + + entry.interruptedBy = interruptedBy + entry.controller.abort(reason) + } + + return interruptedEntries + } + + interruptBase(baseId: string, interruptedBy: 'delete' | 'stop', reason: string): AddTaskEntry[] { + const itemIds = this.getEntriesForBase(baseId).map((entry) => entry.item.id) + return this.interrupt(itemIds, interruptedBy, reason) + } + + interruptAll(interruptedBy: 'delete' | 'stop', reason: string): AddTaskEntry[] { + return this.interrupt([...this.entries.keys()], interruptedBy, reason) + } + + async waitForRunning(itemIds: string[]): Promise { + const executions = this.getEntriesByIds(itemIds) + .filter((entry): entry is QueueEntry & { status: 'running' } => entry.status === 'running') + .map((entry) => entry.promise) + + if (executions.length === 0) { + return + } + + await Promise.allSettled(executions) + } + + private createQueue(): PQueue { + return new PQueue({ + concurrency: this.concurrency + }) + } + + private createEntry(base: KnowledgeBase, item: KnowledgeItem): QueueEntry { + const controller = new AbortController() + return { + base, + item, + promise: Promise.resolve(), + controller, + status: 'pending' as const, + interruptedBy: undefined + } + } + + private schedule(entry: QueueEntry): void { + entry.promise = this.queue + .add( + async () => { + if (this.entries.get(entry.item.id) !== entry) { + return + } + + entry.status = 'running' + await this.executeAdd(entry) + }, + { signal: entry.controller.signal } + ) + .finally(() => { + this.deleteEntry(entry) + }) + } + + private getEntriesByIds(itemIds: string[]): QueueEntry[] { + const entries = new Map() + + for (const itemId of new Set(itemIds)) { + const entry = this.entries.get(itemId) + if (entry) { + entries.set(itemId, entry) + } + } + + return [...entries.values()] + } + + private getEntriesForBase(baseId: string): QueueEntry[] { + return [...this.entries.values()].filter((entry) => entry.base.id === baseId) + } + + private deleteEntry(entry: QueueEntry): void { + if (this.entries.get(entry.item.id) === entry) { + this.entries.delete(entry.item.id) + } + } +} diff --git a/src/main/services/knowledge/runtime/KnowledgeAddRuntime.ts b/src/main/services/knowledge/runtime/KnowledgeAddRuntime.ts new file mode 100644 index 0000000000..64e70d76e3 --- /dev/null +++ b/src/main/services/knowledge/runtime/KnowledgeAddRuntime.ts @@ -0,0 +1,123 @@ +import { knowledgeItemService } from '@data/services/KnowledgeItemService' +import { loggerService } from '@logger' +import { application } from '@main/core/application' +import type { KnowledgeBase, KnowledgeItem } from '@shared/data/types/knowledge' +import type { BaseVectorStore } from '@vectorstores/core' + +import { loadKnowledgeItemDocuments } from '../readers/KnowledgeReader' +import { chunkDocuments } from '../utils/chunk' +import { embedDocuments } from '../utils/embed' +import { getEmbedModel } from '../utils/model' +import type { AddTaskContext } from './KnowledgeAddQueue' +import { + DELETE_INTERRUPTED_REASON, + runAbortable, + type RuntimeTaskContext, + SHUTDOWN_INTERRUPTED_REASON +} from './utils/taskRuntime' + +const logger = loggerService.withContext('KnowledgeAddRuntime') +const CONTAINER_ITEM_INDEXING_UNSUPPORTED_REASON = + 'Container knowledge items must be expanded into child items before indexing' + +export class KnowledgeAddRuntime { + constructor(private readonly isStopping: () => boolean) {} + + async executeAdd(entry: AddTaskContext): Promise { + const { base, item, controller } = entry + const ctx: RuntimeTaskContext = { + itemId: item.id, + signal: controller.signal + } + let vectorStore: BaseVectorStore | null = null + + try { + await runAbortable(this.isStopping, ctx, () => + knowledgeItemService.update(item.id, { + status: 'pending', + error: null + }) + ) + + const nodes = await this.indexItem(ctx, base, item) + const vectorStoreService = application.get('KnowledgeVectorStoreService') + vectorStore = await runAbortable(this.isStopping, ctx, () => vectorStoreService.createStore(base)) + const activeVectorStore = vectorStore + await runAbortable(this.isStopping, ctx, () => activeVectorStore.add(nodes)) + await runAbortable(this.isStopping, ctx, () => + knowledgeItemService.update(item.id, { + status: 'completed', + error: null + }) + ) + } catch (error) { + const normalizedError = error instanceof Error ? error : new Error(String(error)) + + if ( + entry.interruptedBy || + normalizedError.message === DELETE_INTERRUPTED_REASON || + normalizedError.message === SHUTDOWN_INTERRUPTED_REASON + ) { + throw normalizedError + } + + throw await this.handleAddItemFailure(base, item, vectorStore, normalizedError) + } + } + + private async indexItem(ctx: RuntimeTaskContext, base: KnowledgeBase, item: KnowledgeItem) { + if (item.type === 'directory' || item.type === 'sitemap') { + throw new Error(CONTAINER_ITEM_INDEXING_UNSUPPORTED_REASON) + } + + const documents = await runAbortable(this.isStopping, ctx, () => loadKnowledgeItemDocuments(item, ctx.signal)) + const chunks = await runAbortable(this.isStopping, ctx, () => chunkDocuments(base, item, documents)) + const embeddingModel = await runAbortable(this.isStopping, ctx, () => getEmbedModel(base)) + return await runAbortable(this.isStopping, ctx, () => embedDocuments(embeddingModel, chunks, ctx.signal)) + } + + private async handleAddItemFailure( + base: KnowledgeBase, + item: KnowledgeItem, + vectorStore: BaseVectorStore | null, + error: Error + ): Promise { + logger.error('Failed to add knowledge item', error, { + baseId: base.id, + itemId: item.id, + itemType: item.type + }) + + try { + await knowledgeItemService.update(item.id, { + status: 'failed', + error: error.message + }) + } catch (persistError) { + logger.error( + 'Failed to persist knowledge item failure state', + persistError instanceof Error ? persistError : new Error(String(persistError)), + { + baseId: base.id, + itemId: item.id, + itemType: item.type, + originalError: error.message + } + ) + } + + if (vectorStore) { + try { + await vectorStore.delete(item.id) + } catch (cleanupError) { + logger.warn('Failed to cleanup knowledge item vectors after add failure', { + baseId: base.id, + itemId: item.id, + cleanupError: cleanupError instanceof Error ? cleanupError.message : String(cleanupError) + }) + } + } + + return error + } +} diff --git a/src/main/services/knowledge/runtime/KnowledgeRuntimeService.ts b/src/main/services/knowledge/runtime/KnowledgeRuntimeService.ts new file mode 100644 index 0000000000..1876e743d5 --- /dev/null +++ b/src/main/services/knowledge/runtime/KnowledgeRuntimeService.ts @@ -0,0 +1,108 @@ +import { knowledgeItemService } from '@data/services/KnowledgeItemService' +import { application } from '@main/core/application' +import { BaseService, DependsOn, Injectable, Phase, ServicePhase } from '@main/core/lifecycle' +import type { KnowledgeBase, KnowledgeItem, KnowledgeSearchResult } from '@shared/data/types/knowledge' +import { MetadataMode } from '@vectorstores/core' +import { embedMany } from 'ai' + +import { rerankKnowledgeSearchResults } from '../rerank/rerank' +import { getEmbedModel } from '../utils/model' +import { KnowledgeAddQueue } from './KnowledgeAddQueue' +import { KnowledgeAddRuntime } from './KnowledgeAddRuntime' +import { deleteItemVectors, deleteVectorsForEntries, failItems } from './utils/cleanup' +import { DELETE_INTERRUPTED_REASON, SHUTDOWN_INTERRUPTED_REASON } from './utils/taskRuntime' + +@Injectable('KnowledgeRuntimeService') +@ServicePhase(Phase.WhenReady) +@DependsOn(['KnowledgeVectorStoreService']) +export class KnowledgeRuntimeService extends BaseService { + private isStopping = false + private addRuntime = new KnowledgeAddRuntime(() => this.isStopping) + private addQueue = new KnowledgeAddQueue(5, (entry) => { + if (this.isStopping) { + throw new Error(SHUTDOWN_INTERRUPTED_REASON) + } + + return this.addRuntime.executeAdd(entry) + }) + + protected onInit(): void { + this.isStopping = false + this.addQueue.reset() + } + + protected async onStop(): Promise { + this.isStopping = true + + const interruptedEntries = this.addQueue.interruptAll('stop', SHUTDOWN_INTERRUPTED_REASON) + const interruptedItemIds = interruptedEntries.map((entry) => entry.item.id) + + await this.addQueue.waitForRunning(interruptedItemIds) + await deleteVectorsForEntries(interruptedEntries, { continueOnError: true }) + await failItems(interruptedItemIds, SHUTDOWN_INTERRUPTED_REASON) + } + + async createBase(base: KnowledgeBase) { + const vectorStoreService = application.get('KnowledgeVectorStoreService') + await vectorStoreService.createStore(base) + } + + async deleteBase(baseId: string) { + const interruptedEntries = this.addQueue.interruptBase(baseId, 'delete', DELETE_INTERRUPTED_REASON) + const interruptedItemIds = interruptedEntries.map((entry) => entry.item.id) + + await this.addQueue.waitForRunning(interruptedItemIds) + + const vectorStoreService = application.get('KnowledgeVectorStoreService') + await vectorStoreService.deleteStore(baseId) + } + + async addItems(base: KnowledgeBase, items: KnowledgeItem[]) { + return await Promise.all(items.map((item) => this.addQueue.enqueue(base, item))) + } + + async deleteItems(base: KnowledgeBase, items: KnowledgeItem[]) { + const rootIds = [...new Set(items.map((item) => item.id))] + const itemIds = await knowledgeItemService.getCascadeIdsInBase(base.id, rootIds) + + this.addQueue.interrupt(itemIds, 'delete', DELETE_INTERRUPTED_REASON) + await this.addQueue.waitForRunning(itemIds) + await deleteItemVectors(base, itemIds) + } + + async search(base: KnowledgeBase, query: string): Promise { + const model = getEmbedModel(base) + const embedResult = await embedMany({ model, values: [query] }) + const queryEmbedding = embedResult.embeddings[0] + + if (!queryEmbedding?.length) { + throw new Error('Failed to embed search query: model returned empty result') + } + + const vectorStoreService = application.get('KnowledgeVectorStoreService') + const vectorStore = await vectorStoreService.createStore(base) + const results = await vectorStore.query({ + queryStr: query, + queryEmbedding, + mode: base.searchMode ?? 'default', + similarityTopK: base.documentCount ?? 10, + alpha: base.hybridAlpha + }) + const nodes = results.nodes ?? [] + const searchResults = nodes.map((node, index) => { + const metadata = node.metadata ?? {} + + return { + pageContent: node.getContent(MetadataMode.NONE), + score: results.similarities[index] ?? 0, + metadata, + itemId: typeof metadata.itemId === 'string' && metadata.itemId.length > 0 ? metadata.itemId : undefined, + chunkId: node.id_ + } + }) + if (base.rerankModelId) { + return await rerankKnowledgeSearchResults(base, query, searchResults) + } + return searchResults + } +} diff --git a/src/main/services/knowledge/runtime/__tests__/KnowledgeAddQueue.test.ts b/src/main/services/knowledge/runtime/__tests__/KnowledgeAddQueue.test.ts new file mode 100644 index 0000000000..70ab71c916 --- /dev/null +++ b/src/main/services/knowledge/runtime/__tests__/KnowledgeAddQueue.test.ts @@ -0,0 +1,120 @@ +import type { KnowledgeBase, KnowledgeItem } from '@shared/data/types/knowledge' +import { describe, expect, it, vi } from 'vitest' + +import { KnowledgeAddQueue } from '../KnowledgeAddQueue' + +function createBase(): KnowledgeBase { + return { + id: 'kb-1', + name: 'KB', + dimensions: 1024, + embeddingModelId: 'ollama::nomic-embed-text', + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createItem(id: string): KnowledgeItem { + return { + id, + baseId: 'kb-1', + groupId: null, + type: 'note', + data: { content: id }, + status: 'idle', + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createDeferred() { + let resolve!: (value: T | PromiseLike) => void + let reject!: (reason?: unknown) => void + const promise = new Promise((res, rej) => { + resolve = res + reject = rej + }) + + return { promise, resolve, reject } +} + +describe('KnowledgeAddQueue', () => { + it('deduplicates queued work for the same item', async () => { + const deferred = createDeferred() + const executeAdd = vi.fn(async () => { + await deferred.promise + }) + const queue = new KnowledgeAddQueue(1, executeAdd) + const base = createBase() + const item = createItem('item-1') + + const firstPromise = queue.enqueue(base, item) + const secondPromise = queue.enqueue(base, item) + + await vi.waitFor(() => { + expect(executeAdd).toHaveBeenCalledTimes(1) + }) + + deferred.resolve() + + await expect(Promise.all([firstPromise, secondPromise])).resolves.toEqual([undefined, undefined]) + }) + + it('interrupts pending and running items and returns their entries', async () => { + const deferred = createDeferred() + const executeAdd = vi.fn(async (entry) => { + if (entry.item.id === runningItem.id) { + await deferred.promise + } + + if (entry.interruptedBy) { + throw new Error('Knowledge task interrupted by item deletion') + } + }) + const queue = new KnowledgeAddQueue(1, executeAdd) + const base = createBase() + const runningItem = createItem('item-running') + const pendingItem = createItem('item-pending') + + const runningPromise = queue.enqueue(base, runningItem) + const pendingPromise = queue.enqueue(base, pendingItem) + + await vi.waitFor(() => { + expect(executeAdd).toHaveBeenCalledTimes(1) + }) + + const interruptedEntries = queue.interrupt( + [runningItem.id, pendingItem.id], + 'delete', + 'Knowledge task interrupted by item deletion' + ) + + expect(interruptedEntries.map((entry) => entry.item.id)).toEqual([runningItem.id, pendingItem.id]) + expect(executeAdd.mock.calls[0][0].interruptedBy).toBe('delete') + + deferred.resolve() + + await queue.waitForRunning([runningItem.id, pendingItem.id]) + + await expect(runningPromise).rejects.toThrow('Knowledge task interrupted by item deletion') + await expect(pendingPromise).rejects.toThrow('Knowledge task interrupted by item deletion') + }) + + it('rejects the public promise when executeAdd throws and continues with later work', async () => { + const queue = new KnowledgeAddQueue(1, async (entry) => { + if (entry.item.id === firstItem.id) { + throw new Error('execute failed') + } + }) + const base = createBase() + const firstItem = createItem('item-failed') + const secondItem = createItem('item-next') + + const firstPromise = queue.enqueue(base, firstItem) + const secondPromise = queue.enqueue(base, secondItem) + + await expect(firstPromise).rejects.toThrow('execute failed') + await expect(secondPromise).resolves.toBeUndefined() + }) +}) diff --git a/src/main/services/knowledge/runtime/__tests__/KnowledgeRuntimeService.test.ts b/src/main/services/knowledge/runtime/__tests__/KnowledgeRuntimeService.test.ts new file mode 100644 index 0000000000..a6bc37617c --- /dev/null +++ b/src/main/services/knowledge/runtime/__tests__/KnowledgeRuntimeService.test.ts @@ -0,0 +1,950 @@ +import type * as LifecycleModule from '@main/core/lifecycle' +import { getDependencies, getPhase } from '@main/core/lifecycle/decorators' +import { Phase } from '@main/core/lifecycle/types' +import { beforeEach, describe, expect, it, vi } from 'vitest' + +import { KnowledgeAddQueue } from '../KnowledgeAddQueue' +import { DELETE_INTERRUPTED_REASON, SHUTDOWN_INTERRUPTED_REASON } from '../utils/taskRuntime' + +const { + appGetMock, + createVectorStoreMock, + deleteVectorStoreMock, + embedManyMock, + getEmbedModelMock, + knowledgeItemGetCascadeIdsInBaseMock, + knowledgeItemUpdateMock, + loadKnowledgeItemDocumentsMock, + loggerErrorMock, + loggerWarnMock, + rerankKnowledgeSearchResultsMock, + vectorStoreAddMock, + vectorStoreDeleteMock, + vectorStoreQueryMock +} = vi.hoisted(() => ({ + appGetMock: vi.fn(), + createVectorStoreMock: vi.fn(), + deleteVectorStoreMock: vi.fn(), + embedManyMock: vi.fn(), + getEmbedModelMock: vi.fn(), + knowledgeItemGetCascadeIdsInBaseMock: vi.fn(), + knowledgeItemUpdateMock: vi.fn(), + loadKnowledgeItemDocumentsMock: vi.fn(), + loggerErrorMock: vi.fn(), + loggerWarnMock: vi.fn(), + rerankKnowledgeSearchResultsMock: vi.fn(), + vectorStoreAddMock: vi.fn(), + vectorStoreDeleteMock: vi.fn(), + vectorStoreQueryMock: vi.fn() +})) + +vi.mock('@main/core/application', () => ({ + application: { + get: appGetMock + } +})) + +vi.mock('@logger', () => ({ + loggerService: { + withContext: () => ({ + info: vi.fn(), + warn: loggerWarnMock, + error: loggerErrorMock, + debug: vi.fn() + }) + } +})) + +vi.mock('@main/core/lifecycle', async (importOriginal) => { + const actual = await importOriginal() + + class MockBaseService { + ipcHandle = vi.fn() + } + + return { + ...actual, + BaseService: MockBaseService + } +}) + +vi.mock('@data/services/KnowledgeItemService', () => ({ + knowledgeItemService: { + getCascadeIdsInBase: knowledgeItemGetCascadeIdsInBaseMock, + update: knowledgeItemUpdateMock + } +})) + +vi.mock('ai', () => ({ + embedMany: embedManyMock +})) + +vi.mock('../../readers/KnowledgeReader', () => ({ + loadKnowledgeItemDocuments: loadKnowledgeItemDocumentsMock +})) + +vi.mock('../../rerank/rerank', () => ({ + rerankKnowledgeSearchResults: rerankKnowledgeSearchResultsMock +})) + +vi.mock('../../utils/chunk', () => ({ + chunkDocuments: vi.fn((_, __, documents) => documents) +})) + +vi.mock('../../utils/embed', () => ({ + embedDocuments: vi.fn((_, chunks) => chunks) +})) + +vi.mock('../../utils/model', () => ({ + getEmbedModel: getEmbedModelMock +})) + +const { KnowledgeRuntimeService } = await import('..') + +function createBase() { + return { + id: 'kb-1', + name: 'KB', + dimensions: 1024, + embeddingModelId: 'ollama::nomic-embed-text', + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createDirectoryItem() { + return { + id: 'dir-1', + baseId: 'kb-1', + groupId: null, + type: 'directory' as const, + data: { name: 'docs', path: '/docs' }, + status: 'idle' as const, + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createSitemapItem() { + return { + id: 'sitemap-1', + baseId: 'kb-1', + groupId: null, + type: 'sitemap' as const, + data: { url: 'https://example.com/sitemap.xml', name: 'Example Sitemap' }, + status: 'idle' as const, + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createNoteItem(id = 'note-1') { + return { + id, + baseId: 'kb-1', + groupId: null, + type: 'note' as const, + data: { content: `hello ${id}` }, + status: 'idle' as const, + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createDeferred() { + let resolve!: (value: T | PromiseLike) => void + let reject!: (reason?: unknown) => void + const promise = new Promise((res, rej) => { + resolve = res + reject = rej + }) + + return { promise, resolve, reject } +} + +function createSingleConcurrencyQueue(service: InstanceType) { + return new KnowledgeAddQueue(1, (entry) => { + if ((service as any).isStopping) { + throw new Error(SHUTDOWN_INTERRUPTED_REASON) + } + + return (service as any).addRuntime.executeAdd(entry) + }) +} + +describe('KnowledgeRuntimeService', () => { + beforeEach(() => { + vi.clearAllMocks() + + appGetMock.mockImplementation((serviceName: string) => { + if (serviceName === 'KnowledgeVectorStoreService') { + return { + createStore: createVectorStoreMock, + deleteStore: deleteVectorStoreMock, + getStoreIfExists: createVectorStoreMock + } + } + + throw new Error(`Unexpected application.get(${serviceName}) in test`) + }) + createVectorStoreMock.mockResolvedValue({ + add: vectorStoreAddMock, + delete: vectorStoreDeleteMock, + query: vectorStoreQueryMock + }) + deleteVectorStoreMock.mockResolvedValue(undefined) + vectorStoreAddMock.mockResolvedValue(undefined) + vectorStoreDeleteMock.mockResolvedValue(undefined) + vectorStoreQueryMock.mockResolvedValue({ + nodes: [], + similarities: [], + ids: [] + }) + knowledgeItemGetCascadeIdsInBaseMock.mockImplementation(async (_baseId, itemIds: string[]) => [...new Set(itemIds)]) + knowledgeItemUpdateMock.mockImplementation(async (_id, dto) => dto) + loadKnowledgeItemDocumentsMock.mockImplementation(async (item) => [ + { text: item.id, metadata: { itemId: item.id } } + ]) + getEmbedModelMock.mockReturnValue({ provider: 'mock' }) + embedManyMock.mockResolvedValue({ embeddings: [[0.1, 0.2]] }) + rerankKnowledgeSearchResultsMock.mockImplementation(async (_base, _query, results) => results) + }) + + it('uses WhenReady phase and depends on KnowledgeVectorStoreService', () => { + expect(getPhase(KnowledgeRuntimeService)).toBe(Phase.WhenReady) + expect(getDependencies(KnowledgeRuntimeService)).toEqual(['KnowledgeVectorStoreService']) + }) + + it('maps vector search results into knowledge search results with metadata and chunk ids', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const query = 'hello' + const firstNode = { + id_: 'chunk-1', + metadata: { + itemId: 'item-1', + sourceUrl: 'https://example.com/1' + }, + getContent: vi.fn(() => 'page one') + } + const secondNode = { + id_: 'chunk-2', + metadata: { + itemId: '', + sourceUrl: 'https://example.com/2' + }, + getContent: vi.fn(() => 'page two') + } + + embedManyMock.mockResolvedValueOnce({ embeddings: [[0.9, 0.1]] }) + vectorStoreQueryMock.mockResolvedValueOnce({ + nodes: [firstNode, secondNode], + similarities: [0.8, 0.6], + ids: ['chunk-1', 'chunk-2'] + }) + + await expect(service.search(base, query)).resolves.toEqual([ + { + pageContent: 'page one', + score: 0.8, + metadata: { + itemId: 'item-1', + sourceUrl: 'https://example.com/1' + }, + itemId: 'item-1', + chunkId: 'chunk-1' + }, + { + pageContent: 'page two', + score: 0.6, + metadata: { + itemId: '', + sourceUrl: 'https://example.com/2' + }, + itemId: undefined, + chunkId: 'chunk-2' + } + ]) + + expect(getEmbedModelMock).toHaveBeenCalledWith(base) + expect(embedManyMock).toHaveBeenCalledWith({ + model: { provider: 'mock' }, + values: [query] + }) + expect(vectorStoreQueryMock).toHaveBeenCalledWith({ + queryStr: query, + queryEmbedding: [0.9, 0.1], + mode: 'default', + similarityTopK: 10, + alpha: undefined + }) + expect(rerankKnowledgeSearchResultsMock).not.toHaveBeenCalled() + }) + + it('fails search when embedMany returns an empty embedding result', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + + embedManyMock.mockResolvedValueOnce({ embeddings: [] }) + + await expect(service.search(base, 'hello')).rejects.toThrow( + 'Failed to embed search query: model returned empty result' + ) + + expect(createVectorStoreMock).not.toHaveBeenCalled() + expect(vectorStoreQueryMock).not.toHaveBeenCalled() + }) + + it('marks directory items as failed instead of completed', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const item = createDirectoryItem() + + await expect(service.addItems(base, [item])).rejects.toThrow( + 'Container knowledge items must be expanded into child items before indexing' + ) + + expect(loadKnowledgeItemDocumentsMock).not.toHaveBeenCalled() + expect(createVectorStoreMock).not.toHaveBeenCalled() + expect(vectorStoreAddMock).not.toHaveBeenCalled() + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'pending', + error: null + }) + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'failed', + error: 'Container knowledge items must be expanded into child items before indexing' + }) + expect(knowledgeItemUpdateMock).not.toHaveBeenCalledWith(item.id, { + status: 'completed', + error: null + }) + }) + + it('marks sitemap items as failed instead of completed', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const item = createSitemapItem() + + await expect(service.addItems(base, [item])).rejects.toThrow( + 'Container knowledge items must be expanded into child items before indexing' + ) + + expect(loadKnowledgeItemDocumentsMock).not.toHaveBeenCalled() + expect(createVectorStoreMock).not.toHaveBeenCalled() + expect(vectorStoreAddMock).not.toHaveBeenCalled() + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'pending', + error: null + }) + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'failed', + error: 'Container knowledge items must be expanded into child items before indexing' + }) + expect(knowledgeItemUpdateMock).not.toHaveBeenCalledWith(item.id, { + status: 'completed', + error: null + }) + }) + + it('deduplicates add work for the same item', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const item = createNoteItem('note-dup') + const loadDeferred = createDeferred>() + + loadKnowledgeItemDocumentsMock.mockImplementation(async (currentItem) => { + if (currentItem.id === item.id) { + return await loadDeferred.promise + } + + return [{ text: currentItem.id, metadata: { itemId: currentItem.id } }] + }) + + const firstAddPromise = service.addItems(base, [item]) + const secondAddPromise = service.addItems(base, [item]) + + await vi.waitFor(() => { + expect(loadKnowledgeItemDocumentsMock).toHaveBeenCalledTimes(1) + }) + + loadDeferred.resolve([{ text: item.id, metadata: { itemId: item.id } }]) + + await expect(Promise.all([firstAddPromise, secondAddPromise])).resolves.toEqual([[undefined], [undefined]]) + expect(vectorStoreAddMock).toHaveBeenCalledTimes(1) + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'completed', + error: null + }) + }) + + it('keeps add startup atomic per item when another item fails before enqueue', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const startedItem = createNoteItem('note-started') + const failedItem = createNoteItem('note-failed') + const loadDeferred = createDeferred>() + + knowledgeItemUpdateMock.mockImplementation(async (id, dto) => { + if (id === failedItem.id && dto.status === 'pending') { + throw new Error('pending write failed') + } + + return dto + }) + loadKnowledgeItemDocumentsMock.mockImplementation(async (item) => { + if (item.id === startedItem.id) { + return await loadDeferred.promise + } + + return [{ text: item.id, metadata: { itemId: item.id } }] + }) + + const addPromise = service.addItems(base, [startedItem, failedItem]) + const addPromiseAssertion = expect(addPromise).rejects.toThrow('pending write failed') + + await vi.waitFor(() => { + expect(loadKnowledgeItemDocumentsMock).toHaveBeenCalledWith(startedItem, expect.any(AbortSignal)) + }) + + await addPromiseAssertion + + expect(loadKnowledgeItemDocumentsMock).not.toHaveBeenCalledWith(failedItem) + + loadDeferred.resolve([{ text: startedItem.id, metadata: { itemId: startedItem.id } }]) + + await vi.waitFor(() => { + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(startedItem.id, { + status: 'completed', + error: null + }) + }) + }) + + it('removes pending items from the add queue before they start', async () => { + const service = new KnowledgeRuntimeService() + ;(service as any).addQueue = createSingleConcurrencyQueue(service) + + const base = createBase() + const runningItem = createNoteItem('note-running') + const pendingItem = createNoteItem('note-pending') + const loadDeferred = createDeferred>() + + loadKnowledgeItemDocumentsMock.mockImplementation(async (item) => { + if (item.id === runningItem.id) { + return await loadDeferred.promise + } + + return [{ text: item.id, metadata: { itemId: item.id } }] + }) + + const addPromise = service.addItems(base, [runningItem, pendingItem]) + + await vi.waitFor(() => { + expect(loadKnowledgeItemDocumentsMock).toHaveBeenCalledWith(runningItem, expect.any(AbortSignal)) + }) + + await expect(service.deleteItems(base, [pendingItem])).resolves.toBeUndefined() + + loadDeferred.resolve([{ text: runningItem.id, metadata: { itemId: runningItem.id } }]) + + await expect(addPromise).rejects.toThrow(DELETE_INTERRUPTED_REASON) + expect(loadKnowledgeItemDocumentsMock).not.toHaveBeenCalledWith(pendingItem) + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(pendingItem.id) + }) + + it('interrupts running add work before deleting vectors', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const item = createNoteItem('note-delete') + const loadDeferred = createDeferred>() + + loadKnowledgeItemDocumentsMock.mockImplementation(async (currentItem) => { + if (currentItem.id === item.id) { + return await loadDeferred.promise + } + + return [{ text: currentItem.id, metadata: { itemId: currentItem.id } }] + }) + + const addPromise = service.addItems(base, [item]) + + await vi.waitFor(() => { + expect(loadKnowledgeItemDocumentsMock).toHaveBeenCalledWith(item, expect.any(AbortSignal)) + }) + + let deleteResolved = false + const deletePromise = service.deleteItems(base, [item]).then(() => { + deleteResolved = true + }) + + expect(deleteResolved).toBe(false) + + loadDeferred.resolve([{ text: item.id, metadata: { itemId: item.id } }]) + + await deletePromise + + await expect(addPromise).rejects.toThrow(DELETE_INTERRUPTED_REASON) + expect(deleteResolved).toBe(true) + expect(vectorStoreAddMock).not.toHaveBeenCalled() + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(item.id) + expect(knowledgeItemUpdateMock).not.toHaveBeenCalledWith(item.id, { + status: 'completed', + error: null + }) + expect(knowledgeItemUpdateMock).not.toHaveBeenCalledWith(item.id, { + status: 'failed', + error: DELETE_INTERRUPTED_REASON + }) + }) + + it('interrupts running add work before deleting the base store', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const item = createNoteItem('note-delete-base') + const loadDeferred = createDeferred>() + + loadKnowledgeItemDocumentsMock.mockImplementation(async (currentItem) => { + if (currentItem.id === item.id) { + return await loadDeferred.promise + } + + return [{ text: currentItem.id, metadata: { itemId: currentItem.id } }] + }) + + const addPromise = service.addItems(base, [item]) + + await vi.waitFor(() => { + expect(loadKnowledgeItemDocumentsMock).toHaveBeenCalledWith(item, expect.any(AbortSignal)) + }) + + let deleteResolved = false + const deletePromise = service.deleteBase(base.id).then(() => { + deleteResolved = true + }) + + expect(deleteResolved).toBe(false) + + loadDeferred.resolve([{ text: item.id, metadata: { itemId: item.id } }]) + + await deletePromise + + await expect(addPromise).rejects.toThrow(DELETE_INTERRUPTED_REASON) + expect(deleteResolved).toBe(true) + expect(vectorStoreAddMock).not.toHaveBeenCalled() + expect(deleteVectorStoreMock).toHaveBeenCalledWith(base.id) + expect(knowledgeItemUpdateMock).not.toHaveBeenCalledWith(item.id, { + status: 'completed', + error: null + }) + expect(knowledgeItemUpdateMock).not.toHaveBeenCalledWith(item.id, { + status: 'failed', + error: DELETE_INTERRUPTED_REASON + }) + }) + + it('deletes vectors for cascade descendants when only the owner is passed in', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const ownerItem = createDirectoryItem() + const childItem = { + ...createNoteItem('note-child'), + groupId: ownerItem.id + } + + knowledgeItemGetCascadeIdsInBaseMock.mockResolvedValue([ownerItem.id, childItem.id]) + + await expect(service.deleteItems(base, [ownerItem])).resolves.toBeUndefined() + + expect(knowledgeItemGetCascadeIdsInBaseMock).toHaveBeenCalledWith(base.id, [ownerItem.id]) + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(ownerItem.id) + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(childItem.id) + }) + + it('deletes vectors when add already succeeded but completed status is still pending during delete', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const item = createNoteItem('note-delete-after-add') + const completedUpdateDeferred = createDeferred() + + knowledgeItemUpdateMock.mockImplementation(async (_id, dto) => { + if (dto.status === 'completed') { + return await completedUpdateDeferred.promise + } + + return dto + }) + + const addPromise = service.addItems(base, [item]) + + await vi.waitFor(() => { + expect(vectorStoreAddMock).toHaveBeenCalledTimes(1) + }) + await vi.waitFor(() => { + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'completed', + error: null + }) + }) + + let deleteResolved = false + const deletePromise = service.deleteItems(base, [item]).then(() => { + deleteResolved = true + }) + + expect(deleteResolved).toBe(false) + + completedUpdateDeferred.resolve({ + status: 'completed', + error: null + }) + + await deletePromise + + await expect(addPromise).rejects.toThrow(DELETE_INTERRUPTED_REASON) + expect(deleteResolved).toBe(true) + expect(vectorStoreAddMock).toHaveBeenCalledTimes(1) + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(item.id) + }) + + it('interrupts mixed running and pending add work before deleting vectors in one batch', async () => { + const service = new KnowledgeRuntimeService() + ;(service as any).addQueue = createSingleConcurrencyQueue(service) + + const base = createBase() + const runningItem = createNoteItem('note-delete-running') + const pendingItem = createNoteItem('note-delete-pending') + const loadDeferred = createDeferred>() + + loadKnowledgeItemDocumentsMock.mockImplementation(async (item) => { + if (item.id === runningItem.id) { + return await loadDeferred.promise + } + + return [{ text: item.id, metadata: { itemId: item.id } }] + }) + + const addPromise = service.addItems(base, [runningItem, pendingItem]) + + await vi.waitFor(() => { + expect(loadKnowledgeItemDocumentsMock).toHaveBeenCalledWith(runningItem, expect.any(AbortSignal)) + }) + + let deleteResolved = false + const deletePromise = service.deleteItems(base, [runningItem, pendingItem]).then(() => { + deleteResolved = true + }) + + expect(deleteResolved).toBe(false) + + loadDeferred.resolve([{ text: runningItem.id, metadata: { itemId: runningItem.id } }]) + + await deletePromise + + await expect(addPromise).rejects.toThrow(DELETE_INTERRUPTED_REASON) + expect(deleteResolved).toBe(true) + expect(loadKnowledgeItemDocumentsMock).not.toHaveBeenCalledWith(pendingItem) + expect(vectorStoreAddMock).not.toHaveBeenCalled() + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(runningItem.id) + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(pendingItem.id) + expect(knowledgeItemUpdateMock).not.toHaveBeenCalledWith(runningItem.id, { + status: 'completed', + error: null + }) + expect(knowledgeItemUpdateMock).not.toHaveBeenCalledWith(pendingItem.id, { + status: 'completed', + error: null + }) + }) + + it('persists failed status even when vector cleanup throws', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const item = createNoteItem('item-1') + + vectorStoreDeleteMock.mockRejectedValue(new Error('cleanup failed')) + const store = { + add: vi.fn().mockRejectedValue(new Error('vector add failed')), + delete: vectorStoreDeleteMock, + query: vi.fn() + } + createVectorStoreMock.mockResolvedValue(store) + + await expect(service.addItems(base, [item])).rejects.toThrow('vector add failed') + + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'failed', + error: 'vector add failed' + }) + expect(loggerWarnMock).toHaveBeenCalledWith('Failed to cleanup knowledge item vectors after add failure', { + baseId: base.id, + itemId: item.id, + cleanupError: 'cleanup failed' + }) + expect(store.add).toHaveBeenCalled() + }) + + it('keeps the original add error and still cleans up vectors when failed status persistence also fails', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const item = createNoteItem('item-failed-status') + + const store = { + add: vi.fn().mockResolvedValue(undefined), + delete: vectorStoreDeleteMock, + query: vi.fn() + } + createVectorStoreMock.mockResolvedValue(store) + knowledgeItemUpdateMock.mockImplementation(async (_id, dto) => { + if (dto.status === 'completed') { + throw new Error('completed write failed') + } + + if (dto.status === 'failed') { + throw new Error('failed write failed') + } + + return dto + }) + + await expect(service.addItems(base, [item])).rejects.toThrow('completed write failed') + + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'failed', + error: 'completed write failed' + }) + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(item.id) + expect(loggerErrorMock).toHaveBeenCalledWith( + 'Failed to persist knowledge item failure state', + expect.objectContaining({ message: 'failed write failed' }), + expect.objectContaining({ + baseId: base.id, + itemId: item.id, + itemType: item.type, + originalError: 'completed write failed' + }) + ) + }) + + it('continues stop cleanup when interrupted vector deletion fails', async () => { + const service = new KnowledgeRuntimeService() + ;(service as any).addQueue = createSingleConcurrencyQueue(service) + + const base = createBase() + const runningItem = createNoteItem('note-stop-cleanup-running') + const pendingItem = createNoteItem('note-stop-cleanup-pending') + const loadDeferred = createDeferred>() + + loadKnowledgeItemDocumentsMock.mockImplementation(async (item) => { + if (item.id === runningItem.id) { + return await loadDeferred.promise + } + + return [{ text: item.id, metadata: { itemId: item.id } }] + }) + + vectorStoreDeleteMock.mockImplementation(async (itemId: string) => { + if (itemId === pendingItem.id) { + throw new Error('interrupted cleanup failed') + } + }) + + const addPromise = service.addItems(base, [runningItem, pendingItem]) + + await vi.waitFor(() => { + expect(loadKnowledgeItemDocumentsMock).toHaveBeenCalledWith(runningItem, expect.any(AbortSignal)) + }) + + const stopPromise = (service as any).onStop() + + loadDeferred.resolve([{ text: runningItem.id, metadata: { itemId: runningItem.id } }]) + + await expect(stopPromise).resolves.toBeUndefined() + await expect(addPromise).rejects.toThrow(SHUTDOWN_INTERRUPTED_REASON) + + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(runningItem.id) + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(pendingItem.id) + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(runningItem.id, { + status: 'failed', + error: SHUTDOWN_INTERRUPTED_REASON + }) + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(pendingItem.id, { + status: 'failed', + error: SHUTDOWN_INTERRUPTED_REASON + }) + expect(loggerWarnMock).toHaveBeenCalledWith('Failed to delete knowledge item vectors during interruption cleanup', { + baseId: base.id, + itemIds: [runningItem.id, pendingItem.id], + succeededItemIds: [runningItem.id], + failedItemIds: [pendingItem.id], + cleanupError: `Failed to delete vectors for knowledge items in base ${base.id}: ${pendingItem.id}` + }) + }) + + it('deletes vectors on stop when add already succeeded but completed status is still pending', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const item = createNoteItem('note-stop-after-add') + const completedUpdateDeferred = createDeferred() + + knowledgeItemUpdateMock.mockImplementation(async (_id, dto) => { + if (dto.status === 'completed') { + return await completedUpdateDeferred.promise + } + + return dto + }) + + const addPromise = service.addItems(base, [item]) + + await vi.waitFor(() => { + expect(vectorStoreAddMock).toHaveBeenCalledTimes(1) + }) + await vi.waitFor(() => { + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'completed', + error: null + }) + }) + + let stopResolved = false + const stopPromise = (service as any).onStop().then(() => { + stopResolved = true + }) + + expect(stopResolved).toBe(false) + + completedUpdateDeferred.resolve({ + status: 'completed', + error: null + }) + + await stopPromise + + await expect(addPromise).rejects.toThrow(SHUTDOWN_INTERRUPTED_REASON) + expect(stopResolved).toBe(true) + expect(vectorStoreAddMock).toHaveBeenCalledTimes(1) + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(item.id) + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'failed', + error: SHUTDOWN_INTERRUPTED_REASON + }) + }) + + it('fails interrupted items on stop after deleting their vectors', async () => { + const service = new KnowledgeRuntimeService() + ;(service as any).addQueue = createSingleConcurrencyQueue(service) + + const base = createBase() + const runningItem = createNoteItem('note-stop-running') + const pendingItem = createNoteItem('note-stop-pending') + const loadDeferred = createDeferred>() + + loadKnowledgeItemDocumentsMock.mockImplementation(async (item) => { + if (item.id === runningItem.id) { + return await loadDeferred.promise + } + + return [{ text: item.id, metadata: { itemId: item.id } }] + }) + + const addPromise = service.addItems(base, [runningItem, pendingItem]) + + await vi.waitFor(() => { + expect(loadKnowledgeItemDocumentsMock).toHaveBeenCalledWith(runningItem, expect.any(AbortSignal)) + }) + + let stopResolved = false + const stopPromise = (service as any).onStop().then(() => { + stopResolved = true + }) + + expect(stopResolved).toBe(false) + + loadDeferred.resolve([{ text: runningItem.id, metadata: { itemId: runningItem.id } }]) + + await stopPromise + + await expect(addPromise).rejects.toThrow(SHUTDOWN_INTERRUPTED_REASON) + expect(stopResolved).toBe(true) + expect(vectorStoreAddMock).not.toHaveBeenCalled() + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(runningItem.id) + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(pendingItem.id) + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(runningItem.id, { + status: 'failed', + error: SHUTDOWN_INTERRUPTED_REASON + }) + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(pendingItem.id, { + status: 'failed', + error: SHUTDOWN_INTERRUPTED_REASON + }) + expect(knowledgeItemUpdateMock).not.toHaveBeenCalledWith(runningItem.id, { + status: 'completed', + error: null + }) + }) + + it('does not leave an item stuck in pending when stop happens during the pending write', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const item = createNoteItem('note-stop-during-pending') + const pendingUpdateDeferred = createDeferred() + + knowledgeItemUpdateMock.mockImplementation(async (_id, dto) => { + if (dto.status === 'pending') { + return await pendingUpdateDeferred.promise + } + + return dto + }) + + const addPromise = service.addItems(base, [item]) + + await vi.waitFor(() => { + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'pending', + error: null + }) + }) + + let stopResolved = false + const stopPromise = (service as any).onStop().then(() => { + stopResolved = true + }) + + expect(stopResolved).toBe(false) + + pendingUpdateDeferred.resolve({ + status: 'pending', + error: null + }) + + await stopPromise + + await expect(addPromise).rejects.toThrow(SHUTDOWN_INTERRUPTED_REASON) + expect(stopResolved).toBe(true) + expect(loadKnowledgeItemDocumentsMock).not.toHaveBeenCalledWith(item, expect.any(AbortSignal)) + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith(item.id, { + status: 'failed', + error: SHUTDOWN_INTERRUPTED_REASON + }) + expect(knowledgeItemUpdateMock).not.toHaveBeenCalledWith(item.id, { + status: 'completed', + error: null + }) + }) + + it('deduplicates repeated item ids during delete', async () => { + const service = new KnowledgeRuntimeService() + const base = createBase() + const item = createNoteItem('note-delete-dedupe') + + await expect(service.deleteItems(base, [item, item])).resolves.toBeUndefined() + + expect(vectorStoreDeleteMock).toHaveBeenCalledTimes(1) + expect(vectorStoreDeleteMock).toHaveBeenCalledWith(item.id) + }) +}) diff --git a/src/main/services/knowledge/runtime/index.ts b/src/main/services/knowledge/runtime/index.ts new file mode 100644 index 0000000000..e7f5b8f3bd --- /dev/null +++ b/src/main/services/knowledge/runtime/index.ts @@ -0,0 +1,5 @@ +export * from './KnowledgeAddQueue' +export * from './KnowledgeAddRuntime' +export * from './KnowledgeRuntimeService' +export * from './utils/cleanup' +export * from './utils/taskRuntime' diff --git a/src/main/services/knowledge/runtime/utils/__tests__/cleanup.test.ts b/src/main/services/knowledge/runtime/utils/__tests__/cleanup.test.ts new file mode 100644 index 0000000000..aa9911c9ad --- /dev/null +++ b/src/main/services/knowledge/runtime/utils/__tests__/cleanup.test.ts @@ -0,0 +1,249 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const { appGetMock, getStoreIfExistsMock, knowledgeItemUpdateMock, loggerErrorMock, loggerWarnMock, vectorDeleteMock } = + vi.hoisted(() => ({ + appGetMock: vi.fn(), + getStoreIfExistsMock: vi.fn(), + knowledgeItemUpdateMock: vi.fn(), + loggerErrorMock: vi.fn(), + loggerWarnMock: vi.fn(), + vectorDeleteMock: vi.fn() + })) + +vi.mock('@main/core/application', () => ({ + application: { + get: appGetMock + } +})) + +vi.mock('@data/services/KnowledgeItemService', () => ({ + knowledgeItemService: { + update: knowledgeItemUpdateMock + } +})) + +vi.mock('@logger', () => ({ + loggerService: { + withContext: () => ({ + warn: loggerWarnMock, + error: loggerErrorMock + }) + } +})) + +const { deleteItemVectors, deleteVectorsForEntries, failItems } = await import('../cleanup') + +function createBase() { + return { + id: 'kb-1', + name: 'KB', + dimensions: 1024, + embeddingModelId: 'ollama::nomic-embed-text', + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +describe('cleanup', () => { + beforeEach(() => { + vi.clearAllMocks() + + appGetMock.mockImplementation((serviceName: string) => { + if (serviceName === 'KnowledgeVectorStoreService') { + return { + getStoreIfExists: getStoreIfExistsMock + } + } + + throw new Error(`Unexpected application.get(${serviceName}) in test`) + }) + }) + + it('does nothing when no vector store exists for the base', async () => { + const base = createBase() + + getStoreIfExistsMock.mockResolvedValueOnce(undefined) + + await expect(deleteItemVectors(base, ['item-1'])).resolves.toBeUndefined() + + expect(getStoreIfExistsMock).toHaveBeenCalledWith(base) + expect(vectorDeleteMock).not.toHaveBeenCalled() + }) + + it('deduplicates item ids before deleting from an existing vector store', async () => { + const base = createBase() + + getStoreIfExistsMock.mockResolvedValueOnce({ + delete: vectorDeleteMock + }) + vectorDeleteMock.mockResolvedValue(undefined) + + await expect(deleteItemVectors(base, ['item-1', 'item-1', 'item-2'])).resolves.toBeUndefined() + + expect(getStoreIfExistsMock).toHaveBeenCalledWith(base) + expect(vectorDeleteMock).toHaveBeenCalledTimes(2) + expect(vectorDeleteMock).toHaveBeenCalledWith('item-1') + expect(vectorDeleteMock).toHaveBeenCalledWith('item-2') + }) + + it('keeps deleting remaining items and reports partial failures', async () => { + const base = createBase() + const deleteError = new Error('delete failed for item-2') + + getStoreIfExistsMock.mockResolvedValueOnce({ + delete: vectorDeleteMock + }) + vectorDeleteMock.mockImplementation(async (itemId: string) => { + if (itemId === 'item-2') { + throw deleteError + } + }) + + await expect(deleteItemVectors(base, ['item-1', 'item-2'])).rejects.toMatchObject({ + name: 'DeleteItemVectorsError', + message: 'Failed to delete vectors for knowledge items in base kb-1: item-2', + baseId: 'kb-1', + succeededItemIds: ['item-1'], + failed: [ + { + itemId: 'item-2', + error: deleteError + } + ] + }) + + expect(vectorDeleteMock).toHaveBeenCalledTimes(2) + expect(vectorDeleteMock).toHaveBeenCalledWith('item-1') + expect(vectorDeleteMock).toHaveBeenCalledWith('item-2') + }) + + it('logs partial vector cleanup failures and continues when continueOnError is enabled', async () => { + const base = createBase() + + getStoreIfExistsMock.mockResolvedValueOnce({ + delete: vectorDeleteMock + }) + vectorDeleteMock.mockImplementation(async (itemId: string) => { + if (itemId === 'item-2') { + throw new Error('delete failed for item-2') + } + }) + + await expect( + deleteVectorsForEntries( + [ + { + base, + item: { + id: 'item-1' + } + }, + { + base, + item: { + id: 'item-2' + } + } + ] as any, + { continueOnError: true } + ) + ).resolves.toBeUndefined() + + expect(loggerWarnMock).toHaveBeenCalledWith('Failed to delete knowledge item vectors during interruption cleanup', { + baseId: 'kb-1', + itemIds: ['item-1', 'item-2'], + succeededItemIds: ['item-1'], + failedItemIds: ['item-2'], + cleanupError: 'Failed to delete vectors for knowledge items in base kb-1: item-2' + }) + }) + + it('throws vector cleanup errors when continueOnError is disabled', async () => { + const base = createBase() + + getStoreIfExistsMock.mockResolvedValueOnce({ + delete: vectorDeleteMock + }) + vectorDeleteMock.mockRejectedValueOnce(new Error('delete failed for item-1')) + + await expect( + deleteVectorsForEntries( + [ + { + base, + item: { id: 'item-1' } + } + ] as any, + { continueOnError: false } + ) + ).rejects.toMatchObject({ + name: 'DeleteItemVectorsError', + baseId: 'kb-1', + failed: [ + { + itemId: 'item-1' + } + ] + }) + + expect(loggerWarnMock).not.toHaveBeenCalled() + }) + + it('groups entries by base before deleting vectors', async () => { + const firstBase = createBase() + const secondBase = { + ...createBase(), + id: 'kb-2' + } + + getStoreIfExistsMock + .mockResolvedValueOnce({ delete: vectorDeleteMock }) + .mockResolvedValueOnce({ delete: vectorDeleteMock }) + vectorDeleteMock.mockResolvedValue(undefined) + + await expect( + deleteVectorsForEntries( + [ + { base: firstBase, item: { id: 'item-1' } }, + { base: firstBase, item: { id: 'item-2' } }, + { base: secondBase, item: { id: 'item-3' } } + ] as any, + { continueOnError: true } + ) + ).resolves.toBeUndefined() + + expect(getStoreIfExistsMock).toHaveBeenNthCalledWith(1, firstBase) + expect(getStoreIfExistsMock).toHaveBeenNthCalledWith(2, secondBase) + expect(vectorDeleteMock).toHaveBeenCalledWith('item-1') + expect(vectorDeleteMock).toHaveBeenCalledWith('item-2') + expect(vectorDeleteMock).toHaveBeenCalledWith('item-3') + }) + + it('marks interrupted items as failed and logs persistence errors per item', async () => { + knowledgeItemUpdateMock.mockImplementation(async (itemId: string) => { + if (itemId === 'item-2') { + throw new Error('persist failed') + } + }) + + await expect(failItems(['item-1', 'item-2', 'item-1'], 'stop')).resolves.toBeUndefined() + + expect(knowledgeItemUpdateMock).toHaveBeenCalledTimes(2) + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith('item-1', { + status: 'failed', + error: 'stop' + }) + expect(knowledgeItemUpdateMock).toHaveBeenCalledWith('item-2', { + status: 'failed', + error: 'stop' + }) + expect(loggerErrorMock).toHaveBeenCalledWith( + 'Failed to persist interrupted knowledge item state', + expect.objectContaining({ message: 'persist failed' }), + { + itemId: 'item-2', + reason: 'stop' + } + ) + }) +}) diff --git a/src/main/services/knowledge/runtime/utils/__tests__/taskRuntime.test.ts b/src/main/services/knowledge/runtime/utils/__tests__/taskRuntime.test.ts new file mode 100644 index 0000000000..1c9854afb0 --- /dev/null +++ b/src/main/services/knowledge/runtime/utils/__tests__/taskRuntime.test.ts @@ -0,0 +1,47 @@ +import { describe, expect, it, vi } from 'vitest' + +import { assertTaskActive, runAbortable, SHUTDOWN_INTERRUPTED_REASON } from '../taskRuntime' + +describe('taskRuntime', () => { + it('runs active work and returns the step result', async () => { + const step = vi.fn(async () => 'done') + + await expect( + runAbortable(() => false, { itemId: 'item-1', signal: new AbortController().signal }, step) + ).resolves.toBe('done') + expect(step).toHaveBeenCalledTimes(1) + }) + + it('throws the abort reason when the signal was aborted', () => { + const controller = new AbortController() + controller.abort('Knowledge task interrupted by item deletion') + + expect(() => assertTaskActive(() => false, { itemId: 'item-1', signal: controller.signal })).toThrow( + 'Knowledge task interrupted by item deletion' + ) + }) + + it('throws the shutdown reason before invoking the step when stopping', async () => { + const step = vi.fn(async () => 'done') + + await expect( + runAbortable(() => true, { itemId: 'item-1', signal: new AbortController().signal }, step) + ).rejects.toThrow(SHUTDOWN_INTERRUPTED_REASON) + expect(step).not.toHaveBeenCalled() + }) + + it('rechecks the stopping state after the step instead of using a stale snapshot', async () => { + let stopping = false + + await expect( + runAbortable( + () => stopping, + { itemId: 'item-1', signal: new AbortController().signal }, + async () => { + stopping = true + return 'done' + } + ) + ).rejects.toThrow(SHUTDOWN_INTERRUPTED_REASON) + }) +}) diff --git a/src/main/services/knowledge/runtime/utils/cleanup.ts b/src/main/services/knowledge/runtime/utils/cleanup.ts new file mode 100644 index 0000000000..b8b0b3e100 --- /dev/null +++ b/src/main/services/knowledge/runtime/utils/cleanup.ts @@ -0,0 +1,137 @@ +import { knowledgeItemService } from '@data/services/KnowledgeItemService' +import { loggerService } from '@logger' +import { application } from '@main/core/application' +import type { KnowledgeBase, KnowledgeItem } from '@shared/data/types/knowledge' + +const logger = loggerService.withContext('KnowledgeRuntimeCleanup') + +interface DeleteItemVectorFailure { + itemId: string + error: Error +} + +class DeleteItemVectorsError extends Error { + constructor( + readonly baseId: string, + readonly succeededItemIds: string[], + readonly failed: DeleteItemVectorFailure[] + ) { + super( + `Failed to delete vectors for knowledge items in base ${baseId}: ${failed.map((entry) => entry.itemId).join(', ')}` + ) + this.name = 'DeleteItemVectorsError' + } +} + +/** + * Deletes vectors for the given item ids within one knowledge base. + */ +export async function deleteItemVectors(base: KnowledgeBase, itemIds: string[]): Promise { + const uniqueItemIds = [...new Set(itemIds)] + if (uniqueItemIds.length === 0) { + return + } + + const vectorStoreService = application.get('KnowledgeVectorStoreService') + const vectorStore = await vectorStoreService.getStoreIfExists(base) + if (!vectorStore) { + return + } + + const results = await Promise.allSettled(uniqueItemIds.map((itemId) => vectorStore.delete(itemId))) + const succeededItemIds: string[] = [] + const failed: DeleteItemVectorFailure[] = [] + + for (const [index, result] of results.entries()) { + const itemId = uniqueItemIds[index] + if (result.status === 'fulfilled') { + succeededItemIds.push(itemId) + continue + } + + failed.push({ + itemId, + error: result.reason instanceof Error ? result.reason : new Error(String(result.reason)) + }) + } + + if (failed.length > 0) { + throw new DeleteItemVectorsError(base.id, succeededItemIds, failed) + } +} + +/** + * Groups interrupted entries by base and deletes their vectors in batches. + */ +export async function deleteVectorsForEntries( + entries: Array<{ base: KnowledgeBase; item: KnowledgeItem }>, + options: { continueOnError: boolean } +): Promise { + const entriesByBase = new Map }>() + + for (const entry of entries) { + const existing = entriesByBase.get(entry.base.id) + if (existing) { + existing.itemIds.add(entry.item.id) + continue + } + + entriesByBase.set(entry.base.id, { + base: entry.base, + itemIds: new Set([entry.item.id]) + }) + } + + for (const { base, itemIds } of entriesByBase.values()) { + try { + await deleteItemVectors(base, [...itemIds]) + } catch (error) { + if (!options.continueOnError) { + throw error + } + + const deleteError = error instanceof DeleteItemVectorsError ? error : null + logger.warn('Failed to delete knowledge item vectors during interruption cleanup', { + baseId: base.id, + itemIds: [...itemIds], + succeededItemIds: deleteError?.succeededItemIds ?? [], + failedItemIds: deleteError?.failed.map((entry) => entry.itemId) ?? [], + cleanupError: error instanceof Error ? error.message : String(error) + }) + } + } +} + +/** + * Marks interrupted items as failed and logs any persistence errors. + */ +export async function failItems(itemIds: string[], reason: string): Promise { + if (itemIds.length === 0) { + return + } + + const uniqueItemIds = [...new Set(itemIds)] + const results = await Promise.allSettled( + uniqueItemIds.map((itemId) => + knowledgeItemService.update(itemId, { + status: 'failed', + error: reason + }) + ) + ) + + for (const [index, result] of results.entries()) { + if (result.status === 'fulfilled') { + continue + } + + logger.error( + 'Failed to persist interrupted knowledge item state', + result.reason instanceof Error ? result.reason : new Error(String(result.reason)), + { + itemId: uniqueItemIds[index], + reason + } + ) + } +} diff --git a/src/main/services/knowledge/runtime/utils/taskRuntime.ts b/src/main/services/knowledge/runtime/utils/taskRuntime.ts new file mode 100644 index 0000000000..98b5505fb7 --- /dev/null +++ b/src/main/services/knowledge/runtime/utils/taskRuntime.ts @@ -0,0 +1,39 @@ +export const SHUTDOWN_INTERRUPTED_REASON = 'Knowledge task interrupted by service shutdown' +export const DELETE_INTERRUPTED_REASON = 'Knowledge task interrupted by item deletion' + +export interface RuntimeTaskContext { + itemId: string + signal: AbortSignal +} + +/** + * Runs one async runtime step with interruption checks before and after the + * step body. + */ +export async function runAbortable( + isStopping: () => boolean, + ctx: RuntimeTaskContext, + step: () => Promise | T +): Promise { + assertTaskActive(isStopping, ctx) + const result = await step() + assertTaskActive(isStopping, ctx) + return result +} + +/** + * Throws when the runtime has been interrupted by shutdown or abort signal. + */ +export function assertTaskActive(isStopping: () => boolean, ctx: RuntimeTaskContext): void { + if (ctx.signal.aborted) { + const reason = + typeof ctx.signal.reason === 'string' && ctx.signal.reason.length > 0 + ? ctx.signal.reason + : SHUTDOWN_INTERRUPTED_REASON + throw new Error(reason) + } + + if (isStopping()) { + throw new Error(SHUTDOWN_INTERRUPTED_REASON) + } +} diff --git a/src/main/services/knowledge/utils/__tests__/chunk.test.ts b/src/main/services/knowledge/utils/__tests__/chunk.test.ts new file mode 100644 index 0000000000..072702b33f --- /dev/null +++ b/src/main/services/knowledge/utils/__tests__/chunk.test.ts @@ -0,0 +1,70 @@ +import { Document } from '@vectorstores/core' +import { describe, expect, it } from 'vitest' + +import { chunkDocuments } from '../chunk' + +function createBase() { + return { + id: 'kb-1', + name: 'KB', + dimensions: 1024, + embeddingModelId: 'ollama::nomic-embed-text', + chunkSize: 1000, + chunkOverlap: 0, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createItem() { + return { + id: 'item-1', + baseId: 'kb-1', + groupId: null, + type: 'note' as const, + data: { content: 'hello' }, + status: 'idle' as const, + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +describe('chunkDocuments', () => { + it('returns an empty list when there are no source documents', () => { + expect(chunkDocuments(createBase(), createItem(), [])).toEqual([]) + }) + + it('preserves source metadata and annotates chunks with item metadata', () => { + const documents = [ + new Document({ + text: 'hello world', + metadata: { sourceUrl: 'https://example.com/1' } + }), + new Document({ + text: 'goodbye world', + metadata: { sourceUrl: 'https://example.com/2' } + }) + ] + + const chunks = chunkDocuments(createBase(), createItem(), documents) + + expect(chunks).toHaveLength(2) + expect(chunks[0]?.metadata).toMatchObject({ + sourceUrl: 'https://example.com/1', + itemId: 'item-1', + itemType: 'note', + sourceDocumentIndex: 0, + chunkIndex: 0, + chunkCount: 1 + }) + expect(chunks[1]?.metadata).toMatchObject({ + sourceUrl: 'https://example.com/2', + itemId: 'item-1', + itemType: 'note', + sourceDocumentIndex: 1, + chunkIndex: 0, + chunkCount: 1 + }) + }) +}) diff --git a/src/main/services/knowledge/utils/__tests__/config.test.ts b/src/main/services/knowledge/utils/__tests__/config.test.ts new file mode 100644 index 0000000000..c73b0deecf --- /dev/null +++ b/src/main/services/knowledge/utils/__tests__/config.test.ts @@ -0,0 +1,19 @@ +import { describe, expect, it } from 'vitest' + +import { parseCompositeModelId } from '../config' + +describe('knowledge config utils', () => { + it('parses a strict providerId::modelId composite id', () => { + expect(parseCompositeModelId('openai::text-embedding-3-small')).toEqual({ + providerId: 'openai', + modelId: 'text-embedding-3-small' + }) + }) + + it.each(['', 'openai', 'openai:', 'openai::', '::model', ' openai::model', 'openai::model ', 'openai:::model'])( + 'throws on invalid composite id: %s', + (value) => { + expect(() => parseCompositeModelId(value)).toThrow('Expected format: "providerId::modelId"') + } + ) +}) diff --git a/src/main/services/knowledge/utils/__tests__/directory.test.ts b/src/main/services/knowledge/utils/__tests__/directory.test.ts new file mode 100644 index 0000000000..17b3343aa3 --- /dev/null +++ b/src/main/services/knowledge/utils/__tests__/directory.test.ts @@ -0,0 +1,80 @@ +import type * as NodeFs from 'node:fs' +import type * as NodeOs from 'node:os' +import path from 'node:path' + +import { afterEach, describe, expect, it, vi } from 'vitest' + +const { expandDirectoryOwnerToCreateItems } = await import('../directory') +const realFs = await vi.importActual('node:fs') +const realOs = await vi.importActual('node:os') + +function createTempRoot() { + return realFs.mkdtempSync(path.join(realOs.tmpdir(), 'knowledge-directory-expand-')) +} + +describe('expandDirectoryOwnerToCreateItems', () => { + let tempRoot: string | undefined + + afterEach(() => { + if (tempRoot) { + realFs.rmSync(tempRoot, { recursive: true, force: true }) + tempRoot = undefined + } + }) + + it('expands a directory owner into child createMany dto items with preserved hierarchy', async () => { + tempRoot = createTempRoot() + const rootDir = path.join(tempRoot, 'anna') + const nestedDir = path.join(rootDir, 'agents', 'skills') + realFs.mkdirSync(nestedDir, { recursive: true }) + realFs.writeFileSync(path.join(rootDir, '.dockerignore'), 'node_modules') + realFs.writeFileSync(path.join(nestedDir, 'skill.md'), '# skill') + + const items = await expandDirectoryOwnerToCreateItems({ + id: 'dir-owner-1', + baseId: 'kb-1', + groupId: null, + type: 'directory', + data: { + name: 'anna', + path: rootDir + }, + status: 'idle', + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + }) + + const agentsDir = items.find((item) => item.type === 'directory' && item.data.path === path.join(rootDir, 'agents')) + const skillsDir = items.find( + (item) => item.type === 'directory' && item.data.path === path.join(rootDir, 'agents', 'skills') + ) + const rootFile = items.find( + (item) => item.type === 'file' && item.data.file.path === path.join(rootDir, '.dockerignore') + ) + const nestedFile = items.find( + (item) => item.type === 'file' && item.data.file.path === path.join(nestedDir, 'skill.md') + ) + + expect(agentsDir).toMatchObject({ + ref: 'dir:/agents', + groupId: 'dir-owner-1' + }) + expect(skillsDir).toMatchObject({ + ref: 'dir:/agents/skills', + groupRef: 'dir:/agents' + }) + expect(rootFile).toBeUndefined() + expect(nestedFile).toMatchObject({ + groupRef: 'dir:/agents/skills', + type: 'file' + }) + expect(nestedFile && nestedFile.type === 'file' ? nestedFile.data.file : undefined).toMatchObject({ + name: 'skill.md', + origin_name: 'skill.md', + path: path.join(nestedDir, 'skill.md'), + ext: '.md', + count: 1 + }) + }) +}) diff --git a/src/main/services/knowledge/utils/__tests__/sitemap.test.ts b/src/main/services/knowledge/utils/__tests__/sitemap.test.ts new file mode 100644 index 0000000000..0e09c94f41 --- /dev/null +++ b/src/main/services/knowledge/utils/__tests__/sitemap.test.ts @@ -0,0 +1,128 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const { fetchMock, loggerWarnMock } = vi.hoisted(() => ({ + fetchMock: vi.fn(), + loggerWarnMock: vi.fn() +})) + +vi.mock('@logger', () => ({ + loggerService: { + withContext: () => ({ + debug: vi.fn(), + info: vi.fn(), + warn: loggerWarnMock, + error: vi.fn() + }) + } +})) + +vi.mock('electron', () => ({ + net: { + fetch: fetchMock + } +})) + +const { expandSitemapOwnerToCreateItems } = await import('../sitemap') + +describe('expandSitemapOwnerToCreateItems', () => { + beforeEach(() => { + fetchMock.mockReset() + loggerWarnMock.mockReset() + }) + + it('creates deduplicated url child items for a sitemap owner', async () => { + fetchMock.mockResolvedValue( + new Response( + [ + '', + ' https://example.com/page-1', + ' https://example.com/page-2', + ' https://example.com/page-1', + '' + ].join(''), + { status: 200 } + ) + ) + + const items = await expandSitemapOwnerToCreateItems({ + id: 'sitemap-owner-1', + baseId: 'kb-1', + groupId: null, + type: 'sitemap', + data: { + url: 'https://example.com/sitemap.xml', + name: 'https://example.com/sitemap.xml' + }, + status: 'idle', + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + }) + + expect(items).toEqual([ + { + groupId: 'sitemap-owner-1', + type: 'url', + data: { + url: 'https://example.com/page-1', + name: 'https://example.com/page-1' + } + }, + { + groupId: 'sitemap-owner-1', + type: 'url', + data: { + url: 'https://example.com/page-2', + name: 'https://example.com/page-2' + } + } + ]) + }) + + it('rejects unsupported sitemap protocols before fetching', async () => { + await expect( + expandSitemapOwnerToCreateItems({ + id: 'sitemap-owner-2', + baseId: 'kb-1', + groupId: null, + type: 'sitemap', + data: { + url: 'file:///etc/passwd', + name: 'file:///etc/passwd' + }, + status: 'idle', + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + }) + ).rejects.toThrow('Invalid knowledge url: file:///etc/passwd') + + expect(fetchMock).not.toHaveBeenCalled() + }) + + it('logs a warning when sitemap parsing yields no URLs', async () => { + fetchMock.mockResolvedValue(new Response('', { status: 200 })) + + await expect( + expandSitemapOwnerToCreateItems({ + id: 'sitemap-owner-3', + baseId: 'kb-1', + groupId: null, + type: 'sitemap', + data: { + url: 'https://example.com/empty-sitemap.xml', + name: 'https://example.com/empty-sitemap.xml' + }, + status: 'idle', + error: null, + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + }) + ).resolves.toEqual([]) + + expect(loggerWarnMock).toHaveBeenCalledWith('Sitemap expansion produced no URLs', { + ownerId: 'sitemap-owner-3', + sitemapUrl: 'https://example.com/empty-sitemap.xml' + }) + }) +}) diff --git a/src/main/services/knowledge/utils/__tests__/url.test.ts b/src/main/services/knowledge/utils/__tests__/url.test.ts new file mode 100644 index 0000000000..06e8343ffc --- /dev/null +++ b/src/main/services/knowledge/utils/__tests__/url.test.ts @@ -0,0 +1,136 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const fetchMock = vi.hoisted(() => vi.fn()) + +vi.mock('@logger', () => ({ + loggerService: { + withContext: () => ({ + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + error: vi.fn() + }) + } +})) + +vi.mock('electron', () => ({ + net: { + fetch: fetchMock + } +})) + +const { fetchKnowledgeWebPage } = await import('../url') + +function createDeferred() { + let resolve!: (value: T | PromiseLike) => void + let reject!: (reason?: unknown) => void + const promise = new Promise((res, rej) => { + resolve = res + reject = rej + }) + + return { promise, resolve, reject } +} + +describe('fetchKnowledgeWebPage', () => { + beforeEach(() => { + fetchMock.mockReset() + }) + + it('fetches a page and returns markdown content', async () => { + fetchMock.mockResolvedValue(new Response('# Example Page\n\nHello knowledge', { status: 200 })) + + const controller = new AbortController() + + await expect(fetchKnowledgeWebPage('https://example.com', controller.signal)).resolves.toBe( + '# Example Page\n\nHello knowledge' + ) + + expect(fetchMock).toHaveBeenCalledWith( + 'https://r.jina.ai/https://example.com', + expect.objectContaining({ + signal: expect.any(AbortSignal), + headers: { + 'X-Retain-Images': 'none', + 'X-Return-Format': 'markdown' + } + }) + ) + }) + + it('rejects before execution when the caller signal is already aborted', async () => { + const controller = new AbortController() + controller.abort(new Error('fetch aborted')) + + await expect(fetchKnowledgeWebPage('https://example.com', controller.signal)).rejects.toThrow('fetch aborted') + expect(fetchMock).not.toHaveBeenCalled() + }) + + it('throws on non-ok upstream responses', async () => { + fetchMock.mockResolvedValue(new Response('nope', { status: 500 })) + + await expect(fetchKnowledgeWebPage('https://example.com')).rejects.toThrow( + 'Failed to fetch knowledge web page https://example.com: HTTP 500' + ) + }) + + it('rejects unsupported protocols before dispatching the request', async () => { + await expect(fetchKnowledgeWebPage('file:///etc/passwd')).rejects.toThrow( + 'Invalid knowledge url: file:///etc/passwd' + ) + + expect(fetchMock).not.toHaveBeenCalled() + }) + + it('limits concurrent upstream web fetches through a shared queue', async () => { + let activeFetches = 0 + let maxActiveFetches = 0 + const deferredResponses = Array.from({ length: 5 }, () => createDeferred()) + let fetchCallIndex = 0 + + fetchMock.mockImplementation(async () => { + const deferred = deferredResponses[fetchCallIndex] + fetchCallIndex += 1 + if (!deferred) { + throw new Error('Unexpected fetch call') + } + + activeFetches += 1 + maxActiveFetches = Math.max(maxActiveFetches, activeFetches) + + try { + return await deferred.promise + } finally { + activeFetches -= 1 + } + }) + + const requests = [ + fetchKnowledgeWebPage('https://example.com/1'), + fetchKnowledgeWebPage('https://example.com/2'), + fetchKnowledgeWebPage('https://example.com/3'), + fetchKnowledgeWebPage('https://example.com/4'), + fetchKnowledgeWebPage('https://example.com/5') + ] + + await vi.waitFor(() => { + expect(fetchMock).toHaveBeenCalledTimes(3) + expect(activeFetches).toBe(3) + }) + + deferredResponses[0].resolve(new Response('page 1', { status: 200 })) + + await vi.waitFor(() => { + expect(fetchMock).toHaveBeenCalledTimes(4) + expect(maxActiveFetches).toBeLessThanOrEqual(3) + }) + + deferredResponses[1].resolve(new Response('page 2', { status: 200 })) + deferredResponses[2].resolve(new Response('page 3', { status: 200 })) + deferredResponses[3].resolve(new Response('page 4', { status: 200 })) + deferredResponses[4].resolve(new Response('page 5', { status: 200 })) + + await expect(Promise.all(requests)).resolves.toEqual(['page 1', 'page 2', 'page 3', 'page 4', 'page 5']) + expect(maxActiveFetches).toBeLessThanOrEqual(3) + }) +}) diff --git a/src/main/services/knowledge/utils/chunk.ts b/src/main/services/knowledge/utils/chunk.ts new file mode 100644 index 0000000000..9d6e3bdcb9 --- /dev/null +++ b/src/main/services/knowledge/utils/chunk.ts @@ -0,0 +1,32 @@ +import type { KnowledgeBase, KnowledgeItem } from '@shared/data/types/knowledge' +import { Document, type Document as VectorStoreDocument, SentenceSplitter } from '@vectorstores/core' + +/** + * Splits source documents into chunked vector-store documents and attaches + * knowledge-item metadata needed by downstream indexing steps. + */ +export function chunkDocuments(base: KnowledgeBase, item: KnowledgeItem, documents: VectorStoreDocument[]) { + const splitter = new SentenceSplitter({ + chunkSize: base.chunkSize, + chunkOverlap: base.chunkOverlap + }) + + return documents.flatMap((document, documentIndex) => { + const chunks = splitter.splitText(document.text).filter(Boolean) + + return chunks.map( + (chunk, chunkIndex) => + new Document({ + text: chunk, + metadata: { + ...document.metadata, + itemId: item.id, + itemType: item.type, + sourceDocumentIndex: documentIndex, + chunkIndex, + chunkCount: chunks.length + } + }) + ) + }) +} diff --git a/src/main/services/knowledge/utils/config.ts b/src/main/services/knowledge/utils/config.ts new file mode 100644 index 0000000000..bdce8c540b --- /dev/null +++ b/src/main/services/knowledge/utils/config.ts @@ -0,0 +1,43 @@ +/** + * Knowledge model configuration helpers. + * + * Temporary knowledge-domain implementation. + * TODO: consolidate this parser into shared model utils after v2 settles. + */ + +export interface CompositeModelRef { + providerId: string + modelId: string +} + +const COMPOSITE_MODEL_SEPARATOR = '::' + +/** + * Parse a composite model id stored in DB as `providerId::modelId`. + * + * Current scope is knowledge-domain only. + * Future model-id parsing should be unified in shared model utils. + */ +export function parseCompositeModelId(value: string): CompositeModelRef { + const separatorIndex = value.indexOf(COMPOSITE_MODEL_SEPARATOR) + const lastSeparatorIndex = value.lastIndexOf(COMPOSITE_MODEL_SEPARATOR) + + if ( + !value || + separatorIndex <= 0 || + separatorIndex !== lastSeparatorIndex || + separatorIndex + COMPOSITE_MODEL_SEPARATOR.length >= value.length + ) { + throw new Error(`Invalid composite model id "${value}". Expected format: "providerId::modelId".`) + } + + const providerId = value.slice(0, separatorIndex).trim() + const modelId = value.slice(separatorIndex + COMPOSITE_MODEL_SEPARATOR.length).trim() + + // Strict format guard: no leading/trailing spaces and no whitespace around separator. + if (!providerId || !modelId || `${providerId}${COMPOSITE_MODEL_SEPARATOR}${modelId}` !== value) { + throw new Error(`Invalid composite model id "${value}". Expected format: "providerId::modelId".`) + } + + return { providerId, modelId } +} diff --git a/src/main/services/knowledge/utils/directory.ts b/src/main/services/knowledge/utils/directory.ts new file mode 100644 index 0000000000..42bca1c37e --- /dev/null +++ b/src/main/services/knowledge/utils/directory.ts @@ -0,0 +1,145 @@ +import fs from 'node:fs/promises' +import path from 'node:path' + +import { getFileType } from '@main/utils/file' +import type { CreateKnowledgeItemsDto } from '@shared/data/api/schemas/knowledges' +import type { FileMetadata } from '@shared/data/types/file' +import type { KnowledgeItem } from '@shared/data/types/knowledge' +import type { NotesTreeNode } from '@types' +import { v4 as uuidv4 } from 'uuid' + +type CreateKnowledgeItemInput = CreateKnowledgeItemsDto['items'][number] + +/** + * Recursively reads a directory tree and converts it into note-tree nodes. + */ +async function readDirectoryTree(dirPath: string, rootPath: string = dirPath): Promise { + const entries = await fs.readdir(dirPath, { withFileTypes: true }) + const nodes: NotesTreeNode[] = [] + + for (const entry of entries) { + if (entry.name.startsWith('.')) { + continue + } + + const entryPath = path.join(dirPath, entry.name) + const stats = await fs.stat(entryPath) + const relativePath = path.relative(rootPath, entryPath) + const treePath = `/${relativePath.replace(/\\/g, '/')}` + + if (entry.isDirectory()) { + nodes.push({ + id: uuidv4(), + name: entry.name, + type: 'folder', + treePath, + externalPath: entryPath, + createdAt: stats.birthtime.toISOString(), + updatedAt: stats.mtime.toISOString(), + children: await readDirectoryTree(entryPath, rootPath) + }) + continue + } + + if (entry.isFile()) { + nodes.push({ + id: uuidv4(), + name: entry.name, + type: 'file', + treePath, + externalPath: entryPath, + createdAt: stats.birthtime.toISOString(), + updatedAt: stats.mtime.toISOString() + }) + } + } + + return nodes +} + +/** + * Builds file metadata for an external file path so it can be stored as a + * knowledge file item. + */ +async function createExternalFileMetadata(filePath: string): Promise { + const stats = await fs.stat(filePath) + const originName = path.basename(filePath) + const ext = path.extname(originName) + + return { + id: uuidv4(), + origin_name: originName, + name: originName, + path: filePath, + created_at: stats.birthtime.toISOString(), + size: stats.size, + ext, + type: getFileType(ext), + count: 1 + } +} + +type GroupingTarget = { groupId: string } | { groupRef: string } + +/** + * Flattens a directory node into create-item inputs while preserving the + * parent-child grouping relationship. + */ +async function flattenDirectoryNode(node: NotesTreeNode, parent: GroupingTarget): Promise { + if (node.type === 'file') { + return [ + { + ...parent, + type: 'file', + data: { + file: await createExternalFileMetadata(node.externalPath) + } + } + ] + } + + if (node.type !== 'folder') { + return [] + } + + const ref = node.treePath === '/' ? 'root' : `dir:${node.treePath}` + const items: CreateKnowledgeItemInput[] = [ + { + ref, + ...parent, + type: 'directory', + data: { + name: node.name, + path: node.externalPath + } + } + ] + + for (const child of node.children ?? []) { + items.push(...(await flattenDirectoryNode(child, { groupRef: ref }))) + } + + return items +} + +/** + * Expands a directory owner item into a batch of child knowledge items that + * mirror the directory structure on disk. + */ +export async function expandDirectoryOwnerToCreateItems( + owner: KnowledgeItem +): Promise { + if (owner.type !== 'directory') { + throw new Error(`Knowledge item '${owner.id}' must be type 'directory', received '${owner.type}'`) + } + + const resolvedPath = path.resolve(owner.data.path) + const children = await readDirectoryTree(resolvedPath) + const items: CreateKnowledgeItemsDto['items'] = [] + + for (const child of children) { + items.push(...(await flattenDirectoryNode(child, { groupId: owner.id }))) + } + + return items +} diff --git a/src/main/services/knowledge/utils/embed.ts b/src/main/services/knowledge/utils/embed.ts new file mode 100644 index 0000000000..63c394aca2 --- /dev/null +++ b/src/main/services/knowledge/utils/embed.ts @@ -0,0 +1,38 @@ +import type { EmbeddingModelV3 } from '@ai-sdk/provider' +import { type Document as VectorStoreDocument, NodeRelationship, TextNode } from '@vectorstores/core' +import { embedMany } from 'ai' + +/** + * Embeds chunked documents and converts them into vector-store text nodes. + */ +export async function embedDocuments( + model: EmbeddingModelV3, + documents: VectorStoreDocument[], + signal?: AbortSignal +): Promise { + if (documents.length === 0) { + return [] + } + + const values = documents.map((document) => document.text) + const result = await embedMany({ + model, + values, + abortSignal: signal + }) + + return documents.map( + (document, index) => + new TextNode({ + text: document.text, + embedding: result.embeddings[index], + metadata: document.metadata, + relationships: { + [NodeRelationship.SOURCE]: { + nodeId: String(document.metadata.itemId), + metadata: document.metadata + } + } + }) + ) +} diff --git a/src/main/services/knowledge/utils/model.ts b/src/main/services/knowledge/utils/model.ts new file mode 100644 index 0000000000..bf0d3ee00d --- /dev/null +++ b/src/main/services/knowledge/utils/model.ts @@ -0,0 +1,24 @@ +import type { EmbeddingModelV3 } from '@ai-sdk/provider' +import type { KnowledgeBase } from '@shared/data/types/knowledge' +import { createOllama } from 'ollama-ai-provider-v2' + +import { parseCompositeModelId } from './config' + +/** + * Temporary knowledge-domain model resolver. + * TODO: unify model acquisition after ai-core moves into main. + */ +/** + * Resolves the embedding model configured on a knowledge base. + */ +export function getEmbedModel(base: KnowledgeBase): EmbeddingModelV3 { + const { providerId, modelId } = parseCompositeModelId(base.embeddingModelId) + // todo: wait model/provider pr merged + // const {baseUrl, apiKey} = model/provider.getxxx + + if (providerId !== 'ollama') { + throw new Error(`Unsupported embedding provider: ${providerId}`) + } + + return createOllama().textEmbeddingModel(modelId) +} diff --git a/src/main/services/knowledge/utils/sitemap.ts b/src/main/services/knowledge/utils/sitemap.ts new file mode 100644 index 0000000000..36a401d2e5 --- /dev/null +++ b/src/main/services/knowledge/utils/sitemap.ts @@ -0,0 +1,75 @@ +import { loggerService } from '@logger' +import type { CreateKnowledgeItemsDto } from '@shared/data/api/schemas/knowledges' +import type { KnowledgeItem } from '@shared/data/types/knowledge' +import { net } from 'electron' +import { XMLParser } from 'fast-xml-parser' + +import { sanitizeKnowledgeUrl } from './url' + +const logger = loggerService.withContext('KnowledgeSitemapExpansion') +const DEFAULT_FETCH_TIMEOUT_MS = 30000 +const sitemapParser = new XMLParser() + +type ParsedSitemapDocument = { + urlset?: { url?: Array<{ loc?: string }> | { loc?: string } } +} + +/** + * Normalizes sitemap url entries into a flat string list. + */ +function normalizeLocs(value: Array<{ loc?: string }> | { loc?: string } | undefined): string[] { + if (!value) { + return [] + } + + const entries = Array.isArray(value) ? value : [value] + return entries.map((entry) => entry.loc?.trim()).filter((loc): loc is string => Boolean(loc)) +} + +/** + * Expands a sitemap owner item into child url items fetched from the remote + * sitemap document. + */ +export async function expandSitemapOwnerToCreateItems(owner: KnowledgeItem): Promise { + if (owner.type !== 'sitemap') { + throw new Error(`Knowledge item '${owner.id}' must be type 'sitemap', received '${owner.type}'`) + } + + const sitemapUrl = owner.data.url + + try { + const safeSitemapUrl = sanitizeKnowledgeUrl(sitemapUrl) + + const response = await net.fetch(safeSitemapUrl, { + signal: AbortSignal.timeout(DEFAULT_FETCH_TIMEOUT_MS) + }) + + if (!response.ok) { + throw new Error(`Failed to read sitemap ${safeSitemapUrl}: HTTP ${response.status}`) + } + + const xml = await response.text() + const parsed = sitemapParser.parse(xml) as ParsedSitemapDocument + const pageUrls = [...new Set(normalizeLocs(parsed.urlset?.url).map((url) => sanitizeKnowledgeUrl(url)))] + + if (pageUrls.length === 0) { + logger.warn('Sitemap expansion produced no URLs', { + ownerId: owner.id, + sitemapUrl: safeSitemapUrl + }) + } + + return pageUrls.map((url) => ({ + groupId: owner.id, + type: 'url' as const, + data: { + url, + name: url + } + })) + } catch (error) { + const normalizedError = error instanceof Error ? error : new Error(String(error)) + logger.error(`Failed to expand sitemap: ${sitemapUrl}`, normalizedError) + throw error + } +} diff --git a/src/main/services/knowledge/utils/url.ts b/src/main/services/knowledge/utils/url.ts new file mode 100644 index 0000000000..16ae5c08ee --- /dev/null +++ b/src/main/services/knowledge/utils/url.ts @@ -0,0 +1,69 @@ +import { loggerService } from '@logger' +import { net } from 'electron' +import PQueue from 'p-queue' +import { sanitizeUrl } from 'strict-url-sanitise' + +const logger = loggerService.withContext('KnowledgeWebSearch') +const DEFAULT_FETCH_TIMEOUT_MS = 30000 +const JINA_READER_BASE_URL = 'https://r.jina.ai/' +const KNOWLEDGE_WEB_FETCH_CONCURRENCY = 3 +const KNOWLEDGE_WEB_FETCH_INTERVAL_CAP = 10 +const KNOWLEDGE_WEB_FETCH_INTERVAL_MS = 60_000 + +const knowledgeWebFetchQueue = new PQueue({ + concurrency: KNOWLEDGE_WEB_FETCH_CONCURRENCY, + intervalCap: KNOWLEDGE_WEB_FETCH_INTERVAL_CAP, + interval: KNOWLEDGE_WEB_FETCH_INTERVAL_MS +}) + +export function sanitizeKnowledgeUrl(rawUrl: string): string { + try { + const sanitizedUrl = sanitizeUrl(rawUrl) + const parsedRawUrl = new URL(rawUrl) + + if (parsedRawUrl.pathname === '/' && !rawUrl.endsWith('/') && !parsedRawUrl.search && !parsedRawUrl.hash) { + return sanitizedUrl.replace(/\/$/, '') + } + + return sanitizedUrl + } catch { + throw new Error(`Invalid knowledge url: ${rawUrl}`) + } +} + +/** + * Fetches a knowledge web page through the Jina reader endpoint and returns + * the normalized markdown payload. + */ +export async function fetchKnowledgeWebPage(url: string, signal?: AbortSignal): Promise { + try { + const safeUrl = sanitizeKnowledgeUrl(url) + + const response = await knowledgeWebFetchQueue.add( + async () => + await net.fetch(`${JINA_READER_BASE_URL}${safeUrl}`, { + signal: signal ?? AbortSignal.timeout(DEFAULT_FETCH_TIMEOUT_MS), + headers: { + 'X-Retain-Images': 'none', + 'X-Return-Format': 'markdown' + } + }), + signal ? { signal } : undefined + ) + if (!response) { + throw new Error(`Knowledge web fetch queue returned no response for ${safeUrl}`) + } + + if (!response.ok) { + throw new Error(`Failed to fetch knowledge web page ${safeUrl}: HTTP ${response.status}`) + } + + const markdown = (await response.text()).trim() + + return markdown + } catch (error) { + const normalizedError = error instanceof Error ? error : new Error(String(error)) + logger.error(`Failed to load knowledge web page: ${url}`, normalizedError) + throw error + } +} diff --git a/src/main/services/knowledge/vectorstore/KnowledgeVectorStoreService.ts b/src/main/services/knowledge/vectorstore/KnowledgeVectorStoreService.ts new file mode 100644 index 0000000000..68b70ee310 --- /dev/null +++ b/src/main/services/knowledge/vectorstore/KnowledgeVectorStoreService.ts @@ -0,0 +1,94 @@ +import { loggerService } from '@logger' +import { BaseService, Injectable, Phase, ServicePhase } from '@main/core/lifecycle' +import type { KnowledgeBase } from '@shared/data/types/knowledge' +import type { BaseVectorStore } from '@vectorstores/core' +import { LibSQLVectorStore } from '@vectorstores/libsql' + +import { libSqlVectorStoreProvider } from './providers/LibSqlVectorStoreProvider' + +const logger = loggerService.withContext('KnowledgeVectorStoreService') + +@Injectable('KnowledgeVectorStoreService') +@ServicePhase(Phase.WhenReady) +export class KnowledgeVectorStoreService extends BaseService { + private instanceCache = new Map() + + async createStore(base: KnowledgeBase): Promise { + if (this.instanceCache.has(base.id)) { + logger.debug('Reusing cached vector store', { baseId: base.id }) + return this.instanceCache.get(base.id)! + } + + // Cache is keyed only by base.id because store-shaping config is treated as immutable + // for an existing knowledge base. If embedding model / dimensions change, callers must + // migrate into a new knowledge base instead of mutating the existing one in place. + const store = await libSqlVectorStoreProvider.create(base) + this.instanceCache.set(base.id, store) + logger.info('Created vector store', { + baseId: base.id, + dimensions: base.dimensions, + cacheSize: this.instanceCache.size + }) + return store + } + + async getStoreIfExists(base: KnowledgeBase): Promise { + const cachedStore = this.instanceCache.get(base.id) + if (cachedStore) { + logger.debug('Using cached vector store from getStoreIfExists', { baseId: base.id }) + return cachedStore + } + + const exists = await libSqlVectorStoreProvider.exists(base.id) + if (!exists) { + logger.debug('Vector store does not exist on disk', { baseId: base.id }) + return undefined + } + + logger.info('Opening existing vector store from disk', { baseId: base.id }) + return await this.createStore(base) + } + + async deleteStore(baseId: string): Promise { + const store = this.instanceCache.get(baseId) + + try { + this.closeStoreInstance(store) + await libSqlVectorStoreProvider.delete(baseId) + logger.info('Deleted vector store', { + baseId, + hadCachedStore: Boolean(store) + }) + } finally { + this.instanceCache.delete(baseId) + } + } + + protected async onStop(): Promise { + const storeCount = this.instanceCache.size + logger.info('Stopping vector stores', { storeCount }) + + try { + for (const [baseId, store] of this.instanceCache.entries()) { + try { + this.closeStoreInstance(store) + } catch (error) { + logger.error('Failed to close vector store', error as Error, { baseId }) + } + } + } finally { + this.instanceCache.clear() + logger.info('Stopped vector stores', { storeCount }) + } + } + + private closeStoreInstance(store: BaseVectorStore | undefined): void { + if (!store) { + return + } + + if (store instanceof LibSQLVectorStore) { + store.client().close() + } + } +} diff --git a/src/main/services/knowledge/vectorstore/__tests__/KnowledgeVectorStoreService.test.ts b/src/main/services/knowledge/vectorstore/__tests__/KnowledgeVectorStoreService.test.ts new file mode 100644 index 0000000000..0fb67d3a82 --- /dev/null +++ b/src/main/services/knowledge/vectorstore/__tests__/KnowledgeVectorStoreService.test.ts @@ -0,0 +1,201 @@ +import type * as LifecycleModule from '@main/core/lifecycle' +import { beforeEach, describe, expect, it, vi } from 'vitest' + +const { loggerDebugMock, loggerErrorMock, loggerInfoMock, providerCreateMock, providerDeleteMock, providerExistsMock } = + vi.hoisted(() => ({ + loggerDebugMock: vi.fn(), + loggerErrorMock: vi.fn(), + loggerInfoMock: vi.fn(), + providerCreateMock: vi.fn(), + providerDeleteMock: vi.fn(), + providerExistsMock: vi.fn() + })) + +vi.mock('@main/core/lifecycle', async (importOriginal) => { + const actual = await importOriginal() + + class MockBaseService {} + + return { + ...actual, + BaseService: MockBaseService + } +}) + +vi.mock('@vectorstores/libsql', () => { + class MockLibSQLVectorStore { + closeMock = vi.fn() + + client() { + return { + close: this.closeMock + } + } + } + + return { + LibSQLVectorStore: MockLibSQLVectorStore + } +}) + +vi.mock('@logger', () => ({ + loggerService: { + withContext: () => ({ + debug: loggerDebugMock, + info: loggerInfoMock, + error: loggerErrorMock + }) + } +})) + +vi.mock('../providers/LibSqlVectorStoreProvider', () => ({ + libSqlVectorStoreProvider: { + create: providerCreateMock, + delete: providerDeleteMock, + exists: providerExistsMock + } +})) + +const { KnowledgeVectorStoreService } = await import('../KnowledgeVectorStoreService') +const { LibSQLVectorStore } = await import('@vectorstores/libsql') + +function createBase(id = 'kb-1') { + return { + id, + name: 'KB', + dimensions: 1024, + embeddingModelId: 'ollama::nomic-embed-text', + createdAt: '2026-04-08T00:00:00.000Z', + updatedAt: '2026-04-08T00:00:00.000Z' + } +} + +function createStore(closeMock = vi.fn()) { + const store = new LibSQLVectorStore({}) + ;(store as unknown as { closeMock: () => void }).closeMock = closeMock + return store +} + +describe('KnowledgeVectorStoreService', () => { + beforeEach(() => { + vi.clearAllMocks() + providerExistsMock.mockResolvedValue(false) + }) + + it('evicts a cached store even when provider delete fails', async () => { + const service = new KnowledgeVectorStoreService() + const base = createBase() + const firstCloseMock = vi.fn() + const firstStore = createStore(firstCloseMock) + const secondStore = createStore() + + providerCreateMock.mockResolvedValueOnce(firstStore).mockResolvedValueOnce(secondStore) + providerDeleteMock.mockRejectedValueOnce(new Error('delete failed')) + + await expect(service.createStore(base)).resolves.toBe(firstStore) + await expect(service.deleteStore(base.id)).rejects.toThrow('delete failed') + await expect(service.createStore(base)).resolves.toBe(secondStore) + + expect(firstCloseMock).toHaveBeenCalledTimes(1) + expect(providerCreateMock).toHaveBeenCalledTimes(2) + expect(loggerInfoMock).toHaveBeenCalledWith('Created vector store', { + baseId: base.id, + dimensions: base.dimensions, + cacheSize: 1 + }) + }) + + it('clears cached stores during stop after closing them', async () => { + const service = new KnowledgeVectorStoreService() + const firstStore = createStore() + const secondCloseMock = vi.fn() + const secondStore = createStore(secondCloseMock) + + providerCreateMock.mockResolvedValueOnce(firstStore).mockResolvedValueOnce(secondStore) + + await service.createStore(createBase('kb-1')) + await service.createStore(createBase('kb-2')) + + await (service as any).onStop() + + const replacementStore = createStore() + providerCreateMock.mockResolvedValueOnce(replacementStore) + + await expect(service.createStore(createBase('kb-2'))).resolves.toBe(replacementStore) + expect(secondCloseMock).toHaveBeenCalledTimes(1) + expect(loggerInfoMock).toHaveBeenCalledWith('Stopping vector stores', { storeCount: 2 }) + expect(loggerInfoMock).toHaveBeenCalledWith('Stopped vector stores', { storeCount: 2 }) + }) + + it('continues closing remaining stores when one close fails during stop', async () => { + const service = new KnowledgeVectorStoreService() + const firstCloseError = new Error('close failed') + const firstCloseMock = vi.fn(() => { + throw firstCloseError + }) + const secondCloseMock = vi.fn() + const firstStore = createStore(firstCloseMock) + const secondStore = createStore(secondCloseMock) + + providerCreateMock.mockResolvedValueOnce(firstStore).mockResolvedValueOnce(secondStore) + + await service.createStore(createBase('kb-1')) + await service.createStore(createBase('kb-2')) + + await expect((service as any).onStop()).resolves.toBeUndefined() + + expect(firstCloseMock).toHaveBeenCalledTimes(1) + expect(secondCloseMock).toHaveBeenCalledTimes(1) + expect(loggerErrorMock).toHaveBeenCalledWith('Failed to close vector store', firstCloseError, { + baseId: 'kb-1' + }) + + const replacementStore = createStore() + providerCreateMock.mockResolvedValueOnce(replacementStore) + + await expect(service.createStore(createBase('kb-2'))).resolves.toBe(replacementStore) + }) + + it('returns undefined from getStoreIfExists when no cached store or backing file exists', async () => { + const service = new KnowledgeVectorStoreService() + const base = createBase() + + providerExistsMock.mockResolvedValueOnce(false) + + await expect(service.getStoreIfExists(base)).resolves.toBeUndefined() + + expect(providerExistsMock).toHaveBeenCalledWith(base.id) + expect(providerCreateMock).not.toHaveBeenCalled() + expect(loggerDebugMock).toHaveBeenCalledWith('Vector store does not exist on disk', { baseId: base.id }) + }) + + it('opens an existing store from disk when getStoreIfExists detects a backing file', async () => { + const service = new KnowledgeVectorStoreService() + const base = createBase() + const store = createStore() + + providerExistsMock.mockResolvedValueOnce(true) + providerCreateMock.mockResolvedValueOnce(store) + + await expect(service.getStoreIfExists(base)).resolves.toBe(store) + + expect(providerExistsMock).toHaveBeenCalledWith(base.id) + expect(providerCreateMock).toHaveBeenCalledWith(base) + expect(loggerInfoMock).toHaveBeenCalledWith('Opening existing vector store from disk', { baseId: base.id }) + }) + + it('returns the cached store from getStoreIfExists without probing the provider', async () => { + const service = new KnowledgeVectorStoreService() + const base = createBase() + const store = createStore() + + providerCreateMock.mockResolvedValueOnce(store) + await expect(service.createStore(base)).resolves.toBe(store) + + await expect(service.getStoreIfExists(base)).resolves.toBe(store) + + expect(providerExistsMock).not.toHaveBeenCalled() + expect(providerCreateMock).toHaveBeenCalledTimes(1) + expect(loggerDebugMock).toHaveBeenCalledWith('Using cached vector store from getStoreIfExists', { baseId: base.id }) + }) +}) diff --git a/src/main/services/knowledge/vectorstore/providers/BaseVectorStoreProvider.ts b/src/main/services/knowledge/vectorstore/providers/BaseVectorStoreProvider.ts new file mode 100644 index 0000000000..29290ca66a --- /dev/null +++ b/src/main/services/knowledge/vectorstore/providers/BaseVectorStoreProvider.ts @@ -0,0 +1,8 @@ +import type { KnowledgeBase } from '@shared/data/types/knowledge' +import type { BaseVectorStore } from '@vectorstores/core' + +export abstract class BaseVectorStoreProvider { + abstract create(base: KnowledgeBase): Promise + abstract delete(baseId: string): Promise + abstract exists(baseId: string): Promise +} diff --git a/src/main/services/knowledge/vectorstore/providers/LibSqlVectorStoreProvider.ts b/src/main/services/knowledge/vectorstore/providers/LibSqlVectorStoreProvider.ts new file mode 100644 index 0000000000..b4a9a73788 --- /dev/null +++ b/src/main/services/knowledge/vectorstore/providers/LibSqlVectorStoreProvider.ts @@ -0,0 +1,62 @@ +import fs from 'node:fs' +import { pathToFileURL } from 'node:url' + +import { loggerService } from '@logger' +import { application } from '@main/core/application' +import { sanitizeFilename } from '@main/utils/file' +import type { KnowledgeBase } from '@shared/data/types/knowledge' +import type { BaseVectorStore } from '@vectorstores/core' +import { LibSQLVectorStore } from '@vectorstores/libsql' + +import type { BaseVectorStoreProvider } from './BaseVectorStoreProvider' + +const logger = loggerService.withContext('LibSqlVectorStoreProvider') + +export class LibSqlVectorStoreProvider implements BaseVectorStoreProvider { + async create(base: KnowledgeBase): Promise { + const dbPath = await this.getKnowledgeBaseFilePath(base.id) + + return new LibSQLVectorStore({ + collection: base.id, + dimensions: base.dimensions, + clientConfig: { + url: pathToFileURL(dbPath).toString() + } + }) + } + + async delete(baseId: string): Promise { + const dbPath = await this.getKnowledgeBaseFilePath(baseId) + + try { + await fs.promises.rm(dbPath, { force: true }) + } catch (error) { + logger.error('Failed to delete knowledge base vector store file', error as Error, { + baseId, + dbPath + }) + throw error + } + } + + async exists(baseId: string): Promise { + const dbPath = await this.getKnowledgeBaseFilePath(baseId) + + try { + const stat = await fs.promises.stat(dbPath) + return stat.isFile() + } catch (error) { + if ((error as NodeJS.ErrnoException).code === 'ENOENT') { + return false + } + + throw error + } + } + + private async getKnowledgeBaseFilePath(baseId: string): Promise { + return application.getPath('feature.knowledgebase.data', sanitizeFilename(baseId, '_')) + } +} + +export const libSqlVectorStoreProvider = new LibSqlVectorStoreProvider() diff --git a/src/preload/index.ts b/src/preload/index.ts index 6925b7f77b..6b5504eee4 100644 --- a/src/preload/index.ts +++ b/src/preload/index.ts @@ -25,6 +25,7 @@ import type { UnifiedPreferenceType, UpgradeChannel } from '@shared/data/preference/preferenceTypes' +import type { KnowledgeSearchResult as KnowledgeVectorSearchResult } from '@shared/data/types/knowledge' import type { ExternalAppInfo } from '@shared/externalApp/types' import { IpcChannel } from '@shared/IpcChannel' import type { @@ -350,6 +351,18 @@ const api = { results }) }, + knowledgeRuntime: { + createBase: (baseId: string): Promise => + ipcRenderer.invoke(IpcChannel.KnowledgeRuntime_CreateBase, { baseId }), + deleteBase: (baseId: string): Promise => + ipcRenderer.invoke(IpcChannel.KnowledgeRuntime_DeleteBase, { baseId }), + addItems: (baseId: string, itemIds: string[]): Promise => + ipcRenderer.invoke(IpcChannel.KnowledgeRuntime_AddItems, { baseId, itemIds }), + deleteItems: (baseId: string, itemIds: string[]): Promise => + ipcRenderer.invoke(IpcChannel.KnowledgeRuntime_DeleteItems, { baseId, itemIds }), + search: (baseId: string, query: string): Promise => + ipcRenderer.invoke(IpcChannel.KnowledgeRuntime_Search, { baseId, query }) + }, memory: { add: (messages: string | AssistantMessage[], options?: AddMemoryOptions) => ipcRenderer.invoke(IpcChannel.Memory_Add, messages, options), diff --git a/src/renderer/src/windows/migrationV2/i18n/locales.ts b/src/renderer/src/windows/migrationV2/i18n/locales.ts index cb235b099c..cb8357b762 100644 --- a/src/renderer/src/windows/migrationV2/i18n/locales.ts +++ b/src/renderer/src/windows/migrationV2/i18n/locales.ts @@ -66,7 +66,8 @@ export const zhCN = { migrated_boot_config: '已迁移 {{processed}}/{{total}} 条启动配置', migrated_chats: '已迁移 {{processed}}/{{total}} 个对话,{{messages}} 条消息', migrated_preferences: '已迁移 {{processed}}/{{total}} 条配置', - migrated_knowledge: '已迁移 {{processed}}/{{total}} 条知识库记录' + migrated_knowledge: '已迁移 {{processed}}/{{total}} 条知识库记录', + migrated_knowledge_vectors: '已迁移 {{processed}}/{{total}} 个知识库向量工作单元' }, migration_completed: { title: '数据迁移完成!', @@ -162,7 +163,8 @@ export const enUS = { migrated_boot_config: 'Migrated {{processed}}/{{total}} boot config items', migrated_chats: 'Migrated {{processed}}/{{total}} conversations, {{messages}} messages', migrated_preferences: 'Migrated {{processed}}/{{total}} preferences', - migrated_knowledge: 'Migrated {{processed}}/{{total}} knowledge records' + migrated_knowledge: 'Migrated {{processed}}/{{total}} knowledge records', + migrated_knowledge_vectors: 'Migrated {{processed}}/{{total}} knowledge vector work units' }, migration_completed: { title: 'Data Migration Completed!', diff --git a/tsconfig.node.json b/tsconfig.node.json index 95c48929cc..586439d12b 100644 --- a/tsconfig.node.json +++ b/tsconfig.node.json @@ -15,7 +15,9 @@ "@modelcontextprotocol/sdk/*": ["./node_modules/@modelcontextprotocol/sdk/dist/esm/*"], "@shared/*": ["./packages/shared/*"], "@test-mocks/*": ["./tests/__mocks__/*"], - "@types": ["./src/renderer/src/types/index.ts"] + "@types": ["./src/renderer/src/types/index.ts"], + "@vectorstores/libsql": ["./packages/vectorstores/libsql/src/index.ts"], + "@vectorstores/libsql/*": ["./packages/vectorstores/libsql/src/*"] }, "tsBuildInfoFile": ".tsbuildinfo/tsconfig.node.tsbuildinfo", "types": ["electron-vite/node", "vitest/globals"], @@ -33,6 +35,7 @@ "packages/mcp-trace/**/*", "packages/provider-registry/**/*", "packages/shared/**/*", + "packages/vectorstores/**/*", "tests/__mocks__/**/*" ] } diff --git a/tsconfig.web.json b/tsconfig.web.json index 67fa887f96..137b4cd7fc 100644 --- a/tsconfig.web.json +++ b/tsconfig.web.json @@ -23,7 +23,9 @@ "@renderer/*": ["./src/renderer/src/*"], "@shared/*": ["./packages/shared/*"], "@test-mocks/*": ["./tests/__mocks__/*"], - "@types": ["./src/renderer/src/types/index.ts"] + "@types": ["./src/renderer/src/types/index.ts"], + "@vectorstores/libsql": ["./packages/vectorstores/libsql/src/index.ts"], + "@vectorstores/libsql/*": ["./packages/vectorstores/libsql/src/*"] }, "tsBuildInfoFile": ".tsbuildinfo/tsconfig.web.tsbuildinfo", "useDefineForClassFields": true @@ -41,6 +43,7 @@ "packages/mcp-trace/**/*", "packages/provider-registry/**/*", "packages/shared/**/*", + "packages/vectorstores/**/*", "packages/ui/**/*" ], "exclude": ["packages/aiCore/src/**/__tests__/**"] diff --git a/v2-refactor-temp/docs/knowledge/knowledge-backend-decisions.md b/v2-refactor-temp/docs/knowledge/knowledge-backend-decisions.md new file mode 100644 index 0000000000..02fde25df5 --- /dev/null +++ b/v2-refactor-temp/docs/knowledge/knowledge-backend-decisions.md @@ -0,0 +1,596 @@ +# 知识库后端当前实现说明 + +本文档只记录 `src/main/services/knowledge` 当前已经落地的后端分层、调用边界和 runtime 编排行为。 + +它的目标不是描述理想方案,而是把当前代码中的稳定事实说明清楚,方便后续 v2 重构继续收敛。 + +## 1. 当前架构图 + +```text ++----------------------------------------------------------------------------------+ +| Callers | +| | +| UI (Data API) UI / preload IPC / main-side calls | ++------------------------------------------+---------------------------------------+ + | + +--------------------------+ +-----------------------------------+ + | Data API | | KnowledgeOrchestrationService | + | knowledge handlers | | caller-facing workflow facade | + +-------------+------------+ +-----------------+-----------------+ + | | + v v + +--------------------------+ +---------------------------+ + | KnowledgeBaseService |<---------| KnowledgeItemService | + | base data logic | | item data + status | + +-------------+------------+ +-------------+-------------+ + | | + v v + +----------------------+ +---------------------------+ + | SQLite / Drizzle | | KnowledgeRuntimeService | + +----------------------+ | runtime execution / queue | + +-------------+-------------+ + | + v + +---------------------------+ + | reader / chunk / embed / | + | rerank / vectorstore | + +-------------+-------------+ + | + v + +------------------------+ + | LibSQL vector store | + +------------------------+ +``` + +当前知识库后端已经分成三层: + +1. `KnowledgeBaseService` / `KnowledgeItemService` + - 负责 SQLite 中的知识库业务主数据 CRUD + - 负责 `knowledge_item.status` / `error` 的持久化更新 +2. `KnowledgeOrchestrationService` + - 负责对外 workflow 编排 + - 负责统一 caller-facing IPC + - 负责把 expand / create / add / delete / search 串成单次调用入口 +3. `KnowledgeRuntimeService` + - 负责 runtime 执行 + - 负责 reader / chunk / embedding / vector store 调用串联 + - 负责队列、中断、stop 清理和检索执行 + +## 2. Data Service 的定位 + +`src/main/data/services/KnowledgeBaseService.ts` 和 `src/main/data/services/KnowledgeItemService.ts` 属于 data services。 + +它们负责: + +1. SQLite 业务表读写 +2. DTO 校验后的数据落库 +3. `knowledge_item.data` 与 `type` 的一致性校验 +4. item 状态与错误信息的持久化 + +它们不负责: + +1. reader 调度 +2. embedding 调用 +3. 向量库写入与检索 +4. runtime queue 管理 + +## 3. `KnowledgeRuntimeService` 的定位 + +当前 runtime/vector 侧的底层执行 service 是 `KnowledgeRuntimeService`,不是旧文档中的 `KnowledgeService`。 + +对应实现: + +- `src/main/services/knowledge/runtime/KnowledgeRuntimeService.ts` +- `src/main/core/application/serviceRegistry.ts` + +它是一个 lifecycle service: + +1. `@Injectable('KnowledgeRuntimeService')` +2. `@ServicePhase(Phase.WhenReady)` +3. 已注册到应用 service registry + +它当前对内部调用方暴露的核心能力是: + +1. `createBase(base)` +2. `deleteBase(baseId)` +3. `addItems(base, items)` +4. `deleteItems(base, items)` +5. `search(base, query)` + +它负责: + +1. item 级索引任务入队与执行 +2. `knowledge_item.status` 的有限状态推进 +3. 失败与中断原因写回数据库 +4. 向量库实例的获取、删除和清理 +5. 检索后的 rerank 串联 +6. stop / delete 时的 queue 中断与向量清理补偿 + +它不负责: + +1. `knowledge_base` / `knowledge_item` 的主数据 CRUD +2. caller-facing IPC workflow 编排 +3. `directory` / `sitemap` owner item 的对外展开入口 +4. 持久化任务队列 +5. 自动重试 +6. 恢复未完成索引任务继续执行 +7. 暴露调度器内部概念给调用方 + +## 3.1 `KnowledgeOrchestrationService` 的定位 + +当前对外 workflow facade 是 `KnowledgeOrchestrationService`。 + +对应实现: + +- `src/main/services/knowledge/KnowledgeOrchestrationService.ts` +- `src/main/core/application/serviceRegistry.ts` + +它是一个 lifecycle service: + +1. `@Injectable('KnowledgeOrchestrationService')` +2. `@ServicePhase(Phase.WhenReady)` +3. 已注册到应用 service registry + +它当前对外暴露的核心 IPC 能力是: + +1. `createBase(baseId)` +2. `deleteBase(baseId)` +3. `addItems(baseId, itemIds)` +4. `deleteItems(baseId, itemIds)` +5. `search(baseId, query)` + +它负责: + +1. 统一 caller-facing knowledge runtime IPC +2. 对传入 item ids 做主数据读取 +3. 对 `directory` / `sitemap` owner item 做内部 expand +4. 通过 `KnowledgeItemService.createMany()` 持久化 expanded child items +5. 过滤真正可索引的 leaf items,再交给 `KnowledgeRuntimeService.addItems()` +6. 协调 runtime 与 data service 的调用顺序 + +它不负责: + +1. 直接执行 reader / chunk / embed / vector write +2. 直接持有 queue +3. 直接持有 vector store 实例 + +## 4. 当前调用边界与调用方契约 + +### 4.1 UI + +```text +UI + | + +--> Data API -> knowledge handlers -> KnowledgeBaseService / KnowledgeItemService + | + \--> preload IPC -> KnowledgeOrchestrationService + -> KnowledgeRuntimeService +``` + +当前实现要求调用方明确区分两条调用路径: + +1. Data API + - 负责 `knowledge_base` / `knowledge_item` 的持久化 CRUD + - 负责调用方显式创建的 owner item / leaf item 主数据创建 + - 负责 `knowledge_item.status` / `error` 的持久化读写 +2. runtime IPC + - 负责统一的 knowledge workflow 入口 + - 负责必要时在 main process 内部展开 `directory` / `sitemap` + - 负责索引入队、向量写入和删除 + - 负责检索 + +当前 Data API 侧稳定接口是: + +1. `GET /knowledge-bases` +2. `POST /knowledge-bases` +3. `GET /knowledge-bases/:id` +4. `PATCH /knowledge-bases/:id` +5. `DELETE /knowledge-bases/:id` +6. `GET /knowledge-bases/:id/items` +7. `POST /knowledge-bases/:id/items` +8. `GET /knowledge-items/:id` +9. `PATCH /knowledge-items/:id` +10. `DELETE /knowledge-items/:id` + +preload 已暴露的 runtime IPC 通道是: + +1. `knowledge-runtime:create-base` +2. `knowledge-runtime:delete-base` +3. `knowledge-runtime:add-items` +4. `knowledge-runtime:delete-items` +5. `knowledge-runtime:search` + +### 4.1.1 Leaf item 的调用链 + +`file` / `url` / `note` 这类可直接索引的 leaf item,调用方应走: + +```text +caller + -> Data API create item(s) + -> get created item ids + -> preload IPC add-items(item ids) +``` + +也就是说: + +1. 先通过 Data API 创建持久化 `knowledge_item` +2. 再把 Data API 返回的 item ids 传给 runtime `addItems` +3. runtime 不负责替调用方补建 leaf item 主数据 +4. runtime `addItems` 的输入语义是“已经存在于 SQLite 中的 item ids” + +批量添加 files 时,当前契约就是: + +```text +caller + -> Data API create file items + -> get created file item ids + -> preload IPC add-items(file item ids) +``` + +### 4.1.2 Container item 的调用链 + +`directory` / `sitemap` 当前已经收口为与 leaf item 相同的“两步调用模型”。 + +当前调用方应使用: + +```text +caller + -> Data API create owner item + -> preload IPC add-items(owner item ids) +``` + +也就是说: + +1. owner item 的主数据创建仍然走 Data API +2. 对外 IPC 不再暴露 `expand*`,而是由 `KnowledgeOrchestrationService.addItems()` 在内部判断 owner item 类型 +3. 如果传入的是 `directory` / `sitemap` owner item,orchestration 会: + - expand owner + - 通过 `KnowledgeItemService.createMany()` 持久化 child items + - 过滤出 indexable leaf items + - 调用 `KnowledgeRuntimeService.addItems()` 入队索引 +4. `groupId` / `groupRef` 的职责仍然是把 owner / child / nested child 的持久化关系写进 `knowledge_item` +5. 当前调用方不再需要自己显式执行 “expand -> create children -> filter -> add” 这四步 + +这个边界是当前实现的硬约束: + +1. expand 仍然负责生成要创建的持久化 items +2. child item 的持久化仍然通过 `KnowledgeItemService.createMany()` 写入 SQLite +3. `KnowledgeRuntimeService` 仍然只负责编排可索引 items 的读取 / 切块 / embedding / vector write +4. orchestration 只是把上述步骤收口到一次 caller-facing IPC,不改变 data/runtime 的最终边界 +5. mixed batch 可用于持久化树结构,但不等于 mixed batch 可直接进入 runtime 索引队列 + +这个调用链仍然符合“Data Service 负责主数据,Runtime 负责索引执行,Orchestration 负责 workflow 收口”的分层,不属于边界漂移。 + +`directory` / `sitemap` 的当前内部流程可以进一步写成: + +```text +directory/sitemap + -> Data API create owner + -> IPC add-items(owner item ids) + -> orchestration expand owner + -> orchestration create expanded items + -> orchestration filter indexable leaf items + -> runtime add-items(indexable child items) +``` + +### 4.1.3 删除链路的当前约束 + +删除场景同样需要区分持久化删除与 runtime 删除。 + +item 删除时,调用方应理解为两件独立的事: + +1. runtime IPC `delete-items` + - 通过 orchestration 进入删除 workflow + - 中断 pending / running add task + - 删除 item 及其级联子项的向量 +2. Data API `DELETE /knowledge-items/:id` + - 删除 SQLite 中的 `knowledge_item` + - 依赖数据库 cascade 删除 grouped descendants + +base 删除时,调用方同样需要区分两步: + +1. runtime IPC `delete-base` + - 通过 orchestration 进入删除 workflow + - 中断该 base 下相关 add task + - 删除对应 vector store +2. Data API `DELETE /knowledge-bases/:id` + - 删除 SQLite 中的 base 和关联 items + +当前实现下,Data API 删除并不会替调用方清理向量库,也不会替调用方中断 runtime 任务。 + +### 4.2 Main 进程内部调用 + +主进程内部其他模块如果需要 caller-facing workflow 能力,应优先调用 `KnowledgeOrchestrationService`。 + +主进程内部如果已经明确持有 leaf items 且只需要底层索引执行能力,可以直接调用 `KnowledgeRuntimeService`。 + +主进程内部如果需要业务主数据能力,应直接调用 `KnowledgeBaseService` / `KnowledgeItemService`。 + +## 5. 当前 Queue 模型 + +### 5.1 已落地行为 + +当前实现使用一个进程内自定义 add queue: + +1. queue 持有者是 `KnowledgeRuntimeService` +2. queue 为单实例 in-memory queue +3. 默认 `concurrency = 5` +4. 所有 base 的 add item 任务共用这一条 queue +5. delete 行为不会进入 queue,而是先中断相关 add 任务,再直接删除向量 + +当前实现没有落地以下旧设计假设: + +1. 不是“每个 knowledge base 一条串行 queue” +2. 不是 round-robin scheduler +3. 没有全局持久化任务表 + +### 5.2 当前可观测状态 + +当前 queue 内部维护的是一份 `entries` map,entry 上记录: + +1. `item.id` +2. `status = pending | running` +3. `controller` +4. `promise` +5. `interruptedBy` + +它们的作用仅是: + +1. 跟踪哪些 add 任务仍在等待执行 +2. 跟踪哪些 add 任务正在运行 +3. 在 delete / shutdown 时中断对应任务 +4. 在 shutdown 时识别哪些 item 被中断并做失败补偿 + +这些状态都只是 runtime 内部实现细节,不是对外数据模型的一部分。 + +### 5.3 入队行为 + +`addItems(base, items)` 当前行为: + +1. 对传入的每个 item 分别先写 `status = pending` +2. 同时清空该 item 的旧 `error` +3. 每个 item 在自己的状态写入成功后,立即作为一个 add task 入队 +4. 如果同一个 item 已经在 pending 或 running 中,再次 enqueue 会直接复用已有 promise,不会重复入队 +5. 当前实现不是“整批状态先全部落库,再统一开始 enqueue”的原子批次启动模型 +6. 因此如果某个 item 在写 `pending` 或 enqueue 之前失败,其他已经成功启动的 item 仍可能继续执行 + +`deleteItems(base, items)` 当前行为: + +1. 不更新 item 状态 +2. 先对同 id 的 pending / running add task 做 interrupt +3. 等待相关 running add task settle +4. 直接删除这些 item 对应的向量 + +当前有: + +1. item 级 add 去重保护 +2. delete / stop 中断 add task 的机制 + +当前没有: + +1. 优先级队列 +2. 暂停 / 恢复 API +3. 自动重试 + +## 6. 当前索引执行链路 + +一个 `knowledge_item` 的一次索引流程,当前是: + +```text +addItems + -> status = pending + -> queue task + -> loadKnowledgeItemDocuments(item) + -> chunkDocuments(base, item, documents) + -> getEmbedModel(base) + -> embedDocuments(model, chunks) + -> vectorStore.add(nodes) + -> status = completed +``` + +任意步骤抛错时: + +```text +catch error + -> status = failed + -> error = normalizedError.message + -> 向上抛出异常 +``` + +当前还没有落地 `fileProcessorId` 的执行链路。代码中这一段仍然是 `// todo file processing`。 + +## 7. `knowledge_item.status` 的当前实现边界 + +### 7.1 枚举定义 + +schema 和共享类型仍然保留完整状态集合: + +1. `idle` +2. `pending` +3. `file_processing` +4. `read` +5. `embed` +6. `completed` +7. `failed` + +### 7.2 当前 runtime 实际写入 + +`KnowledgeRuntimeService` 当前真正写入的状态只有: + +1. 入队前写 `pending` +2. 成功完成写 `completed` +3. 任意失败或 shutdown 中断写 `failed` + +也就是说: + +1. `file_processing` / `read` / `embed` 目前仍是预留状态 +2. 它们已进入 schema,但当前 runtime 尚未推进到这些中间态 + +这部分必须在文档中明确,因为旧文档把这些状态当成“当前已经落地的推进链路”,但实现并非如此。 + +## 8. Lifecycle 行为 + +`KnowledgeRuntimeService` 已经接入 lifecycle system,当前行为如下。 + +### 8.1 `onInit` + +当前做三件事: + +1. `isStopping = false` +2. `addQueue.reset()` + +当前没有启动时“扫描中间状态并补偿失败”的逻辑。 + +### 8.2 `onStop` + +当前 stop 流程是: + +1. `isStopping = true` +2. 调用 `addQueue.interruptAll('stop', SHUTDOWN_INTERRUPTED_REASON)` +3. 收集中断的 entries 和 itemIds +4. 等待相关 running add task settle +5. best-effort 删除这些被中断 item 已写入的向量 +6. 将这些 item 批量写为 `failed` + +这意味着: + +1. 当前做了停止时的失败补偿 +2. 当前会在 stop 时清理被中断 item 的向量残留 +3. 但没有做重启后的自动恢复 + +## 9. Reader / Chunk / Embed / Search 的当前边界 + +### 9.1 Reader + +reader 由 `loadKnowledgeItemDocuments(item)` 按 `item.type` 分派: + +1. `file` -> `KnowledgeFileReader` +2. `url` -> `KnowledgeUrlReader` +3. `note` -> `KnowledgeNoteReader` +4. `sitemap` -> `KnowledgeSitemapReader` +5. `directory` -> `KnowledgeDirectoryReader` + +当前各 reader 的实际行为: + +1. `file` + - 按扩展名选择 reader + - 已支持 `pdf` / `csv` / `docx` / `epub` / `json` / `md` / `draftsexport` + - 其他扩展名回退到 `TextFileReader` +2. `url` + - 通过 `https://r.jina.ai/` 抓取 markdown + - 元数据中保留 `itemId` / `itemType` / `sourceUrl` / `name` +3. `note` + - 直接把 `content` 包成一个 `Document` +4. `sitemap` + - 当前已保留 `KnowledgeSitemapReader` 代码路径 + - 但 runtime 侧暂时不直接索引 `sitemap` item + - 当前调用方会先创建 sitemap owner,再通过 runtime IPC 将其展开为具体 `url` item,再进入索引流程 +5. `directory` + - 当前只作为 container placeholder + - reader 会记录 warning 并返回空数组 + - 也就是说它不会直接产出可索引文档,调用方需要先创建 directory owner,再通过 runtime IPC 将其展开为具体子 item + +### 9.2 Chunk + +`chunkDocuments(base, item, documents)` 当前做的事情: + +1. 使用 `SentenceSplitter` +2. 读取 `base.chunkSize` 和 `base.chunkOverlap` +3. 为每个 chunk 写入元数据: + - `itemId` + - `itemType` + - `sourceDocumentIndex` + - `chunkIndex` + - `chunkCount` + +### 9.3 Embed + +`getEmbedModel(base)` 当前只支持: + +1. 从 `embeddingModelId` 解析 `providerId::modelId` +2. 仅接受 `providerId === 'ollama'` + +其他 provider 当前会直接抛错。 + +`embedDocuments(model, documents)` 当前会: + +1. 用 `embedMany` 批量生成 embeddings +2. 构造 `TextNode` +3. 在 `NodeRelationship.SOURCE` 上写回 `itemId` + +### 9.4 Search + +`search(base, query)` 当前链路是: + +```text +embed query + -> vectorStore.query(...) + -> map nodes into KnowledgeSearchResult[] + -> rerankKnowledgeSearchResults(base, query, results) +``` + +查询参数来自 base: + +1. `mode = base.searchMode ?? 'default'` +2. `similarityTopK = base.documentCount ?? 10` +3. `alpha = base.hybridAlpha` + +### 9.5 Rerank 的当前真实状态 + +当前 rerank 代码路径已经存在,但 runtime 配置解析尚未接通: + +1. `base.rerankModelId` 为空时直接跳过 +2. `resolveRerankRuntime(base)` 目前始终返回 `null` +3. 因此当前 search 实际上总是返回原始检索结果,不会真正发起 rerank 请求 + +换句话说,rerank 是“代码壳已存在,但还未真正启用”。 + +## 10. `KnowledgeVectorStoreService` 的边界 + +`KnowledgeVectorStoreService` 当前负责 runtime vector store 的最小缓存和生命周期管理。 + +它负责: + +1. 按 `base.id` 创建或复用 store +2. 删除单个 base 的 store 文件 +3. shutdown 时关闭所有已缓存 store + +它当前的重要约束是: + +1. cache key 只有 `base.id` +2. 默认把 store shaping 配置视为不可变 +3. 如果 `embeddingModelId` / `dimensions` 发生变化,调用方应迁移到新的 knowledge base,而不是原地修改同一个 base 对应的向量文件 + +当前实际 provider 是 `LibSqlVectorStoreProvider`: + +1. 向量文件路径位于 `application.getPath('feature.knowledgebase.data', )` +2. 删除 base 时会删除对应文件 + +## 11. 当前明确不做的内容 + +当前实现没有做: + +1. 每个 base 一条串行 queue +2. round-robin scheduler +3. 独立的 `KnowledgeTaskService` +4. 独立的 `KnowledgeExecutionService` +5. 持久化任务队列 +6. 自动恢复索引继续执行 +7. 自动重试 +8. chunk 级 queue +9. runtime 在 `addItems` 内对 `directory` / `sitemap` item 做隐式自动展开 +10. 真正可用的 rerank runtime 配置接入 +11. 非 `ollama` embedding provider 支持 +12. `fileProcessorId` 驱动的文件处理链路 + +## 12. 后续更新本文档时的原则 + +后续只有在以下行为真正落地之后,才应更新本文档: + +1. runtime queue 从单队列改成 per-base queue +2. 中间状态 `file_processing` / `read` / `embed` 真的开始持久化写入 +3. rerank runtime 配置真正接通 +4. `fileProcessorId` 开始参与 runtime 执行链路 +5. runtime 在 `addItems` 中原生接管 `directory` / `sitemap` item 的隐式展开与索引编排 + +在这些行为落地之前,文档应继续以“当前已实现”为准,不提前写成目标设计。 diff --git a/v2-refactor-temp/docs/knowledge/knowledge-schema.md b/v2-refactor-temp/docs/knowledge/knowledge-schema.md index 2a67ed5b45..fd9f5373d5 100644 --- a/v2-refactor-temp/docs/knowledge/knowledge-schema.md +++ b/v2-refactor-temp/docs/knowledge/knowledge-schema.md @@ -73,11 +73,24 @@ This document records the current V2 knowledge target schema, migration constrai - optional query filters: `type`, `groupId` - Current runtime create flow uses: - `POST /knowledge-bases/:id/items` - - request bodies may carry `groupId` - - `groupId` may only point to an already existing owner item in the same knowledge base - - creating a new owner item and its grouped members is therefore a two-step flow in the current contract: - - create the owner item first - - create grouped members afterwards with `groupId = ownerItem.id` + - request bodies may carry `groupId`, `ref`, and `groupRef` + - `groupId` may point to an already existing owner item in the same knowledge base + - `ref` is an optional request-local reference key for one newly created item in the current batch + - `groupRef` is an optional request-local owner reference that points to another item's `ref` in the same batch + - `ref` and `groupRef` are request-level helper fields only: + - they are not persisted to SQLite + - they are resolved by the DataApi/service layer before insert + - the persisted relationship is still `groupId = ownerItem.id` + - `groupId` and `groupRef` are mutually exclusive on one item + - `groupRef` must resolve to a `ref` present in the same request batch + - one request batch may therefore create: + - a new owner item and its grouped members together + - a multi-level same-base grouping tree + - the current create contract rejects invalid batch-local grouping: + - duplicate `ref` values in one request batch + - missing `groupRef` targets + - self-references + - cycles within one request batch - Current runtime update flow uses: - `PATCH /knowledge-items/:id` - mutable fields may include `data`, `status`, `error` @@ -132,6 +145,54 @@ This document records the current V2 knowledge target schema, migration constrai - otherwise -> `idle` - Temporary legacy states such as in-progress or failed processing are not preserved as V2 status during migration. +## Runtime Status Boundary + +- `knowledge_item.status` and `knowledge_item.error` remain part of the official V2 business schema. +- The runtime queue implementation is not part of the schema contract: + - no separate task table + - no persisted queue record + - no scheduler-specific stage column +- Runtime currently uses an in-memory `p-queue` based pipeline in `KnowledgeRuntimeService`. +- The schema-level status set is still: + - `idle` + - `pending` + - `file_processing` + - `read` + - `embed` + - `completed` + - `failed` +- But the current runtime implementation only persists: + - `pending` before enqueue + - `completed` after successful vector write + - `failed` on any exception or shutdown interruption +- `file_processing`, `read`, and `embed` remain reserved intermediate statuses in the schema and shared types, but are not written by the current runtime yet. +- In other words: + - queue structure is implementation detail + - item status is business state + - some business states are currently reserved for future runtime expansion + - these concerns must not be conflated + +## Current Runtime Consumption Notes + +- Runtime entrypoint: + - `src/main/services/knowledge/KnowledgeRuntimeService.ts` +- Reader dispatch code still exists for stored `knowledge_item.type` values: + - `file` -> file reader by extension + - `url` -> fetch markdown through Jina Reader + - `note` -> inline note content + - `sitemap` -> sitemap reader code path is present, but current runtime does not index `sitemap` items directly + - `directory` -> currently treated as a container placeholder and returns no documents +- This means `directory` and `sitemap` remain valid persisted `knowledge_item.type` values, but the current runtime does not index them directly. +- For container expansion flows, upstream callers may still create mixed persisted child batches under one owner/group, for example `directory` + `file`. +- That mixed batch is a persistence concern, not an indexing contract: + - container items may be stored in `knowledge_item` + - but only concrete indexable leaf items should be submitted to runtime `addItems` +- In other words, callers must distinguish: + - create set: all items that should be persisted into `knowledge_item` + - index set: only items that runtime is allowed to index +- Upstream callers must therefore flatten containers into concrete child items and filter out non-indexable container types before indexing. +- Runtime embedding model resolution currently expects `knowledge_base.embeddingModelId` in `providerId::modelId` format and only supports `ollama` as the active provider. + ## Implementation Status - `video` and `memory` items are skipped during migration. @@ -140,3 +201,5 @@ This document records the current V2 knowledge target schema, migration constrai - Group ownership is represented implicitly by `groupId = ownerItem.id`; there is no standalone group table in the current phase. - `dimensions` resolution failure skips the entire base and all nested items, with warnings recorded in migration output. - Knowledge item status migration uses `uniqueId` instead of `processingStatus`. +- The current runtime service is `KnowledgeRuntimeService`, not the old `KnowledgeService` name used in earlier notes. +- Current runtime queue behavior is a single in-memory `PQueue({ concurrency: 5 })` shared across knowledge bases; there is no per-base serial queue yet. diff --git a/v2-refactor-temp/docs/knowledge/knowledge-vector-migrator.md b/v2-refactor-temp/docs/knowledge/knowledge-vector-migrator.md new file mode 100644 index 0000000000..00916014aa --- /dev/null +++ b/v2-refactor-temp/docs/knowledge/knowledge-vector-migrator.md @@ -0,0 +1,299 @@ +# Knowledge Vector Migrator Notes (V2) + +## 1. 文档目的 + +这份文档用于说明 V2 知识库向量迁移器的职责边界和核心规则。 + +它关注的是: + +1. V1 `embedjs` 向量库的数据来源 +2. V2 目标向量存储的落点 +3. 向量迁移过程中的关键字段转换 +4. 文件安全、校验与跳过规则 + +这份文档只描述当前已经落地的迁移器行为,不展开到未来在线索引重建或最终 retrieval API 设计。 + +对应实现: + +- `src/main/data/migration/v2/migrators/KnowledgeVectorMigrator.ts` +- `src/main/data/migration/v2/migrators/README-KnowledgeVectorMigrator.md` + +## 2. 迁移器的职责 + +`KnowledgeVectorMigrator` 的职责不是迁移知识库业务主数据,而是: + +1. 读取 V1 每个 knowledge base 对应的 legacy `embedjs` 向量库 +2. 将旧的 chunk 向量数据转换为新的 libsql-backed `vectorstores` 布局 +3. 保证新向量数据能稳定关联回已经迁移完成的 V2 `knowledge_base` / `knowledge_item` + +换句话说: + +1. `KnowledgeMigrator` 负责业务主数据 +2. `KnowledgeVectorMigrator` 负责向量索引数据 + +两者共同完成知识库的完整迁移,但 source of truth 仍然是 V2 业务表,不是向量库。 + +## 3. 数据来源 + +迁移器依赖四类输入: + +### 3.1 已迁移的 knowledge base + +来源: + +- SQLite `knowledge_base` 表 + +作用: + +- 提供 base 身份 +- 提供 embedding `dimensions` +- 决定哪些 base 需要尝试迁移向量库 + +### 3.2 已迁移的 knowledge item + +来源: + +- SQLite `knowledge_item` 表 + +作用: + +- 作为新的业务 item 身份来源 +- 为 legacy loader identity 映射提供目标 `itemId` + +### 3.3 Legacy loader metadata + +来源: + +- Redux `knowledge.bases[].items[]` + +作用: + +- 从 V1 `uniqueId` / `uniqueIds[]` 反查到已经迁移后的 `knowledge_item.id` +- 建立旧向量记录与新业务 item 的映射关系 + +### 3.4 Legacy vector database + +来源: + +- `${getDataPath()}/KnowledgeBase/` + +作用: + +- 读取 V1 `embedjs` 的 `vectors` 表 +- 提供原始 chunk 文本、source、vector + +## 4. 目标存储 + +迁移目标不是继续保留旧 `embedjs` 格式,而是生成新的 vectorstores 兼容存储。 + +当前实现的目标结构是: + +- 目标文件:沿用原 knowledge DB 路径 +- 目标表:`libsql_vectorstores_embedding` + +迁移器会为目标存储补齐必要 schema: + +1. 主表字段 + - `id` + - `external_id` + - `collection` + - `document` + - `metadata` + - `embeddings` +2. 普通索引 + - `external_id` + - `collection` +3. 向量索引 + - `libsql_vector_idx(embeddings, 'metric=cosine')` +4. FTS 表和触发器 + +## 5. 核心转换规则 + +### 5.1 Loader identity 映射 + +V1 的向量记录使用 `uniqueLoaderId` 关联 loader。 + +V2 迁移时,不保留这个旧字段作为最终业务标识,而是把它映射成新的 `knowledge_item.id`,并写入: + +- `external_id` + +映射规则: + +1. 优先使用 legacy item 的 `uniqueIds[]` +2. 如果不存在,再回退到 legacy item 的 `uniqueId` +3. 只有已经成功迁移到 V2 `knowledge_item` 的 item 才能参与映射 + +这一步的核心目标是:让新向量记录稳定关联回 V2 的业务 item,而不是继续依赖旧 loader identity。 + +这里有一个重要约束: + +1. 只有能够映射到 V2 `knowledge_item.id` 的 legacy 向量记录,才属于有效可迁移数据 +2. 无法映射到 `knowledge_item.id` 的 legacy 向量,即使仍存在于旧 `embedjs` DB 中,也视为无效索引残留 +3. 因此迁移器的目标不是“尽量保留旧向量文件中的所有内容”,而是“只保留能被当前 V2 业务表证明合法归属的向量数据” + +### 5.2 Chunk 内容映射 + +旧向量记录中的内容字段会转换为: + +- `pageContent` -> `document` +- `knowledge_item.id` -> `metadata.itemId` +- `source` -> optional `metadata.source` + +当前实现不会保留所有旧 metadata,只保留迁移和检索必需的最小信息。 +其中 `metadata.itemId` 与 `external_id` 保持同值,用于恢复稳定的业务 item 归属。 +如果 legacy row 的 `source` 为空,则迁移结果不会补默认值,也不会强制写出 `metadata.source`。 + +### 5.3 Embedding 复用 + +迁移器不会重新做 embedding。 + +它会直接复用 V1 已存在的向量: + +1. 从 legacy `vector` 字段读取 `F32_BLOB` +2. 反序列化为 `number[]` +3. 再写入新表的 `embeddings` + +这意味着: + +1. 迁移成本更低 +2. 不依赖在线模型调用 +3. 迁移阶段不会触发重新切块或重新嵌入 + +### 5.4 Chunk identity 重建 + +旧 chunk row 的 `id` 不会直接复用。 + +每一条迁移后的向量记录都会生成新的 UUID v4 `id`。 + +因此迁移的稳定关联语义不是依赖旧 chunk id,而是依赖: + +1. `baseId` +2. `external_id` = `knowledge_item.id` +3. chunk 文本与 source 对应的向量记录 + +## 6. 文件安全约束 + +当前迁移器采用“临时文件重建 + 就地替换”的策略。 + +规则如下: + +1. 先在原 DB 的同级路径写一个临时文件 + - `.vectorstore.tmp` +2. 临时文件写完整并校验成功后,再替换原文件 +3. 如果当前 base 在最终替换前失败,原始 legacy DB 保持不变 + +这意味着: + +1. 迁移过程尽量避免中途损坏原文件 +2. 当前流程依赖用户在迁移前已完成 V1 备份 +3. 迁移器自身不额外维护一份回滚副本 + +## IMPORTANT: 当前已接受的局限 + +以下行为是当前实现**明确接受**的限制,不应误读为“未来理想方案”: + +1. base 级执行失败属于迁移失败,不属于可跳过数据 + - 如果某个 base 在重建临时库、写入目标表或替换正式文件时失败,`execute()` 会直接返回 `success: false` + - 这类失败不会被计入 `skippedCount`,也不应只记 warning 后继续成功 +2. 当前实现**不会**在知识库目录中额外保留可重试的 legacy `.bak` 文件 + - 也就是说,迁移器不会为后续自动重试维护一份原地可恢复的 V1 向量源 +3. 当前替换策略仍然是“就地替换” + - 因此一旦失败发生在“原文件已移除,但新文件还未成功落位”的窗口中,磁盘上的 legacy source 可能已经不可继续复用 +4. 当前重试策略依赖用户在迁移前完成的完整 V1 备份 + - 如果迁移失败,后续再次迁移前,应先由用户手动恢复原始备份 + - 迁移器自身**不保证**失败后目录仍处于“可直接再次迁移”的状态 + - 这也是当前实现没有采用“三段式替换(old -> .bak -> new)”的直接原因 + - 当前产品策略是在开始迁移前要求用户先做一次完整文件备份 + - 因此失败恢复的 source of truth 是“用户手动恢复的迁移前备份”,不是迁移器在知识库目录内额外维护的一份临时回滚副本 +5. 这一点很重要,后续可能需要继续讨论或改动 + - 如果未来项目希望支持“失败后无需手动恢复、可直接再次迁移”,那么文件替换与 backup 策略需要重新设计 + +## 7. 校验规则 + +当前实现会做至少以下校验: + +1. 每个 base 的目标行数必须与 prepared row 数一致 +2. 每条迁移后的记录都必须有非空 `external_id` +3. 每条迁移后的记录都必须有 `metadata.itemId`,并与 `external_id` 保持一致 + +如果不满足这些条件,应视为当前 base 迁移失败。 + +## 8. 跳过规则 + +以下情况会被跳过,而不是强行写入: + +1. `knowledge_base` 中不存在对应 base +2. legacy DB 文件不存在 +3. legacy DB 路径实际是目录 +4. legacy DB 不包含 `vectors` 表 +5. `uniqueLoaderId` 无法映射回已迁移的 `knowledge_item.id` +6. 向量记录缺少 `vector` 或 `vector` 为空 + +这些跳过通常会记录 warning,而不是让整个迁移流程全部中断。 + +补充说明: + +1. 如果某个 base 的 legacy 向量记录最终全部被跳过,则该 base 在 V2 中会被重建为空的 vector store +2. 这不是“回滚保留旧 DB”的场景,而是预期的数据清洗结果 +3. 原因是这些被跳过的记录无法稳定关联到当前 V2 `knowledge_item`,因此不再被视为有效业务索引数据 + +## 9. 当前边界与限制 + +当前迁移器只负责“向量索引重建”,不负责: + +1. 重新切块 +2. 重新 embedding +3. 重新生成业务 item +4. 校正旧知识库的业务配置 +5. 设计最终 retrieval service 的 API + +因此它的定位应该是: + +- 一次性的迁移工具 +- 不等同于运行时知识库索引服务 + +## 10. 对后续实现的影响 + +基于当前迁移器行为,后续 V2 运行时设计需要遵守以下前提: + +1. V2 业务真相仍然来自 `knowledge_base` / `knowledge_item` +2. 新向量记录必须能通过 `external_id` 稳定关联到 `knowledge_item.id` +3. 运行时不应继续依赖 V1 `embedjs` 的 `uniqueLoaderId` +4. 如果未来需要重建索引,应按 V2 业务表重新生成,而不是继续依赖旧迁移逻辑 + +## 11. 与其他文档的关系 + +- `knowledge-backend-decisions.md` + - 定义当前 `KnowledgeRuntimeService`、data services、queue 和 runtime/vector 边界 +- `knowledge-schema.md` + - 定义 V2 业务 schema +- 本文档 + - 专门说明向量迁移器如何把旧向量索引接到新的 V2 业务模型上 + +三者的关系可以简化为: + +1. schema 定义业务结构 +2. backend decisions 文档定义当前运行时边界 +3. vector migrator 文档定义旧向量索引如何迁移进新体系 + +## 12. 与当前 Runtime 的衔接 + +当前 runtime 向量侧实现位于: + +- `src/main/services/knowledge/KnowledgeRuntimeService.ts` +- `src/main/services/knowledge/vectorstore/KnowledgeVectorStoreService.ts` +- `src/main/services/knowledge/vectorstore/providers/LibSqlVectorStoreProvider.ts` + +这意味着迁移后的向量数据并不是孤立的一次性产物,而是会被当前 runtime 直接按 knowledge base 打开和查询。 + +当前已确认的衔接点是: + +1. runtime 通过 `KnowledgeVectorStoreService` 按 `base.id` 获取 store +2. 实际 store provider 是 `LibSqlVectorStoreProvider` +3. runtime 检索和写入都基于 libsql vector store + +因此,迁移器与 runtime 的共同前提是: + +1. V2 业务真相来自 `knowledge_base` / `knowledge_item` +2. 运行时向量文件与迁移后的向量文件都属于同一类 libsql-backed vector store 体系 +3. 运行时关联业务 item 仍应以 `knowledge_item.id` 为稳定标识,而不是继续依赖 V1 loader identity diff --git a/vitest.config.ts b/vitest.config.ts index 8498c4bba6..e293e127e2 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -90,6 +90,18 @@ export default defineConfig({ } } }, + // vectorstores 包单元测试配置 + { + extends: true, + test: { + name: 'vectorstores', + environment: 'node', + include: [ + 'packages/vectorstores/**/*.{test,spec}.{ts,tsx}', + 'packages/vectorstores/**/__tests__/**/*.{test,spec}.{ts,tsx}' + ] + } + }, // packages/ui 单元测试配置 { extends: true,