From 930dc8adc7533dcca1d544ea42aeca92e4f18832 Mon Sep 17 00:00:00 2001 From: DAYANE GRISEL PALACIOS TORRES Date: Sun, 28 Jun 2026 19:15:51 -0600 Subject: [PATCH 1/2] feat(bundle): llama.cpp CPU Qwen3-Embedding-0.6B embedding bundle Adds a CPU-only embedding bundle so hosts without a compatible GPU (or that can't reach grackle-embed) can run semantic search locally. Serves Qwen3-Embedding-0.6B (Q8_0 GGUF, 1024-dim) via llama.cpp with an OpenAI-compatible /v1/embeddings endpoint on 127.0.0.1:8007, registering the llamacpp-cpu-embed provider. Same model / vector space as the GPU embed bundles. Runs on macOS/Windows Docker Desktop; first request auto-downloads the GGUF via -hf and caches it. Regenerated registry/add-ons.json via build-registry. Co-Authored-By: Claude Opus 4.8 (1M context) --- bundles/llamacpp-cpu-qwen3-embed/README.md | 44 +++++++++++++++ .../docker-compose.yml | 45 ++++++++++++++++ .../llamacpp-cpu-qwen3-embed/manifest.json | 43 +++++++++++++++ registry/add-ons.json | 54 +++++++++++++++++++ 4 files changed, 186 insertions(+) create mode 100644 bundles/llamacpp-cpu-qwen3-embed/README.md create mode 100644 bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml create mode 100644 bundles/llamacpp-cpu-qwen3-embed/manifest.json diff --git a/bundles/llamacpp-cpu-qwen3-embed/README.md b/bundles/llamacpp-cpu-qwen3-embed/README.md new file mode 100644 index 00000000..2e555004 --- /dev/null +++ b/bundles/llamacpp-cpu-qwen3-embed/README.md @@ -0,0 +1,44 @@ +# llama.cpp Qwen3-Embedding-0.6B (CPU) + +A **CPU-only** embedding endpoint for Crow's semantic search, for hosts without a +compatible GPU (or that can't reach a shared embedding server like `grackle-embed`). +It serves **Qwen3-Embedding-0.6B** (Q8_0 GGUF, 1024-dim) via llama.cpp with an +OpenAI-compatible `/v1/embeddings` API on `127.0.0.1:8007`. + +Same model — and therefore the same 1024-dim vector space — as the GPU +`vllm-cuda-embed` / `llamacpp-vulkan-qwen3-embed` bundles, so embeddings are +interchangeable across them. + +## Install + +From the Crow's Nest **Extensions** panel, install **llama.cpp Qwen3-Embedding-0.6B (CPU)**. +Requires Docker. The first request downloads the ~640MB GGUF and caches it in a +Docker volume. + +## Make it the embedding provider + +The bundle registers a provider with id **`llamacpp-cpu-embed`**. Point Crow's +semantic search at it (see *Choosing the embedding provider* in the AI Providers +guide): + +```sql +INSERT INTO dashboard_settings (key, value) VALUES ('embed_provider', 'llamacpp-cpu-embed') + ON CONFLICT(key) DO UPDATE SET value = excluded.value; +``` + +…or set `CROW_EMBED_PROVIDER=llamacpp-cpu-embed`. Allow ~30s for the cache to refresh. + +## Verify + +```bash +curl http://127.0.0.1:8007/v1/models +curl -s http://127.0.0.1:8007/v1/embeddings \ + -H 'Content-Type: application/json' \ + -d '{"model":"qwen3-embedding-0.6b","input":"hello"}' | head -c 200 +``` + +## Notes + +- CPU inference: embedding a short text is fast; the one-time model load takes a + couple of seconds on first call. +- No API key, no GPU, no data leaves your machine. diff --git a/bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml b/bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml new file mode 100644 index 00000000..9788ebb2 --- /dev/null +++ b/bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml @@ -0,0 +1,45 @@ +## llama.cpp CPU embedding server — Qwen3-Embedding-0.6B (OpenAI-compatible). +## +## No GPU required; runs on any Docker host including macOS/Windows Docker Desktop. +## The first start downloads the GGUF (~640MB) via -hf and caches it in the named +## volume, so subsequent restarts are fast. +## +## Usage (manual test): +## docker compose up -d +## curl http://127.0.0.1:8007/v1/models +## curl -s http://127.0.0.1:8007/v1/embeddings \ +## -H 'Content-Type: application/json' \ +## -d '{"model":"qwen3-embedding-0.6b","input":"hello"}' | head -c 200 + +services: + llamacpp-cpu-qwen3-embed: + image: ghcr.io/ggml-org/llama.cpp:server + container_name: llamacpp-cpu-qwen3-embed + environment: + - LLAMA_CACHE=/models + volumes: + - llamacpp-cpu-qwen3-embed-models:/models + ports: + - "127.0.0.1:8007:8000" + command: + - --hf-repo + - Qwen/Qwen3-Embedding-0.6B-GGUF + - --hf-file + - Qwen3-Embedding-0.6B-Q8_0.gguf + - --alias + - qwen3-embedding-0.6b + - --embedding + - --pooling + - mean + - --ctx-size + - "8192" + - --threads + - "4" + - --host + - 0.0.0.0 + - --port + - "8000" + restart: unless-stopped + +volumes: + llamacpp-cpu-qwen3-embed-models: diff --git a/bundles/llamacpp-cpu-qwen3-embed/manifest.json b/bundles/llamacpp-cpu-qwen3-embed/manifest.json new file mode 100644 index 00000000..0c4d91e8 --- /dev/null +++ b/bundles/llamacpp-cpu-qwen3-embed/manifest.json @@ -0,0 +1,43 @@ +{ + "id": "llamacpp-cpu-qwen3-embed", + "type": "bundle", + "name": "llama.cpp Qwen3-Embedding-0.6B (CPU)", + "category": "ai", + "description": "CPU-only text embeddings. Qwen3-Embedding-0.6B Q8_0 GGUF served by llama.cpp with an OpenAI-compatible /v1/embeddings endpoint on port 8007. Runs anywhere Docker runs (including macOS/Windows Docker Desktop) — no GPU required. Registers the llamacpp-cpu-embed provider for semantic memory/research/blog search.", + "author": "Crow", + "version": "1.0.0", + "host": "local", + "icon": "cpu", + "tags": ["ai", "embeddings", "semantic-search", "cpu", "llama.cpp", "local", "privacy"], + "requires": { + "gpu_arch": ["cpu"], + "min_ram_mb": 2048, + "min_disk_mb": 3000 + }, + "env_vars": [], + "port": 8007, + "webUI": null, + "docker": { + "composefile": "docker-compose.yml" + }, + "providers": [ + { + "id": "llamacpp-cpu-embed", + "baseUrlTemplate": "http://{host_ip}:{port}/v1", + "apiKey": "none", + "description": "OpenAI-compatible embedding endpoint (Qwen3-Embedding-0.6B Q8_0, CPU llama.cpp)", + "models": [ + { + "id": "qwen3-embedding-0.6b", + "task": "embed", + "dim": 1024, + "dimensions": 1024, + "contextLen": 32768, + "warm": false, + "priority": "background" + } + ] + } + ], + "notes": "CPU-only; the first request downloads the ~640MB GGUF and caches it in a Docker volume. After install, select it as the embedding provider — set dashboard_settings 'embed_provider' = 'llamacpp-cpu-embed' (or export CROW_EMBED_PROVIDER=llamacpp-cpu-embed). 1024-dim, vector-space compatible with grackle-embed / vllm-cuda-embed (same model)." +} diff --git a/registry/add-ons.json b/registry/add-ons.json index edecf8bf..9f581c9c 100644 --- a/registry/add-ons.json +++ b/registry/add-ons.json @@ -2422,6 +2422,60 @@ "notes": "Self-host via Docker or connect to an existing Linkding instance. Generate an API token in Linkding Settings > Integrations.", "official": true }, + { + "id": "llamacpp-cpu-qwen3-embed", + "type": "bundle", + "name": "llama.cpp Qwen3-Embedding-0.6B (CPU)", + "category": "ai", + "description": "CPU-only text embeddings. Qwen3-Embedding-0.6B Q8_0 GGUF served by llama.cpp with an OpenAI-compatible /v1/embeddings endpoint on port 8007. Runs anywhere Docker runs (including macOS/Windows Docker Desktop) — no GPU required. Registers the llamacpp-cpu-embed provider for semantic memory/research/blog search.", + "author": "Crow", + "version": "1.0.0", + "host": "local", + "icon": "cpu", + "tags": [ + "ai", + "embeddings", + "semantic-search", + "cpu", + "llama.cpp", + "local", + "privacy" + ], + "requires": { + "gpu_arch": [ + "cpu" + ], + "min_ram_mb": 2048, + "min_disk_mb": 3000 + }, + "env_vars": [], + "port": 8007, + "webUI": null, + "docker": { + "composefile": "docker-compose.yml" + }, + "providers": [ + { + "id": "llamacpp-cpu-embed", + "baseUrlTemplate": "http://{host_ip}:{port}/v1", + "apiKey": "none", + "description": "OpenAI-compatible embedding endpoint (Qwen3-Embedding-0.6B Q8_0, CPU llama.cpp)", + "models": [ + { + "id": "qwen3-embedding-0.6b", + "task": "embed", + "dim": 1024, + "dimensions": 1024, + "contextLen": 32768, + "warm": false, + "priority": "background" + } + ] + } + ], + "notes": "CPU-only; the first request downloads the ~640MB GGUF and caches it in a Docker volume. After install, select it as the embedding provider — set dashboard_settings 'embed_provider' = 'llamacpp-cpu-embed' (or export CROW_EMBED_PROVIDER=llamacpp-cpu-embed). 1024-dim, vector-space compatible with grackle-embed / vllm-cuda-embed (same model).", + "official": true + }, { "id": "llamacpp-qwen72b", "type": "bundle", From 6b09cdfacf6ea0a236c4f4e0749a7ab371e7ea7c Mon Sep 17 00:00:00 2001 From: kh0pper Date: Sun, 28 Jun 2026 20:29:18 -0500 Subject: [PATCH 2/2] fix(bundle): align llamacpp-cpu-embed contextLen with served --ctx-size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit manifest declared contextLen 32768 but docker-compose serves --ctx-size 8192, so inputs over 8K tokens would be silently rejected despite the advertised capacity. The CPU bundle intentionally caps ctx at 8192 for RAM; embedding inputs are capped at 8000 chars upstream, so 8192 is ample. Lower the declared contextLen (manifest + regenerated registry entry) to match reality. Vector space is unchanged (1024-dim, same model) — embeddings stay interchangeable with the GPU bundles; only max input length differs. --- bundles/llamacpp-cpu-qwen3-embed/manifest.json | 2 +- registry/add-ons.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bundles/llamacpp-cpu-qwen3-embed/manifest.json b/bundles/llamacpp-cpu-qwen3-embed/manifest.json index 0c4d91e8..7cdd5ef3 100644 --- a/bundles/llamacpp-cpu-qwen3-embed/manifest.json +++ b/bundles/llamacpp-cpu-qwen3-embed/manifest.json @@ -32,7 +32,7 @@ "task": "embed", "dim": 1024, "dimensions": 1024, - "contextLen": 32768, + "contextLen": 8192, "warm": false, "priority": "background" } diff --git a/registry/add-ons.json b/registry/add-ons.json index 9f581c9c..b297c678 100644 --- a/registry/add-ons.json +++ b/registry/add-ons.json @@ -2466,7 +2466,7 @@ "task": "embed", "dim": 1024, "dimensions": 1024, - "contextLen": 32768, + "contextLen": 8192, "warm": false, "priority": "background" }