From 930dc8adc7533dcca1d544ea42aeca92e4f18832 Mon Sep 17 00:00:00 2001
From: DAYANE GRISEL PALACIOS TORRES
 <dayanegriselpalaciostorres@MacBook-Air-de-DAYANE.local>
Date: Sun, 28 Jun 2026 19:15:51 -0600
Subject: [PATCH 1/2] feat(bundle): llama.cpp CPU Qwen3-Embedding-0.6B
 embedding bundle

Adds a CPU-only embedding bundle so hosts without a compatible GPU (or that
can't reach grackle-embed) can run semantic search locally. Serves
Qwen3-Embedding-0.6B (Q8_0 GGUF, 1024-dim) via llama.cpp with an OpenAI-compatible
/v1/embeddings endpoint on 127.0.0.1:8007, registering the llamacpp-cpu-embed
provider. Same model / vector space as the GPU embed bundles. Runs on macOS/Windows
Docker Desktop; first request auto-downloads the GGUF via -hf and caches it.

Regenerated registry/add-ons.json via build-registry.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 bundles/llamacpp-cpu-qwen3-embed/README.md    | 44 +++++++++++++++
 .../docker-compose.yml                        | 45 ++++++++++++++++
 .../llamacpp-cpu-qwen3-embed/manifest.json    | 43 +++++++++++++++
 registry/add-ons.json                         | 54 +++++++++++++++++++
 4 files changed, 186 insertions(+)
 create mode 100644 bundles/llamacpp-cpu-qwen3-embed/README.md
 create mode 100644 bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml
 create mode 100644 bundles/llamacpp-cpu-qwen3-embed/manifest.json

diff --git a/bundles/llamacpp-cpu-qwen3-embed/README.md b/bundles/llamacpp-cpu-qwen3-embed/README.md
new file mode 100644
index 00000000..2e555004
--- /dev/null
+++ b/bundles/llamacpp-cpu-qwen3-embed/README.md
@@ -0,0 +1,44 @@
+# llama.cpp Qwen3-Embedding-0.6B (CPU)
+
+A **CPU-only** embedding endpoint for Crow's semantic search, for hosts without a
+compatible GPU (or that can't reach a shared embedding server like `grackle-embed`).
+It serves **Qwen3-Embedding-0.6B** (Q8_0 GGUF, 1024-dim) via llama.cpp with an
+OpenAI-compatible `/v1/embeddings` API on `127.0.0.1:8007`.
+
+Same model — and therefore the same 1024-dim vector space — as the GPU
+`vllm-cuda-embed` / `llamacpp-vulkan-qwen3-embed` bundles, so embeddings are
+interchangeable across them.
+
+## Install
+
+From the Crow's Nest **Extensions** panel, install **llama.cpp Qwen3-Embedding-0.6B (CPU)**.
+Requires Docker. The first request downloads the ~640MB GGUF and caches it in a
+Docker volume.
+
+## Make it the embedding provider
+
+The bundle registers a provider with id **`llamacpp-cpu-embed`**. Point Crow's
+semantic search at it (see *Choosing the embedding provider* in the AI Providers
+guide):
+
+```sql
+INSERT INTO dashboard_settings (key, value) VALUES ('embed_provider', 'llamacpp-cpu-embed')
+  ON CONFLICT(key) DO UPDATE SET value = excluded.value;
+```
+
+…or set `CROW_EMBED_PROVIDER=llamacpp-cpu-embed`. Allow ~30s for the cache to refresh.
+
+## Verify
+
+```bash
+curl http://127.0.0.1:8007/v1/models
+curl -s http://127.0.0.1:8007/v1/embeddings \
+  -H 'Content-Type: application/json' \
+  -d '{"model":"qwen3-embedding-0.6b","input":"hello"}' | head -c 200
+```
+
+## Notes
+
+- CPU inference: embedding a short text is fast; the one-time model load takes a
+  couple of seconds on first call.
+- No API key, no GPU, no data leaves your machine.
diff --git a/bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml b/bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml
new file mode 100644
index 00000000..9788ebb2
--- /dev/null
+++ b/bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml
@@ -0,0 +1,45 @@
+## llama.cpp CPU embedding server — Qwen3-Embedding-0.6B (OpenAI-compatible).
+##
+## No GPU required; runs on any Docker host including macOS/Windows Docker Desktop.
+## The first start downloads the GGUF (~640MB) via -hf and caches it in the named
+## volume, so subsequent restarts are fast.
+##
+## Usage (manual test):
+##   docker compose up -d
+##   curl http://127.0.0.1:8007/v1/models
+##   curl -s http://127.0.0.1:8007/v1/embeddings \
+##     -H 'Content-Type: application/json' \
+##     -d '{"model":"qwen3-embedding-0.6b","input":"hello"}' | head -c 200
+
+services:
+  llamacpp-cpu-qwen3-embed:
+    image: ghcr.io/ggml-org/llama.cpp:server
+    container_name: llamacpp-cpu-qwen3-embed
+    environment:
+      - LLAMA_CACHE=/models
+    volumes:
+      - llamacpp-cpu-qwen3-embed-models:/models
+    ports:
+      - "127.0.0.1:8007:8000"
+    command:
+      - --hf-repo
+      - Qwen/Qwen3-Embedding-0.6B-GGUF
+      - --hf-file
+      - Qwen3-Embedding-0.6B-Q8_0.gguf
+      - --alias
+      - qwen3-embedding-0.6b
+      - --embedding
+      - --pooling
+      - mean
+      - --ctx-size
+      - "8192"
+      - --threads
+      - "4"
+      - --host
+      - 0.0.0.0
+      - --port
+      - "8000"
+    restart: unless-stopped
+
+volumes:
+  llamacpp-cpu-qwen3-embed-models:
diff --git a/bundles/llamacpp-cpu-qwen3-embed/manifest.json b/bundles/llamacpp-cpu-qwen3-embed/manifest.json
new file mode 100644
index 00000000..0c4d91e8
--- /dev/null
+++ b/bundles/llamacpp-cpu-qwen3-embed/manifest.json
@@ -0,0 +1,43 @@
+{
+  "id": "llamacpp-cpu-qwen3-embed",
+  "type": "bundle",
+  "name": "llama.cpp Qwen3-Embedding-0.6B (CPU)",
+  "category": "ai",
+  "description": "CPU-only text embeddings. Qwen3-Embedding-0.6B Q8_0 GGUF served by llama.cpp with an OpenAI-compatible /v1/embeddings endpoint on port 8007. Runs anywhere Docker runs (including macOS/Windows Docker Desktop) — no GPU required. Registers the llamacpp-cpu-embed provider for semantic memory/research/blog search.",
+  "author": "Crow",
+  "version": "1.0.0",
+  "host": "local",
+  "icon": "cpu",
+  "tags": ["ai", "embeddings", "semantic-search", "cpu", "llama.cpp", "local", "privacy"],
+  "requires": {
+    "gpu_arch": ["cpu"],
+    "min_ram_mb": 2048,
+    "min_disk_mb": 3000
+  },
+  "env_vars": [],
+  "port": 8007,
+  "webUI": null,
+  "docker": {
+    "composefile": "docker-compose.yml"
+  },
+  "providers": [
+    {
+      "id": "llamacpp-cpu-embed",
+      "baseUrlTemplate": "http://{host_ip}:{port}/v1",
+      "apiKey": "none",
+      "description": "OpenAI-compatible embedding endpoint (Qwen3-Embedding-0.6B Q8_0, CPU llama.cpp)",
+      "models": [
+        {
+          "id": "qwen3-embedding-0.6b",
+          "task": "embed",
+          "dim": 1024,
+          "dimensions": 1024,
+          "contextLen": 32768,
+          "warm": false,
+          "priority": "background"
+        }
+      ]
+    }
+  ],
+  "notes": "CPU-only; the first request downloads the ~640MB GGUF and caches it in a Docker volume. After install, select it as the embedding provider — set dashboard_settings 'embed_provider' = 'llamacpp-cpu-embed' (or export CROW_EMBED_PROVIDER=llamacpp-cpu-embed). 1024-dim, vector-space compatible with grackle-embed / vllm-cuda-embed (same model)."
+}
diff --git a/registry/add-ons.json b/registry/add-ons.json
index edecf8bf..9f581c9c 100644
--- a/registry/add-ons.json
+++ b/registry/add-ons.json
@@ -2422,6 +2422,60 @@
       "notes": "Self-host via Docker or connect to an existing Linkding instance. Generate an API token in Linkding Settings > Integrations.",
       "official": true
     },
+    {
+      "id": "llamacpp-cpu-qwen3-embed",
+      "type": "bundle",
+      "name": "llama.cpp Qwen3-Embedding-0.6B (CPU)",
+      "category": "ai",
+      "description": "CPU-only text embeddings. Qwen3-Embedding-0.6B Q8_0 GGUF served by llama.cpp with an OpenAI-compatible /v1/embeddings endpoint on port 8007. Runs anywhere Docker runs (including macOS/Windows Docker Desktop) — no GPU required. Registers the llamacpp-cpu-embed provider for semantic memory/research/blog search.",
+      "author": "Crow",
+      "version": "1.0.0",
+      "host": "local",
+      "icon": "cpu",
+      "tags": [
+        "ai",
+        "embeddings",
+        "semantic-search",
+        "cpu",
+        "llama.cpp",
+        "local",
+        "privacy"
+      ],
+      "requires": {
+        "gpu_arch": [
+          "cpu"
+        ],
+        "min_ram_mb": 2048,
+        "min_disk_mb": 3000
+      },
+      "env_vars": [],
+      "port": 8007,
+      "webUI": null,
+      "docker": {
+        "composefile": "docker-compose.yml"
+      },
+      "providers": [
+        {
+          "id": "llamacpp-cpu-embed",
+          "baseUrlTemplate": "http://{host_ip}:{port}/v1",
+          "apiKey": "none",
+          "description": "OpenAI-compatible embedding endpoint (Qwen3-Embedding-0.6B Q8_0, CPU llama.cpp)",
+          "models": [
+            {
+              "id": "qwen3-embedding-0.6b",
+              "task": "embed",
+              "dim": 1024,
+              "dimensions": 1024,
+              "contextLen": 32768,
+              "warm": false,
+              "priority": "background"
+            }
+          ]
+        }
+      ],
+      "notes": "CPU-only; the first request downloads the ~640MB GGUF and caches it in a Docker volume. After install, select it as the embedding provider — set dashboard_settings 'embed_provider' = 'llamacpp-cpu-embed' (or export CROW_EMBED_PROVIDER=llamacpp-cpu-embed). 1024-dim, vector-space compatible with grackle-embed / vllm-cuda-embed (same model).",
+      "official": true
+    },
     {
       "id": "llamacpp-qwen72b",
       "type": "bundle",

From 6b09cdfacf6ea0a236c4f4e0749a7ab371e7ea7c Mon Sep 17 00:00:00 2001
From: kh0pper <kevin.hopper@maestro.press>
Date: Sun, 28 Jun 2026 20:29:18 -0500
Subject: [PATCH 2/2] fix(bundle): align llamacpp-cpu-embed contextLen with
 served --ctx-size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

manifest declared contextLen 32768 but docker-compose serves --ctx-size 8192,
so inputs over 8K tokens would be silently rejected despite the advertised
capacity. The CPU bundle intentionally caps ctx at 8192 for RAM; embedding
inputs are capped at 8000 chars upstream, so 8192 is ample. Lower the declared
contextLen (manifest + regenerated registry entry) to match reality. Vector
space is unchanged (1024-dim, same model) — embeddings stay interchangeable
with the GPU bundles; only max input length differs.
---
 bundles/llamacpp-cpu-qwen3-embed/manifest.json | 2 +-
 registry/add-ons.json                          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bundles/llamacpp-cpu-qwen3-embed/manifest.json b/bundles/llamacpp-cpu-qwen3-embed/manifest.json
index 0c4d91e8..7cdd5ef3 100644
--- a/bundles/llamacpp-cpu-qwen3-embed/manifest.json
+++ b/bundles/llamacpp-cpu-qwen3-embed/manifest.json
@@ -32,7 +32,7 @@
           "task": "embed",
           "dim": 1024,
           "dimensions": 1024,
-          "contextLen": 32768,
+          "contextLen": 8192,
           "warm": false,
           "priority": "background"
         }
diff --git a/registry/add-ons.json b/registry/add-ons.json
index 9f581c9c..b297c678 100644
--- a/registry/add-ons.json
+++ b/registry/add-ons.json
@@ -2466,7 +2466,7 @@
               "task": "embed",
               "dim": 1024,
               "dimensions": 1024,
-              "contextLen": 32768,
+              "contextLen": 8192,
               "warm": false,
               "priority": "background"
             }