kh0pper · kh0pper · Jun 29, 2026 · Jun 29, 2026 · Jun 29, 2026
diff --git a/bundles/llamacpp-cpu-qwen3-embed/README.md b/bundles/llamacpp-cpu-qwen3-embed/README.md
@@ -0,0 +1,44 @@
+# llama.cpp Qwen3-Embedding-0.6B (CPU)
+
+A **CPU-only** embedding endpoint for Crow's semantic search, for hosts without a
+compatible GPU (or that can't reach a shared embedding server like `grackle-embed`).
+It serves **Qwen3-Embedding-0.6B** (Q8_0 GGUF, 1024-dim) via llama.cpp with an
+OpenAI-compatible `/v1/embeddings` API on `127.0.0.1:8007`.
+
+Same model — and therefore the same 1024-dim vector space — as the GPU
+`vllm-cuda-embed` / `llamacpp-vulkan-qwen3-embed` bundles, so embeddings are
+interchangeable across them.
+
+## Install
+
+From the Crow's Nest **Extensions** panel, install **llama.cpp Qwen3-Embedding-0.6B (CPU)**.
+Requires Docker. The first request downloads the ~640MB GGUF and caches it in a
+Docker volume.
+
+## Make it the embedding provider
+
+The bundle registers a provider with id **`llamacpp-cpu-embed`**. Point Crow's
+semantic search at it (see *Choosing the embedding provider* in the AI Providers
+guide):
+
+```sql
+INSERT INTO dashboard_settings (key, value) VALUES ('embed_provider', 'llamacpp-cpu-embed')
+  ON CONFLICT(key) DO UPDATE SET value = excluded.value;
+```
+
+…or set `CROW_EMBED_PROVIDER=llamacpp-cpu-embed`. Allow ~30s for the cache to refresh.
+
+## Verify
+
+```bash
+curl http://127.0.0.1:8007/v1/models
+curl -s http://127.0.0.1:8007/v1/embeddings \
+  -H 'Content-Type: application/json' \
+  -d '{"model":"qwen3-embedding-0.6b","input":"hello"}' | head -c 200
+```
+
+## Notes
+
+- CPU inference: embedding a short text is fast; the one-time model load takes a
+  couple of seconds on first call.
+- No API key, no GPU, no data leaves your machine.
diff --git a/bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml b/bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml
@@ -0,0 +1,45 @@
+## llama.cpp CPU embedding server — Qwen3-Embedding-0.6B (OpenAI-compatible).
+##
+## No GPU required; runs on any Docker host including macOS/Windows Docker Desktop.
+## The first start downloads the GGUF (~640MB) via -hf and caches it in the named
+## volume, so subsequent restarts are fast.
+##
+## Usage (manual test):
+##   docker compose up -d
+##   curl http://127.0.0.1:8007/v1/models
+##   curl -s http://127.0.0.1:8007/v1/embeddings \
+##     -H 'Content-Type: application/json' \
+##     -d '{"model":"qwen3-embedding-0.6b","input":"hello"}' | head -c 200
+
+services:
+  llamacpp-cpu-qwen3-embed:
+    image: ghcr.io/ggml-org/llama.cpp:server
+    container_name: llamacpp-cpu-qwen3-embed
+    environment:
+      - LLAMA_CACHE=/models
+    volumes:
+      - llamacpp-cpu-qwen3-embed-models:/models
+    ports:
+      - "127.0.0.1:8007:8000"
+    command:
+      - --hf-repo
+      - Qwen/Qwen3-Embedding-0.6B-GGUF
+      - --hf-file
+      - Qwen3-Embedding-0.6B-Q8_0.gguf
+      - --alias
+      - qwen3-embedding-0.6b
+      - --embedding
+      - --pooling
+      - mean
+      - --ctx-size
+      - "8192"
+      - --threads
+      - "4"
+      - --host
+      - 0.0.0.0
+      - --port
+      - "8000"
+    restart: unless-stopped
+
+volumes:
+  llamacpp-cpu-qwen3-embed-models:
diff --git a/bundles/llamacpp-cpu-qwen3-embed/manifest.json b/bundles/llamacpp-cpu-qwen3-embed/manifest.json
@@ -0,0 +1,43 @@
+{
+  "id": "llamacpp-cpu-qwen3-embed",
+  "type": "bundle",
+  "name": "llama.cpp Qwen3-Embedding-0.6B (CPU)",
+  "category": "ai",
+  "description": "CPU-only text embeddings. Qwen3-Embedding-0.6B Q8_0 GGUF served by llama.cpp with an OpenAI-compatible /v1/embeddings endpoint on port 8007. Runs anywhere Docker runs (including macOS/Windows Docker Desktop) — no GPU required. Registers the llamacpp-cpu-embed provider for semantic memory/research/blog search.",
+  "author": "Crow",
+  "version": "1.0.0",
+  "host": "local",
+  "icon": "cpu",
+  "tags": ["ai", "embeddings", "semantic-search", "cpu", "llama.cpp", "local", "privacy"],
+  "requires": {
+    "gpu_arch": ["cpu"],
+    "min_ram_mb": 2048,
+    "min_disk_mb": 3000
+  },
+  "env_vars": [],
+  "port": 8007,
+  "webUI": null,
+  "docker": {
+    "composefile": "docker-compose.yml"
+  },
+  "providers": [
+    {
+      "id": "llamacpp-cpu-embed",
+      "baseUrlTemplate": "http://{host_ip}:{port}/v1",
+      "apiKey": "none",
+      "description": "OpenAI-compatible embedding endpoint (Qwen3-Embedding-0.6B Q8_0, CPU llama.cpp)",
+      "models": [
+        {
+          "id": "qwen3-embedding-0.6b",
+          "task": "embed",
+          "dim": 1024,
+          "dimensions": 1024,
+          "contextLen": 8192,
+          "warm": false,
+          "priority": "background"
+        }
+      ]
+    }
+  ],
+  "notes": "CPU-only; the first request downloads the ~640MB GGUF and caches it in a Docker volume. After install, select it as the embedding provider — set dashboard_settings 'embed_provider' = 'llamacpp-cpu-embed' (or export CROW_EMBED_PROVIDER=llamacpp-cpu-embed). 1024-dim, vector-space compatible with grackle-embed / vllm-cuda-embed (same model)."
+}
diff --git a/registry/add-ons.json b/registry/add-ons.json
@@ -2422,6 +2422,60 @@
       "notes": "Self-host via Docker or connect to an existing Linkding instance. Generate an API token in Linkding Settings > Integrations.",
       "official": true
     },
+    {
+      "id": "llamacpp-cpu-qwen3-embed",
+      "type": "bundle",
+      "name": "llama.cpp Qwen3-Embedding-0.6B (CPU)",
+      "category": "ai",
+      "description": "CPU-only text embeddings. Qwen3-Embedding-0.6B Q8_0 GGUF served by llama.cpp with an OpenAI-compatible /v1/embeddings endpoint on port 8007. Runs anywhere Docker runs (including macOS/Windows Docker Desktop) — no GPU required. Registers the llamacpp-cpu-embed provider for semantic memory/research/blog search.",
+      "author": "Crow",
+      "version": "1.0.0",
+      "host": "local",
+      "icon": "cpu",
+      "tags": [
+        "ai",
+        "embeddings",
+        "semantic-search",
+        "cpu",
+        "llama.cpp",
+        "local",
+        "privacy"
+      ],
+      "requires": {
+        "gpu_arch": [
+          "cpu"
+        ],
+        "min_ram_mb": 2048,
+        "min_disk_mb": 3000
+      },
+      "env_vars": [],
+      "port": 8007,
+      "webUI": null,
+      "docker": {
+        "composefile": "docker-compose.yml"
+      },
+      "providers": [
+        {
+          "id": "llamacpp-cpu-embed",
+          "baseUrlTemplate": "http://{host_ip}:{port}/v1",
+          "apiKey": "none",
+          "description": "OpenAI-compatible embedding endpoint (Qwen3-Embedding-0.6B Q8_0, CPU llama.cpp)",
+          "models": [
+            {
+              "id": "qwen3-embedding-0.6b",
+              "task": "embed",
+              "dim": 1024,
+              "dimensions": 1024,
+              "contextLen": 8192,
+              "warm": false,
+              "priority": "background"
+            }
+          ]
+        }
+      ],
+      "notes": "CPU-only; the first request downloads the ~640MB GGUF and caches it in a Docker volume. After install, select it as the embedding provider — set dashboard_settings 'embed_provider' = 'llamacpp-cpu-embed' (or export CROW_EMBED_PROVIDER=llamacpp-cpu-embed). 1024-dim, vector-space compatible with grackle-embed / vllm-cuda-embed (same model).",
+      "official": true
+    },
     {
       "id": "llamacpp-qwen72b",
       "type": "bundle",