diff --git a/bundles/llamacpp-cpu-qwen3-embed/README.md b/bundles/llamacpp-cpu-qwen3-embed/README.md new file mode 100644 index 00000000..2e555004 --- /dev/null +++ b/bundles/llamacpp-cpu-qwen3-embed/README.md @@ -0,0 +1,44 @@ +# llama.cpp Qwen3-Embedding-0.6B (CPU) + +A **CPU-only** embedding endpoint for Crow's semantic search, for hosts without a +compatible GPU (or that can't reach a shared embedding server like `grackle-embed`). +It serves **Qwen3-Embedding-0.6B** (Q8_0 GGUF, 1024-dim) via llama.cpp with an +OpenAI-compatible `/v1/embeddings` API on `127.0.0.1:8007`. + +Same model — and therefore the same 1024-dim vector space — as the GPU +`vllm-cuda-embed` / `llamacpp-vulkan-qwen3-embed` bundles, so embeddings are +interchangeable across them. + +## Install + +From the Crow's Nest **Extensions** panel, install **llama.cpp Qwen3-Embedding-0.6B (CPU)**. +Requires Docker. The first request downloads the ~640MB GGUF and caches it in a +Docker volume. + +## Make it the embedding provider + +The bundle registers a provider with id **`llamacpp-cpu-embed`**. Point Crow's +semantic search at it (see *Choosing the embedding provider* in the AI Providers +guide): + +```sql +INSERT INTO dashboard_settings (key, value) VALUES ('embed_provider', 'llamacpp-cpu-embed') + ON CONFLICT(key) DO UPDATE SET value = excluded.value; +``` + +…or set `CROW_EMBED_PROVIDER=llamacpp-cpu-embed`. Allow ~30s for the cache to refresh. + +## Verify + +```bash +curl http://127.0.0.1:8007/v1/models +curl -s http://127.0.0.1:8007/v1/embeddings \ + -H 'Content-Type: application/json' \ + -d '{"model":"qwen3-embedding-0.6b","input":"hello"}' | head -c 200 +``` + +## Notes + +- CPU inference: embedding a short text is fast; the one-time model load takes a + couple of seconds on first call. +- No API key, no GPU, no data leaves your machine. diff --git a/bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml b/bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml new file mode 100644 index 00000000..9788ebb2 --- /dev/null +++ b/bundles/llamacpp-cpu-qwen3-embed/docker-compose.yml @@ -0,0 +1,45 @@ +## llama.cpp CPU embedding server — Qwen3-Embedding-0.6B (OpenAI-compatible). +## +## No GPU required; runs on any Docker host including macOS/Windows Docker Desktop. +## The first start downloads the GGUF (~640MB) via -hf and caches it in the named +## volume, so subsequent restarts are fast. +## +## Usage (manual test): +## docker compose up -d +## curl http://127.0.0.1:8007/v1/models +## curl -s http://127.0.0.1:8007/v1/embeddings \ +## -H 'Content-Type: application/json' \ +## -d '{"model":"qwen3-embedding-0.6b","input":"hello"}' | head -c 200 + +services: + llamacpp-cpu-qwen3-embed: + image: ghcr.io/ggml-org/llama.cpp:server + container_name: llamacpp-cpu-qwen3-embed + environment: + - LLAMA_CACHE=/models + volumes: + - llamacpp-cpu-qwen3-embed-models:/models + ports: + - "127.0.0.1:8007:8000" + command: + - --hf-repo + - Qwen/Qwen3-Embedding-0.6B-GGUF + - --hf-file + - Qwen3-Embedding-0.6B-Q8_0.gguf + - --alias + - qwen3-embedding-0.6b + - --embedding + - --pooling + - mean + - --ctx-size + - "8192" + - --threads + - "4" + - --host + - 0.0.0.0 + - --port + - "8000" + restart: unless-stopped + +volumes: + llamacpp-cpu-qwen3-embed-models: diff --git a/bundles/llamacpp-cpu-qwen3-embed/manifest.json b/bundles/llamacpp-cpu-qwen3-embed/manifest.json new file mode 100644 index 00000000..7cdd5ef3 --- /dev/null +++ b/bundles/llamacpp-cpu-qwen3-embed/manifest.json @@ -0,0 +1,43 @@ +{ + "id": "llamacpp-cpu-qwen3-embed", + "type": "bundle", + "name": "llama.cpp Qwen3-Embedding-0.6B (CPU)", + "category": "ai", + "description": "CPU-only text embeddings. Qwen3-Embedding-0.6B Q8_0 GGUF served by llama.cpp with an OpenAI-compatible /v1/embeddings endpoint on port 8007. Runs anywhere Docker runs (including macOS/Windows Docker Desktop) — no GPU required. Registers the llamacpp-cpu-embed provider for semantic memory/research/blog search.", + "author": "Crow", + "version": "1.0.0", + "host": "local", + "icon": "cpu", + "tags": ["ai", "embeddings", "semantic-search", "cpu", "llama.cpp", "local", "privacy"], + "requires": { + "gpu_arch": ["cpu"], + "min_ram_mb": 2048, + "min_disk_mb": 3000 + }, + "env_vars": [], + "port": 8007, + "webUI": null, + "docker": { + "composefile": "docker-compose.yml" + }, + "providers": [ + { + "id": "llamacpp-cpu-embed", + "baseUrlTemplate": "http://{host_ip}:{port}/v1", + "apiKey": "none", + "description": "OpenAI-compatible embedding endpoint (Qwen3-Embedding-0.6B Q8_0, CPU llama.cpp)", + "models": [ + { + "id": "qwen3-embedding-0.6b", + "task": "embed", + "dim": 1024, + "dimensions": 1024, + "contextLen": 8192, + "warm": false, + "priority": "background" + } + ] + } + ], + "notes": "CPU-only; the first request downloads the ~640MB GGUF and caches it in a Docker volume. After install, select it as the embedding provider — set dashboard_settings 'embed_provider' = 'llamacpp-cpu-embed' (or export CROW_EMBED_PROVIDER=llamacpp-cpu-embed). 1024-dim, vector-space compatible with grackle-embed / vllm-cuda-embed (same model)." +} diff --git a/registry/add-ons.json b/registry/add-ons.json index edecf8bf..b297c678 100644 --- a/registry/add-ons.json +++ b/registry/add-ons.json @@ -2422,6 +2422,60 @@ "notes": "Self-host via Docker or connect to an existing Linkding instance. Generate an API token in Linkding Settings > Integrations.", "official": true }, + { + "id": "llamacpp-cpu-qwen3-embed", + "type": "bundle", + "name": "llama.cpp Qwen3-Embedding-0.6B (CPU)", + "category": "ai", + "description": "CPU-only text embeddings. Qwen3-Embedding-0.6B Q8_0 GGUF served by llama.cpp with an OpenAI-compatible /v1/embeddings endpoint on port 8007. Runs anywhere Docker runs (including macOS/Windows Docker Desktop) — no GPU required. Registers the llamacpp-cpu-embed provider for semantic memory/research/blog search.", + "author": "Crow", + "version": "1.0.0", + "host": "local", + "icon": "cpu", + "tags": [ + "ai", + "embeddings", + "semantic-search", + "cpu", + "llama.cpp", + "local", + "privacy" + ], + "requires": { + "gpu_arch": [ + "cpu" + ], + "min_ram_mb": 2048, + "min_disk_mb": 3000 + }, + "env_vars": [], + "port": 8007, + "webUI": null, + "docker": { + "composefile": "docker-compose.yml" + }, + "providers": [ + { + "id": "llamacpp-cpu-embed", + "baseUrlTemplate": "http://{host_ip}:{port}/v1", + "apiKey": "none", + "description": "OpenAI-compatible embedding endpoint (Qwen3-Embedding-0.6B Q8_0, CPU llama.cpp)", + "models": [ + { + "id": "qwen3-embedding-0.6b", + "task": "embed", + "dim": 1024, + "dimensions": 1024, + "contextLen": 8192, + "warm": false, + "priority": "background" + } + ] + } + ], + "notes": "CPU-only; the first request downloads the ~640MB GGUF and caches it in a Docker volume. After install, select it as the embedding provider — set dashboard_settings 'embed_provider' = 'llamacpp-cpu-embed' (or export CROW_EMBED_PROVIDER=llamacpp-cpu-embed). 1024-dim, vector-space compatible with grackle-embed / vllm-cuda-embed (same model).", + "official": true + }, { "id": "llamacpp-qwen72b", "type": "bundle",