VoScript/.env.example at main · MapleEve/VoScript · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Full configuration reference:
#   Chinese: doc/configuration.zh.md
#   English: doc/configuration.en.md

# Hugging Face access token (required).
# Accept terms at https://huggingface.co/pyannote/speaker-diarization-3.1
# then create a token at https://huggingface.co/settings/tokens
HF_TOKEN=hf_replace_me

# Shared secret used by clients (e.g. OpenPlaud). When set, every /api/*
# request must send `Authorization: Bearer $API_KEY` or `X-API-Key: $API_KEY`.
# Leave empty ONLY if the service is bound to a fully trusted network.
API_KEY=change-me-to-a-long-random-string

# Set to 1 only when you intentionally run without API_KEY authentication.
ALLOW_NO_AUTH=0

# Comma-separated browser origins allowed to call the API.
CORS_ALLOW_ORIGINS=*

# Where transcripts, uploads, and voiceprints are persisted on the host.
DATA_DIR=./data

# Where HuggingFace model weights are cached on the host.
MODEL_CACHE_DIR=./models

# Host port to publish the service on.
HOST_PORT=8780

# Max bytes accepted per upload. Requests larger than this return 413.
# Default 2 GiB — override downward if your disk is small.
MAX_UPLOAD_BYTES=2147483648

# Runtime cache and conversion limits.
JOBS_MAX_CACHE=200
FFMPEG_TIMEOUT_SEC=1800

# Optional idle model unload. Defaults to 180 seconds (3 minutes). Set to 0
# to disable idle unload and keep loaded GPU models resident. When enabled,
# loaded GPU models are released after the serialized GPU runtime has been
# idle for this many seconds; the next lazy load reselects the CUDA device
# with the most free memory.
MODEL_IDLE_TIMEOUT_SEC=180

# UID/GID the container process runs as. Must match the owner of DATA_DIR
# and MODEL_CACHE_DIR on the host, otherwise writes fail. On a typical
# Linux host `id -u` / `id -g` is 1000, which is the default.
APP_UID=1000
APP_GID=1000

# CUDA_VISIBLE_DEVICES is intentionally not defined here. By default
# docker-compose.yml requests all Docker-exposed NVIDIA GPUs and does not set
# CUDA_VISIBLE_DEVICES inside the container. To restrict visibility, add an
# explicit docker-compose.override.yml environment entry, optionally backed by
# a private local .env value. Container cuda:N indexes are remapped from that
# visible set and may not match physical host GPU numbers.

# Device the pipeline runs on.
#   cuda  — NVIDIA GPU (requires NVIDIA Container Toolkit inside docker)
#   cpu   — fallback, fine for macOS / Apple Silicon / no-GPU hosts
DEVICE=cuda

# Whisper model size. Built-in options (smallest → largest):
#   tiny, base, small, medium, large-v3
# Recommendations:
#   GPU with ≥ 12 GB VRAM           → large-v3  (default, best quality)
#   GPU with < 12 GB VRAM / macOS / CPU-only → medium  (3–4× faster; Chinese quality still good)
#   quick smoke tests               → small
WHISPER_MODEL=large-v3

# Optional: mirror for HuggingFace model downloads (e.g. https://hf-mirror.com).
HF_ENDPOINT=https://huggingface.co

# Disable HuggingFace Xet/CAS downloads by default; set to 0 only if your
# environment is known to support hf-xet reliably.
HF_HUB_DISABLE_XET=1

# Short metadata timeout lets HuggingFace Hub fall back to local cache quickly.
HF_HUB_ETAG_TIMEOUT=3

# WhisperX forced-alignment controls.
# Languages, including zh, are attempted by default. Chinese word-level
# alignment with the default model needs torch>=2.6 under recent transformers
# safety checks; the Docker image supplies torch 2.6.0.
# Only set this as a temporary operational fallback when you intentionally
# want to complete jobs without word-level timestamps for selected languages.
WHISPERX_ALIGN_DISABLED_LANGUAGES=

# Alignment defaults to CPU to isolate wav2vec2 alignment from the GPU ASR and
# speaker-embedding runtimes. Set to pipeline/asr/cuda/cuda:0 only if you have
# validated the target CUDA stack is stable for WhisperX alignment.
WHISPERX_ALIGN_DEVICE=cpu

# Optional comma-separated language=model overrides.
# Example: WHISPERX_ALIGN_MODEL_MAP=zh=your-org/your-zh-align-model
WHISPERX_ALIGN_MODEL_MAP=

# Optional alignment model cache directory and offline/cache-only mode.
WHISPERX_ALIGN_MODEL_DIR=
WHISPERX_ALIGN_CACHE_ONLY=0

# Noise reduction defaults. Omitting denoise_model in the API uses
# DENOISE_MODEL; explicitly sending denoise_model=none disables denoising for
# that request. DENOISE_SNR_THRESHOLD only gates DeepFilterNet skips;
# noisereduce runs whenever selected.
DENOISE_MODEL=none
DENOISE_SNR_THRESHOLD=10.0

# Speaker matching and diarization/embedding defaults.
VOICEPRINT_THRESHOLD=0.75
EMBEDDING_DIM=256
PYANNOTE_MIN_DURATION_OFF=0.5
MIN_EMBED_DURATION=1.5
MAX_EMBED_DURATION=10.0