From 9f6a390ad078c7579c0835947aeee08e060ee8e8 Mon Sep 17 00:00:00 2001 From: wooway777 Date: Thu, 18 Jun 2026 19:38:10 +0800 Subject: [PATCH 1/2] feat: adjust sampling defaults --- examples/bench.py | 26 +++++++++++++- examples/test_infer.py | 6 ++-- python/infinilm/base_config.py | 6 ++-- python/infinilm/config/engine_config.py | 6 ++-- python/infinilm/llm/llm.py | 42 +++++++++++++++------- python/infinilm/server/inference_server.py | 19 ++++++---- 6 files changed, 77 insertions(+), 28 deletions(-) diff --git a/examples/bench.py b/examples/bench.py index 37ad326c0..5754bbf6c 100644 --- a/examples/bench.py +++ b/examples/bench.py @@ -1,7 +1,11 @@ import infinicore from infinilm.modeling_utils import load_model_state_dict_by_file from infinilm.distributed import DistConfig -from infinilm.infer_engine import GenerationConfig, InferEngine +from infinilm.infer_engine import ( + GenerationConfig, + InferEngine, + read_hf_generation_config, +) from infinilm.base_config import BaseConfig from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig from infinilm.processors import AutoInfinilmProcessor @@ -88,6 +92,23 @@ def read_json_file(file_path): return json.load(file) +def resolve_generation_defaults(model_path, top_k, top_p, temperature): + generation_config = read_hf_generation_config(model_path) + + def resolve(value, name, fallback, cast): + if value is None: + value = generation_config.get(name) + if value is None: + value = fallback + return cast(value) + + return ( + resolve(top_k, "top_k", 1, int), + resolve(top_p, "top_p", 1.0, float), + resolve(temperature, "temperature", 1.0, float), + ) + + def get_test_cases( model_path: str, batch_size_list: list[int], @@ -286,6 +307,9 @@ def run( enable_paged_attn = cfg.enable_paged_attn enable_graph = cfg.enable_graph attn_backend = cfg.attn + cfg.top_k, cfg.top_p, cfg.temperature = resolve_generation_defaults( + model_path, cfg.top_k, cfg.top_p, cfg.temperature + ) if isinstance(batch_size, int): batch_size = [batch_size] diff --git a/examples/test_infer.py b/examples/test_infer.py index dffe64d7f..8c78be6be 100644 --- a/examples/test_infer.py +++ b/examples/test_infer.py @@ -12,9 +12,9 @@ def test( tp=1, enable_paged_attn=False, enable_graph=False, - top_k=1, - top_p=1.0, - temperature=1.0, + top_k=None, + top_p=None, + temperature=None, attn_backend="default", use_mla=False, image_path=None, diff --git a/python/infinilm/base_config.py b/python/infinilm/base_config.py index 6b5515275..4c0591ae7 100644 --- a/python/infinilm/base_config.py +++ b/python/infinilm/base_config.py @@ -176,9 +176,9 @@ def _add_common_args(self): self.parser.add_argument( "--prompt", type=str, default="How are you", help="default prompt text" ) - self.parser.add_argument("--top-k", type=int, default=1) - self.parser.add_argument("--top-p", type=float, default=1.0) - self.parser.add_argument("--temperature", type=float, default=1.0) + self.parser.add_argument("--top-k", type=int, default=None) + self.parser.add_argument("--top-p", type=float, default=None) + self.parser.add_argument("--temperature", type=float, default=None) # --- debug --- self.parser.add_argument("--warmup", action="store_true") diff --git a/python/infinilm/config/engine_config.py b/python/infinilm/config/engine_config.py index 044cfdda4..0dc2c45fb 100644 --- a/python/infinilm/config/engine_config.py +++ b/python/infinilm/config/engine_config.py @@ -37,9 +37,9 @@ class EngineConfig: num_blocks: int = 512 block_size: int = 256 max_cache_len: int = 4096 - temperature: float = 1.0 - top_p: float = 0.8 - top_k: int = 1 + temperature: float | None = None + top_p: float | None = None + top_k: int | None = None enable_graph: bool = False attn_backend: str = "default" use_mla: bool = False diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py index f8171c0db..c42d39f81 100644 --- a/python/infinilm/llm/llm.py +++ b/python/infinilm/llm/llm.py @@ -29,15 +29,33 @@ from infinilm.multimodal.multimodal import resolve_multimodal_inputs from infinilm.config.kv_transfer import KVTransferConfig from infinilm.config.engine_config import EngineConfig +from infinilm.infer_engine import read_hf_generation_config from infinilm.kv_connector import KVConnectorRole, KVConnectorFactory logger = logging.getLogger(__name__) +def _resolve_generation_defaults(config: EngineConfig) -> None: + generation_config = read_hf_generation_config(config.model_path) + + def resolve(name: str, fallback, cast): + value = getattr(config, name) + if value is None: + value = generation_config.get(name) + if value is None: + value = fallback + setattr(config, name, cast(value)) + + resolve("top_k", 1, int) + resolve("top_p", 1.0, float) + resolve("temperature", 1.0, float) + + class LLMEngine: """Low-level LLM engine that handles inference execution.""" def __init__(self, config: EngineConfig): + _resolve_generation_defaults(config) self.config = config self.model_runner = ModelRunner(config) @@ -296,9 +314,9 @@ def __init__( num_blocks: int = 512, block_size: int = 256, max_cache_len: int = 4096, - temperature: float = 1.0, - top_p: float = 0.8, - top_k: int = 1, + temperature: float | None = None, + top_p: float | None = None, + top_k: int | None = None, enable_graph: bool = False, attn_backend: str = "default", use_mla: bool = False, @@ -490,9 +508,9 @@ def __init__( num_blocks: int = 512, block_size: int = 256, max_cache_len: int = 4096, - temperature: float = 1.0, - top_p: float = 0.8, - top_k: int = 1, + temperature: float | None = None, + top_p: float | None = None, + top_k: int | None = None, enable_graph: bool = False, attn_backend: str = "default", kv_transfer_config: Optional[KVTransferConfig] = None, @@ -715,13 +733,13 @@ def add_request( elif prompt is not None: prompt_token_ids = self.engine.tokenize(prompt) else: - assert messages is not None, ( - "Either messages or prompt/prompt_token_ids must be provided" - ) + assert ( + messages is not None + ), "Either messages or prompt/prompt_token_ids must be provided" - assert apply_chat_template, ( - "apply_chat_template needs to be true for multi-role conversation" - ) + assert ( + apply_chat_template + ), "apply_chat_template needs to be true for multi-role conversation" prompt = self.engine.apply_chat_template( messages, add_generation_prompt=add_generation_prompt diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index 645b00656..40253bdc8 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -103,9 +103,9 @@ def __init__( num_blocks: int = 512, block_size: int = 256, max_cache_len: int = 4096, - temperature: float = 1.0, - top_p: float = 0.8, - top_k: int = 1, + temperature: float | None = None, + top_p: float | None = None, + top_k: int | None = None, host: str = "0.0.0.0", port: int = 8000, enable_graph: bool = False, @@ -194,6 +194,9 @@ async def lifespan(app: FastAPI): use_mla=self.use_mla, kv_transfer_config=self.kv_transfer_config, ) + self.temperature = self.engine.config.temperature + self.top_p = self.engine.config.top_p + self.top_k = self.engine.config.top_k self.engine.start() logger.info(f"Engine initialized with model at {self.model_path}") logger.info(f" enable_graph: {self.enable_graph}") @@ -337,10 +340,14 @@ def pick(key: str, default): if isinstance(stop, str): stop = [stop] + temperature = pick("temperature", self.temperature) + top_p = pick("top_p", self.top_p) + top_k = pick("top_k", self.top_k) + return SamplingParams( - temperature=float(pick("temperature", self.temperature)), - top_p=float(pick("top_p", self.top_p)), - top_k=int(pick("top_k", self.top_k)), + temperature=float(1.0 if temperature is None else temperature), + top_p=float(1.0 if top_p is None else top_p), + top_k=int(1 if top_k is None else top_k), max_tokens=int(max_tokens) if max_tokens is not None else None, stop=stop, ignore_eos=self.ignore_eos, From 3b92df73e69eab0100679115b38b52e30da9e8d5 Mon Sep 17 00:00:00 2001 From: wooway777 Date: Thu, 18 Jun 2026 19:39:27 +0800 Subject: [PATCH 2/2] feat: rwkv5 world text --- csrc/models/rwkv/rwkv5_for_causal_lm.cpp | 336 ++++++++++++++++++ csrc/models/rwkv/rwkv5_for_causal_lm.hpp | 145 ++++++++ examples/bench.py | 28 +- python/infinilm/infer_engine.py | 12 +- python/infinilm/llm/llm.py | 14 + .../infinilm/llm/model_runner/model_runner.py | 9 +- python/infinilm/processors/__init__.py | 11 +- .../processors/basic_llm_processor.py | 55 ++- python/infinilm/processors/processor.py | 13 + python/infinilm/processors/rwkv_processor.py | 55 +++ python/infinilm/server/inference_server.py | 1 + 11 files changed, 646 insertions(+), 33 deletions(-) create mode 100644 csrc/models/rwkv/rwkv5_for_causal_lm.cpp create mode 100644 csrc/models/rwkv/rwkv5_for_causal_lm.hpp create mode 100644 python/infinilm/processors/rwkv_processor.py diff --git a/csrc/models/rwkv/rwkv5_for_causal_lm.cpp b/csrc/models/rwkv/rwkv5_for_causal_lm.cpp new file mode 100644 index 000000000..997e6eefc --- /dev/null +++ b/csrc/models/rwkv/rwkv5_for_causal_lm.cpp @@ -0,0 +1,336 @@ +#include "rwkv5_for_causal_lm.hpp" +#include "../../global_state/global_state.hpp" +#include "../models_registry.hpp" +#include "infinicore/context/context.hpp" +#include "infinicore/ops/add.hpp" +#include "infinicore/ops/layer_norm.hpp" +#include "infinicore/ops/lerp.hpp" +#include "infinicore/ops/mul.hpp" +#include "infinicore/ops/relu.hpp" +#include "infinicore/ops/rwkv5_wkv.hpp" +#include "infinicore/ops/sigmoid.hpp" +#include "infinicore/ops/silu.hpp" + +#include +#include + +namespace infinilm::models::rwkv { + +namespace { + +infinicore::Tensor layer_state_view(const infinicore::Tensor &state, + size_t batch_size, + size_t layer_idx, + size_t num_layers, + size_t hidden_size) { + return state->narrow({{0, 0, batch_size}, {1, layer_idx, 1}}) + ->as_strided({batch_size, hidden_size}, + {static_cast(num_layers * hidden_size), 1}); +} + +infinicore::Tensor layer_wkv_state_view(const infinicore::Tensor &state, + size_t batch_size, + size_t layer_idx, + size_t num_layers, + size_t num_heads, + size_t head_size) { + const ptrdiff_t batch_stride = static_cast(num_layers * num_heads * head_size * head_size); + return state->narrow({{0, 0, batch_size}, {1, layer_idx, 1}}) + ->as_strided({batch_size, num_heads, head_size, head_size}, + {batch_stride, + static_cast(head_size * head_size), + static_cast(head_size), + 1}); +} + +infinicore::Tensor scale_tensor(const infinicore::Tensor &x, float scale) { + if (scale == 1.0f) { + return x; + } + auto zeros = infinicore::Tensor::zeros(x->shape(), x->dtype(), x->device()); + return infinicore::op::lerp(zeros, x, scale); +} + +} // namespace + +std::shared_ptr create_rwkv5_model_config(std::shared_ptr model_config) { + const std::string &model_type = model_config->get("model_type"); + if ("rwkv5" != model_type) { + throw std::runtime_error("infinilm::models::rwkv::create_rwkv5_model_config: model_type is not rwkv5"); + } + auto &j = model_config->get_config_json(); + const size_t hidden_size = j.value("hidden_size", 2048); + j["attention_hidden_size"] = j.value("attention_hidden_size", hidden_size); + if (!j.contains("intermediate_size") || j["intermediate_size"].is_null()) { + j["intermediate_size"] = static_cast((static_cast(hidden_size) * 3.5) / 32) * 32; + } + j["head_size"] = j.value("head_size", 64); + j["num_attention_heads"] = j.value("num_attention_heads", j["attention_hidden_size"].get() / j["head_size"].get()); + j["head_size_divisor"] = j.value("head_size_divisor", 8); + j["layer_norm_eps"] = j.value("layer_norm_eps", j.value("layer_norm_epsilon", 1e-5)); + j["max_position_embeddings"] = j.value("max_position_embeddings", j.value("context_length", 4096)); + if (!j.contains("dtype") && !j.contains("torch_dtype")) { + j["torch_dtype"] = "bfloat16"; + } + j["tie_word_embeddings"] = false; + return model_config; +} + +Rwkv5SelfAttention::Rwkv5SelfAttention(std::shared_ptr config, + size_t layer_idx, + const infinicore::Device &device) + : layer_idx_(layer_idx) { + const auto &dtype = config->get_dtype(); + hidden_size_ = config->get("hidden_size"); + attention_hidden_size_ = config->get("attention_hidden_size"); + head_size_ = config->get("head_size"); + num_heads_ = attention_hidden_size_ / head_size_; + head_size_divisor_ = config->get_or("head_size_divisor", 8); + auto quantization_method = config->get_quantization_method(); + + INFINICORE_NN_PARAMETER_INIT(time_decay, ({num_heads_, head_size_}, dtype, device)); + INFINICORE_NN_PARAMETER_INIT(time_faaaa, ({num_heads_, head_size_}, dtype, device)); + INFINICORE_NN_PARAMETER_INIT(time_mix_gate, ({1, 1, hidden_size_}, dtype, device)); + INFINICORE_NN_PARAMETER_INIT(time_mix_key, ({1, 1, hidden_size_}, dtype, device)); + INFINICORE_NN_PARAMETER_INIT(time_mix_value, ({1, 1, hidden_size_}, dtype, device)); + INFINICORE_NN_PARAMETER_INIT(time_mix_receptance, ({1, 1, hidden_size_}, dtype, device)); + INFINICORE_NN_MODULE_INIT(key, hidden_size_, attention_hidden_size_, quantization_method, false, dtype, device); + INFINICORE_NN_MODULE_INIT(value, hidden_size_, attention_hidden_size_, quantization_method, false, dtype, device); + INFINICORE_NN_MODULE_INIT(receptance, hidden_size_, attention_hidden_size_, quantization_method, false, dtype, device); + INFINICORE_NN_MODULE_INIT(gate, hidden_size_, attention_hidden_size_, quantization_method, false, dtype, device); + INFINICORE_NN_MODULE_INIT(output, attention_hidden_size_, hidden_size_, quantization_method, false, dtype, device); + INFINICORE_NN_MODULE_INIT(ln_x, hidden_size_, 1e-5, dtype, device); +} + +infinicore::Tensor Rwkv5SelfAttention::shifted_hidden_(const infinicore::Tensor &hidden_states, + infinicore::Tensor &state) const { + const auto shape = hidden_states->shape(); + const size_t batch_size = shape[0]; + const size_t seq_len = shape[1]; + auto shifted = infinicore::Tensor::empty(shape, hidden_states->dtype(), hidden_states->device()); + shifted->narrow({{1, 0, 1}})->view({batch_size, hidden_size_})->copy_from(state); + for (size_t t = 1; t < seq_len; ++t) { + shifted->narrow({{1, t, 1}})->copy_from(hidden_states->narrow({{1, t - 1, 1}})); + } + state->copy_from(hidden_states->narrow({{1, seq_len - 1, 1}})->view({batch_size, hidden_size_})); + return shifted; +} + +infinicore::Tensor Rwkv5SelfAttention::group_norm_(const infinicore::Tensor &x) const { + const auto shape = x->shape(); + const size_t batch_size = shape[0]; + const size_t seq_len = shape[1]; + auto zeros = infinicore::Tensor::zeros(shape, x->dtype(), x->device()); + auto scaled = infinicore::op::lerp(zeros, x, 1.0f / static_cast(head_size_divisor_)); + auto y = infinicore::Tensor::empty(shape, x->dtype(), x->device()); + auto scaled_3d = scaled->view({batch_size * seq_len, num_heads_, head_size_}); + auto y_3d = y->view({batch_size * seq_len, num_heads_, head_size_}); + auto weight = ln_x_->weight(); + auto bias = ln_x_->bias(); + for (size_t h = 0; h < num_heads_; ++h) { + auto x_h = scaled_3d->narrow({{1, h, 1}})->view({batch_size * seq_len, head_size_}); + auto y_h = y_3d->narrow({{1, h, 1}})->view({batch_size * seq_len, head_size_}); + auto w_h = weight->narrow({{0, h * head_size_, head_size_}}); + auto b_h = bias->narrow({{0, h * head_size_, head_size_}}); + infinicore::op::layer_norm_(y_h, x_h, w_h, b_h, 1e-5f); + } + return y; +} + +infinicore::Tensor Rwkv5SelfAttention::forward(const infinicore::Tensor &hidden_states, + infinicore::Tensor &attn_x_state, + infinicore::Tensor &wkv_state) const { + auto shifted = shifted_hidden_(hidden_states, attn_x_state); + auto key_mix = infinicore::op::lerp(shifted, hidden_states, time_mix_key_); + auto value_mix = infinicore::op::lerp(shifted, hidden_states, time_mix_value_); + auto receptance_mix = infinicore::op::lerp(shifted, hidden_states, time_mix_receptance_); + auto gate_mix = infinicore::op::lerp(shifted, hidden_states, time_mix_gate_); + + auto key_states = key_->forward(key_mix); + auto value_states = value_->forward(value_mix); + auto receptance_states = receptance_->forward(receptance_mix); + auto gate_states = infinicore::op::silu(gate_->forward(gate_mix)); + auto wkv = infinicore::op::rwkv5_wkv(receptance_states, key_states, value_states, time_decay_, time_faaaa_, wkv_state); + auto normed = group_norm_(wkv); + auto gated = infinicore::op::mul(normed, gate_states); + return output_->forward(gated); +} + +Rwkv5FeedForward::Rwkv5FeedForward(std::shared_ptr config, + size_t layer_idx, + const infinicore::Device &device) + : layer_idx_(layer_idx) { + const auto &dtype = config->get_dtype(); + hidden_size_ = config->get("hidden_size"); + intermediate_size_ = config->get("intermediate_size"); + auto quantization_method = config->get_quantization_method(); + + INFINICORE_NN_PARAMETER_INIT(time_mix_key, ({1, 1, hidden_size_}, dtype, device)); + INFINICORE_NN_PARAMETER_INIT(time_mix_receptance, ({1, 1, hidden_size_}, dtype, device)); + INFINICORE_NN_MODULE_INIT(key, hidden_size_, intermediate_size_, quantization_method, false, dtype, device); + INFINICORE_NN_MODULE_INIT(receptance, hidden_size_, hidden_size_, quantization_method, false, dtype, device); + INFINICORE_NN_MODULE_INIT(value, intermediate_size_, hidden_size_, quantization_method, false, dtype, device); +} + +infinicore::Tensor Rwkv5FeedForward::shifted_hidden_(const infinicore::Tensor &hidden_states, + infinicore::Tensor &state) const { + const auto shape = hidden_states->shape(); + const size_t batch_size = shape[0]; + const size_t seq_len = shape[1]; + auto shifted = infinicore::Tensor::empty(shape, hidden_states->dtype(), hidden_states->device()); + shifted->narrow({{1, 0, 1}})->view({batch_size, hidden_size_})->copy_from(state); + for (size_t t = 1; t < seq_len; ++t) { + shifted->narrow({{1, t, 1}})->copy_from(hidden_states->narrow({{1, t - 1, 1}})); + } + state->copy_from(hidden_states->narrow({{1, seq_len - 1, 1}})->view({batch_size, hidden_size_})); + return shifted; +} + +infinicore::Tensor Rwkv5FeedForward::forward(const infinicore::Tensor &hidden_states, + infinicore::Tensor &ffn_x_state) const { + auto shifted = shifted_hidden_(hidden_states, ffn_x_state); + auto key_mix = infinicore::op::lerp(shifted, hidden_states, time_mix_key_); + auto receptance_mix = infinicore::op::lerp(shifted, hidden_states, time_mix_receptance_); + auto key_states = infinicore::op::relu(key_->forward(key_mix)); + key_states = infinicore::op::mul(key_states, key_states); + auto value_states = value_->forward(key_states); + auto receptance_states = infinicore::op::sigmoid(receptance_->forward(receptance_mix)); + return infinicore::op::mul(receptance_states, value_states); +} + +Rwkv5Block::Rwkv5Block(std::shared_ptr config, + size_t layer_idx, + const infinicore::Device &device) + : layer_idx_(layer_idx) { + const auto &dtype = config->get_dtype(); + const size_t hidden_size = config->get("hidden_size"); + const double eps = config->get("layer_norm_eps"); + rescale_every_ = config->get_or("rescale_every", 0); + if (layer_idx_ == 0) { + pre_ln_ = this->register_module("pre_ln", hidden_size, eps, dtype, device); + } + INFINICORE_NN_MODULE_INIT(ln1, hidden_size, eps, dtype, device); + INFINICORE_NN_MODULE_INIT(attention, config, layer_idx, device); + INFINICORE_NN_MODULE_INIT(ln2, hidden_size, eps, dtype, device); + INFINICORE_NN_MODULE_INIT(feed_forward, config, layer_idx, device); +} + +infinicore::Tensor Rwkv5Block::forward(const infinicore::Tensor &hidden_states, + infinicore::Tensor &attn_x_state, + infinicore::Tensor &wkv_state, + infinicore::Tensor &ffn_x_state) const { + auto x = hidden_states; + if (pre_ln_) { + x = pre_ln_->forward(x); + } + auto attn = attention_->forward(ln1_->forward(x), attn_x_state, wkv_state); + if (rescale_every_ > 0) { + const size_t scale_power = layer_idx_ / rescale_every_; + if (scale_power > 0) { + attn = scale_tensor(attn, 1.0f / static_cast(size_t{1} << scale_power)); + } + } + x = infinicore::op::add(x, attn); + auto ffn = feed_forward_->forward(ln2_->forward(x), ffn_x_state); + if (rescale_every_ > 0) { + const size_t scale_power = layer_idx_ / rescale_every_; + if (scale_power > 0) { + ffn = scale_tensor(ffn, 1.0f / static_cast(size_t{1} << scale_power)); + } + } + return infinicore::op::add(x, ffn); +} + +Rwkv5Model::Rwkv5Model(std::shared_ptr config, + const infinicore::Device &device) { + const auto &dtype = config->get_dtype(); + const size_t vocab_size = config->get("vocab_size"); + const size_t hidden_size = config->get("hidden_size"); + const size_t num_hidden_layers = config->get("num_hidden_layers"); + const double eps = config->get("layer_norm_eps"); + rescale_every_ = config->get_or("rescale_every", 0); + + INFINICORE_NN_MODULE_INIT(embeddings, vocab_size, hidden_size, std::nullopt, dtype, device); + blocks_.reserve(num_hidden_layers); + for (size_t i = 0; i < num_hidden_layers; ++i) { + blocks_.push_back(this->register_module("blocks." + std::to_string(i), config, i, device)); + } + INFINICORE_NN_MODULE_INIT(ln_out, hidden_size, eps, dtype, device); +} + +infinicore::Tensor Rwkv5Model::forward(const infinilm::InfinilmModel::Input &input, + infinicore::Tensor &attn_x_state, + infinicore::Tensor &wkv_state, + infinicore::Tensor &ffn_x_state) const { + auto hidden_states = embeddings_->forward(input.input_ids.value()); + const size_t batch_size = hidden_states->shape()[0]; + const size_t num_layers = blocks_.size(); + const size_t hidden_size = hidden_states->shape()[2]; + const size_t num_heads = wkv_state->shape()[2]; + const size_t head_size = wkv_state->shape()[3]; + + for (size_t i = 0; i < num_layers; ++i) { + auto attn_x = layer_state_view(attn_x_state, batch_size, i, num_layers, hidden_size); + auto attn_kv = layer_wkv_state_view(wkv_state, batch_size, i, num_layers, num_heads, head_size); + auto ffn_x = layer_state_view(ffn_x_state, batch_size, i, num_layers, hidden_size); + hidden_states = blocks_.at(i)->forward(hidden_states, attn_x, attn_kv, ffn_x); + if (rescale_every_ > 0 && (i + 1) % rescale_every_ == 0) { + hidden_states = scale_tensor(hidden_states, 0.5f); + } + } + return ln_out_->forward(hidden_states); +} + +Rwkv5ForCausalLM::Rwkv5ForCausalLM(std::shared_ptr config, + const infinicore::Device &device) + : device_(device), dtype_(config->get_dtype()) { + model_config_ = config; + num_hidden_layers_ = config->get("num_hidden_layers"); + hidden_size_ = config->get("hidden_size"); + head_size_ = config->get("head_size"); + num_heads_ = config->get("attention_hidden_size") / head_size_; + const size_t vocab_size = config->get("vocab_size"); + + INFINICORE_NN_MODULE_INIT(rwkv, config, device); + INFINICORE_NN_MODULE_INIT(head, hidden_size_, vocab_size, false, dtype_, device); +} + +void Rwkv5ForCausalLM::ensure_state_(size_t batch_size) const { + if (state_batch_size_ >= batch_size && attn_x_state_ && wkv_state_ && ffn_x_state_) { + return; + } + state_batch_size_ = batch_size; + attn_x_state_ = infinicore::Tensor::zeros({batch_size, num_hidden_layers_, hidden_size_}, dtype_, device_); + wkv_state_ = infinicore::Tensor::zeros({batch_size, num_hidden_layers_, num_heads_, head_size_, head_size_}, infinicore::DataType::F32, device_); + ffn_x_state_ = infinicore::Tensor::zeros({batch_size, num_hidden_layers_, hidden_size_}, dtype_, device_); +} + +InfinilmModel::Output Rwkv5ForCausalLM::forward(const InfinilmModel::Input &input) const { + const size_t batch_size = input.input_ids.value()->shape()[0]; + ensure_state_(batch_size); + auto hidden_states = rwkv_->forward(input, attn_x_state_, wkv_state_, ffn_x_state_); + auto logits = head_->forward(hidden_states); + return {logits}; +} + +void Rwkv5ForCausalLM::reset_cache(const cache::CacheConfig *cache_config) { + if (cache_config == nullptr) { + cache_config_.reset(); + } else { + cache_config_ = cache_config->unique_copy(); + } + infinilm::global_state::get_forward_context().kv_cache_vec.clear(); + state_batch_size_ = 0; + attn_x_state_.reset(); + wkv_state_.reset(); + ffn_x_state_.reset(); +} + +} // namespace infinilm::models::rwkv + +namespace { +INFINILM_REGISTER_CAUSAL_LM_MODEL( + rwkv5, + infinilm::models::rwkv::Rwkv5ForCausalLM, + infinilm::models::rwkv::create_rwkv5_model_config); +} // namespace diff --git a/csrc/models/rwkv/rwkv5_for_causal_lm.hpp b/csrc/models/rwkv/rwkv5_for_causal_lm.hpp new file mode 100644 index 000000000..302c4fbd7 --- /dev/null +++ b/csrc/models/rwkv/rwkv5_for_causal_lm.hpp @@ -0,0 +1,145 @@ +#pragma once + +#include "../../cache/kv_cache.hpp" +#include "../../config/model_config.hpp" +#include "../../layers/linear/linear.hpp" +#include "../infinilm_model.hpp" +#include "infinicore/nn/embedding.hpp" +#include "infinicore/nn/layer_norm.hpp" +#include "infinicore/nn/parameter.hpp" +#include "infinicore/tensor.hpp" + +#include +#include +#include + +namespace infinilm::models::rwkv { + +class Rwkv5SelfAttention : public infinicore::nn::Module { +public: + Rwkv5SelfAttention(std::shared_ptr config, + size_t layer_idx, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states, + infinicore::Tensor &attn_x_state, + infinicore::Tensor &wkv_state) const; + +private: + infinicore::Tensor shifted_hidden_(const infinicore::Tensor &hidden_states, + infinicore::Tensor &state) const; + infinicore::Tensor group_norm_(const infinicore::Tensor &x) const; + + size_t layer_idx_; + size_t hidden_size_; + size_t attention_hidden_size_; + size_t head_size_; + size_t num_heads_; + size_t head_size_divisor_; + + INFINICORE_NN_PARAMETER(time_decay); + INFINICORE_NN_PARAMETER(time_faaaa); + INFINICORE_NN_PARAMETER(time_mix_gate); + INFINICORE_NN_PARAMETER(time_mix_key); + INFINICORE_NN_PARAMETER(time_mix_value); + INFINICORE_NN_PARAMETER(time_mix_receptance); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, key); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, value); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, receptance); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, gate); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, output); + INFINICORE_NN_MODULE(infinicore::nn::LayerNorm, ln_x); +}; + +class Rwkv5FeedForward : public infinicore::nn::Module { +public: + Rwkv5FeedForward(std::shared_ptr config, + size_t layer_idx, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states, + infinicore::Tensor &ffn_x_state) const; + +private: + infinicore::Tensor shifted_hidden_(const infinicore::Tensor &hidden_states, + infinicore::Tensor &state) const; + + size_t layer_idx_; + size_t hidden_size_; + size_t intermediate_size_; + + INFINICORE_NN_PARAMETER(time_mix_key); + INFINICORE_NN_PARAMETER(time_mix_receptance); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, key); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, receptance); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, value); +}; + +class Rwkv5Block : public infinicore::nn::Module { +public: + Rwkv5Block(std::shared_ptr config, + size_t layer_idx, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinicore::Tensor &hidden_states, + infinicore::Tensor &attn_x_state, + infinicore::Tensor &wkv_state, + infinicore::Tensor &ffn_x_state) const; + +private: + size_t layer_idx_; + size_t rescale_every_; + std::shared_ptr pre_ln_; + INFINICORE_NN_MODULE(infinicore::nn::LayerNorm, ln1); + INFINICORE_NN_MODULE(Rwkv5SelfAttention, attention); + INFINICORE_NN_MODULE(infinicore::nn::LayerNorm, ln2); + INFINICORE_NN_MODULE(Rwkv5FeedForward, feed_forward); +}; + +class Rwkv5Model : public infinicore::nn::Module { +public: + Rwkv5Model(std::shared_ptr config, + const infinicore::Device &device); + + infinicore::Tensor forward(const infinilm::InfinilmModel::Input &input, + infinicore::Tensor &attn_x_state, + infinicore::Tensor &wkv_state, + infinicore::Tensor &ffn_x_state) const; + +private: + size_t rescale_every_; + + INFINICORE_NN_MODULE(infinicore::nn::Embedding, embeddings); + INFINICORE_NN_MODULE_VEC(Rwkv5Block, blocks); + INFINICORE_NN_MODULE(infinicore::nn::LayerNorm, ln_out); +}; + +class Rwkv5ForCausalLM : public infinilm::InfinilmModel { +public: + Rwkv5ForCausalLM(std::shared_ptr config, + const infinicore::Device &device); + + Output forward(const Input &input) const override; + void reset_cache(const cache::CacheConfig *cache_config) override; + +private: + void ensure_state_(size_t batch_size) const; + + size_t num_hidden_layers_; + size_t hidden_size_; + size_t num_heads_; + size_t head_size_; + infinicore::Device device_; + infinicore::DataType dtype_; + mutable size_t state_batch_size_ = 0; + mutable infinicore::Tensor attn_x_state_; + mutable infinicore::Tensor wkv_state_; + mutable infinicore::Tensor ffn_x_state_; + + INFINICORE_NN_MODULE(Rwkv5Model, rwkv); + INFINICORE_NN_MODULE(infinilm::layers::linear::ReplicatedLinear, head); +}; + +std::shared_ptr create_rwkv5_model_config(std::shared_ptr model_config); + +} // namespace infinilm::models::rwkv diff --git a/examples/bench.py b/examples/bench.py index 5754bbf6c..ed76a6470 100644 --- a/examples/bench.py +++ b/examples/bench.py @@ -42,6 +42,10 @@ "num_key_value_heads": "num_attention_heads", "head_dim": lambda cfg: cfg["hidden_size"] // cfg["num_attention_heads"], }, + "rwkv5": { + "num_key_value_heads": "num_attention_heads", + "head_dim": "head_size", + }, } @@ -167,13 +171,18 @@ def get_test_cases( return case_dict -prompt_path = ( - "examples/bench_prompt.md" - if os.path.isfile("examples/bench_prompt.md") - else "InfiniLM/examples/bench_prompt.md" -) -with open(prompt_path, "r") as f: - prompt = f.read() +def read_bench_prompt(): + prompt_path = ( + "examples/bench_prompt.md" + if os.path.isfile("examples/bench_prompt.md") + else "InfiniLM/examples/bench_prompt.md" + ) + with open(prompt_path, "r") as f: + return f.read() + + +def has_prompt_override(): + return any(arg == "--prompt" or arg.startswith("--prompt=") for arg in sys.argv[1:]) def repeat_prompt(input_ids: list[int], target_length: int): @@ -195,8 +204,10 @@ def __init__( cache_config=None, enable_graph=False, attn_backend="default", + prompt=None, ) -> None: model_path = os.path.expanduser(model_path) + prompt = read_bench_prompt() if prompt is None else prompt # ---------------------------------------------------------------------------- # # 创建模型, # ---------------------------------------------------------------------------- # @@ -343,6 +354,8 @@ def run( if enable_paged_attn and attn_backend == "default": attn_backend = "paged-attn" + prompt = cfg.prompt if has_prompt_override() else read_bench_prompt() + test = TestModel( model_path, infini_device=infini_device, @@ -351,6 +364,7 @@ def run( cache_config=cache_config, enable_graph=enable_graph, attn_backend=attn_backend, + prompt=prompt, ) # ---------------------------------------------------------------------------- # diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py index 17ee6c12f..bff061669 100644 --- a/python/infinilm/infer_engine.py +++ b/python/infinilm/infer_engine.py @@ -24,6 +24,12 @@ def read_hf_config(model_path): and config_dict.get("dtype") is None ): config_dict["torch_dtype"] = "float32" + if config_dict.get("model_type") == "rwkv5": + if config_dict.get("torch_dtype") is None and config_dict.get("dtype") is None: + config_dict["torch_dtype"] = "bfloat16" + config_dict.setdefault( + "max_position_embeddings", config_dict.get("context_length", 4096) + ) if "model_type" not in config_dict: raise ValueError( f"`model_type` is not specified in the config file `{config_path}`." @@ -239,6 +245,10 @@ def generate( if _measure_and_log_time: time_measurements = [] + is_rwkv = self.model_type == "rwkv5" + if is_rwkv: + self.reset_cache(self.get_cache_config()) + block_tables = None max_blocks_per_batch = 0 if self.enable_paged_attn: @@ -263,7 +273,7 @@ def generate( batch_size, seq_len = input_ids.shape[:2] - if self.enable_paged_attn: + if self.enable_paged_attn and not is_rwkv: input_ids = input_ids.view([1, batch_size * seq_len]) position_ids = infinicore.from_list( list(range(past_seq_len, past_seq_len + seq_len)) * batch_size, diff --git a/python/infinilm/llm/llm.py b/python/infinilm/llm/llm.py index c42d39f81..7a5c59362 100644 --- a/python/infinilm/llm/llm.py +++ b/python/infinilm/llm/llm.py @@ -31,10 +31,16 @@ from infinilm.config.engine_config import EngineConfig from infinilm.infer_engine import read_hf_generation_config from infinilm.kv_connector import KVConnectorRole, KVConnectorFactory +from infinilm.processors import AutoInfinilmProcessor logger = logging.getLogger(__name__) +def _resolve_cache_type_for_model(model_path: str, cache_type: str) -> str: + processor_cls = AutoInfinilmProcessor.get_processor_class(model_path) + return processor_cls.resolve_cache_type(cache_type) + + def _resolve_generation_defaults(config: EngineConfig) -> None: generation_config = read_hf_generation_config(config.model_path) @@ -55,6 +61,9 @@ class LLMEngine: """Low-level LLM engine that handles inference execution.""" def __init__(self, config: EngineConfig): + config.cache_type = _resolve_cache_type_for_model( + config.model_path, config.cache_type + ) _resolve_generation_defaults(config) self.config = config @@ -397,6 +406,11 @@ def generate( elif sampling_params.max_tokens is None: sampling_params = sampling_params.clone() sampling_params.max_tokens = self.config.max_tokens + if apply_chat_template and not sampling_params.stop: + default_stop = self.engine.processor.default_stop_strings() + if default_stop: + sampling_params = sampling_params.clone() + sampling_params.stop = default_stop requests = [] for content in contents: diff --git a/python/infinilm/llm/model_runner/model_runner.py b/python/infinilm/llm/model_runner/model_runner.py index 5eca6d372..88a37fbed 100644 --- a/python/infinilm/llm/model_runner/model_runner.py +++ b/python/infinilm/llm/model_runner/model_runner.py @@ -87,10 +87,7 @@ def __init__(self, config: EngineConfig): # Initialize KV connector self.kv_connector = None - if ( - self.kv_transfer_config is not None - and self.kv_transfer_config.kv_connector - ): + if self.kv_transfer_config is not None and self.kv_transfer_config.kv_connector: connector_name = self.kv_transfer_config.kv_connector self.kv_connector = KVConnectorFactory.create_connector( connector_name=connector_name, @@ -170,6 +167,10 @@ def execute_model(self, scheduler_output) -> ModelRunnerOutput: ) def _model_forward(self, scheduler_output): + self.processor.prepare_model_forward( + scheduler_output, self.model_engine, self.config + ) + # Build model inputs model_input = self.processor.build_model_inputs( scheduler_output, diff --git a/python/infinilm/processors/__init__.py b/python/infinilm/processors/__init__.py index 67f97acac..5382fe4ed 100644 --- a/python/infinilm/processors/__init__.py +++ b/python/infinilm/processors/__init__.py @@ -22,6 +22,12 @@ class AutoInfinilmProcessor: """Factory class to instantiate the appropriate model processor.""" + @classmethod + def get_processor_class(cls, model_dir_path: str): + config = AutoConfig.from_pretrained(model_dir_path, trust_remote_code=True) + model_type = config.model_type.lower() + return get_processor_class(model_type) + @classmethod def from_pretrained(cls, model_dir_path: str, **kwargs) -> InfinilmProcessor: """Instantiate a processor based on the model's configuration. @@ -30,8 +36,5 @@ def from_pretrained(cls, model_dir_path: str, **kwargs) -> InfinilmProcessor: registered Processor. Falls back to the registered default processor for unregistered or standard architectures. """ - config = AutoConfig.from_pretrained(model_dir_path, trust_remote_code=True) - model_type = config.model_type.lower() - - processor_cls = get_processor_class(model_type) + processor_cls = cls.get_processor_class(model_dir_path) return processor_cls(model_dir_path) diff --git a/python/infinilm/processors/basic_llm_processor.py b/python/infinilm/processors/basic_llm_processor.py index 6948aa41c..69f3a0380 100644 --- a/python/infinilm/processors/basic_llm_processor.py +++ b/python/infinilm/processors/basic_llm_processor.py @@ -24,8 +24,6 @@ def __call__(self, prompt: str, return_tensors: str = None, **kwargs) -> dict: if return_tensors is None: return self.tokenizer(prompt, add_special_tokens=False) elif return_tensors == "infini": - import infinicore - result = {} for key, tensor in self.tokenizer( prompt, return_tensors="pt", add_special_tokens=False @@ -44,21 +42,15 @@ def apply_chat_template( tokenize: bool = True, **kwargs, ): - normalized_conversation = [] - for message in conversation: - if isinstance(message["content"], list): - assert len(message["content"]) == 1, ( - "Only one content item supported in list" - ) - content_item = message["content"][0] - assert "type" in content_item and "text" in content_item, ( - "Content dict must have 'type' and 'text' keys" - ) - normalized_conversation.append( - {"role": message["role"], "content": content_item["text"]} - ) - else: - normalized_conversation.append(message) + normalized_conversation = self._normalize_conversation(conversation) + if self.tokenizer.chat_template is None: + prompt = self._messages_to_prompt(normalized_conversation) + if not tokenize: + return prompt + return self.tokenizer(prompt, add_special_tokens=False, **kwargs)[ + "input_ids" + ] + return self.tokenizer.apply_chat_template( conversation=normalized_conversation, add_generation_prompt=add_generation_prompt, @@ -66,6 +58,35 @@ def apply_chat_template( **kwargs, ) + @classmethod + def _normalize_conversation(cls, conversation): + normalized_conversation = [] + for message in conversation: + normalized_conversation.append( + {**message, "content": cls._content_to_text(message.get("content", ""))} + ) + return normalized_conversation + + @staticmethod + def _content_to_text(content) -> str: + if isinstance(content, list): + parts = [] + for item in content: + if isinstance(item, dict) and item.get("type") == "text": + parts.append(str(item.get("text", ""))) + elif isinstance(item, str): + parts.append(item) + return "".join(parts) + return str(content) + + @staticmethod + def _messages_to_prompt(conversation) -> str: + return "\n".join( + str(message.get("content", "")) + for message in conversation + if message.get("content", "") + ) + @override def build_model_inputs( self, diff --git a/python/infinilm/processors/processor.py b/python/infinilm/processors/processor.py index a2952bc1e..b813a7d58 100644 --- a/python/infinilm/processors/processor.py +++ b/python/infinilm/processors/processor.py @@ -29,6 +29,19 @@ def build_model_inputs(self, scheduler_output, **kwargs) -> dict: """Build batched infinilm model inputs from the scheduler output.""" raise NotImplementedError("build_model_inputs is not implemented yet") + @classmethod + def resolve_cache_type(cls, cache_type: str) -> str: + """Return the cache type this processor supports for LLM scheduling.""" + return cache_type + + def prepare_model_forward(self, scheduler_output, model_engine, engine_config): + """Adjust scheduler/model state before a model forward pass.""" + return None + + def default_stop_strings(self) -> list[str]: + """Return model-specific stop strings for chat generation.""" + return [] + def get_tokenizer(self): """Return the text tokenizer associated with this processor.""" raise NotImplementedError("get_tokenizer is not implemented yet") diff --git a/python/infinilm/processors/rwkv_processor.py b/python/infinilm/processors/rwkv_processor.py new file mode 100644 index 000000000..8ade18642 --- /dev/null +++ b/python/infinilm/processors/rwkv_processor.py @@ -0,0 +1,55 @@ +import logging + +from .basic_llm_processor import BasicLLMProcessor +from .processor import register_processor + +logger = logging.getLogger(__name__) + + +@register_processor("rwkv5") +class Rwkv5Processor(BasicLLMProcessor): + @classmethod + def resolve_cache_type(cls, cache_type: str) -> str: + if cache_type == "paged": + logger.info("RWKV5 uses recurrent state; using static cache scheduling") + return "static" + return cache_type + + def prepare_model_forward(self, scheduler_output, model_engine, engine_config): + if engine_config.cache_type == "static" and scheduler_output.is_prefill: + scheduler_output.prefix_hit_len = 0 + model_engine.reset_cache(model_engine.get_cache_config()) + + def default_stop_strings(self) -> list[str]: + return ["\n\nUser:", "\nUser:"] + + def apply_chat_template( + self, + conversation, + add_generation_prompt: bool = False, + tokenize: bool = True, + **kwargs, + ): + normalized_conversation = self._normalize_conversation(conversation) + if self.tokenizer.chat_template is not None: + return self.tokenizer.apply_chat_template( + conversation=normalized_conversation, + add_generation_prompt=add_generation_prompt, + tokenize=tokenize, + **kwargs, + ) + + prompt_parts = [ + "User: hi\n" + "Assistant: Hi. I am your assistant and I will provide expert full response in full details. " + "Please feel free to ask any question and I will always answer it." + ] + for message in normalized_conversation: + role = "Assistant" if message.get("role") == "assistant" else "User" + prompt_parts.append(f"{role}: {message.get('content', '')}") + prompt = "\n\n".join(prompt_parts) + if add_generation_prompt: + prompt = prompt.rstrip() + "\n\nAssistant:" + if not tokenize: + return prompt + return self.tokenizer(prompt, add_special_tokens=False, **kwargs)["input_ids"] diff --git a/python/infinilm/server/inference_server.py b/python/infinilm/server/inference_server.py index 40253bdc8..dc1ffa4bf 100644 --- a/python/infinilm/server/inference_server.py +++ b/python/infinilm/server/inference_server.py @@ -197,6 +197,7 @@ async def lifespan(app: FastAPI): self.temperature = self.engine.config.temperature self.top_p = self.engine.config.top_p self.top_k = self.engine.config.top_k + self.cache_type = self.engine.config.cache_type self.engine.start() logger.info(f"Engine initialized with model at {self.model_path}") logger.info(f" enable_graph: {self.enable_graph}")