-
Notifications
You must be signed in to change notification settings - Fork 73
feat(moe): add MoE inference and expert parallel support #444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| #include "spdlog/spdlog.h" | ||
| #include <future> | ||
| #include <stdexcept> | ||
| #include <unordered_set> | ||
|
|
||
| namespace infinilm::engine { | ||
|
|
||
|
|
@@ -67,19 +68,19 @@ void InferEngine::load_param(const std::string &name, const infinicore::Tensor & | |
| } | ||
| } | ||
|
|
||
| void InferEngine::load_params(const std::unordered_map<std::string, infinicore::Tensor> ¶ms) { | ||
| void InferEngine::load_params(const std::unordered_map<std::string, infinicore::Tensor> ¶ms, bool strict) { | ||
| if (workers_.size() <= 1 || weight_load_mode_ == "sync") { | ||
| for (auto &worker : workers_) { | ||
| worker->load_params(params); | ||
| worker->load_params(params, strict); | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| std::vector<std::future<void>> futures; | ||
| futures.reserve(workers_.size()); | ||
| for (auto &worker : workers_) { | ||
| futures.emplace_back(std::async(std::launch::async, [&worker, ¶ms] { | ||
| worker->load_params(params); | ||
| futures.emplace_back(std::async(std::launch::async, [&worker, ¶ms, strict] { | ||
| worker->load_params(params, strict); | ||
| })); | ||
| } | ||
| for (auto &future : futures) { | ||
|
|
@@ -118,7 +119,16 @@ std::vector<std::string> InferEngine::state_dict_keys() { | |
| if (0 == workers_.size()) { | ||
| throw std::runtime_error(" Model object not found. "); | ||
| } | ||
| return workers_.front()->state_dict_keys(); | ||
| std::vector<std::string> keys; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个写法,我看了好一会才看懂。 std::vectorstd::string keys_vec(keys.begin(), keys.end()); ` |
||
| std::unordered_set<std::string> seen; | ||
| for (auto &worker : workers_) { | ||
| for (const auto &key : worker->state_dict_keys()) { | ||
| if (seen.emplace(key).second) { | ||
| keys.push_back(key); | ||
| } | ||
| } | ||
| } | ||
| return keys; | ||
| } | ||
|
|
||
| //------------------------------------------------------ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -87,14 +87,15 @@ void RankWorker::load_param(const std::string &name, | |
| //------------------------------------------------------ | ||
| // load_params -- synchronous batch load | ||
| //------------------------------------------------------ | ||
| void RankWorker::load_params(const std::unordered_map<std::string, infinicore::Tensor> ¶ms) { | ||
| void RankWorker::load_params(const std::unordered_map<std::string, infinicore::Tensor> ¶ms, bool strict) { | ||
| { | ||
| std::lock_guard<std::mutex> lock(mutex_); | ||
| if (should_exit_) { | ||
| throw std::runtime_error("RankWorker is closing; cannot load_params"); | ||
| } | ||
|
|
||
| pending_params_ = params; | ||
| pending_params_strict_ = strict; | ||
| job_cmd_ = Command::LOAD_BATCH; | ||
| has_job_ = true; | ||
| job_done_ = false; | ||
|
|
@@ -295,6 +296,7 @@ void RankWorker::thread_loop() { | |
| std::string local_param_name; | ||
| infinicore::Tensor local_param; | ||
| std::unordered_map<std::string, infinicore::Tensor> local_params; | ||
| bool local_params_strict = true; | ||
| Input local_args; | ||
| std::unique_ptr<cache::CacheConfig> local_cache_config; | ||
|
|
||
|
|
@@ -314,6 +316,8 @@ void RankWorker::thread_loop() { | |
| local_param = pending_param_; | ||
| } else if (local_cmd == Command::LOAD_BATCH) { | ||
| local_params = std::move(pending_params_); | ||
| local_params_strict = pending_params_strict_; | ||
| pending_params_strict_ = true; | ||
| pending_params_.clear(); | ||
| } else if (local_cmd == Command::PREPROCESS) { | ||
|
|
||
|
|
@@ -353,7 +357,7 @@ void RankWorker::thread_loop() { | |
|
|
||
| } else if (local_cmd == Command::LOAD_BATCH) { | ||
| try { | ||
| model_->load_parameters_no_sync(local_params); | ||
| model_->load_parameters_no_sync(local_params, local_params_strict); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 等价于这个写法么 model_->load_parameters_no_sync(local_params, strict); |
||
| infinicore::context::syncStream(); | ||
| } catch (const std::exception &e) { | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,93 @@ | ||
| #pragma once | ||
|
|
||
| #include "topk_output.hpp" | ||
|
|
||
| #include "infinicore/tensor.hpp" | ||
|
|
||
| #include <cstddef> | ||
|
|
||
| namespace infinilm::layers::moe { | ||
|
|
||
| enum class DispatchOutputFormat { | ||
| Standard, | ||
| DeepEPNormal, | ||
| DeepEPLL, | ||
| }; | ||
|
|
||
| enum class CombineInputFormat { | ||
| Standard, | ||
| DeepEPNormal, | ||
| DeepEPLL, | ||
| }; | ||
|
|
||
| struct DispatchOutput { | ||
| DispatchOutputFormat format = DispatchOutputFormat::Standard; | ||
| infinicore::Tensor hidden_states; | ||
| infinicore::Tensor hidden_states_scale; | ||
| TopKOutput topk_output; | ||
| infinicore::Tensor expert_map; | ||
| }; | ||
|
|
||
| struct MoeRoutingMetadata { | ||
| infinicore::Tensor sorted_token_ids; | ||
| infinicore::Tensor expert_ids; | ||
| infinicore::Tensor num_tokens_post_padded; | ||
|
|
||
| infinicore::Tensor expert_offsets; | ||
| infinicore::Tensor blockscale_offsets; | ||
| infinicore::Tensor problem_sizes1; | ||
| infinicore::Tensor problem_sizes2; | ||
| infinicore::Tensor input_permutation; | ||
| infinicore::Tensor output_permutation; | ||
|
|
||
| bool has_grouped_gemm_metadata = false; | ||
| }; | ||
|
|
||
| struct CombineInput { | ||
| CombineInputFormat format = CombineInputFormat::Standard; | ||
| infinicore::Tensor hidden_states; | ||
| TopKOutput topk_output; | ||
| MoeRoutingMetadata routing_metadata; | ||
| }; | ||
|
|
||
| struct MoeWeights { | ||
| infinicore::Tensor packed_w13; | ||
| infinicore::Tensor packed_w2; | ||
|
|
||
| bool empty() const { | ||
| return !packed_w13 && !packed_w2; | ||
| } | ||
|
|
||
| bool has_packed_dense_weights() const { | ||
| return packed_w13 && packed_w2; | ||
| } | ||
| }; | ||
|
|
||
| struct MoeWorkspace { | ||
| infinicore::Tensor ep_gathered_hidden_states; | ||
| infinicore::Tensor ep_gathered_topk_weights; | ||
| infinicore::Tensor ep_gathered_topk_ids; | ||
| infinicore::Tensor ep_reduced_hidden_states; | ||
| infinicore::Tensor fused_moe_output; | ||
|
|
||
| infinicore::Tensor sorted_token_ids; | ||
| infinicore::Tensor expert_ids; | ||
| infinicore::Tensor num_tokens_post_padded; | ||
| infinicore::Tensor expert_offsets; | ||
| infinicore::Tensor blockscale_offsets; | ||
| infinicore::Tensor problem_sizes1; | ||
| infinicore::Tensor problem_sizes2; | ||
| infinicore::Tensor input_permutation; | ||
| infinicore::Tensor output_permutation; | ||
|
|
||
| size_t sorted_token_ids_capacity = 0; | ||
| size_t expert_ids_capacity = 0; | ||
| size_t ep_gathered_tokens_capacity = 0; | ||
| size_t ep_reduced_tokens_capacity = 0; | ||
| size_t fused_moe_output_tokens_capacity = 0; | ||
| size_t blockscale_offsets_capacity = 0; | ||
| size_t permutation_capacity = 0; | ||
| size_t prepared_num_experts = 0; | ||
| }; | ||
|
|
||
| } // namespace infinilm::layers::moe |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,13 @@ | ||
| #pragma once | ||
|
|
||
| #include "infinicore/tensor.hpp" | ||
|
|
||
| namespace infinilm::layers::moe { | ||
|
|
||
| struct TopKOutput { | ||
| infinicore::Tensor topk_weights; | ||
| infinicore::Tensor topk_ids; | ||
| infinicore::Tensor router_logits; | ||
| }; | ||
|
|
||
| } // namespace infinilm::layers::moe |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这个新增的replay_output变量,以及graph编译时新增和修改的代码。可以注释或解释一下么,不知道啥意思