From a04ef3e1c88cafb0409f9aebbd9cc4b95b675001 Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Thu, 22 Jan 2026 16:42:10 +0800 Subject: [PATCH 01/42] feat:add Qwen2.5omni text modal processing --- examples/CMakeLists.txt | 1 + examples/qwen2_5omni/CMakeLists.txt | 3 + examples/qwen2_5omni/config_qwen2_5omni.json | 495 ++++++++++++++++++ examples/qwen2_5omni/text_infer.cpp | 72 +++ .../qwen2_5omni/configuration_qwen2_5omni.hpp | 97 ++++ .../qwen2_5omni/modeling_qwen2_5omni.hpp | 357 +++++++++++++ .../qwen2_5omni/tokenization_qwen2_5omni.hpp | 252 +++++++++ 7 files changed, 1277 insertions(+) create mode 100644 examples/qwen2_5omni/CMakeLists.txt create mode 100644 examples/qwen2_5omni/config_qwen2_5omni.json create mode 100644 examples/qwen2_5omni/text_infer.cpp create mode 100644 mllm/models/qwen2_5omni/configuration_qwen2_5omni.hpp create mode 100644 mllm/models/qwen2_5omni/modeling_qwen2_5omni.hpp create mode 100644 mllm/models/qwen2_5omni/tokenization_qwen2_5omni.hpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 180c3cbe6..a2426f229 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -2,6 +2,7 @@ add_subdirectory(qwen2vl) add_subdirectory(qwen2vl_tracer) add_subdirectory(qwen2_5vl) add_subdirectory(qwen2_5vl_tracer) +add_subdirectory(qwen2_5omni) add_subdirectory(llama) add_subdirectory(minicpm_o) add_subdirectory(minicpm4) diff --git a/examples/qwen2_5omni/CMakeLists.txt b/examples/qwen2_5omni/CMakeLists.txt new file mode 100644 index 000000000..3141b56d7 --- /dev/null +++ b/examples/qwen2_5omni/CMakeLists.txt @@ -0,0 +1,3 @@ +add_executable(mllm-qwen2_5-omni-text-runner text_infer.cpp) +target_link_libraries(mllm-qwen2_5-omni-text-runner PRIVATE MllmRT MllmCPUBackend) +target_include_directories(mllm-qwen2_5-omni-text-runner PRIVATE ${MLLM_INCLUDE_DIR}) diff --git a/examples/qwen2_5omni/config_qwen2_5omni.json b/examples/qwen2_5omni/config_qwen2_5omni.json new file mode 100644 index 000000000..633e1b2b1 --- /dev/null +++ b/examples/qwen2_5omni/config_qwen2_5omni.json @@ -0,0 +1,495 @@ +{ + "architectures": [ + "Qwen2_5OmniModel" + ], + "enable_audio_output": true, + "enable_talker": true, + "model_type": "qwen2_5_omni", + "talker_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "Qwen2.5-Omni-7B/talker", + "architectures": [ + "Qwen2OmniTalkerForConditionalGeneration" + ], + "attention_dropout": 0.0, + "audio_end_token_id": 151648, + "audio_start_token_id": 151647, + "audio_token_index": 151646, + "embedding_size": 3584, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 896, + "image_token_index": 151655, + "init_std": 0.02, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2_5_omni_talker", + "num_attention_heads": 12, + "num_hidden_layers": 24, + "num_key_value_heads": 4, + "position_id_per_seconds": 25, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "rope_theta": 1000000.0, + "seconds_per_chunk": 2, + "sliding_window": 32768, + "spatial_merge_size": 2, + "torch_dtype": "bfloat16", + "tts_codec_end_token_id": 8294, + "tts_codec_mask_token_id": 8296, + "tts_codec_pad_token_id": 8292, + "tts_codec_start_token_id": 8293, + "tts_text_end_token_id": 151861, + "tts_text_pad_token_id": 151859, + "tts_text_start_token_id": 151860, + "use_cache": true, + "use_sliding_window": false, + "video_token_index": 151656, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vocab_size": 8448 + }, + "thinker_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "Qwen2.5-Omni-7B/thinker", + "architectures": [ + "Qwen2OmniNaViTThinkerForConditionalGeneration" + ], + "audio_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "", + "activation_dropout": 0.0, + "activation_function": "gelu", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "d_model": 1280, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "dropout": 0.0, + "early_stopping": false, + "encoder_attention_heads": 20, + "encoder_ffn_dim": 5120, + "encoder_layerdrop": 0.0, + "encoder_layers": 32, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "init_std": 0.02, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_source_positions": 1500, + "min_length": 0, + "model_type": "qwen2_5_omni_audio_encoder", + "n_window": 100, + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 32, + "num_mel_bins": 128, + "num_return_sequences": 1, + "output_attentions": false, + "output_dim": 3584, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "scale_embedding": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + }, + "text_config": { + "model_type": "qwen2_5_omni_text", + "hidden_act": "silu", + "hidden_size": 3584, + "init_std": 0.02, + "intermediate_size": 18944, + "vocab_size": 152064, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "use_cache": true, + "rope_theta": 1000000.0, + "use_sliding_window": false, + "sliding_window": 32768, + "attention_dropout": 0.0, + "tie_word_embeddings": false + }, + "audio_end_token_id": 151648, + "audio_start_token_id": 151647, + "audio_token_index": 151646, + "bos_token_id": 151644, + "eos_token_id": 151645, + "ignore_index": -100, + "image_token_index": 151655, + "init_std": 0.02, + "model_type": "qwen2_5_omni_thinker", + "pad_token_id": 151643, + "position_id_per_seconds": 25, + "seconds_per_chunk": 2, + "torch_dtype": "bfloat16", + "user_token_id": 872, + "video_token_index": 151656, + "vision_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "depth": 32, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "embed_dim": 1280, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fullatt_block_indexes": [ + 7, + 15, + 23, + 31 + ], + "hidden_act": "silu", + "hidden_size": 1280, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "in_channels": 3, + "in_chans": 3, + "init_std": 0.02, + "intermediate_size": 3420, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "qwen2_5_omni_vision_encoder", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_heads": 16, + "num_return_sequences": 1, + "out_hidden_size": 3584, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "spatial_merge_size": 2, + "spatial_patch_size": 14, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "temporal_patch_size": 2, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "tokens_per_second": 25, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "window_size": 112 + }, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vision_token_id": 151654 + }, + "token2wav_config": { + "_attn_implementation_autoset": true, + "bigvgan_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "mel_dim": 80, + "min_length": 0, + "model_type": "qwen2_5_omni_bigvgan", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "upsample_initial_channel": 1536, + "upsample_kernel_sizes": [ + 11, + 7, + 4, + 4, + 4, + 4 + ], + "upsample_rates": [ + 5, + 3, + 2, + 2, + 2, + 2 + ], + "use_bfloat16": false, + "use_bias_at_final": false + }, + "dit_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "depth": 22, + "dim": 1024, + "diversity_penalty": 0.0, + "do_sample": false, + "dropout": 0.1, + "early_stopping": false, + "emb_dim": 512, + "enc_attention_channels": 64, + "enc_channels": [ + 256, + 256, + 256, + 256, + 768 + ], + "enc_dilations": [ + 1, + 2, + 3, + 4, + 1 + ], + "enc_dim": 128, + "enc_emb_dim": 192, + "enc_global_context": true, + "enc_kernel_sizes": [ + 5, + 3, + 3, + 3, + 1 + ], + "enc_lin_neurons": 192, + "enc_res2net_scale": 2, + "enc_se_channels": 64, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "ff_mult": 2, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 64, + "heads": 16, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "mel_dim": 80, + "min_length": 0, + "model_type": "qwen2_5_omni_dit", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_embeds": 8193, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repeats": 2, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + }, + "model_type": "qwen2_5_omni_token2wav" + }, + "torch_dtype": "bfloat16", + "transformers_version": "4.50.0.dev0" +} \ No newline at end of file diff --git a/examples/qwen2_5omni/text_infer.cpp b/examples/qwen2_5omni/text_infer.cpp new file mode 100644 index 000000000..299a0e07d --- /dev/null +++ b/examples/qwen2_5omni/text_infer.cpp @@ -0,0 +1,72 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include + +using mllm::Argparse; + +MLLM_MAIN({ + mllm::Logger::level() = mllm::LogLevel::kError; + + auto& help = Argparse::add("-h|--help").help("Show help message"); + auto& model_path = Argparse::add("-m|--model_path").help("Model path").required(true); + auto& model_version = Argparse::add("-mv|--model_version").help("Model version").required(true); + auto& tokenizer_path = Argparse::add("-t|--tokenizer_path").help("Tokenizer directory").required(true); + auto& config_path = Argparse::add("-c|--config_path").help("Config path").required(true); + + Argparse::parse(argc, argv); + + mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1; + if (model_version.get() == "v1") { + file_version = mllm::ModelFileVersion::kV1; + } else if (model_version.get() == "v2") { + file_version = mllm::ModelFileVersion::kV2; + } + + if (help.isSet()) { + Argparse::printHelp(); + mllm::shutdownContext(); + return 0; + } + + { + auto qwen2_5omni_cfg = mllm::models::qwen2_5omni::Qwen2_5OmniConfig(config_path.get()); + auto qwen2_5omni_tokenizer = mllm::models::qwen2_5omni::Qwen2_5OmniTokenizer(tokenizer_path.get()); + auto qwen2_5omni = mllm::models::qwen2_5omni::Qwen2_5OmniForCausalLM(qwen2_5omni_cfg); + + auto param = mllm::load(model_path.get(), file_version); + qwen2_5omni.thinker_.load(param); + + fmt::print("\n{:*^60}\n", " Qwen2.5-Omni Text CLI "); + fmt::print("Enter 'exit' or 'quit' to end the session\n\n"); + + std::string prompt_text; + + fmt::print("💬 Prompt text (or 'exit/quit'): "); + std::getline(std::cin, prompt_text); + + if (prompt_text == "exit" || prompt_text == "quit") { return 0; } + + try { + fmt::print("🔄 Processing...\n"); + auto inputs = qwen2_5omni_tokenizer.convertMessage({.prompt = prompt_text}); + + fmt::print("\n🤖 Response: "); + for (auto& step : qwen2_5omni.chat(inputs)) { + std::wcout << qwen2_5omni_tokenizer.detokenize(step.cur_token_id) << std::flush; + } + + fmt::print("\n{}\n", std::string(60, '-')); + } catch (const std::exception& e) { fmt::print("\n❌ Error: {}\n{}\n", e.what(), std::string(60, '-')); } + + qwen2_5omni.perfSummary(); + } + + mllm::print("\n"); + mllm::memoryReport(); +}) diff --git a/mllm/models/qwen2_5omni/configuration_qwen2_5omni.hpp b/mllm/models/qwen2_5omni/configuration_qwen2_5omni.hpp new file mode 100644 index 000000000..2b0cb1ee8 --- /dev/null +++ b/mllm/models/qwen2_5omni/configuration_qwen2_5omni.hpp @@ -0,0 +1,97 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. +#pragma once + +#include +#include + +#include "mllm/core/aops/LinearOp.hpp" +#include "mllm/engine/ConfigFile.hpp" + +namespace mllm::models::qwen2_5omni { + +struct Qwen2_5OmniConfig : protected ConfigFile { + Qwen2_5OmniConfig() = default; + + explicit Qwen2_5OmniConfig(const std::string& file_path) : ConfigFile(file_path) { + auto& root = data(); + + if (root.contains("thinker_config")) { + auto& thinker_cfg = root["thinker_config"]; + auto& text_cfg = thinker_cfg["text_config"]; + + hidden_size = text_cfg["hidden_size"]; + intermediate_size = text_cfg["intermediate_size"]; + num_attention_heads = text_cfg["num_attention_heads"]; + num_key_value_heads = text_cfg["num_key_value_heads"]; + num_hidden_layers = text_cfg["num_hidden_layers"]; + max_position_embeddings = text_cfg["max_position_embeddings"]; + rms_norm_eps = text_cfg["rms_norm_eps"]; + vocab_size = text_cfg["vocab_size"]; + rope_theta = text_cfg["rope_theta"]; + tie_word_embeddings = text_cfg.value("tie_word_embeddings", false); + + if (text_cfg.contains("rope_scaling") && text_cfg["rope_scaling"].contains("mrope_section")) { + mrope_section = text_cfg["rope_scaling"]["mrope_section"].get>(); + } + + bos_token_id = thinker_cfg.value("bos_token_id", bos_token_id); + eos_token_id = thinker_cfg.value("eos_token_id", eos_token_id); + pad_token_id = thinker_cfg.value("pad_token_id", pad_token_id); + image_token_id = thinker_cfg.value("image_token_index", image_token_id); + audio_token_id = thinker_cfg.value("audio_token_index", audio_token_id); + video_token_id = thinker_cfg.value("video_token_index", video_token_id); + } else { + hidden_size = root["hidden_size"]; + intermediate_size = root["intermediate_size"]; + num_attention_heads = root["num_attention_heads"]; + num_key_value_heads = root["num_key_value_heads"]; + num_hidden_layers = root["num_hidden_layers"]; + max_position_embeddings = root["max_position_embeddings"]; + rms_norm_eps = root["rms_norm_eps"]; + vocab_size = root["vocab_size"]; + rope_theta = root["rope_theta"]; + tie_word_embeddings = root.value("tie_word_embeddings", tie_word_embeddings); + if (root.contains("mrope_section")) { + mrope_section = root["mrope_section"].get>(); + } + bos_token_id = root.value("bos_token_id", bos_token_id); + eos_token_id = root.value("eos_token_id", eos_token_id); + pad_token_id = root.value("pad_token_id", pad_token_id); + image_token_id = root.value("image_token_id", image_token_id); + audio_token_id = root.value("audio_token_id", audio_token_id); + video_token_id = root.value("video_token_id", video_token_id); + } + + max_cache_length = root.value("max_cache_length", max_position_embeddings); + + if (root.contains("linear_impl_type")) { + linear_impl_type = aops::str2LinearImplTypes(root["linear_impl_type"]); + } + } + + int32_t hidden_size = 3584; + int32_t intermediate_size = 18944; + int32_t num_attention_heads = 28; + int32_t num_key_value_heads = 4; + int32_t num_hidden_layers = 28; + int32_t max_position_embeddings = 32768; + float rms_norm_eps = 1e-06f; + int32_t vocab_size = 152064; + std::vector mrope_section = {16, 24, 24}; + float rope_theta = 1000000.0f; + bool tie_word_embeddings = false; + + int32_t max_cache_length = 32768; + + int64_t bos_token_id = 151644; + int64_t eos_token_id = 151645; + int64_t pad_token_id = 151643; + int64_t image_token_id = 151655; + int64_t audio_token_id = 151646; + int64_t video_token_id = 151656; + + aops::LinearImplTypes linear_impl_type = aops::LinearImplTypes::kDefault; +}; + +} // namespace mllm::models::qwen2_5omni diff --git a/mllm/models/qwen2_5omni/modeling_qwen2_5omni.hpp b/mllm/models/qwen2_5omni/modeling_qwen2_5omni.hpp new file mode 100644 index 000000000..7bd00baa7 --- /dev/null +++ b/mllm/models/qwen2_5omni/modeling_qwen2_5omni.hpp @@ -0,0 +1,357 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. +#pragma once + +#include + +#include "mllm/mllm.hpp" +#include "mllm/nn/Module.hpp" +#include "mllm/nn/Nn.hpp" +#include "mllm/nn/Functional.hpp" +#include "mllm/nn/lmcache/StaticCache.hpp" +#include "mllm/models/ARGeneration.hpp" +#include "mllm/utils/Enumerate.hpp" + +#include "mllm/models/qwen2_5omni/configuration_qwen2_5omni.hpp" + +namespace mllm::models::qwen2_5omni { + +inline auto makeMultimodalRoPEInvFreq(int output_dim, float rope_theta) -> Tensor { + auto inv_freq = Tensor::empty({output_dim / 2}, kFloat32, kCPU).alloc(); + auto inv_freq_ptr = inv_freq.ptr(); + for (int i = 0; i < output_dim / 2; i++) { inv_freq_ptr[i] = 1.0f / std::pow(rope_theta, 2.0f * i / output_dim); } + return inv_freq; +} + +inline auto makeMultimodalPositionEmbedding(Tensor& position_ids, const Tensor& inv_freq, int seq_len, int output_dim, + const std::vector& mrope_section) -> std::pair { + MLLM_RT_ASSERT_EQ(position_ids.shape().size(), 3); + MLLM_RT_ASSERT_EQ(position_ids.shape()[1], 1); + + Tensor tmp_sin = Tensor::empty({3, position_ids.shape()[2], inv_freq.shape()[0] * 2}).alloc(); + Tensor tmp_cos = Tensor::empty({3, position_ids.shape()[2], inv_freq.shape()[0] * 2}).alloc(); + + for (int b = 0; b < 3; ++b) { + for (int d = 0; d < inv_freq.shape()[0]; ++d) { + for (int s = 0; s < position_ids.shape()[2]; ++s) { + auto value = inv_freq.ptr()[d] * (*position_ids.offsettedPtr({b, 0, s})); + *tmp_cos.offsettedPtr({b, s, d}) = cosf(value); + *tmp_cos.offsettedPtr({b, s, d + inv_freq.shape()[0]}) = cosf(value); + *tmp_sin.offsettedPtr({b, s, d}) = sinf(value); + *tmp_sin.offsettedPtr({b, s, d + inv_freq.shape()[0]}) = sinf(value); + } + } + } + + Tensor sin = Tensor::nil(); + Tensor cos = Tensor::nil(); + + if (!mrope_section.empty()) { + auto double_rope_section = mrope_section; + for (int i : mrope_section) { double_rope_section.push_back(i); } + + int num_rows = tmp_sin.shape()[1]; + int num_cols = tmp_sin.shape()[2]; + + sin = Tensor::empty({num_rows, num_cols}, kFloat32, kCPU).alloc(); + cos = Tensor::empty({num_rows, num_cols}, kFloat32, kCPU).alloc(); + + std::vector start_cols; + int current_start = 0; + start_cols.push_back(current_start); + for (int s : double_rope_section) { + current_start += s; + start_cols.push_back(current_start); + } + + for (int j = 0; j < static_cast(double_rope_section.size()); ++j) { + int layer = j % 3; + int s_j = double_rope_section[j]; + int start_col_in = start_cols[j]; + int start_col_out = start_cols[j]; + for (int row = 0; row < num_rows; ++row) { + auto in_cos_row_ptr = tmp_cos.offsettedPtr({layer, row, 0}); + auto out_cos_row_ptr = cos.offsettedPtr({row, 0}); + for (int c = 0; c < s_j; ++c) { out_cos_row_ptr[start_col_out + c] = in_cos_row_ptr[start_col_in + c]; } + + auto in_sin_row_ptr = tmp_sin.offsettedPtr({layer, row, 0}); + auto out_sin_row_ptr = sin.offsettedPtr({row, 0}); + for (int c = 0; c < s_j; ++c) { out_sin_row_ptr[start_col_out + c] = in_sin_row_ptr[start_col_in + c]; } + } + } + } else { + sin = tmp_sin; + cos = tmp_cos; + } + + return {sin, cos}; +} + +class Qwen2_5OmniMLP final : public nn::Module { + nn::Linear gate_proj_; + nn::Linear up_proj_; + nn::Linear down_proj_; + nn::SiLU silu_; + + public: + Qwen2_5OmniMLP() = default; + Qwen2_5OmniMLP(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + gate_proj_ = reg("gate_proj", cfg.hidden_size, cfg.intermediate_size, false, cfg.linear_impl_type); + silu_ = reg("act"); + up_proj_ = reg("up_proj", cfg.hidden_size, cfg.intermediate_size, false, cfg.linear_impl_type); + down_proj_ = reg("down_proj", cfg.intermediate_size, cfg.hidden_size, false, cfg.linear_impl_type); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto x = gate_proj_(inputs[0]); + x = silu_(x); + auto y = up_proj_(inputs[0]); + x = x * y; + x = down_proj_(x); + return {x}; + } +}; + +class Qwen2_5OmniAttention final : public nn::Module { + nn::Linear q_proj_; + nn::Linear k_proj_; + nn::Linear v_proj_; + nn::Linear o_proj_; + nn::MultimodalRoPE q_rope_; + nn::MultimodalRoPE k_rope_; + nn::CausalMask mask_; + nn::Softmax softmax_; + + int hidden_size_; + int head_dim_; + int num_attention_heads_; + int num_key_value_heads_; + int num_key_value_groups_; + + public: + Qwen2_5OmniAttention() = default; + + Qwen2_5OmniAttention(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + hidden_size_ = cfg.hidden_size; + num_attention_heads_ = cfg.num_attention_heads; + num_key_value_heads_ = cfg.num_key_value_heads; + head_dim_ = hidden_size_ / num_attention_heads_; + num_key_value_groups_ = num_attention_heads_ / num_key_value_heads_; + + q_proj_ = reg("q_proj", hidden_size_, head_dim_ * num_attention_heads_, true, cfg.linear_impl_type); + k_proj_ = reg("k_proj", hidden_size_, head_dim_ * num_key_value_heads_, true, cfg.linear_impl_type); + v_proj_ = reg("v_proj", hidden_size_, head_dim_ * num_key_value_heads_, true, cfg.linear_impl_type); + o_proj_ = reg("o_proj", head_dim_ * num_attention_heads_, hidden_size_, false, cfg.linear_impl_type); + + q_rope_ = reg( + "q_rope", aops::Qwen2VLMultimodalRoPEOpOptions{.rope_theta = cfg.rope_theta, + .max_position_embeddings = cfg.max_position_embeddings, + .mrope_section = cfg.mrope_section}); + k_rope_ = reg( + "k_rope", aops::Qwen2VLMultimodalRoPEOpOptions{.rope_theta = cfg.rope_theta, + .max_position_embeddings = cfg.max_position_embeddings, + .mrope_section = cfg.mrope_section}); + + mask_ = reg("mask"); + softmax_ = reg("softmax", -1); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto x = inputs[0]; + auto llm_embedding_sin = inputs[1]; + auto llm_embedding_cos = inputs[2]; + auto past_kv_cache = args[0].get(); + + auto query_states = q_proj_(x); + auto key_states = k_proj_(x); + auto value_states = v_proj_(x); + + int B = inputs[0].shape()[0]; + int S = inputs[0].shape()[1]; + + query_states = query_states.view({B, S, num_attention_heads_, head_dim_}); + key_states = key_states.view({B, S, num_key_value_heads_, head_dim_}); + value_states = value_states.view({B, S, num_key_value_heads_, head_dim_}); + + query_states = query_states.transpose(1, 2); + key_states = key_states.transpose(1, 2); + value_states = value_states.transpose(1, 2); + + query_states = q_rope_(query_states, llm_embedding_sin, llm_embedding_cos); + key_states = k_rope_(key_states, llm_embedding_sin, llm_embedding_cos); + + auto [k, v] = past_kv_cache->updateKVCache(layer_idx_, key_states, value_states); + key_states = k; + value_states = v; + + Tensor attn; + if (key_states.dtype() == kFloat32) { + attn = nn::functional::matmul(query_states, key_states, false, true) * (1.f / sqrtf(head_dim_)); + attn = mask_(attn); + attn = softmax_(attn); + } else if (key_states.dtype() == kFloat16) { + attn = nn::functional::matmul(query_states.to(kFloat32), key_states.to(kFloat32), false, true) * (1.f / sqrtf(head_dim_)); + attn = mask_(attn); + attn = softmax_(attn); + attn = attn.to(kFloat16); + } + + auto output = nn::functional::matmul(attn, value_states); + output = output.transpose(1, 2).view({B, S, num_attention_heads_ * head_dim_}); + output = o_proj_(output); + return {output}; + } + + int layer_idx_; +}; + +class Qwen2_5OmniDecoder final : public nn::Module { + public: + Qwen2_5OmniAttention self_attn_; + Qwen2_5OmniMLP mlp_; + nn::RMSNorm input_layer_norm_; + nn::RMSNorm post_attention_layer_norm_; + + Qwen2_5OmniDecoder() = default; + + Qwen2_5OmniDecoder(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + self_attn_ = reg("self_attn", cfg); + mlp_ = reg("mlp", cfg); + input_layer_norm_ = reg("input_layernorm", cfg.rms_norm_eps); + post_attention_layer_norm_ = reg("post_attention_layernorm", cfg.rms_norm_eps); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto llm_embedding_sin = inputs[1]; + auto llm_embedding_cos = inputs[2]; + auto& kv_cache = args[0]; + + auto x = input_layer_norm_(inputs[0]); + x = self_attn_(x, llm_embedding_sin, llm_embedding_cos, kv_cache)[0]; + auto tmp = x + inputs[0]; + x = post_attention_layer_norm_(tmp); + x = mlp_(x)[0]; + x = x + tmp; + return {x}; + } +}; + +class Qwen2_5OmniText final : public nn::Module { + nn::ModuleList decode_blocks_; + nn::RMSNorm norm_; + + public: + Qwen2_5OmniText() = default; + + Qwen2_5OmniText(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + decode_blocks_ = reg>("layers", cfg.num_hidden_layers, cfg); + for (auto [idx, b] : enumerate(decode_blocks_.list())) { b.self_attn_.layer_idx_ = idx; } + + norm_ = reg("norm", cfg.rms_norm_eps); + embedding_ = reg("embed_tokens", cfg.vocab_size, cfg.hidden_size); + + auto inv = makeMultimodalRoPEInvFreq(cfg.hidden_size / cfg.num_attention_heads, cfg.rope_theta); + registerBuffer("inv_freq", inv); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto& blocks = decode_blocks_.list(); + auto x = inputs[0]; + auto llm_embedding_sin = inputs[1]; + auto llm_embedding_cos = inputs[2]; + auto& kv_cache = args[0]; + + for (auto& block : blocks) { x = block(x, llm_embedding_sin, llm_embedding_cos, kv_cache)[0]; } + x = norm_(x); + + return {x}; + } + + nn::Embedding embedding_; +}; + +class Qwen2_5OmniThinker final : public nn::Module { + public: + Qwen2_5OmniThinker() = default; + Qwen2_5OmniThinker(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + model_ = reg("model", cfg); + lm_head_ = reg("lm_head", cfg.hidden_size, cfg.vocab_size, false, cfg.linear_impl_type); + } + + Qwen2_5OmniText model_; + nn::Linear lm_head_; +}; + +class Qwen2_5OmniForCausalLM : public ARGeneration { + public: + explicit Qwen2_5OmniForCausalLM(const Qwen2_5OmniConfig& cfg) : cfg_(cfg), thinker_("thinker", cfg) { + kv_cache_ = nn::StaticCache(cfg.max_cache_length, cfg.num_hidden_layers, + cfg.num_attention_heads, + cfg.num_key_value_heads, + cfg.hidden_size / cfg.num_attention_heads, + kFloat32, + kFloat32, + kCPU, + false); + eos_token_id_ = cfg.eos_token_id; + max_length_ = cfg.max_cache_length; + } + + ARGenerationOutputPast forward(const ARGenerationOutputPast& input, const ARGenerationArgs& args) override { + auto sequence = input.at("sequence"); + + auto input_embeddings = thinker_.model_.embedding_(sequence); + + Tensor position_ids = Tensor::nil(); + if (input.count("position_ids")) { + position_ids = input.at("position_ids"); + } + position_ids = getPositionIds(sequence, position_ids); + + auto [llm_embedding_sin, llm_embedding_cos] = + makeMultimodalPositionEmbedding(position_ids, thinker_.model_.getBuffer("inv_freq"), cfg_.max_position_embeddings, + cfg_.hidden_size / cfg_.num_attention_heads, cfg_.mrope_section); + + auto hidden_states = thinker_.model_(input_embeddings, llm_embedding_sin, llm_embedding_cos, AnyValue(&kv_cache_))[0]; + auto seq_len = hidden_states.shape()[1]; + auto last_hidden = hidden_states[{kAll, {seq_len - 1}, kAll}]; + auto logits = thinker_.lm_head_(last_hidden); + + return { + {"sequence", logits}, + {"position_ids", position_ids}, + }; + } + + Qwen2_5OmniThinker thinker_; + + private: + Tensor getPositionIds(Tensor& input_ids, Tensor& position_ids) const { + MLLM_RT_ASSERT_EQ(input_ids.shape().size(), 2); + + if (!position_ids.isNil()) { + auto last_pos = *position_ids.offsettedPtr({0, 0, position_ids.shape()[2] - 1}); + auto ret_position_ids = Tensor::empty({3, 1, 1}, kInt64, kCPU).alloc(); + *ret_position_ids.offsettedPtr({0, 0, 0}) = last_pos + 1; + *ret_position_ids.offsettedPtr({1, 0, 0}) = last_pos + 1; + *ret_position_ids.offsettedPtr({2, 0, 0}) = last_pos + 1; + return ret_position_ids; + } + + auto B = input_ids.shape()[0]; + auto S = input_ids.shape()[1]; + MLLM_RT_ASSERT_EQ(B, 1); + + Tensor out = Tensor::empty({3, B, S}, kInt64, kCPU).alloc(); + for (int d = 0; d < 3; ++d) { + auto out_ptr = out.offsettedPtr({d, 0, 0}); + for (int64_t s = 0; s < S; ++s) { out_ptr[s] = s; } + } + return out; + } + + const Qwen2_5OmniConfig& cfg_; + nn::StaticCache kv_cache_; +}; + +} // namespace mllm::models::qwen2_5omni diff --git a/mllm/models/qwen2_5omni/tokenization_qwen2_5omni.hpp b/mllm/models/qwen2_5omni/tokenization_qwen2_5omni.hpp new file mode 100644 index 000000000..8674af9f5 --- /dev/null +++ b/mllm/models/qwen2_5omni/tokenization_qwen2_5omni.hpp @@ -0,0 +1,252 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. +#pragma once + +#include +#include + +#include "mllm/preprocessor/tokenizers/BPE.hpp" +#include "mllm/preprocessor/tokenizers/Unicode.hpp" +#include "mllm/preprocessor/tokenizers/AutoTokenizer.hpp" +#include "mllm/models/ARGeneration.hpp" + +namespace mllm::models::qwen2_5omni { + +// same regex as Qwen2/Qwen2-VL tokenizers +inline bool qwen2_5OmniTokenizerMatchPattern(const std::wstring& str, size_t& pos, std::wstring& matched) { + if (pos >= str.size()) return false; + + static const std::wstring contractions[] = {L"'s", L"'t", L"'re", L"'ve", L"'m", L"'ll", L"'d"}; + for (const auto& contraction : contractions) { + if (pos + contraction.size() <= str.size() && str.compare(pos, contraction.size(), contraction) == 0) { + matched = contraction; + pos += contraction.size(); + return true; + } + } + + { + size_t original_pos = pos; + bool has_prefix = false; + matched.clear(); + + if (!preprocessor::isLetter(str[pos]) && !preprocessor::isDigit(str[pos]) && str[pos] != L'\r' && str[pos] != L'\n') { + matched += str[pos]; + ++pos; + has_prefix = true; + } + + if (pos < str.size() && preprocessor::isLetter(str[pos])) { + do { + matched += str[pos]; + ++pos; + } while (pos < str.size() && preprocessor::isLetter(str[pos])); + return true; + } else if (has_prefix) { + pos = original_pos; + matched.clear(); + } + } + + if (preprocessor::isDigit(str[pos])) { + matched = str.substr(pos, 1); + ++pos; + return true; + } + + { + size_t original_pos = pos; + matched.clear(); + size_t start = pos; + + if (str[pos] == L' ') { ++pos; } + + if (pos < str.size() && !std::iswspace(str[pos]) && !preprocessor::isLetter(str[pos]) && !preprocessor::isDigit(str[pos])) { + do { + ++pos; + } while (pos < str.size() && !std::iswspace(str[pos]) && !preprocessor::isLetter(str[pos]) + && !preprocessor::isDigit(str[pos])); + + matched = str.substr(start, pos - start); + + while (pos < str.size() && (str[pos] == L'\r' || str[pos] == L'\n')) { + matched += str[pos]; + ++pos; + } + return true; + } else { + pos = original_pos; + } + } + + { + size_t start = pos; + while (pos < str.size() && std::iswspace(str[pos])) ++pos; + if (pos < str.size() && (str[pos] == L'\r' || str[pos] == L'\n')) { + while (pos < str.size() && (str[pos] == L'\r' || str[pos] == L'\n')) ++pos; + matched = str.substr(start, pos - start); + return true; + } else { + pos = start; + } + } + + if (std::iswspace(str[pos])) { + size_t start = pos; + while (pos < str.size() && std::iswspace(str[pos])) ++pos; + if (pos >= str.size() || std::iswspace(str[pos])) { + matched = str.substr(start, pos - start); + return true; + } else { + pos = start; + } + } + + if (std::iswspace(str[pos])) { + size_t start = pos; + while (pos < str.size() && std::iswspace(str[pos])) ++pos; + matched = str.substr(start, pos - start); + return true; + } + + return false; +} + +inline bool qwen2_5OmniRegex(const std::string& str, std::vector& splitted) { + auto w_string = preprocessor::utf8string2WideString(str); + size_t pos = 0; + while (pos < w_string.size()) { + std::wstring matched; + if (qwen2_5OmniTokenizerMatchPattern(w_string, pos, matched)) { + splitted.push_back(matched); + } else { + ++pos; + } + } + return true; +} + +struct Qwen2_5OmniMessage { + std::string prompt; + std::string system_prompt = "You are a helpful assistant."; + + [[nodiscard]] std::string buildChatMessage() const { + std::string result; + if (!system_prompt.empty()) { + result += "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"; + } + result += "<|im_start|>user\n" + prompt + "<|im_end|>\n"; + result += "<|im_start|>assistant\n"; + return result; + } +}; + +class Qwen2_5OmniTokenizer final : public mllm::preprocessor::AutoTokenizer { + public: + explicit Qwen2_5OmniTokenizer(const std::string& file_path) { + preprocessor::initLocal(); + preprocessor::makeBytes2UnicodeMap(bytes_2_unicode_dict_); + for (auto& kv : bytes_2_unicode_dict_) { bytes_2_unicode_dict_inverse_.insert({kv.second, kv.first}); } + bpe_.initFromSentencePieceJson(file_path); + special_tokens_trie_.add(L"<|endoftext|>"); + special_tokens_trie_.add(L"<|im_start|>"); + special_tokens_trie_.add(L"<|im_end|>"); + special_tokens_trie_.add(L"<|object_ref_start|>"); + special_tokens_trie_.add(L"<|object_ref_end|>"); + special_tokens_trie_.add(L"<|box_start|>"); + special_tokens_trie_.add(L"<|box_end|>"); + special_tokens_trie_.add(L"<|quad_start|>"); + special_tokens_trie_.add(L"<|quad_end|>"); + special_tokens_trie_.add(L"<|vision_bos|>"); + special_tokens_trie_.add(L"<|vision_eos|>"); + special_tokens_trie_.add(L"<|vision_pad|>"); + special_tokens_trie_.add(L"<|image_pad|>"); + special_tokens_trie_.add(L"<|video_pad|>"); + special_tokens_trie_.add(L"<|AUDIO|>"); + special_tokens_trie_.add(L"<|audio_bos|>"); + special_tokens_trie_.add(L"<|audio_eos|>"); + special_tokens_trie_.add(L"<|IMAGE|>"); + special_tokens_trie_.add(L"<|VIDEO|>"); + } + + std::vector _tokenize(const std::string& str) override { + std::vector ret; + std::vector splitted; + ::mllm::models::qwen2_5omni::qwen2_5OmniRegex(str, splitted); + for (const auto& s : splitted) { + auto utf_8_str = preprocessor::wideString2Utf8String(s); + std::wstring mapped_str; + for (unsigned char c : utf_8_str) { mapped_str.push_back(bytes_2_unicode_dict_[c]); } + + auto bpe_ts = bpe_._bpe(mapped_str); + + for (const auto& bpe_t : bpe_ts) { ret.push_back(bpe_t); } + } + + return ret; + } + + std::vector tokenize(const std::string& str) override { + auto tokens = special_tokens_trie_.split(preprocessor::utf8string2WideString(str)); + std::vector all_tokens; + for (const auto& token : tokens) { + if (special_tokens_trie_.isSpecialToken(token)) { + all_tokens.emplace_back(token); + continue; + } + auto tmp_tokens = _tokenize(preprocessor::wideString2Utf8String(token)); + all_tokens.insert(all_tokens.end(), tmp_tokens.begin(), tmp_tokens.end()); + } + return all_tokens; + } + + std::wstring _detokenize(int64_t pos_idx) override { return bpe_._lookup_inverse_vocab(pos_idx); } + + std::wstring detokenize(int64_t pos_idx) override { + auto str = _detokenize(pos_idx); + std::string utf_8_str; + for (wchar_t c : str) { utf_8_str.push_back((unsigned char)(bytes_2_unicode_dict_inverse_[c])); } + return {mllm::preprocessor::utf8string2WideString(utf_8_str)}; + } + + Tensor convert2Ids(const std::vector& strs) override { + std::vector ids; + ids.reserve(strs.size()); + for (const auto& str : strs) { ids.emplace_back(bpe_._lookup_vocab(str)); } + Tensor ret = Tensor::empty({1, static_cast(ids.size())}, kInt64, kCPU) + .setMemType(kExtraInput) + .setName("qwen2_5omni-tokenizer-i0") + .alloc(); + + auto ptr = ret.ptr(); + for (size_t i = 0; i < ids.size(); ++i) { ptr[i] = ids[i]; } + + return ret; + } + + ARGenerationOutputPast convertMessage(const Qwen2_5OmniMessage& message) { + auto applied_string = message.buildChatMessage(); + auto sequence_str = tokenize(applied_string); + + std::vector ids; + ids.reserve(sequence_str.size()); + for (const auto& str : sequence_str) { ids.emplace_back(bpe_._lookup_vocab(str)); } + + Tensor sequence = Tensor::empty({1, static_cast(ids.size())}, kInt64, kCPU) + .setMemType(kNormal) + .setName("qwen2_5omni-tokenizer-i0") + .alloc(); + + auto ptr = sequence.ptr(); + for (size_t i = 0; i < ids.size(); ++i) { ptr[i] = ids[i]; } + + return {{"sequence", sequence}}; + } + + private: + preprocessor::BPE bpe_; + std::unordered_map bytes_2_unicode_dict_; + std::unordered_map bytes_2_unicode_dict_inverse_; +}; + +} // namespace mllm::models::qwen2_5omni From c9333abef049e87f02faa8b9edf3943e3e082cc0 Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Fri, 23 Jan 2026 14:50:29 +0800 Subject: [PATCH 02/42] add qwen2.5omni vision, audio modal --- examples/qwen2_5omni/CMakeLists.txt | 8 + examples/qwen2_5omni/audio_infer.cpp | 84 ++ examples/qwen2_5omni/image_infer.cpp | 84 ++ .../audio_preprocessor_qwen2_5omni.hpp | 240 ++++ .../qwen2_5omni/configuration_qwen2_5omni.hpp | 82 ++ .../qwen2_5omni/modeling_qwen2_5omni.hpp | 1033 ++++++++++++++++- .../qwen2_5omni/tokenization_qwen2_5omni.hpp | 135 ++- 7 files changed, 1659 insertions(+), 7 deletions(-) create mode 100644 examples/qwen2_5omni/audio_infer.cpp create mode 100644 examples/qwen2_5omni/image_infer.cpp create mode 100644 mllm/models/qwen2_5omni/audio_preprocessor_qwen2_5omni.hpp diff --git a/examples/qwen2_5omni/CMakeLists.txt b/examples/qwen2_5omni/CMakeLists.txt index 3141b56d7..479c3a635 100644 --- a/examples/qwen2_5omni/CMakeLists.txt +++ b/examples/qwen2_5omni/CMakeLists.txt @@ -1,3 +1,11 @@ add_executable(mllm-qwen2_5-omni-text-runner text_infer.cpp) target_link_libraries(mllm-qwen2_5-omni-text-runner PRIVATE MllmRT MllmCPUBackend) target_include_directories(mllm-qwen2_5-omni-text-runner PRIVATE ${MLLM_INCLUDE_DIR}) + +add_executable(mllm-qwen2_5-omni-image-runner image_infer.cpp) +target_link_libraries(mllm-qwen2_5-omni-image-runner PRIVATE MllmRT MllmCPUBackend) +target_include_directories(mllm-qwen2_5-omni-image-runner PRIVATE ${MLLM_INCLUDE_DIR}) + +add_executable(mllm-qwen2_5-omni-audio-runner audio_infer.cpp) +target_link_libraries(mllm-qwen2_5-omni-audio-runner PRIVATE MllmRT MllmCPUBackend) +target_include_directories(mllm-qwen2_5-omni-audio-runner PRIVATE ${MLLM_INCLUDE_DIR}) diff --git a/examples/qwen2_5omni/audio_infer.cpp b/examples/qwen2_5omni/audio_infer.cpp new file mode 100644 index 000000000..014b4688f --- /dev/null +++ b/examples/qwen2_5omni/audio_infer.cpp @@ -0,0 +1,84 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include + +using mllm::Argparse; + +MLLM_MAIN({ + mllm::Logger::level() = mllm::LogLevel::kError; + + auto& help = Argparse::add("-h|--help").help("Show help message"); + auto& model_path = Argparse::add("-m|--model_path").help("Model path").required(true); + auto& model_version = Argparse::add("-mv|--model_version").help("Model version").required(true); + auto& tokenizer_path = Argparse::add("-t|--tokenizer_path").help("Tokenizer directory").required(true); + auto& config_path = Argparse::add("-c|--config_path").help("Config path").required(true); + + Argparse::parse(argc, argv); + + mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1; + if (model_version.get() == "v1") { + file_version = mllm::ModelFileVersion::kV1; + } else if (model_version.get() == "v2") { + file_version = mllm::ModelFileVersion::kV2; + } + + if (help.isSet()) { + Argparse::printHelp(); + mllm::shutdownContext(); + return 0; + } + + { + auto qwen2_5omni_cfg = mllm::models::qwen2_5omni::Qwen2_5OmniConfig(config_path.get()); + auto qwen2_5omni_tokenizer = mllm::models::qwen2_5omni::Qwen2_5OmniTokenizer(tokenizer_path.get()); + auto qwen2_5omni = mllm::models::qwen2_5omni::Qwen2_5OmniForCausalLM(qwen2_5omni_cfg); + + auto param = mllm::load(model_path.get(), file_version); + qwen2_5omni.thinker_.load(param); + + fmt::print("\n{:*^60}\n", " Qwen2.5-Omni Audio CLI "); + fmt::print("Enter 'exit' or 'quit' to end the session\n\n"); + + std::string audio_path; + std::string prompt_text; + + fmt::print("Audio path (or 'exit/quit'): "); + //std::getline(std::cin, audio_path); + //if (audio_path == "exit" || audio_path == "quit") { return 0; } + audio_path = "/Users/kkkai/Desktop/mllm2-former/mllm/rsc/recognize.wav"; + + fmt::print("Prompt text: "); + //std::getline(std::cin, prompt_text); + //if (prompt_text.empty()) { prompt_text = "Please describe the audio."; } + prompt_text = "复述这段音频"; + + try { + fmt::print("Processing...\n"); + auto inputs = qwen2_5omni_tokenizer.convertAudioMessage({.prompt = prompt_text, .audio_file_path = audio_path}); + + fmt::print("\nResponse: "); + qwen2_5omni.streamGenerate(inputs, + { + {"do_sample", mllm::AnyValue(false)}, + {"max_length", mllm::AnyValue(qwen2_5omni_cfg.max_cache_length)}, + }, + [&](int64_t token_id) { + auto str = qwen2_5omni_tokenizer.detokenize(token_id); + std::wcout << str << std::flush; + }); + + fmt::print("\n{}\n", std::string(60, '-')); + } catch (const std::exception& e) { fmt::print("\nError: {}\n{}\n", e.what(), std::string(60, '-')); } + + qwen2_5omni.perfSummary(); + } + + mllm::print("\n"); + mllm::memoryReport(); +}) diff --git a/examples/qwen2_5omni/image_infer.cpp b/examples/qwen2_5omni/image_infer.cpp new file mode 100644 index 000000000..41bf770b1 --- /dev/null +++ b/examples/qwen2_5omni/image_infer.cpp @@ -0,0 +1,84 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include + +using mllm::Argparse; + +MLLM_MAIN({ + mllm::Logger::level() = mllm::LogLevel::kError; + + auto& help = Argparse::add("-h|--help").help("Show help message"); + auto& model_path = Argparse::add("-m|--model_path").help("Model path").required(true); + auto& model_version = Argparse::add("-mv|--model_version").help("Model version").required(true); + auto& tokenizer_path = Argparse::add("-t|--tokenizer_path").help("Tokenizer directory").required(true); + auto& config_path = Argparse::add("-c|--config_path").help("Config path").required(true); + + Argparse::parse(argc, argv); + + mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1; + if (model_version.get() == "v1") { + file_version = mllm::ModelFileVersion::kV1; + } else if (model_version.get() == "v2") { + file_version = mllm::ModelFileVersion::kV2; + } + + if (help.isSet()) { + Argparse::printHelp(); + mllm::shutdownContext(); + return 0; + } + + { + auto qwen2_5omni_cfg = mllm::models::qwen2_5omni::Qwen2_5OmniConfig(config_path.get()); + auto qwen2_5omni_tokenizer = + mllm::models::qwen2_5omni::Qwen2_5OmniTokenizer(tokenizer_path.get(), qwen2_5omni_cfg.visual_spatial_merge_size); + auto qwen2_5omni = mllm::models::qwen2_5omni::Qwen2_5OmniForCausalLM(qwen2_5omni_cfg); + + auto param = mllm::load(model_path.get(), file_version); + qwen2_5omni.thinker_.load(param); + + fmt::print("\n{:*^60}\n", " Qwen2.5-Omni Image CLI "); + fmt::print("Enter 'exit' or 'quit' to end the session\n\n"); + + std::string image_path; + std::string prompt_text; + + fmt::print("Image path (or 'exit/quit'): "); + image_path = "../../../mllm2-former/mllm/rsc/pics.jpg"; + //std::getline(std::cin, image_path); + if (image_path == "exit" || image_path == "quit") { return 0; } + + fmt::print("Prompt text: "); + prompt_text = "描述图片中物体"; + //std::getline(std::cin, prompt_text); + + try { + fmt::print("Processing...\n"); + auto inputs = qwen2_5omni_tokenizer.convertVisionMessage({.prompt = prompt_text, .img_file_path = image_path}); + + fmt::print("\nResponse: "); + qwen2_5omni.streamGenerate(inputs, + { + {"do_sample", mllm::AnyValue(false)}, + {"max_length", mllm::AnyValue(qwen2_5omni_cfg.max_cache_length)}, + }, + [&](int64_t token_id) { + auto str = qwen2_5omni_tokenizer.detokenize(token_id); + std::wcout << str << std::flush; + }); + + fmt::print("\n{}\n", std::string(60, '-')); + } catch (const std::exception& e) { fmt::print("\nError: {}\n{}\n", e.what(), std::string(60, '-')); } + + qwen2_5omni.perfSummary(); + } + + mllm::print("\n"); + mllm::memoryReport(); +}) diff --git a/mllm/models/qwen2_5omni/audio_preprocessor_qwen2_5omni.hpp b/mllm/models/qwen2_5omni/audio_preprocessor_qwen2_5omni.hpp new file mode 100644 index 000000000..392bfc17b --- /dev/null +++ b/mllm/models/qwen2_5omni/audio_preprocessor_qwen2_5omni.hpp @@ -0,0 +1,240 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "mllm/core/Tensor.hpp" +#include "mllm/nn/Module.hpp" +#include "mllm/nn/Nn.hpp" +#include "mllm/nn/Functional.hpp" +#include "mllm/preprocessor/audio/Audio.hpp" + +namespace mllm::models::qwen2_5omni { + +inline float hertz_to_mel_slaney(float freq) { + constexpr float kMinLogHertz = 1000.0f; + constexpr float kMinLogMel = 15.0f; + const float logstep = 27.0f / std::log(6.4f); + + if (freq < kMinLogHertz) { + return 3.0f * freq / 200.0f; + } + return kMinLogMel + std::log(freq / kMinLogHertz) * logstep; +} + +inline float mel_to_hertz_slaney(float mel) { + constexpr float kMinLogHertz = 1000.0f; + constexpr float kMinLogMel = 15.0f; + const float logstep = std::log(6.4f) / 27.0f; + + if (mel < kMinLogMel) { + return 200.0f * mel / 3.0f; + } + return kMinLogHertz * std::exp(logstep * (mel - kMinLogMel)); +} + +inline Tensor create_hann_window(int32_t window_length, bool periodic = true) { + int32_t length = periodic ? window_length + 1 : window_length; + auto window = Tensor::empty({1, window_length}, kFloat32, kCPU).alloc(); + float* window_ptr = window.ptr(); + + for (int32_t i = 0; i < window_length; ++i) { + float n = static_cast(i); + float denominator = periodic ? static_cast(length) : static_cast(length - 1); + window_ptr[i] = 0.5f - 0.5f * std::cos(2.0f * M_PI * n / denominator); + } + + return window; +} + +inline Tensor create_mel_filterbank(int32_t num_frequency_bins, int32_t num_mel_filters, float min_frequency, + float max_frequency, int32_t sampling_rate) { + std::vector fft_freqs(num_frequency_bins); + for (int32_t i = 0; i < num_frequency_bins; ++i) { + fft_freqs[i] = static_cast(i) * (sampling_rate / 2.0f) / (num_frequency_bins - 1); + } + + float mel_min = hertz_to_mel_slaney(min_frequency); + float mel_max = hertz_to_mel_slaney(max_frequency); + + std::vector mel_freqs(num_mel_filters + 2); + for (int32_t i = 0; i < num_mel_filters + 2; ++i) { + mel_freqs[i] = mel_min + static_cast(i) * (mel_max - mel_min) / (num_mel_filters + 1); + } + + std::vector filter_freqs(num_mel_filters + 2); + for (int32_t i = 0; i < num_mel_filters + 2; ++i) { filter_freqs[i] = mel_to_hertz_slaney(mel_freqs[i]); } + + auto mel_filters = Tensor::empty({num_frequency_bins, num_mel_filters}, kFloat32, kCPU).alloc(); + float* filters_ptr = mel_filters.ptr(); + std::fill_n(filters_ptr, num_frequency_bins * num_mel_filters, 0.0f); + + for (int32_t mel_idx = 0; mel_idx < num_mel_filters; ++mel_idx) { + float left_freq = filter_freqs[mel_idx]; + float center_freq = filter_freqs[mel_idx + 1]; + float right_freq = filter_freqs[mel_idx + 2]; + + for (int32_t freq_idx = 0; freq_idx < num_frequency_bins; ++freq_idx) { + float freq = fft_freqs[freq_idx]; + float value = 0.0f; + + if (freq >= left_freq && freq <= center_freq && center_freq != left_freq) { + value = (freq - left_freq) / (center_freq - left_freq); + } else if (freq >= center_freq && freq <= right_freq && right_freq != center_freq) { + value = (right_freq - freq) / (right_freq - center_freq); + } + + filters_ptr[freq_idx * num_mel_filters + mel_idx] = value; + } + } + + for (int32_t mel_idx = 0; mel_idx < num_mel_filters; ++mel_idx) { + float enorm = 2.0f / (filter_freqs[mel_idx + 2] - filter_freqs[mel_idx]); + for (int32_t freq_idx = 0; freq_idx < num_frequency_bins; ++freq_idx) { + filters_ptr[freq_idx * num_mel_filters + mel_idx] *= enorm; + } + } + + return mel_filters; +} + +class MelSpectrogramFeatures final : public nn::Module { + int32_t n_fft_; + int32_t hop_length_; + int32_t win_length_; + int32_t n_mels_; + std::string padding_; + int power_; + nn::STFT stft_; + Tensor window_; + Tensor melscale_fbanks_; + + public: + MelSpectrogramFeatures() = default; + + explicit inline MelSpectrogramFeatures(const std::string& name, int32_t sample_rate = 16000, int32_t n_fft = 400, + int32_t hop_length = 160, int32_t n_mels = 128, + const std::string& padding = "center", int power = 2) + : nn::Module(name), n_fft_(n_fft), hop_length_(hop_length), n_mels_(n_mels), padding_(padding), power_(power) { + if (padding != "center" && padding != "same") { throw std::invalid_argument("Padding must be 'center' or 'same'."); } + + win_length_ = n_fft_; + stft_ = reg("stft", n_fft_, hop_length_, win_length_, true, true, "reflect", true); + window_ = create_hann_window(win_length_, true); + melscale_fbanks_ = create_mel_filterbank(n_fft_ / 2 + 1, n_mels_, 0.0f, 8000.0f, sample_rate); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto audio = inputs[0]; // [B, T] + + if (padding_ == "same") { + NYI("apply same padding in MelSpectrogramFeatures not implemented"); + } + + auto stft_result = stft_(audio, window_); + auto specgram = stft_result.abs(); + if (power_ == 2) { + specgram = specgram * specgram; + } else if (power_ != 1) { + NYI("power != 1 and power != 2 not implemented"); + } + + auto mel_specgram = nn::functional::matmul(specgram.T(), melscale_fbanks_).T(); + mel_specgram = nn::functional::clip(mel_specgram, 1e-10f, std::numeric_limits::max()); + mel_specgram = nn::functional::log(mel_specgram) / std::log(10.0f); + auto max_val = mel_specgram.max(); + float threshold = max_val.item() - 8.0f; + mel_specgram = nn::functional::clip(mel_specgram, threshold, std::numeric_limits::max()); + mel_specgram = (mel_specgram + 4.0f) / 4.0f; + + return {mel_specgram}; + } +}; + +struct Qwen2_5OmniAudioFeatures { + Tensor input_features = Tensor::nil(); + int32_t feature_length = 0; +}; + +class Qwen2_5OmniAudioPreprocessor { + MelSpectrogramFeatures mel_extractor_; + int32_t sample_rate_; + int32_t n_mels_; + int32_t hop_length_; + int32_t chunk_length_; + int32_t n_samples_; + + public: + explicit Qwen2_5OmniAudioPreprocessor(int32_t sample_rate = 16000, int32_t n_mels = 128, int32_t hop_length = 160, + int32_t chunk_length = 300) + : mel_extractor_("feature_extractor.mel_spec", sample_rate, 400, hop_length, n_mels, "center", 2), + sample_rate_(sample_rate), + n_mels_(n_mels), + hop_length_(hop_length), + chunk_length_(chunk_length), + n_samples_(chunk_length * sample_rate) {} + + [[nodiscard]] Qwen2_5OmniAudioFeatures processAudioFile(const std::string& audio_file_path) { + auto audio_data = mllm::audio::readWAV(audio_file_path, sample_rate_); + if (audio_data.empty()) { return {}; } + return processAudioData(audio_data.data(), static_cast(audio_data.size())); + } + + [[nodiscard]] Qwen2_5OmniAudioFeatures processAudioData(const float* audio_data, int32_t audio_length) { + Qwen2_5OmniAudioFeatures result; + if (audio_data == nullptr || audio_length <= 0) { return result; } + + int32_t padded_length = n_samples_; + int32_t effective_length = std::min(audio_length, padded_length); + + auto audio_tensor = Tensor::empty({1, padded_length}, kFloat32, kCPU).alloc(); + float* audio_ptr = audio_tensor.ptr(); + + if (audio_length <= padded_length) { + std::memcpy(audio_ptr, audio_data, audio_length * sizeof(float)); + std::fill(audio_ptr + audio_length, audio_ptr + padded_length, 0.0f); + } else { + std::memcpy(audio_ptr, audio_data, padded_length * sizeof(float)); + } + + auto mel_spec = mel_extractor_.forward({audio_tensor}, {})[0]; // [1, n_mels, n_frames] + + int32_t valid_frames = calcFeatureLength(effective_length); + int32_t max_frames = mel_spec.shape()[2]; + if (valid_frames > max_frames) { valid_frames = max_frames; } + if (valid_frames <= 0) { return result; } + + auto trimmed = Tensor::empty({1, n_mels_, valid_frames}, kFloat32, kCPU).alloc(); + for (int32_t m = 0; m < n_mels_; ++m) { + auto src_ptr = mel_spec.offsettedPtr({0, m, 0}); + auto dst_ptr = trimmed.offsettedPtr({0, m, 0}); + std::memcpy(dst_ptr, src_ptr, valid_frames * sizeof(float)); + } + + result.input_features = trimmed; + result.feature_length = valid_frames; + return result; + } + + [[nodiscard]] int32_t calcFeatureLength(int32_t audio_length) const { + if (audio_length <= 0) { return 0; } + return (audio_length + hop_length_ - 1) / hop_length_; + } + + [[nodiscard]] int32_t calcAudioTokenLength(int32_t feature_length) const { + if (feature_length <= 0) { return 0; } + int32_t after_conv = (feature_length - 1) / 2 + 1; + if (after_conv < 2) { return 0; } + int32_t after_pool = (after_conv - 2) / 2 + 1; + return std::max(0, after_pool); + } +}; + +} // namespace mllm::models::qwen2_5omni diff --git a/mllm/models/qwen2_5omni/configuration_qwen2_5omni.hpp b/mllm/models/qwen2_5omni/configuration_qwen2_5omni.hpp index 2b0cb1ee8..d0e000642 100644 --- a/mllm/models/qwen2_5omni/configuration_qwen2_5omni.hpp +++ b/mllm/models/qwen2_5omni/configuration_qwen2_5omni.hpp @@ -35,12 +35,48 @@ struct Qwen2_5OmniConfig : protected ConfigFile { mrope_section = text_cfg["rope_scaling"]["mrope_section"].get>(); } + if (thinker_cfg.contains("vision_config")) { + auto& vision_cfg = thinker_cfg["vision_config"]; + visual_in_chans = vision_cfg.value("in_channels", vision_cfg.value("in_chans", visual_in_chans)); + visual_hidden_size = vision_cfg.value("hidden_size", vision_cfg.value("embed_dim", visual_hidden_size)); + visual_patch_size = vision_cfg.value("patch_size", vision_cfg.value("spatial_patch_size", visual_patch_size)); + visual_temporal_patch_size = vision_cfg.value("temporal_patch_size", visual_temporal_patch_size); + visual_spatial_merge_size = vision_cfg.value("spatial_merge_size", visual_spatial_merge_size); + visual_out_hidden_size = vision_cfg.value("out_hidden_size", visual_out_hidden_size); + visual_num_heads = vision_cfg.value("num_heads", visual_num_heads); + visual_depth = vision_cfg.value("depth", visual_depth); + visual_intermediate_size = vision_cfg.value("intermediate_size", visual_intermediate_size); + if (vision_cfg.contains("fullatt_block_indexes")) { + visual_fullatt_block_indexes = vision_cfg["fullatt_block_indexes"].get>(); + } + visual_window_size = vision_cfg.value("window_size", visual_window_size); + } + + if (thinker_cfg.contains("audio_config")) { + auto& audio_cfg = thinker_cfg["audio_config"]; + audio_d_model = audio_cfg.value("d_model", audio_d_model); + audio_num_mel_bins = audio_cfg.value("num_mel_bins", audio_num_mel_bins); + audio_encoder_layers = audio_cfg.value("encoder_layers", audio_encoder_layers); + audio_encoder_attention_heads = audio_cfg.value("encoder_attention_heads", audio_encoder_attention_heads); + audio_encoder_ffn_dim = audio_cfg.value("encoder_ffn_dim", audio_encoder_ffn_dim); + audio_max_source_positions = audio_cfg.value("max_source_positions", audio_max_source_positions); + audio_n_window = audio_cfg.value("n_window", audio_n_window); + audio_output_dim = audio_cfg.value("output_dim", audio_output_dim); + } + bos_token_id = thinker_cfg.value("bos_token_id", bos_token_id); eos_token_id = thinker_cfg.value("eos_token_id", eos_token_id); pad_token_id = thinker_cfg.value("pad_token_id", pad_token_id); image_token_id = thinker_cfg.value("image_token_index", image_token_id); audio_token_id = thinker_cfg.value("audio_token_index", audio_token_id); video_token_id = thinker_cfg.value("video_token_index", video_token_id); + audio_start_token_id = thinker_cfg.value("audio_start_token_id", audio_start_token_id); + audio_end_token_id = thinker_cfg.value("audio_end_token_id", audio_end_token_id); + vision_start_token_id = thinker_cfg.value("vision_start_token_id", vision_start_token_id); + vision_end_token_id = thinker_cfg.value("vision_end_token_id", vision_end_token_id); + vision_token_id = thinker_cfg.value("vision_token_id", vision_token_id); + position_id_per_seconds = thinker_cfg.value("position_id_per_seconds", position_id_per_seconds); + seconds_per_chunk = thinker_cfg.value("seconds_per_chunk", seconds_per_chunk); } else { hidden_size = root["hidden_size"]; intermediate_size = root["intermediate_size"]; @@ -55,12 +91,30 @@ struct Qwen2_5OmniConfig : protected ConfigFile { if (root.contains("mrope_section")) { mrope_section = root["mrope_section"].get>(); } + if (root.contains("audio_config")) { + auto& audio_cfg = root["audio_config"]; + audio_d_model = audio_cfg.value("d_model", audio_d_model); + audio_num_mel_bins = audio_cfg.value("num_mel_bins", audio_num_mel_bins); + audio_encoder_layers = audio_cfg.value("encoder_layers", audio_encoder_layers); + audio_encoder_attention_heads = audio_cfg.value("encoder_attention_heads", audio_encoder_attention_heads); + audio_encoder_ffn_dim = audio_cfg.value("encoder_ffn_dim", audio_encoder_ffn_dim); + audio_max_source_positions = audio_cfg.value("max_source_positions", audio_max_source_positions); + audio_n_window = audio_cfg.value("n_window", audio_n_window); + audio_output_dim = audio_cfg.value("output_dim", audio_output_dim); + } bos_token_id = root.value("bos_token_id", bos_token_id); eos_token_id = root.value("eos_token_id", eos_token_id); pad_token_id = root.value("pad_token_id", pad_token_id); image_token_id = root.value("image_token_id", image_token_id); audio_token_id = root.value("audio_token_id", audio_token_id); video_token_id = root.value("video_token_id", video_token_id); + audio_start_token_id = root.value("audio_start_token_id", audio_start_token_id); + audio_end_token_id = root.value("audio_end_token_id", audio_end_token_id); + vision_start_token_id = root.value("vision_start_token_id", vision_start_token_id); + vision_end_token_id = root.value("vision_end_token_id", vision_end_token_id); + vision_token_id = root.value("vision_token_id", vision_token_id); + position_id_per_seconds = root.value("position_id_per_seconds", position_id_per_seconds); + seconds_per_chunk = root.value("seconds_per_chunk", seconds_per_chunk); } max_cache_length = root.value("max_cache_length", max_position_embeddings); @@ -82,6 +136,27 @@ struct Qwen2_5OmniConfig : protected ConfigFile { float rope_theta = 1000000.0f; bool tie_word_embeddings = false; + int32_t visual_in_chans = 3; + int32_t visual_hidden_size = 1280; + int32_t visual_patch_size = 14; + int32_t visual_temporal_patch_size = 2; + int32_t visual_spatial_merge_size = 2; + int32_t visual_out_hidden_size = 3584; + int32_t visual_num_heads = 16; + int32_t visual_depth = 32; + int32_t visual_intermediate_size = 3420; + std::vector visual_fullatt_block_indexes = {7, 15, 23, 31}; + int32_t visual_window_size = 112; + + int32_t audio_d_model = 1280; + int32_t audio_num_mel_bins = 128; + int32_t audio_encoder_layers = 32; + int32_t audio_encoder_attention_heads = 20; + int32_t audio_encoder_ffn_dim = 5120; + int32_t audio_max_source_positions = 1500; + int32_t audio_n_window = 100; + int32_t audio_output_dim = 3584; + int32_t max_cache_length = 32768; int64_t bos_token_id = 151644; @@ -90,6 +165,13 @@ struct Qwen2_5OmniConfig : protected ConfigFile { int64_t image_token_id = 151655; int64_t audio_token_id = 151646; int64_t video_token_id = 151656; + int64_t audio_start_token_id = 151647; + int64_t audio_end_token_id = 151648; + int64_t vision_start_token_id = 151652; + int64_t vision_end_token_id = 151653; + int64_t vision_token_id = 151654; + int32_t position_id_per_seconds = 25; + int32_t seconds_per_chunk = 2; aops::LinearImplTypes linear_impl_type = aops::LinearImplTypes::kDefault; }; diff --git a/mllm/models/qwen2_5omni/modeling_qwen2_5omni.hpp b/mllm/models/qwen2_5omni/modeling_qwen2_5omni.hpp index 7bd00baa7..fac087bae 100644 --- a/mllm/models/qwen2_5omni/modeling_qwen2_5omni.hpp +++ b/mllm/models/qwen2_5omni/modeling_qwen2_5omni.hpp @@ -2,9 +2,14 @@ // Licensed under the MIT License. #pragma once +#include #include +#include +#include +#include #include "mllm/mllm.hpp" +#include "mllm/core/SlicePrimitives.hpp" #include "mllm/nn/Module.hpp" #include "mllm/nn/Nn.hpp" #include "mllm/nn/Functional.hpp" @@ -87,6 +92,756 @@ inline auto makeMultimodalPositionEmbedding(Tensor& position_ids, const Tensor& return {sin, cos}; } +inline auto makeWindowIndex(const Tensor& grid_thw, int window_size, int spatial_merge_size, + int patch_size) -> std::pair, std::vector> { + MLLM_RT_ASSERT_EQ(grid_thw.shape().size(), 2); + const int grid_num = grid_thw.shape()[0]; + + const int vit_merger_window_size = window_size / spatial_merge_size / patch_size; + const int spatial_merge_unit = spatial_merge_size * spatial_merge_size; + + std::vector window_index; + std::vector cu_window_seqlens = {0}; + int window_index_id = 0; + + for (int grid_idx = 0; grid_idx < grid_num; ++grid_idx) { + const int grid_t = grid_thw.constAt({grid_idx, 0}); + const int grid_h = grid_thw.constAt({grid_idx, 1}); + const int grid_w = grid_thw.constAt({grid_idx, 2}); + + const int llm_grid_h = grid_h / spatial_merge_size; + const int llm_grid_w = grid_w / spatial_merge_size; + const int pad_h = (vit_merger_window_size - llm_grid_h % vit_merger_window_size) % vit_merger_window_size; + const int pad_w = (vit_merger_window_size - llm_grid_w % vit_merger_window_size) % vit_merger_window_size; + + const int num_windows_h = (llm_grid_h + pad_h) / vit_merger_window_size; + const int num_windows_w = (llm_grid_w + pad_w) / vit_merger_window_size; + const int total_windows = grid_t * num_windows_h * num_windows_w; + + std::vector>> index( + grid_t, std::vector>(llm_grid_h, std::vector(llm_grid_w))); + + int counter = 0; + for (int t = 0; t < grid_t; t++) { + for (int h = 0; h < llm_grid_h; h++) { + for (int w = 0; w < llm_grid_w; w++) { index[t][h][w] = counter++; } + } + } + + std::vector>> index_padded( + grid_t, std::vector>(llm_grid_h + pad_h, std::vector(llm_grid_w + pad_w, -100))); + + for (int t = 0; t < grid_t; t++) { + for (int h = 0; h < llm_grid_h; h++) { + for (int w = 0; w < llm_grid_w; w++) { index_padded[t][h][w] = index[t][h][w]; } + } + } + + std::vector seqlens(total_windows, 0); + for (int t = 0; t < grid_t; t++) { + for (int wh = 0; wh < num_windows_h; wh++) { + for (int ww = 0; ww < num_windows_w; ww++) { + const int window_idx = t * num_windows_h * num_windows_w + wh * num_windows_w + ww; + for (int h = 0; h < vit_merger_window_size; h++) { + for (int w = 0; w < vit_merger_window_size; w++) { + const int orig_h = wh * vit_merger_window_size + h; + const int orig_w = ww * vit_merger_window_size + w; + if (index_padded[t][orig_h][orig_w] != -100) { + window_index.push_back(index_padded[t][orig_h][orig_w] + window_index_id); + seqlens[window_idx]++; + } + } + } + } + } + } + + int cumulative = cu_window_seqlens.back(); + for (int i = 0; i < total_windows; i++) { + cumulative += seqlens[i] * spatial_merge_unit; + cu_window_seqlens.push_back(cumulative); + } + + window_index_id += grid_t * llm_grid_h * llm_grid_w; + } + + return {window_index, cu_window_seqlens}; +} + +inline auto makeVisualRoPEInvFreq(int32_t dims, float theta) -> Tensor { + const int half_dim = dims / (2 * 2); + Tensor inv_freq = Tensor::empty({half_dim}, kFloat32).alloc(); + float* inv_freq_ptr = inv_freq.ptr(); + const float dims_inv = 1.0f / static_cast(dims / 2); + for (int i = 0; i < half_dim; ++i) { + const float exponent = (2.0f * i) * dims_inv; + inv_freq_ptr[i] = 1.0f / std::pow(theta, exponent); + } + return inv_freq; +} + +inline auto makeVisualRotaryPosEmbIds(Tensor& grid_thw, int32_t spatial_merge_size) -> Tensor { + MLLM_RT_ASSERT_EQ(grid_thw.shape().size(), 2); + + const auto img_nums = grid_thw.shape()[0]; + int total_positions = 0; + for (int row = 0; row < img_nums; ++row) { + const int* dims = grid_thw.offsettedPtr({row, 0}); + total_positions += dims[0] * dims[1] * dims[2]; + } + + Tensor out = Tensor::empty({total_positions, 2}, kInt32).alloc(); + int* out_ptr = out.ptr(); + int out_offset = 0; + + for (int row = 0; row < img_nums; ++row) { + const int* dims = grid_thw.offsettedPtr({row, 0}); + const int t = dims[0]; + const int h = dims[1]; + const int w = dims[2]; + + const int num_h_blocks = h / spatial_merge_size; + const int num_w_blocks = w / spatial_merge_size; + const int total_blocks = num_h_blocks * num_w_blocks; + const int block_area = spatial_merge_size * spatial_merge_size; + const int grid_size = h * w; + + std::vector flatten_hpos(grid_size); + std::vector flatten_wpos(grid_size); + + for (int block_idx = 0; block_idx < total_blocks; ++block_idx) { + const int i_h = block_idx / num_w_blocks; + const int i_w = block_idx % num_w_blocks; + const int start_idx = block_idx * block_area; + + const int base_h = i_h * spatial_merge_size; + const int base_w = i_w * spatial_merge_size; + + for (int j_h = 0; j_h < spatial_merge_size; ++j_h) { + const int global_h = base_h + j_h; + for (int j_w = 0; j_w < spatial_merge_size; ++j_w) { + const int global_w = base_w + j_w; + const int pos = start_idx + j_h * spatial_merge_size + j_w; + flatten_hpos[pos] = global_h; + flatten_wpos[pos] = global_w; + } + } + } + + for (int frame = 0; frame < t; ++frame) { + for (int pos = 0; pos < grid_size; ++pos) { + const int out_idx = out_offset + (frame * grid_size + pos) * 2; + out_ptr[out_idx] = flatten_hpos[pos]; + out_ptr[out_idx + 1] = flatten_wpos[pos]; + } + } + out_offset += t * grid_size * 2; + } + + return out; +} + +inline auto makeVisualRotaryPosEmbFull(Tensor& inv_freq, int seq_len) -> Tensor { + MLLM_RT_ASSERT(seq_len > 0); + const int32_t dim = inv_freq.shape()[0]; + Tensor freqs = Tensor::empty({seq_len, dim}, kFloat32, kCPU).alloc(); + float* inv_freq_ptr = inv_freq.ptr(); + float* freqs_ptr = freqs.ptr(); + for (int i = 0; i < seq_len; ++i) { + const float i_val = static_cast(i); + float* row_ptr = freqs_ptr + i * dim; + for (int j = 0; j < dim; ++j) { row_ptr[j] = i_val * inv_freq_ptr[j]; } + } + return freqs; +} + +inline auto makeVisualRotaryPosEmb(Tensor& rotary_pos_emb_full, Tensor& pos_ids, Tensor& grid_thw) -> Tensor { + const int32_t dim = rotary_pos_emb_full.shape()[1]; + const int32_t batch_size = pos_ids.shape()[0]; + const int32_t seq_len = pos_ids.shape()[1]; + + int total_positions = 0; + for (int row = 0; row < grid_thw.shape()[0]; ++row) { + const int* dims = grid_thw.offsettedPtr({row, 0}); + total_positions += dims[0] * dims[1] * dims[2]; + } + + Tensor out = Tensor::empty({batch_size, seq_len * dim}, kFloat32, kCPU).alloc(); + + auto rotary_pos_emb_full_ptr = rotary_pos_emb_full.ptr(); + auto pos_ids_ptr = pos_ids.ptr(); + + if (rotary_pos_emb_full.shape()[0] <= 0 || dim <= 0 || batch_size <= 0) { + MLLM_ERROR_EXIT(ExitCode::kSliceOB, "Invalid tensor dimensions"); + } + + if (total_positions != batch_size) { MLLM_ERROR_EXIT(ExitCode::kSliceOB, "Grid dimensions mismatch with batch size"); } + + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < seq_len; ++j) { + const int idx = pos_ids_ptr[i * seq_len + j]; + if (idx < 0 || idx >= rotary_pos_emb_full.shape()[0]) { + MLLM_ERROR_EXIT(ExitCode::kSliceOB, "Position index out of bounds"); + } + } + } + + for (int i = 0; i < batch_size; ++i) { + auto batch_ptr = out.offsettedPtr({i, 0}); + size_t offset = 0; + for (int j = 0; j < seq_len; ++j) { + const int idx = pos_ids_ptr[i * seq_len + j]; + auto emb_ptr = rotary_pos_emb_full_ptr + idx * dim; + std::copy(emb_ptr, emb_ptr + dim, batch_ptr + offset); + offset += dim; + } + } + + return out; +} + +inline auto makeVisualRotarySinCos(Tensor& rotary_pos_emb) -> std::pair { + const auto seq = rotary_pos_emb.shape()[0]; + const auto dim = rotary_pos_emb.shape()[1]; + + auto rotary_pos_emb_ptr = rotary_pos_emb.ptr(); + + Tensor sin_pos_emb = Tensor::empty({seq, dim}, kFloat32, kCPU).alloc(); + Tensor cos_pos_emb = Tensor::empty({seq, dim}, kFloat32, kCPU).alloc(); + + auto sin_pos_emb_ptr = sin_pos_emb.ptr(); + auto cos_pos_emb_ptr = cos_pos_emb.ptr(); + + for (int i = 0; i < seq; i++) { + for (int j = 0; j < dim; j++) { + sin_pos_emb_ptr[i * dim + j] = std::sin(rotary_pos_emb_ptr[i * dim + j]); + cos_pos_emb_ptr[i * dim + j] = std::cos(rotary_pos_emb_ptr[i * dim + j]); + } + } + + return {sin_pos_emb, cos_pos_emb}; +} + +inline auto makeAudioSinusoidalPosEmb(int32_t length, int32_t channels, float max_timescale = 10000.0f) -> Tensor { + MLLM_RT_ASSERT(channels % 2 == 0); + auto pos_emb = Tensor::empty({length, channels}, kFloat32, kCPU).alloc(); + auto pos_ptr = pos_emb.ptr(); + + const int half = channels / 2; + const float log_timescale_increment = std::log(max_timescale) / static_cast(half - 1); + + std::vector inv_timescales(half); + for (int i = 0; i < half; ++i) { + inv_timescales[i] = std::exp(-log_timescale_increment * static_cast(i)); + } + + for (int t = 0; t < length; ++t) { + for (int i = 0; i < half; ++i) { + const float scaled_time = static_cast(t) * inv_timescales[i]; + pos_ptr[t * channels + i] = std::sin(scaled_time); + pos_ptr[t * channels + half + i] = std::cos(scaled_time); + } + } + + return pos_emb; +} + +class Qwen2_5OmniPatchEmbed final : public nn::Module { + int32_t in_chans_; + int32_t embed_dim_; + int32_t patch_size_; + int32_t temporal_patch_size_; + + nn::Conv3D proj_; + + public: + Qwen2_5OmniPatchEmbed() = default; + + explicit Qwen2_5OmniPatchEmbed(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + in_chans_ = cfg.visual_in_chans; + embed_dim_ = cfg.visual_hidden_size; + patch_size_ = cfg.visual_patch_size; + temporal_patch_size_ = cfg.visual_temporal_patch_size; + + proj_ = reg("proj", cfg.visual_in_chans, cfg.visual_hidden_size, + std::vector{cfg.visual_temporal_patch_size, cfg.visual_patch_size, cfg.visual_patch_size}, + std::vector{cfg.visual_temporal_patch_size, cfg.visual_patch_size, cfg.visual_patch_size}, + false); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto hidden_states = inputs[0]; + hidden_states = hidden_states.view({-1, in_chans_, temporal_patch_size_, patch_size_, patch_size_}); + hidden_states = proj_(hidden_states).view({-1, embed_dim_}); + return {hidden_states}; + } +}; + +class Qwen2_5OmniPatchMerger final : public nn::Module { + int32_t hidden_size_; + int32_t spatial_merge_size_; + int32_t context_dim_; + + nn::RMSNorm ln_q_; + nn::Linear mlp_0_; + nn::Linear mlp_2_; + nn::GELU mlp_gelu_; + + public: + Qwen2_5OmniPatchMerger() = default; + + explicit Qwen2_5OmniPatchMerger(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + context_dim_ = cfg.visual_hidden_size; + spatial_merge_size_ = cfg.visual_spatial_merge_size; + hidden_size_ = context_dim_ * spatial_merge_size_ * spatial_merge_size_; + + ln_q_ = reg("ln_q", 1e-6); + mlp_0_ = reg("mlp.0", hidden_size_, hidden_size_, true, cfg.linear_impl_type); + mlp_gelu_ = reg("mlp.gelu"); + mlp_2_ = reg("mlp.2", hidden_size_, cfg.visual_out_hidden_size, true, cfg.linear_impl_type); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto o = ln_q_(inputs[0]).view({-1, hidden_size_}); + o = mlp_0_(o); + o = mlp_gelu_(o); + o = mlp_2_(o); + return {o}; + } +}; + +class Qwen2_5OmniVisionMLP final : public nn::Module { + nn::Linear gate_proj_; + nn::Linear up_proj_; + nn::Linear down_proj_; + nn::SiLU silu_; + + public: + Qwen2_5OmniVisionMLP() = default; + explicit Qwen2_5OmniVisionMLP(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + gate_proj_ = reg("gate_proj", cfg.visual_hidden_size, cfg.visual_intermediate_size, true); + silu_ = reg("act"); + up_proj_ = reg("up_proj", cfg.visual_hidden_size, cfg.visual_intermediate_size, true); + down_proj_ = reg("down_proj", cfg.visual_intermediate_size, cfg.visual_hidden_size, true); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto x = gate_proj_(inputs[0]); + x = silu_(x); + auto y = up_proj_(inputs[0]); + x = x * y; + x = down_proj_(x); + return {x}; + } +}; + +class Qwen2_5OmniVisionAttention final : public nn::Module { + int32_t dim_; + int32_t num_heads_; + int32_t head_dim_; + + nn::Linear q_; + nn::Linear k_; + nn::Linear v_; + nn::Linear proj_; + nn::Softmax softmax_; + nn::VisionRoPE vision_rope_q_; + nn::VisionRoPE vision_rope_k_; + + public: + Qwen2_5OmniVisionAttention() = default; + + explicit Qwen2_5OmniVisionAttention(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + dim_ = cfg.visual_hidden_size; + num_heads_ = cfg.visual_num_heads; + head_dim_ = dim_ / num_heads_; + + q_ = reg("q", dim_, dim_, true, cfg.linear_impl_type); + k_ = reg("k", dim_, dim_, true, cfg.linear_impl_type); + v_ = reg("v", dim_, dim_, true, cfg.linear_impl_type); + proj_ = reg("proj", dim_, dim_, true, cfg.linear_impl_type); + softmax_ = reg("softmax", -1); + + vision_rope_q_ = reg("vision_rope_q", aops::VisionRoPEOpOptionsType::kQwen2VL, + aops::Qwen2VLRoPEOpOptions{ + .dims = head_dim_, + .spatial_merge_size = cfg.visual_spatial_merge_size, + .theta = 10000.0, + }); + vision_rope_k_ = reg("vision_rope_k", aops::VisionRoPEOpOptionsType::kQwen2VL, + aops::Qwen2VLRoPEOpOptions{ + .dims = head_dim_, + .spatial_merge_size = cfg.visual_spatial_merge_size, + .theta = 10000.0, + }); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto hidden_states = inputs[0]; + auto visual_embedding_sin = inputs[1]; + auto visual_embedding_cos = inputs[2]; + auto& mask = inputs[3]; + + auto seq_length = hidden_states.shape()[0]; + + auto query_states = q_(hidden_states).view({seq_length, num_heads_, head_dim_}).unsqueeze(0); + auto key_states = k_(hidden_states).view({seq_length, num_heads_, head_dim_}).unsqueeze(0); + auto value_states = v_(hidden_states).view({seq_length, num_heads_, head_dim_}).unsqueeze(0); + + query_states = vision_rope_q_(query_states, visual_embedding_sin, visual_embedding_cos); + key_states = vision_rope_k_(key_states, visual_embedding_sin, visual_embedding_cos); + + query_states = query_states.transpose(1, 2); + key_states = key_states.transpose(1, 2); + value_states = value_states.transpose(1, 2); + + auto attn = nn::functional::matmul(query_states, key_states, false, true) * (1.f / sqrtf(head_dim_)); + if (mask) { attn = attn + mask; } + attn = softmax_(attn); + + auto attn_output = nn::functional::matmul(attn, value_states); + attn_output = attn_output.transpose(1, 2).view({seq_length, -1}); + attn_output = proj_(attn_output); + return {attn_output}; + } +}; + +class Qwen2_5OmniVisionBlock final : public nn::Module { + nn::RMSNorm norm1_; + nn::RMSNorm norm2_; + + Qwen2_5OmniVisionAttention attn_; + Qwen2_5OmniVisionMLP mlp_; + + public: + Qwen2_5OmniVisionBlock() = default; + + explicit Qwen2_5OmniVisionBlock(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + norm1_ = reg("norm1", 1e-6); + norm2_ = reg("norm2", 1e-6); + attn_ = reg("attn", cfg); + mlp_ = reg("mlp", cfg); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto hidden_states = inputs[0]; + auto visual_embedding_sin = inputs[1]; + auto visual_embedding_cos = inputs[2]; + auto mask = inputs[3]; + + hidden_states = hidden_states + attn_(norm1_(hidden_states), visual_embedding_sin, visual_embedding_cos, mask)[0]; + hidden_states = hidden_states + mlp_(norm2_(hidden_states))[0]; + return {hidden_states}; + } +}; + +class Qwen2_5OmniVisionEncoder final : public nn::Module { + Qwen2_5OmniPatchEmbed patch_embed_; + Qwen2_5OmniPatchMerger patch_merger_; + nn::ModuleList blocks_; + std::vector visual_fullatt_block_indexes_; + int32_t visual_window_size_ = 0; + int32_t visual_spatial_merge_size_ = 1; + int32_t visual_patch_size_ = 1; + int32_t spatial_merge_unit_ = 1; + + public: + Qwen2_5OmniVisionEncoder() = default; + + explicit Qwen2_5OmniVisionEncoder(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + visual_window_size_ = cfg.visual_window_size; + visual_spatial_merge_size_ = cfg.visual_spatial_merge_size; + visual_patch_size_ = cfg.visual_patch_size; + spatial_merge_unit_ = visual_spatial_merge_size_ * visual_spatial_merge_size_; + visual_fullatt_block_indexes_ = cfg.visual_fullatt_block_indexes; + patch_embed_ = reg("patch_embed", cfg); + patch_merger_ = reg("merger", cfg); + blocks_ = reg>("blocks", cfg.visual_depth, cfg); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto hidden_states = inputs[0]; + auto embedding_sin = inputs[1]; + auto embedding_cos = inputs[2]; + auto& grid_thw = inputs[3]; + + hidden_states = patch_embed_(hidden_states)[0]; + auto [window_index, cu_window_seqlens] = + makeWindowIndex(grid_thw, visual_window_size_, visual_spatial_merge_size_, visual_patch_size_); + + auto seq_len = hidden_states.shape()[0]; + hidden_states = hidden_states.view({seq_len / spatial_merge_unit_, spatial_merge_unit_, -1}); + hidden_states = hidden_states[{window_index, {kAll}, {kAll}}]; + hidden_states = hidden_states.view({seq_len, -1}); + + embedding_sin = embedding_sin.view({seq_len / spatial_merge_unit_, spatial_merge_unit_, -1}); + embedding_sin = embedding_sin[{window_index, {kAll}, {kAll}}]; + embedding_sin = embedding_sin.view({seq_len, -1}); + embedding_cos = embedding_cos.view({seq_len / spatial_merge_unit_, spatial_merge_unit_, -1}); + embedding_cos = embedding_cos[{window_index, {kAll}, {kAll}}]; + embedding_cos = embedding_cos.view({seq_len, -1}); + + auto mask = Tensor::empty({1, 1, seq_len, seq_len}, DataTypes::kFloat32, DeviceTypes::kCPU).alloc(); + { + auto mask_ptr = mask.ptr(); + const mllm_fp32_t neg_inf = -1e12f; + for (int i = 0; i < seq_len * seq_len; ++i) { mask_ptr[i] = neg_inf; } + for (int i = 1; i < cu_window_seqlens.size(); ++i) { + const int start = cu_window_seqlens[i - 1]; + const int end = cu_window_seqlens[i]; + for (int r = start; r < end; ++r) { + for (int c = start; c < end; ++c) { mask_ptr[r * seq_len + c] = 0.0f; } + } + } + } + + for (auto [layer_idx, b] : enumerate(blocks_.list())) { + if (std::find(visual_fullatt_block_indexes_.begin(), visual_fullatt_block_indexes_.end(), layer_idx) + != visual_fullatt_block_indexes_.end()) { + hidden_states = b(hidden_states, embedding_sin, embedding_cos, Tensor::nil())[0]; + } else { + hidden_states = b(hidden_states, embedding_sin, embedding_cos, mask)[0]; + } + } + + hidden_states = patch_merger_(hidden_states)[0]; + + std::vector reverse_indices(window_index.size()); + std::iota(reverse_indices.begin(), reverse_indices.end(), 0); + std::sort(reverse_indices.begin(), reverse_indices.end(), + [&window_index](int i, int j) { return window_index[i] < window_index[j]; }); + hidden_states = hidden_states[{reverse_indices, {kAll}}]; + + return {hidden_states}; + } +}; + +class Qwen2_5OmniAudioAttention final : public nn::Module { + int32_t embed_dim_ = 0; + int32_t num_heads_ = 0; + int32_t head_dim_ = 0; + + nn::Linear k_proj_; + nn::Linear v_proj_; + nn::Linear q_proj_; + nn::Linear out_proj_; + + public: + Qwen2_5OmniAudioAttention() = default; + + explicit Qwen2_5OmniAudioAttention(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + embed_dim_ = cfg.audio_d_model; + num_heads_ = cfg.audio_encoder_attention_heads; + head_dim_ = embed_dim_ / num_heads_; + + k_proj_ = reg("k_proj", embed_dim_, embed_dim_, false); + v_proj_ = reg("v_proj", embed_dim_, embed_dim_, true); + q_proj_ = reg("q_proj", embed_dim_, embed_dim_, true); + out_proj_ = reg("out_proj", embed_dim_, embed_dim_, true); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto hidden_states = inputs[0]; // [seq_len, embed_dim] + auto seq_len = hidden_states.shape()[0]; + + auto hidden = hidden_states.unsqueeze(0); // [1, seq_len, embed_dim] + auto query_states = q_proj_(hidden); + auto key_states = k_proj_(hidden); + auto value_states = v_proj_(hidden); + + query_states = query_states.view({1, seq_len, num_heads_, head_dim_}).transpose(1, 2); + key_states = key_states.view({1, seq_len, num_heads_, head_dim_}).transpose(1, 2); + value_states = value_states.view({1, seq_len, num_heads_, head_dim_}).transpose(1, 2); + + float scale = 1.0f / std::sqrt(static_cast(head_dim_)); + auto attn_weights = nn::functional::matmul(query_states, key_states.transpose(-2, -1)) * scale; + attn_weights = nn::functional::softmax(attn_weights, -1); + auto attn_output = nn::functional::matmul(attn_weights, value_states); + + attn_output = attn_output.transpose(1, 2).contiguous().view({1, seq_len, embed_dim_}); + attn_output = out_proj_(attn_output); + + return {attn_output.squeeze(0)}; + } +}; + +class Qwen2_5OmniAudioEncoderLayer final : public nn::Module { + Qwen2_5OmniAudioAttention self_attn_; + nn::LayerNorm self_attn_layer_norm_; + nn::Linear fc1_; + nn::Linear fc2_; + nn::LayerNorm final_layer_norm_; + nn::GELU activation_fn_; + + public: + Qwen2_5OmniAudioEncoderLayer() = default; + + explicit Qwen2_5OmniAudioEncoderLayer(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + const int32_t embed_dim = cfg.audio_d_model; + self_attn_ = reg("self_attn", cfg); + self_attn_layer_norm_ = + reg("self_attn_layer_norm", std::vector{embed_dim}, true, true, 1e-5); + fc1_ = reg("fc1", embed_dim, cfg.audio_encoder_ffn_dim, true); + fc2_ = reg("fc2", cfg.audio_encoder_ffn_dim, embed_dim, true); + final_layer_norm_ = reg("final_layer_norm", std::vector{embed_dim}, true, true, 1e-5); + activation_fn_ = reg("activation_fn"); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto hidden_states = inputs[0]; + auto residual = hidden_states; + + hidden_states = self_attn_layer_norm_(hidden_states); + hidden_states = self_attn_(hidden_states)[0]; + hidden_states = residual + hidden_states; + + residual = hidden_states; + hidden_states = final_layer_norm_(hidden_states); + hidden_states = fc1_(hidden_states); + hidden_states = activation_fn_(hidden_states); + hidden_states = fc2_(hidden_states); + hidden_states = residual + hidden_states; + + if (hidden_states.dtype() == kFloat16) { + const float clamp_value = 65504.0f - 1000.0f; + hidden_states = nn::functional::clip(hidden_states, -clamp_value, clamp_value); + } + + return {hidden_states}; + } +}; + +class Qwen2_5OmniAudioEncoder final : public nn::Module { + nn::Conv1D conv1_; + nn::Conv1D conv2_; + nn::GELU gelu_; + nn::ModuleList layers_; + nn::LayerNorm ln_post_; + nn::AvgPool1d avg_pooler_; + nn::Linear proj_; + nn::Embedding audio_bos_eos_token_; + + int32_t num_mel_bins_ = 0; + int32_t embed_dim_ = 0; + int32_t n_window_ = 0; + int32_t output_dim_ = 0; + + public: + Qwen2_5OmniAudioEncoder() = default; + + explicit Qwen2_5OmniAudioEncoder(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { + num_mel_bins_ = cfg.audio_num_mel_bins; + embed_dim_ = cfg.audio_d_model; + n_window_ = cfg.audio_n_window; + output_dim_ = cfg.audio_output_dim; + + conv1_ = reg("conv1", num_mel_bins_, embed_dim_, 3, 1, 1); + conv2_ = reg("conv2", embed_dim_, embed_dim_, 3, 2, 1); + gelu_ = reg("gelu"); + audio_bos_eos_token_ = reg("audio_bos_eos_token", 2, cfg.audio_output_dim); + layers_ = reg>("layers", cfg.audio_encoder_layers, cfg); + ln_post_ = reg("ln_post", std::vector{embed_dim_}, true, true, 1e-5); + avg_pooler_ = reg("avg_pooler", 2, 2); + proj_ = reg("proj", embed_dim_, cfg.audio_output_dim, true); + + auto pos_emb = makeAudioSinusoidalPosEmb(cfg.audio_max_source_positions, embed_dim_); + registerBuffer("positional_embedding", pos_emb); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto input_features = inputs[0]; // [B, n_mels, T] + MLLM_RT_ASSERT_EQ(input_features.shape().size(), 3); + + const int32_t batch_size = input_features.shape()[0]; + MLLM_RT_ASSERT_EQ(input_features.shape()[1], num_mel_bins_); + const int32_t feature_len = input_features.shape()[2]; + MLLM_RT_ASSERT(feature_len > 0); + + auto pos_emb = getBuffer("positional_embedding"); + + std::vector audio_outputs; + audio_outputs.reserve(batch_size); + + for (int32_t b = 0; b < batch_size; ++b) { + Tensor audio_b = input_features[make_slice(b), kAll, kAll].view({1, num_mel_bins_, feature_len}).contiguous(); + + const int32_t chunk_size = n_window_ * 2; + const int32_t num_chunks = (feature_len + chunk_size - 1) / chunk_size; + + std::vector chunk_outputs; + chunk_outputs.reserve(num_chunks); + + for (int32_t chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) { + const int32_t start = chunk_idx * chunk_size; + const int32_t chunk_len = std::min(chunk_size, feature_len - start); + auto chunk = Tensor::empty({1, num_mel_bins_, chunk_len}, kFloat32, kCPU).alloc(); + for (int32_t m = 0; m < num_mel_bins_; ++m) { + auto src_ptr = audio_b.offsettedPtr({0, m, start}); + auto dst_ptr = chunk.offsettedPtr({0, m, 0}); + std::memcpy(dst_ptr, src_ptr, chunk_len * sizeof(float)); + } + + auto x = conv1_(chunk); + x = gelu_(x); + x = conv2_(x); + x = gelu_(x); + x = x.transpose(1, 2).contiguous(); // [1, T2, D] + + const int32_t t2 = x.shape()[1]; + MLLM_RT_ASSERT(t2 <= pos_emb.shape()[0]); + auto pos_ptr = pos_emb.ptr(); + auto x_ptr = x.ptr(); + for (int32_t t = 0; t < t2; ++t) { + const float* pos_row = pos_ptr + t * embed_dim_; + float* x_row = x_ptr + t * embed_dim_; + for (int32_t d = 0; d < embed_dim_; ++d) { x_row[d] += pos_row[d]; } + } + + auto hidden_states = x.squeeze(0); // [T2, D] + for (auto& layer : layers_.list()) { hidden_states = layer(hidden_states)[0]; } + if (hidden_states.shape()[0] < 2) { continue; } + + auto pooled = hidden_states.unsqueeze(0).transpose(1, 2); // [1, D, T] + pooled = avg_pooler_(pooled); + pooled = pooled.transpose(1, 2).squeeze(0); // [T', D] + pooled = ln_post_(pooled); + pooled = proj_(pooled); + chunk_outputs.push_back(pooled); + } + + int32_t total_len = 0; + for (const auto& chunk : chunk_outputs) { total_len += chunk.shape()[0]; } + + auto merged = Tensor::empty({total_len, output_dim_}, kFloat32, kCPU).alloc(); + int32_t offset = 0; + for (const auto& chunk : chunk_outputs) { + const int32_t len = chunk.shape()[0]; + const float* src_ptr = chunk.ptr(); + float* dst_ptr = merged.offsettedPtr({offset, 0}); + std::memcpy(dst_ptr, src_ptr, len * output_dim_ * sizeof(float)); + offset += len; + } + + audio_outputs.push_back(merged); + } + + int32_t total_audio_tokens = 0; + for (const auto& out : audio_outputs) { total_audio_tokens += out.shape()[0]; } + + auto output = Tensor::empty({total_audio_tokens, output_dim_}, kFloat32, kCPU).alloc(); + int32_t offset = 0; + for (const auto& out : audio_outputs) { + const int32_t len = out.shape()[0]; + const float* src_ptr = out.ptr(); + float* dst_ptr = output.offsettedPtr({offset, 0}); + std::memcpy(dst_ptr, src_ptr, len * output_dim_ * sizeof(float)); + offset += len; + } + + return {output}; + } +}; + class Qwen2_5OmniMLP final : public nn::Module { nn::Linear gate_proj_; nn::Linear up_proj_; @@ -275,10 +1030,14 @@ class Qwen2_5OmniThinker final : public nn::Module { Qwen2_5OmniThinker() = default; Qwen2_5OmniThinker(const std::string& name, const Qwen2_5OmniConfig& cfg) : nn::Module(name) { model_ = reg("model", cfg); + audio_tower_ = reg("audio_tower", cfg); + visual_ = reg("visual", cfg); lm_head_ = reg("lm_head", cfg.hidden_size, cfg.vocab_size, false, cfg.linear_impl_type); } Qwen2_5OmniText model_; + Qwen2_5OmniAudioEncoder audio_tower_; + Qwen2_5OmniVisionEncoder visual_; nn::Linear lm_head_; }; @@ -302,11 +1061,97 @@ class Qwen2_5OmniForCausalLM : public ARGeneration { auto input_embeddings = thinker_.model_.embedding_(sequence); - Tensor position_ids = Tensor::nil(); - if (input.count("position_ids")) { - position_ids = input.at("position_ids"); + if (input.count("input_features")) { + auto input_features = input.at("input_features"); + auto audio_embeddings = thinker_.audio_tower_(input_features)[0]; + MLLM_RT_ASSERT_EQ(audio_embeddings.shape()[1], input_embeddings.shape()[2]); + if (audio_embeddings.dtype() != input_embeddings.dtype()) { + audio_embeddings = audio_embeddings.to(input_embeddings.dtype()); + } + + MLLM_RT_ASSERT_EQ(sequence.shape()[0], 1); + auto S = sequence.shape()[1]; + std::vector audio_positions; + audio_positions.reserve(audio_embeddings.shape()[0]); + auto input_ids_ptr = sequence.ptr(); + for (int s = 0; s < S; ++s) { + if (input_ids_ptr[s] == cfg_.audio_token_id) { audio_positions.push_back(s); } + } + MLLM_RT_ASSERT_EQ(static_cast(audio_positions.size()), audio_embeddings.shape()[0]); + + auto D = input_embeddings.shape()[2]; + if (input_embeddings.dtype() == kFloat32) { + for (size_t i = 0; i < audio_positions.size(); ++i) { + auto out_ptr = input_embeddings.offsettedPtr({0, audio_positions[i], 0}); + auto in_ptr = audio_embeddings.offsettedPtr({static_cast(i), 0}); + std::copy(in_ptr, in_ptr + D, out_ptr); + } + } else if (input_embeddings.dtype() == kFloat16) { + for (size_t i = 0; i < audio_positions.size(); ++i) { + auto out_ptr = input_embeddings.offsettedPtr({0, audio_positions[i], 0}); + auto in_ptr = audio_embeddings.offsettedPtr({static_cast(i), 0}); + std::copy(in_ptr, in_ptr + D, out_ptr); + } + } else { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "Unsupported embedding dtype for Qwen2.5-Omni audio input."); + } } - position_ids = getPositionIds(sequence, position_ids); + + if (input.count("img")) { + auto img = input.at("img"); + auto grid_thw = input.at("grid_thw"); + + auto inv_freq = makeVisualRoPEInvFreq(cfg_.visual_hidden_size / cfg_.visual_num_heads, 10000.0f); + auto pos_ids = makeVisualRotaryPosEmbIds(grid_thw, cfg_.visual_spatial_merge_size); + + int max_grid = 0; + for (int row = 0; row < grid_thw.shape()[0]; ++row) { + const int* dims = grid_thw.offsettedPtr({row, 0}); + max_grid = std::max({max_grid, dims[1], dims[2]}); + } + MLLM_RT_ASSERT(max_grid > 0); + auto rotary_pos_emb_full = makeVisualRotaryPosEmbFull(inv_freq, max_grid); + auto pos_emb = makeVisualRotaryPosEmb(rotary_pos_emb_full, pos_ids, grid_thw); + auto [visual_embedding_sin, visual_embedding_cos] = makeVisualRotarySinCos(pos_emb); + + auto visual_embeddings = thinker_.visual_(img, visual_embedding_sin, visual_embedding_cos, grid_thw)[0]; + MLLM_RT_ASSERT_EQ(visual_embeddings.shape()[1], input_embeddings.shape()[2]); + if (visual_embeddings.dtype() != input_embeddings.dtype()) { + visual_embeddings = visual_embeddings.to(input_embeddings.dtype()); + } + + MLLM_RT_ASSERT_EQ(sequence.shape()[0], 1); + auto S = sequence.shape()[1]; + std::vector image_positions; + image_positions.reserve(visual_embeddings.shape()[0]); + auto input_ids_ptr = sequence.ptr(); + for (int s = 0; s < S; ++s) { + if (input_ids_ptr[s] == cfg_.image_token_id) { image_positions.push_back(s); } + } + MLLM_RT_ASSERT_EQ(static_cast(image_positions.size()), visual_embeddings.shape()[0]); + + auto D = input_embeddings.shape()[2]; + if (input_embeddings.dtype() == kFloat32) { + for (size_t i = 0; i < image_positions.size(); ++i) { + auto out_ptr = input_embeddings.offsettedPtr({0, image_positions[i], 0}); + auto in_ptr = visual_embeddings.offsettedPtr({static_cast(i), 0}); + std::copy(in_ptr, in_ptr + D, out_ptr); + } + } else if (input_embeddings.dtype() == kFloat16) { + for (size_t i = 0; i < image_positions.size(); ++i) { + auto out_ptr = input_embeddings.offsettedPtr({0, image_positions[i], 0}); + auto in_ptr = visual_embeddings.offsettedPtr({static_cast(i), 0}); + std::copy(in_ptr, in_ptr + D, out_ptr); + } + } else { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "Unsupported embedding dtype for Qwen2.5-Omni image input."); + } + } + + Tensor position_ids = input.count("position_ids") ? input.at("position_ids") : Tensor::nil(); + Tensor img = input.count("img") ? input.at("img") : Tensor::nil(); + Tensor grid_thw = input.count("grid_thw") ? input.at("grid_thw") : Tensor::nil(); + position_ids = getPositionIds(img, grid_thw, sequence, position_ids); auto [llm_embedding_sin, llm_embedding_cos] = makeMultimodalPositionEmbedding(position_ids, thinker_.model_.getBuffer("inv_freq"), cfg_.max_position_embeddings, @@ -326,9 +1171,21 @@ class Qwen2_5OmniForCausalLM : public ARGeneration { Qwen2_5OmniThinker thinker_; private: - Tensor getPositionIds(Tensor& input_ids, Tensor& position_ids) const { + Tensor getPositionIds(Tensor& img, Tensor& grid_thw, Tensor& input_ids, Tensor& position_ids) const { MLLM_RT_ASSERT_EQ(input_ids.shape().size(), 2); + bool has_multimodal = false; + auto input_ids_ptr = input_ids.ptr(); + auto seq_len = input_ids.shape()[1]; + for (int s = 0; s < seq_len; ++s) { + if (input_ids_ptr[s] == cfg_.vision_start_token_id || input_ids_ptr[s] == cfg_.audio_start_token_id) { + has_multimodal = true; + break; + } + } + + if (has_multimodal) { return getPositionIdsPrefill(input_ids, grid_thw); } + if (!position_ids.isNil()) { auto last_pos = *position_ids.offsettedPtr({0, 0, position_ids.shape()[2] - 1}); auto ret_position_ids = Tensor::empty({3, 1, 1}, kInt64, kCPU).alloc(); @@ -339,7 +1196,7 @@ class Qwen2_5OmniForCausalLM : public ARGeneration { } auto B = input_ids.shape()[0]; - auto S = input_ids.shape()[1]; + auto S = seq_len; MLLM_RT_ASSERT_EQ(B, 1); Tensor out = Tensor::empty({3, B, S}, kInt64, kCPU).alloc(); @@ -350,6 +1207,170 @@ class Qwen2_5OmniForCausalLM : public ARGeneration { return out; } + Tensor getPositionIdsPrefill(Tensor& input_ids, Tensor& image_grid_thw) const { + MLLM_RT_ASSERT_EQ(input_ids.shape().size(), 2); + + auto B = input_ids.shape()[0]; + auto S = input_ids.shape()[1]; + MLLM_RT_ASSERT_EQ(B, 1); + + Tensor position_ids = Tensor::empty({3, B, S}, kInt64, kCPU).alloc(); + + auto input_ids_ptr = input_ids.ptr(); + + auto fill_text_positions = [&](int start_seq, int len, int64_t start_id) { + for (int d = 0; d < 3; ++d) { + auto out_ptr = position_ids.offsettedPtr({d, 0, 0}); + for (int i = 0; i < len; ++i) { out_ptr[start_seq + i] = start_id + i; } + } + }; + + int seq_idx = 0; + int image_idx = 0; + int64_t current_max_position_id = -1; + const int total_images = image_grid_thw.isNil() ? 0 : image_grid_thw.shape()[0]; + + while (seq_idx < S) { + int next_vision = -1; + int next_audio = -1; + for (int i = seq_idx; i < S; ++i) { + if (input_ids_ptr[i] == cfg_.vision_start_token_id) { + next_vision = i; + break; + } + } + for (int i = seq_idx; i < S; ++i) { + if (input_ids_ptr[i] == cfg_.audio_start_token_id) { + next_audio = i; + break; + } + } + + if (next_vision == -1 && next_audio == -1) { + const int text_len = S - seq_idx; + if (text_len > 0) { fill_text_positions(seq_idx, text_len, current_max_position_id + 1); } + break; + } + + const bool is_vision = (next_vision != -1) && (next_audio == -1 || next_vision < next_audio); + const int segment_start = is_vision ? next_vision : next_audio; + + const int text_len = segment_start - seq_idx; + if (text_len > 0) { + fill_text_positions(seq_idx, text_len, current_max_position_id + 1); + current_max_position_id += text_len; + } + + if (is_vision) { + fill_text_positions(segment_start, 1, current_max_position_id + 1); + current_max_position_id += 1; + + int vision_end = -1; + for (int i = segment_start + 1; i < S; ++i) { + if (input_ids_ptr[i] == cfg_.vision_end_token_id) { + vision_end = i; + break; + } + } + MLLM_RT_ASSERT(vision_end != -1); + MLLM_RT_ASSERT(image_idx < total_images); + if (image_grid_thw.isNil()) { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "Missing grid_thw for Qwen2.5-Omni vision input."); + } + MLLM_RT_ASSERT_EQ(image_grid_thw.shape().size(), 2); + + std::vector image_positions; + for (int i = segment_start + 1; i < vision_end; ++i) { + if (input_ids_ptr[i] == cfg_.image_token_id) { + image_positions.push_back(i); + } else { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "Unsupported token inside vision segment."); + } + } + + const int* grid_dims = image_grid_thw.offsettedPtr({image_idx, 0}); + const int grid_t = grid_dims[0]; + const int grid_h = grid_dims[1]; + const int grid_w = grid_dims[2]; + + const int image_token_len = (grid_t * grid_h * grid_w) + / (cfg_.visual_spatial_merge_size * cfg_.visual_spatial_merge_size); + MLLM_RT_ASSERT_EQ(static_cast(image_positions.size()), image_token_len); + + const int inputs_t = grid_t; + const int inputs_h = grid_h / cfg_.visual_spatial_merge_size; + const int inputs_w = grid_w / cfg_.visual_spatial_merge_size; + + const int64_t vision_start_id = current_max_position_id + 1; + int pos_counter = 0; + for (int ti = 0; ti < inputs_t; ++ti) { + const int64_t t_id = vision_start_id + static_cast(ti) * cfg_.position_id_per_seconds; + for (int hi = 0; hi < inputs_h; ++hi) { + for (int wi = 0; wi < inputs_w; ++wi) { + const auto seq_pos = image_positions[pos_counter++]; + *position_ids.offsettedPtr({0, 0, seq_pos}) = t_id; + *position_ids.offsettedPtr({1, 0, seq_pos}) = vision_start_id + hi; + *position_ids.offsettedPtr({2, 0, seq_pos}) = vision_start_id + wi; + } + } + } + + const int64_t dim_0_tail = vision_start_id + static_cast(inputs_t - 1) * cfg_.position_id_per_seconds; + const int64_t dim_1_tail = vision_start_id + inputs_h - 1; + const int64_t dim_2_tail = vision_start_id + inputs_w - 1; + current_max_position_id = std::max({dim_0_tail, dim_1_tail, dim_2_tail}); + + fill_text_positions(vision_end, 1, current_max_position_id + 1); + current_max_position_id += 1; + + seq_idx = vision_end + 1; + image_idx += 1; + } else { + fill_text_positions(segment_start, 1, current_max_position_id + 1); + current_max_position_id += 1; + + int audio_end = -1; + for (int i = segment_start + 1; i < S; ++i) { + if (input_ids_ptr[i] == cfg_.audio_end_token_id) { + audio_end = i; + break; + } + } + MLLM_RT_ASSERT(audio_end != -1); + + std::vector audio_positions; + for (int i = segment_start + 1; i < audio_end; ++i) { + if (input_ids_ptr[i] == cfg_.audio_token_id) { + audio_positions.push_back(i); + } else { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "Unsupported token inside audio segment."); + } + } + + const int audio_len = static_cast(audio_positions.size()); + if (audio_len == 0) { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "Empty audio tokens inside audio segment."); + } + const int64_t audio_start_id = current_max_position_id + 1; + for (int i = 0; i < audio_len; ++i) { + const int64_t pos_id = audio_start_id + i; + for (int d = 0; d < 3; ++d) { + *position_ids.offsettedPtr({d, 0, audio_positions[i]}) = pos_id; + } + } + current_max_position_id += audio_len; + + fill_text_positions(audio_end, 1, current_max_position_id + 1); + current_max_position_id += 1; + + seq_idx = audio_end + 1; + } + } + + MLLM_RT_ASSERT_EQ(image_idx, total_images); + return position_ids; + } + const Qwen2_5OmniConfig& cfg_; nn::StaticCache kv_cache_; }; diff --git a/mllm/models/qwen2_5omni/tokenization_qwen2_5omni.hpp b/mllm/models/qwen2_5omni/tokenization_qwen2_5omni.hpp index 8674af9f5..961b5c8f2 100644 --- a/mllm/models/qwen2_5omni/tokenization_qwen2_5omni.hpp +++ b/mllm/models/qwen2_5omni/tokenization_qwen2_5omni.hpp @@ -2,6 +2,7 @@ // Licensed under the MIT License. #pragma once +#include #include #include @@ -9,6 +10,9 @@ #include "mllm/preprocessor/tokenizers/Unicode.hpp" #include "mllm/preprocessor/tokenizers/AutoTokenizer.hpp" #include "mllm/models/ARGeneration.hpp" +#include "mllm/models/qwen2vl/image_preprocessor_qwen2vl.hpp" +#include "mllm/models/qwen2_5omni/audio_preprocessor_qwen2_5omni.hpp" +#include "mllm/utils/Common.hpp" namespace mllm::models::qwen2_5omni { @@ -141,9 +145,52 @@ struct Qwen2_5OmniMessage { } }; +struct Qwen2_5OmniVisionMessage { + std::string prompt; + std::string img_file_path; + std::string system_prompt = "You are a helpful assistant."; + + [[nodiscard]] std::string buildChatMessage() const { + std::string result; + if (!system_prompt.empty()) { + result += "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"; + } + result += "<|im_start|>user\n<|vision_bos|><|IMAGE|><|vision_eos|>" + prompt + "<|im_end|>\n"; + result += "<|im_start|>assistant\n"; + return result; + } +}; + +struct Qwen2_5OmniAudioMessage { + std::string prompt; + std::string audio_file_path; + std::string system_prompt = "You are a helpful assistant."; + + [[nodiscard]] std::string buildChatMessage() const { + std::string result; + if (!system_prompt.empty()) { + result += "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"; + } + result += "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>" + prompt + "<|im_end|>\n"; + result += "<|im_start|>assistant\n"; + return result; + } +}; + class Qwen2_5OmniTokenizer final : public mllm::preprocessor::AutoTokenizer { public: - explicit Qwen2_5OmniTokenizer(const std::string& file_path) { + explicit Qwen2_5OmniTokenizer(const std::string& file_path, + int32_t spatial_merge_size = 2, + int32_t min_pixels = 56 * 56, + int32_t max_pixels = 1280 * 1280, + int32_t audio_sample_rate = 16000, + int32_t audio_n_mels = 128, + int32_t audio_hop_length = 160, + int32_t audio_chunk_length = 300) + //interestingly, the answer went bad when setting max_pixels higher, eg. 3584*3584) + : image_preprocessor_(min_pixels, max_pixels), + audio_preprocessor_(audio_sample_rate, audio_n_mels, audio_hop_length, audio_chunk_length), + spatial_merge_size_(spatial_merge_size) { preprocessor::initLocal(); preprocessor::makeBytes2UnicodeMap(bytes_2_unicode_dict_); for (auto& kv : bytes_2_unicode_dict_) { bytes_2_unicode_dict_inverse_.insert({kv.second, kv.first}); } @@ -243,10 +290,96 @@ class Qwen2_5OmniTokenizer final : public mllm::preprocessor::AutoTokenizer { return {{"sequence", sequence}}; } + ARGenerationOutputPast convertVisionMessage(const Qwen2_5OmniVisionMessage& message) { + auto applied_string = message.buildChatMessage(); + + auto [img, grid_thw] = image_preprocessor_(message.img_file_path); + + auto sequence_str = tokenize(applied_string); + std::vector ids; + ids.reserve(sequence_str.size()); + for (const auto& str : sequence_str) { ids.emplace_back(bpe_._lookup_vocab(str)); } + + auto grid_t = grid_thw.ptr()[0]; + auto grid_h = grid_thw.ptr()[1]; + auto grid_w = grid_thw.ptr()[2]; + int32_t img_token_nums = grid_t * grid_h * grid_w; + img_token_nums /= (spatial_merge_size_ * spatial_merge_size_); + + auto image_token_id = bpe_._lookup_vocab(L"<|IMAGE|>"); + { + auto it = std::find(ids.begin(), ids.end(), image_token_id); + if (it == ids.end()) { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "Missing <|IMAGE|> token in Qwen2.5-Omni prompt template."); + } + ids.insert(it + 1, img_token_nums - 1, image_token_id); + } + + Tensor sequence = Tensor::empty({1, static_cast(ids.size())}, kInt64, kCPU) + .setMemType(kNormal) + .setName("qwen2_5omni-tokenizer-i0") + .alloc(); + + auto ptr = sequence.ptr(); + for (size_t i = 0; i < ids.size(); ++i) { ptr[i] = ids[i]; } + + return { + {"sequence", sequence}, + {"img", img}, + {"grid_thw", grid_thw}, + }; + } + + ARGenerationOutputPast convertAudioMessage(const Qwen2_5OmniAudioMessage& message) { + auto applied_string = message.buildChatMessage(); + auto sequence_str = tokenize(applied_string); + + std::vector ids; + ids.reserve(sequence_str.size()); + for (const auto& str : sequence_str) { ids.emplace_back(bpe_._lookup_vocab(str)); } + + auto audio_result = audio_preprocessor_.processAudioFile(message.audio_file_path); + if (audio_result.input_features.isNil() || audio_result.feature_length <= 0) { + MLLM_ERROR_EXIT(ExitCode::kIOError, "Failed to extract audio features for Qwen2.5-Omni."); + } + + int32_t audio_token_nums = audio_preprocessor_.calcAudioTokenLength(audio_result.feature_length); + if (audio_token_nums <= 0) { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "Invalid audio token length for Qwen2.5-Omni."); + } + + auto audio_token_id = bpe_._lookup_vocab(L"<|AUDIO|>"); + { + auto it = std::find(ids.begin(), ids.end(), audio_token_id); + if (it == ids.end()) { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "Missing <|AUDIO|> token in Qwen2.5-Omni prompt template."); + } + ids.insert(it + 1, audio_token_nums - 1, audio_token_id); + } + + Tensor sequence = Tensor::empty({1, static_cast(ids.size())}, kInt64, kCPU) + .setMemType(kNormal) + .setName("qwen2_5omni-tokenizer-i0") + .alloc(); + + auto ptr = sequence.ptr(); + for (size_t i = 0; i < ids.size(); ++i) { ptr[i] = ids[i]; } + + audio_result.input_features.setName("input_features"); + + return { + {"sequence", sequence}, + {"input_features", audio_result.input_features}, + }; + } + private: preprocessor::BPE bpe_; std::unordered_map bytes_2_unicode_dict_; std::unordered_map bytes_2_unicode_dict_inverse_; + mllm::models::qwen2vl::Qwen2VLImagePreprocessor image_preprocessor_; + Qwen2_5OmniAudioPreprocessor audio_preprocessor_; + int32_t spatial_merge_size_ = 2; }; } // namespace mllm::models::qwen2_5omni From e959822f3f1c09f915a2bc0b4f55c8c90eafcf5c Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Sat, 17 Jan 2026 02:31:56 +0000 Subject: [PATCH 03/42] fix: Enhance quantization modules. Introduced FixedActivationQDQ for fixed quantization parameters, updated ActivationQDQ to use MovingAverageMinMaxObserver, and adjusted eps values for better precision. Modified Qwen3 model to utilize FixedActivationQDQ for sigmoid output and ensured dtype consistency in attention calculations. --- .../qualcomm/transformers/core/qdq.py | 117 +++++++++++++++++- .../qualcomm/transformers/core/rms_norm.py | 4 +- .../transformers/qwen3/modeling_qwen3.py | 34 ++++- .../qualcomm/transformers/qwen3/runner.py | 1 + .../qualcomm/transformers/qwen3/train.py | 1 + 5 files changed, 147 insertions(+), 10 deletions(-) diff --git a/pymllm/backends/qualcomm/transformers/core/qdq.py b/pymllm/backends/qualcomm/transformers/core/qdq.py index ce67729f4..8a4f90687 100644 --- a/pymllm/backends/qualcomm/transformers/core/qdq.py +++ b/pymllm/backends/qualcomm/transformers/core/qdq.py @@ -1,6 +1,13 @@ import torch import torch.nn as nn -from torch.ao.quantization import FakeQuantize, MinMaxObserver +from torch.ao.quantization import ( + FakeQuantize, + MovingAverageMinMaxObserver, +) +from torch.ao.quantization.observer import FixedQParamsObserver + +DEFAULT_EPS_8BIT = 0.0001 / 255 +DEFAULT_EPS_16BIT = 0.0001 / 65535 class ActivationQDQ(nn.Module): @@ -30,16 +37,24 @@ def __init__(self, bits=8, qscheme=torch.per_tensor_affine): self.quant_min = 0 self.quant_max = (2**bits) - 1 + if bits == 8: + eps = DEFAULT_EPS_8BIT + elif bits == 16: + eps = DEFAULT_EPS_16BIT + else: + raise ValueError(f"Unsupported bit width: {bits}") + # 2. Initialize FakeQuantize - # MinMaxObserver calculates scale and zero_point based on observed tensors. + # MovingAverageMinMaxObserver calculates scale and zero_point based on observed tensors. # Passing quant_min/max to the observer ensures consistency. self.fake_quant = FakeQuantize( - observer=MinMaxObserver.with_args( - qscheme=self.qscheme, + observer=MovingAverageMinMaxObserver.with_args( dtype=self.dtype, + qscheme=self.qscheme, quant_min=self.quant_min, quant_max=self.quant_max, reduce_range=False, + eps=eps, ), quant_min=self.quant_min, quant_max=self.quant_max, @@ -72,3 +87,97 @@ def disable_fakequant(self): def extra_repr(self): mode = "Symmetric" if "symmetric" in str(self.qscheme) else "Asymmetric" return f"bits={self.bits}, mode={mode}, q_range=({self.quant_min}, {self.quant_max}), dtype={self.dtype}" + + +class FixedActivationQDQ(nn.Module): + """ + Fixed activation Quantization-DeQuantization (QDQ) module. + Uses pre-determined scale and zero_point instead of dynamic observation. + Supports both Symmetric and Asymmetric (Affine) quantization. + Uses torch.qint32 as a unified type to support various bit-widths. + """ + + def __init__(self, scale, zero_point, bits=8, qscheme=torch.per_tensor_affine): + super().__init__() + self.bits = bits + self.qscheme = qscheme + + # Define the simulation dtype as qint32 to avoid overflow across different bit-widths + self.dtype = torch.qint32 + + # 1. Calculate quantization range based on bits and scheme + if qscheme in [torch.per_tensor_symmetric, torch.per_channel_symmetric]: + # Symmetric: range is [-(2^(bits-1)), 2^(bits-1) - 1] + # e.g., 8-bit: -128 to 127 + self.quant_min = -(2 ** (bits - 1)) + self.quant_max = 2 ** (bits - 1) - 1 + else: + # Asymmetric (Affine): range is [0, 2^bits - 1] + # e.g., 8-bit: 0 to 255 + self.quant_min = 0 + self.quant_max = (2**bits) - 1 + + if bits not in [8, 16]: + raise ValueError(f"Unsupported bit width: {bits}") + + # 2. Convert scale and zero_point to tensors if needed + if not isinstance(scale, torch.Tensor): + scale = torch.tensor(scale, dtype=torch.float32) + if not isinstance(zero_point, torch.Tensor): + zero_point = torch.tensor(zero_point, dtype=torch.int32) + + # 3. Initialize FakeQuantize with fixed parameters + # Use FakeQuantize with FixedQParamsObserver for fixed scale and zero_point + self.fake_quant = FakeQuantize.with_args( + observer=FixedQParamsObserver.with_args( + scale=scale, + zero_point=zero_point, + ), + dtype=self.dtype, + qscheme=self.qscheme, + quant_min=self.quant_min, + quant_max=self.quant_max, + )() + + def forward(self, x): + # Applies fake quantization with fixed scale and zero_point: + # rounds to nearest integer and clamps to [min, max], + # then dequantizes back to float to simulate quantization noise. + return self.fake_quant(x) + + # Control methods for quantization-aware training (QAT) + # Note: FixedActivationQDQ doesn't have observer, so these methods + # only control fake quantization behavior + def enable_observer(self): + """No-op: FixedActivationQDQ doesn't use observer.""" + pass + + def disable_observer(self): + """No-op: FixedActivationQDQ doesn't use observer.""" + pass + + def enable_fakequant(self): + """Enable simulation of quantization error.""" + self.fake_quant.enable_fakequant() + + def disable_fakequant(self): + """Disable quantization simulation (act as identity).""" + self.fake_quant.disable_fakequant() + + @property + def scale(self): + """Get the fixed scale value.""" + return self.fake_quant.scale + + @property + def zero_point(self): + """Get the fixed zero_point value.""" + return self.fake_quant.zero_point + + def extra_repr(self): + mode = "Symmetric" if "symmetric" in str(self.qscheme) else "Asymmetric" + scale_val = self.scale.item() if self.scale.numel() == 1 else self.scale + zp_val = ( + self.zero_point.item() if self.zero_point.numel() == 1 else self.zero_point + ) + return f"bits={self.bits}, mode={mode}, scale={scale_val}, zero_point={zp_val}, q_range=({self.quant_min}, {self.quant_max}), dtype={self.dtype}" diff --git a/pymllm/backends/qualcomm/transformers/core/rms_norm.py b/pymllm/backends/qualcomm/transformers/core/rms_norm.py index 0101d6aee..b3964469f 100644 --- a/pymllm/backends/qualcomm/transformers/core/rms_norm.py +++ b/pymllm/backends/qualcomm/transformers/core/rms_norm.py @@ -21,7 +21,9 @@ def __init__( # Quantization configuration for Weight self.weight_fake_quant = FakeQuantize( observer=MinMaxObserver.with_args( - qscheme=torch.per_tensor_affine, dtype=torch.qint32 + qscheme=torch.per_tensor_affine, + dtype=torch.qint32, + eps=0.0001 / 65535, ), quant_min=0, quant_max=2 ** (quant_bits) - 1, diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py index 9c0696328..0bbcbffd8 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py @@ -49,9 +49,11 @@ from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm from pymllm.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, - QLinearW8A16_PerChannelSym, ) -from pymllm.backends.qualcomm.transformers.core.qdq import ActivationQDQ +from pymllm.backends.qualcomm.transformers.core.qdq import ( + ActivationQDQ, + FixedActivationQDQ, +) class Qwen3MLP(nn.Module): @@ -76,7 +78,12 @@ def __init__(self, config): self.gate_proj_output_qdq = ActivationQDQ(bits=16) self.act_output_qdq = ActivationQDQ(bits=16) self.down_proj_input_qdq = ActivationQDQ(bits=16) - self.sigmoid_output_qdq = ActivationQDQ(bits=16) + # For sigmoid output: scale = 1 / (q_max - q_min + 1), zp = 0 + # For 16-bit: q_min = 0, q_max = 65535 + sigmoid_scale = 1.0 / (65535 - 0 + 1) # 1 / 65536 + self.sigmoid_output_qdq = FixedActivationQDQ( + scale=sigmoid_scale, zero_point=0, bits=16 + ) def forward(self, x): x = self.up_proj_input_qdq(x) @@ -281,7 +288,7 @@ def forward( torch.matmul(query_states, key_states.transpose(2, 3)) ) * self.scaling_qdq( - torch.ones(1, dtype=torch.bfloat16, device=value_states.device) + torch.ones(1, dtype=value_states.dtype, device=value_states.device) * self.scaling ) ) @@ -292,7 +299,8 @@ def forward( attn_vv = self.minus_0_output_qdq( attn_min + self.neg_20_qdq( - torch.ones(1, dtype=torch.bfloat16, device=value_states.device) * (-20) + torch.ones(1, dtype=value_states.dtype, device=value_states.device) + * (-20) ) ) attn_weights = torch.where(attention_mask == 0, attn_weights, attn_vv) @@ -315,6 +323,7 @@ def forward( class Qwen3DecoderLayer(GradientCheckpointingLayer): def __init__(self, config: Qwen3Config, layer_idx: int): super().__init__() + self.layer_dix = layer_idx self.hidden_size = config.hidden_size self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx) @@ -362,6 +371,15 @@ def forward( position_embeddings=position_embeddings, **kwargs, ) + + if self.layer_dix == 2: + print("1", hidden_states.min(), hidden_states.max()) + print( + "2", + self.add_0_lhs_input_qdq(hidden_states).min(), + self.add_0_lhs_input_qdq(hidden_states).max(), + ) + hidden_states = self.add_0_output_qdq( residual + self.add_0_lhs_input_qdq(hidden_states) ) @@ -567,6 +585,12 @@ def forward( self.mllm_max_cos_embedding, self.mllm_max_sin_embedding = self.rotary_emb( hidden_states, max_position_ids ) + self.mllm_max_cos_embedding = self.mllm_max_cos_embedding.to( + inputs_embeds.dtype + ) + self.mllm_max_sin_embedding = self.mllm_max_sin_embedding.to( + inputs_embeds.dtype + ) self.mllm_max_cos_embedding = self.cos_embedding_input_qdq( self.mllm_max_cos_embedding ) diff --git a/pymllm/backends/qualcomm/transformers/qwen3/runner.py b/pymllm/backends/qualcomm/transformers/qwen3/runner.py index 53ab40a9e..88f5ce84e 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/runner.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/runner.py @@ -44,6 +44,7 @@ def __init__(self, model_path: str, mllm_qualcomm_max_length=2048): self.model = Qwen3ForCausalLM.from_pretrained( model_path, attn_implementation="eager", + dtype=torch.bfloat16, ) self.model.cuda() self.mllm_qualcomm_max_length = mllm_qualcomm_max_length diff --git a/pymllm/backends/qualcomm/transformers/qwen3/train.py b/pymllm/backends/qualcomm/transformers/qwen3/train.py index 13ad2785a..33351918f 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/train.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/train.py @@ -44,6 +44,7 @@ def main(): # !!! # Things below is for deploy. We will turn all fp32 weights and some buffers(rope) to quantized dtype. # !!! + # This line maybe error. we need use quantized weight!!! not embed_tokens.weight!!! m.model.lm_head.weight = torch.nn.Parameter( m.model.model.embed_tokens.weight.clone() ) From 0672432d6d7f94a2567dfb2e1dbbb2b2e76985e9 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Sat, 17 Jan 2026 03:29:56 +0000 Subject: [PATCH 04/42] fix: Suppress deprecated comma-subscript warnings in CMake and remove debug print statements from Qwen3DecoderLayer --- mllm/CMakeLists.txt | 4 ++++ .../qualcomm/transformers/qwen3/modeling_qwen3.py | 10 ++-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/mllm/CMakeLists.txt b/mllm/CMakeLists.txt index 9df6b7741..fd796f95a 100644 --- a/mllm/CMakeLists.txt +++ b/mllm/CMakeLists.txt @@ -56,6 +56,10 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "App endif() endif() +# FIXME: @oreomaker Need to remove comma features in slice! +# Suppress comma-subscript warnings (deprecated C++ feature that will be removed in C++26) +target_compile_options(MllmRT PUBLIC -Wno-comma-subscript) + # ONLY APPLE CAN DO ! # Processing OpenMP if(MLLM_KERNEL_USE_THREADS AND MLLM_KERNEL_THREADS_VENDOR_OPENMP) diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py index 0bbcbffd8..dc6486043 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py @@ -372,14 +372,6 @@ def forward( **kwargs, ) - if self.layer_dix == 2: - print("1", hidden_states.min(), hidden_states.max()) - print( - "2", - self.add_0_lhs_input_qdq(hidden_states).min(), - self.add_0_lhs_input_qdq(hidden_states).max(), - ) - hidden_states = self.add_0_output_qdq( residual + self.add_0_lhs_input_qdq(hidden_states) ) @@ -388,6 +380,8 @@ def forward( residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) + # if self.layer_dix == 2: + # print(hidden_states.min(), hidden_states.max()) hidden_states = residual + self.add_1_lhs_input_qdq(hidden_states) return hidden_states From 927f7eb8c76afa1664bb482e0b425f04f4f022db Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Mon, 19 Jan 2026 07:45:23 +0000 Subject: [PATCH 05/42] feat(qualcomm): Add installation targets for flatbuffers and MllmQNNBackend in CMake, enhance PTQPass with unsolved tensor value checks, and update quantization specifications in RMSNorm and model file conversion. --- CMakeLists.txt | 7 +++ .../qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp | 3 +- mllm/backends/qnn/CMakeLists.txt | 7 +++ mllm/backends/qnn/aot/passes/PTQPass.cpp | 44 +++++++++++++++++++ mllm/backends/qnn/aot/visitor/RMSNorm.cpp | 5 ++- .../qualcomm/transformers/core/qdq.py | 4 +- .../qualcomm/transformers/core/qlinear.py | 4 +- .../transformers/qwen3/modeling_qwen3.py | 2 - .../qualcomm/transformers/qwen3/runner.py | 2 +- pymllm/convertor/model_file_v2.py | 12 ++++- 10 files changed, 80 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 298b412c0..fca470ee5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -332,6 +332,13 @@ install( ARCHIVE DESTINATION lib RUNTIME DESTINATION bin) +install( + TARGETS flatbuffers + EXPORT MllmTargets + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin) + if(MLLM_BUILD_SDK_C_BINDING) install( TARGETS MllmSdkC diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp index 9eed37267..f1b20a1a2 100644 --- a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp +++ b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp @@ -272,7 +272,8 @@ class Qwen3Attention final : public nn::Module { auto attn_min = ptq::QDQ(this, attn.min(-1, true), "reduce_min_output_qdq"); auto minus_value = Tensor::constant(-20, kFloat32); minus_value = ptq::QDQ(this, minus_value, "neg_20_qdq"); - attn = nn::functional::where(causal_mask.equal(0.f), attn, attn_min.addConstant(minus_value)); + auto attn_vv = ptq::QDQ(this, attn_min.addConstant(minus_value), "minus_0_output_qdq"); + attn = nn::functional::where(causal_mask.equal(0.f), attn, attn_vv); attn = ptq::QDQ(this, nn::functional::softmax(attn, -1), "softmax_output_qdq"); auto y = ptq::QDQ(this, nn::functional::matmul(attn, vh), "attn_value_matmul_output_qdq"); y = y.transpose(1, 2).view({1, 1, -1, num_attention_heads_ * head_dim_}, /*ssa=*/true); diff --git a/mllm/backends/qnn/CMakeLists.txt b/mllm/backends/qnn/CMakeLists.txt index 0ad833792..83b4a43f9 100644 --- a/mllm/backends/qnn/CMakeLists.txt +++ b/mllm/backends/qnn/CMakeLists.txt @@ -44,3 +44,10 @@ get_property(current_includes DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INC message(STATUS "MLLM_QNN INCLUDES: ${current_includes}") #print include directories target_link_libraries(MllmQNNBackend PUBLIC MllmRT) + +install( + TARGETS MllmQNNBackend + EXPORT MllmTargets + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib + RUNTIME DESTINATION bin) diff --git a/mllm/backends/qnn/aot/passes/PTQPass.cpp b/mllm/backends/qnn/aot/passes/PTQPass.cpp index 1d42d58d3..7172db475 100644 --- a/mllm/backends/qnn/aot/passes/PTQPass.cpp +++ b/mllm/backends/qnn/aot/passes/PTQPass.cpp @@ -300,6 +300,45 @@ void recursiveSolveNormal(const std::shared_ptr& ir_ctx, const ir }); } +void recursiveCheckUnsolved(const std::shared_ptr& ir_ctx, const ir::graph::SubGraphOp::ptr_t& call_op) { + auto wow = ir::IRWriter(ir_ctx, call_op->getTopRegion()); + wow.walk([&](ir::IRWriter& w, const ir::Op::ptr_t& op) -> ir::IRWriter::WalkResult { + if (op->isa_()) { + auto linalg_op = op->cast_(); + std::string op_name = linalg_op->getAOp()->getName(); + + auto inputs = op->inputs(); + auto outputs = op->outputs(); + + for (auto iii : inputs) { + if (!iii->isa_()) continue; + auto tv = iii->cast_(); + if (!tv->getAttr("quant_recipe")) continue; + auto f_spec = tv->getAttr("quant_recipe")->cast_(); + if (!f_spec->spec_->solved) { + MLLM_WARN("PTQPass: TensorValue '{}' is not solved, used by Op: '{}'", tv->name(), op_name); + } + } + + for (auto ooo : outputs) { + if (!ooo->isa_()) continue; + auto tv = ooo->cast_(); + if (!tv->getAttr("quant_recipe")) continue; + auto f_spec = tv->getAttr("quant_recipe")->cast_(); + if (!f_spec->spec_->solved) { + MLLM_WARN("PTQPass: TensorValue '{}' is not solved, produced by Op: '{}'", tv->name(), op_name); + } + } + } + + if (op->isa_()) { + auto ns = op->cast_()->getSymbolAttr()->str(); + recursiveCheckUnsolved(w.getContext(), w.getContext()->lookupSymbolTable(ns)->cast_()); + } + return ir::IRWriter::WALK_CONTINUE; + }); +} + } // namespace uint8_t PTQPass::run(const ir::node_ptr_t& op) { @@ -330,6 +369,11 @@ uint8_t PTQPass::run(const ir::node_ptr_t& op) { getCtx()->lookupSymbolTable(call_main_graph_op->getSymbolAttr()->str())->cast_(), pf); + // Check for unsolved tensorValues and warn + recursiveCheckUnsolved( + writer.getContext(), + getCtx()->lookupSymbolTable(call_main_graph_op->getSymbolAttr()->str())->cast_()); + return ir::PASS_RET_SUCCESS; } diff --git a/mllm/backends/qnn/aot/visitor/RMSNorm.cpp b/mllm/backends/qnn/aot/visitor/RMSNorm.cpp index 27f72e2e2..351e2562a 100644 --- a/mllm/backends/qnn/aot/visitor/RMSNorm.cpp +++ b/mllm/backends/qnn/aot/visitor/RMSNorm.cpp @@ -47,9 +47,12 @@ bool QnnAOTRMSNormPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& op) auto bias_tensor = mllm::Tensor::zeros(weight->tensor_.shape(), weight->tensor_.dtype()); auto bias_node = ir::tensor::TensorValue::build(writer.getContext().get(), bias_tensor); bias_node->tensor_.setName(a->getName() + "_runtime_bias"); + bias_node->name() = a->getName() + "_runtime_bias"; // fake bias quant recipe - auto quant_spec = mllm::ir::linalg::QuantizationSpecSymPerTensor::create(0, 0, kInt32, kFloat32, Tensor::ones({1})); + auto bias_scale = Tensor::ones({1}); + bias_scale.at({0}) = 1.0 / 32767; + auto quant_spec = mllm::ir::linalg::QuantizationSpecSymPerTensor::create(-32768, 32767, kInt16, kFloat32, bias_scale); auto quant_attr = mllm::ir::linalg::LinalgIRQuantizatonSpecAttr::build(writer.getContext().get(), quant_spec); bias_node->setAttr("quant_recipe", quant_attr); diff --git a/pymllm/backends/qualcomm/transformers/core/qdq.py b/pymllm/backends/qualcomm/transformers/core/qdq.py index 8a4f90687..f1c4d20dc 100644 --- a/pymllm/backends/qualcomm/transformers/core/qdq.py +++ b/pymllm/backends/qualcomm/transformers/core/qdq.py @@ -2,7 +2,7 @@ import torch.nn as nn from torch.ao.quantization import ( FakeQuantize, - MovingAverageMinMaxObserver, + MinMaxObserver, ) from torch.ao.quantization.observer import FixedQParamsObserver @@ -48,7 +48,7 @@ def __init__(self, bits=8, qscheme=torch.per_tensor_affine): # MovingAverageMinMaxObserver calculates scale and zero_point based on observed tensors. # Passing quant_min/max to the observer ensures consistency. self.fake_quant = FakeQuantize( - observer=MovingAverageMinMaxObserver.with_args( + observer=MinMaxObserver.with_args( dtype=self.dtype, qscheme=self.qscheme, quant_min=self.quant_min, diff --git a/pymllm/backends/qualcomm/transformers/core/qlinear.py b/pymllm/backends/qualcomm/transformers/core/qlinear.py index d9c55e759..255f52ffb 100644 --- a/pymllm/backends/qualcomm/transformers/core/qlinear.py +++ b/pymllm/backends/qualcomm/transformers/core/qlinear.py @@ -296,7 +296,9 @@ def convert_to_conv2d_deploy_hwio(self): s1_permuted = ( s1.view(self.out_features, -1).t().contiguous() ) # [Out, Blocks] -> [Blocks, Out] - s1_hwio = s1_permuted.view(1, 1, -1, self.out_features) # Shape: [1, 1, Blocks, Out] + s1_hwio = s1_permuted.view( + 1, 1, -1, self.out_features + ) # Shape: [1, 1, Blocks, Out] del self.weight self.register_buffer("weight", w_hwio) diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py index dc6486043..2f099088e 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py @@ -380,8 +380,6 @@ def forward( residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) hidden_states = self.mlp(hidden_states) - # if self.layer_dix == 2: - # print(hidden_states.min(), hidden_states.max()) hidden_states = residual + self.add_1_lhs_input_qdq(hidden_states) return hidden_states diff --git a/pymllm/backends/qualcomm/transformers/qwen3/runner.py b/pymllm/backends/qualcomm/transformers/qwen3/runner.py index 88f5ce84e..ed302f215 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/runner.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/runner.py @@ -44,7 +44,7 @@ def __init__(self, model_path: str, mllm_qualcomm_max_length=2048): self.model = Qwen3ForCausalLM.from_pretrained( model_path, attn_implementation="eager", - dtype=torch.bfloat16, + dtype=torch.float32, ) self.model.cuda() self.mllm_qualcomm_max_length = mllm_qualcomm_max_length diff --git a/pymllm/convertor/model_file_v2.py b/pymllm/convertor/model_file_v2.py index 302e3e21b..976c04411 100644 --- a/pymllm/convertor/model_file_v2.py +++ b/pymllm/convertor/model_file_v2.py @@ -24,6 +24,14 @@ MLLM_MODEL_FILE_V2_TENSOR_SHAPE_LENGTH = 16 +def _torch_tensor_bytes(tensor: "torch.Tensor") -> bytes: + # Use uint8 view to preserve raw bytes for dtypes not supported by numpy. + t = tensor.detach().cpu().contiguous() + if t.dim() == 0: + t = t.reshape(1) + return t.view(torch.uint8).numpy().tobytes() + + class ModelFileV2Descriptor: SIZE = 532 @@ -132,7 +140,7 @@ def streaming_write(self, tensor_name, tensor_obj): if MLLM_FIND_TORCH_AVAILABLE and isinstance(tensor_obj, torch.Tensor): # PyTorch tensor shape = list(tensor_obj.shape) - tensor_data = tensor_obj.detach().cpu().numpy().tobytes() + tensor_data = _torch_tensor_bytes(tensor_obj) true_dtype = MLLM_TYPE_MAPPING[tensor_obj.dtype] elif MLLM_FIND_NUMPY_AVAILABLE and isinstance(tensor_obj, np.ndarray): # Numpy array @@ -203,7 +211,7 @@ def static_write(self, tensor_obj): if MLLM_FIND_TORCH_AVAILABLE and isinstance(tensor, torch.Tensor): # PyTorch tensor shape = list(tensor.shape) - tensor_data = tensor.detach().cpu().numpy().tobytes() + tensor_data = _torch_tensor_bytes(tensor) true_dtype = MLLM_TYPE_MAPPING[tensor.dtype] elif MLLM_FIND_NUMPY_AVAILABLE and isinstance(tensor, np.ndarray): # Numpy array From d2e6b36edf6b799c126fa71c77d090f0a2bcb7bb Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Mon, 19 Jan 2026 13:08:59 +0000 Subject: [PATCH 06/42] feat(qualcomm): Refactor Qwen3 model to integrate ConcatObserver for improved quantization, enhance rotate_half function to utilize observers, and ensure consistent scale and zero_point across concatenated inputs. --- .../qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp | 35 +++---- .../qnn/aot/passes/LLMQuantRecipePass.cpp | 17 +++- mllm/backends/qnn/aot/passes/PTQPass.cpp | 93 +++++++++++++++++++ .../qualcomm/transformers/core/observer.py | 56 +++++++++++ .../qualcomm/transformers/core/qdq.py | 8 +- .../transformers/qwen3/modeling_qwen3.py | 65 ++++++++++++- .../qualcomm/transformers/qwen3/runner.py | 21 ++++- .../qualcomm/transformers/qwen3/train.py | 5 +- 8 files changed, 268 insertions(+), 32 deletions(-) create mode 100644 pymllm/backends/qualcomm/transformers/core/observer.py diff --git a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp index f1b20a1a2..a2d054bad 100644 --- a/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp +++ b/examples/qwen3_qnn_aot/modeling_qwen_qnn_aot.hpp @@ -15,14 +15,6 @@ namespace mllm::models::qwen3 { -Tensor rotateHalf(Tensor x) { // NOLINT - // X is [x, x, x, D] - auto D = x.size(-1); - auto x1 = x.slice({kAll, kAll, kAll, {kAll, D / 2}}, /*ssa=*/true); - auto x2 = x.slice({kAll, kAll, kAll, {D / 2, kAll}}, /*ssa=*/true); - return nn::functional::concat({-x2, x1}, -1); -} - namespace ptq { Tensor QDQ(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch) { @@ -112,6 +104,14 @@ Tensor QDQ_ROPE(nn::Module* m, Tensor in, const std::string& qdq_name_in_pytorch } // namespace ptq +Tensor rotateHalf(Tensor x, nn::Module* m, const std::string& qdq_name_in_pytorch) { // NOLINT + // X is [x, x, x, D] + auto D = x.size(-1); + auto x1 = x.slice({kAll, kAll, kAll, {kAll, D / 2}}, /*ssa=*/true); + auto x2 = x.slice({kAll, kAll, kAll, {D / 2, kAll}}, /*ssa=*/true); + return nn::functional::concat({ptq::QDQ(m, -x2, qdq_name_in_pytorch), x1}, -1); +} + using vi32 = std::vector; #define CONV2D_PROPERTY vi32{1, 1}, vi32{1, 1}, vi32{0, 0}, vi32{1, 1}, false, aops::Conv2DOpImplType::kQNN_LPBQ_w4a16o16_G32 @@ -232,14 +232,16 @@ class Qwen3Attention final : public nn::Module { // [B, H, S, D] auto cos = llm_embedding_cos.unsqueeze(1); auto sin = llm_embedding_sin.unsqueeze(1); - query_states = ptq::QDQ(this, - ptq::QDQ(this, query_states * cos, "q_rope_mul_0_output_qdq") - + ptq::QDQ(this, rotateHalf(query_states) * sin, "q_rope_mul_1_output_qdq"), - "q_rope_add_0_output_qdq"); - key_states = ptq::QDQ(this, - ptq::QDQ(this, key_states * cos, "k_rope_mul_0_output_qdq") - + ptq::QDQ(this, rotateHalf(key_states) * sin, "k_rope_mul_1_output_qdq"), - "k_rope_add_0_output_qdq"); + query_states = + ptq::QDQ(this, + ptq::QDQ(this, query_states * cos, "q_rope_mul_0_output_qdq") + + ptq::QDQ(this, rotateHalf(query_states, this, "q_rope_neg_half_qdq") * sin, "q_rope_mul_1_output_qdq"), + "q_rope_add_0_output_qdq"); + key_states = + ptq::QDQ(this, + ptq::QDQ(this, key_states * cos, "k_rope_mul_0_output_qdq") + + ptq::QDQ(this, rotateHalf(key_states, this, "k_rope_neg_half_qdq") * sin, "k_rope_mul_1_output_qdq"), + "k_rope_add_0_output_qdq"); // De-quantization and quantization again key_states = key_states.to(kFloat32); @@ -274,6 +276,7 @@ class Qwen3Attention final : public nn::Module { minus_value = ptq::QDQ(this, minus_value, "neg_20_qdq"); auto attn_vv = ptq::QDQ(this, attn_min.addConstant(minus_value), "minus_0_output_qdq"); attn = nn::functional::where(causal_mask.equal(0.f), attn, attn_vv); + attn = ptq::QDQ(this, attn, "where_attn_qdq"); attn = ptq::QDQ(this, nn::functional::softmax(attn, -1), "softmax_output_qdq"); auto y = ptq::QDQ(this, nn::functional::matmul(attn, vh), "attn_value_matmul_output_qdq"); y = y.transpose(1, 2).view({1, 1, -1, num_attention_heads_ * head_dim_}, /*ssa=*/true); diff --git a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp index 90ee4ad72..957fdf321 100644 --- a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp +++ b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp @@ -369,8 +369,7 @@ bool LLMQuantRecipeNegPattern::isMatch(const mllm::ir::op_ptr_t& op) { } bool LLMQuantRecipeNegPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_t& node) { - return shareQuantSpecSingleInputToSingleOutputAndSetOpQuantAnnoAttr(writer.getContext(), - node->cast_()); + return noSharingSingleInAndSingleOutQuantAnnoAttr(writer.getContext(), node->cast_()); } //===----------------------------------------------------------------------===// @@ -651,8 +650,15 @@ bool LLMQuantRecipeConcatPattern::rewrite(ir::IRWriter& writer, const ir::op_ptr return false; } - MLLM_RETURN_FALSE_IF_NOT(i_0->getAttr("quant_recipe")); - MLLM_RETURN_FALSE_IF_NOT(i_1->getAttr("quant_recipe")); + // Create quant_recipe if not present + if (!i_0->getAttr("quant_recipe")) { + auto i_0_spec = genSimpleQuantizationSpecAttr(writer.getContext(), i_0->cast_()); + i_0->setAttr("quant_recipe", i_0_spec); + } + if (!i_1->getAttr("quant_recipe")) { + auto i_1_spec = genSimpleQuantizationSpecAttr(writer.getContext(), i_1->cast_()); + i_1->setAttr("quant_recipe", i_1_spec); + } o_0->setAttr("quant_recipe", i_0->getAttr("quant_recipe")); @@ -795,7 +801,8 @@ bool LLMQuantRecipeWherePattern::rewrite(ir::IRWriter& writer, const ir::op_ptr_ MLLM_RETURN_FALSE_IF_NOT(i_1->getAttr("quant_recipe")); MLLM_RETURN_FALSE_IF_NOT(i_2->getAttr("quant_recipe")); - o_0->setAttr("quant_recipe", i_2->getAttr("quant_recipe")); + auto o_0_spec = genSimpleQuantizationSpecAttr(writer.getContext(), o_0->cast_()); + o_0->setAttr("quant_recipe", o_0_spec); auto annotation_attr = writer.create(); annotation_attr->annotation_.inputs.emplace_back( diff --git a/mllm/backends/qnn/aot/passes/PTQPass.cpp b/mllm/backends/qnn/aot/passes/PTQPass.cpp index 7172db475..82869ab16 100644 --- a/mllm/backends/qnn/aot/passes/PTQPass.cpp +++ b/mllm/backends/qnn/aot/passes/PTQPass.cpp @@ -339,6 +339,94 @@ void recursiveCheckUnsolved(const std::shared_ptr& ir_ctx, const }); } +void recursiveCheckConcatInputs(const std::shared_ptr& ir_ctx, const ir::graph::SubGraphOp::ptr_t& call_op) { + auto wow = ir::IRWriter(ir_ctx, call_op->getTopRegion()); + wow.walk([&](ir::IRWriter& w, const ir::Op::ptr_t& op) -> ir::IRWriter::WalkResult { + if (op->isa_()) { + auto concat_op = op->cast_(); + std::string op_name = concat_op->getAOp()->getName(); + + auto inputs = op->inputs(); + if (inputs.empty()) { return ir::IRWriter::WALK_CONTINUE; } + + // Get first input's scale and zero_point as reference + Tensor ref_scale; + Tensor ref_zero_point; + bool has_ref = false; + std::string ref_input_name; + + for (auto iii : inputs) { + if (!iii->isa_()) continue; + auto tv = iii->cast_(); + if (!tv->getAttr("quant_recipe")) continue; + auto f_spec = tv->getAttr("quant_recipe")->cast_(); + + if (f_spec->spec_->type == ir::linalg::QuantizationSpecType::kAsymPerTensor) { + auto this_spec = std::static_pointer_cast(f_spec->spec_); + if (!this_spec->solved) continue; + + if (!has_ref) { + ref_scale = this_spec->scale; + ref_zero_point = this_spec->zero_point; + ref_input_name = tv->name(); + has_ref = true; + } else { + // Check if scale and zero_point match + auto cur_scale = this_spec->scale; + auto cur_zero_point = this_spec->zero_point; + + MLLM_RT_ASSERT_EQ(ref_scale.numel(), 1); + MLLM_RT_ASSERT_EQ(cur_scale.numel(), 1); + MLLM_RT_ASSERT_EQ(ref_zero_point.numel(), 1); + MLLM_RT_ASSERT_EQ(cur_zero_point.numel(), 1); + + auto ref_scale_v = ref_scale.item(); + auto cur_scale_v = cur_scale.item(); + auto ref_zp_v = ref_zero_point.item(); + auto cur_zp_v = cur_zero_point.item(); + + if (std::abs(ref_scale_v - cur_scale_v) > 1e-6 || ref_zp_v != cur_zp_v) { + MLLM_ERROR("PTQPass: ConcatOp '{}' has mismatched scale/zp between inputs. " + "Input '{}': scale={}, zp={}; Input '{}': scale={}, zp={}", + op_name, ref_input_name, ref_scale_v, ref_zp_v, tv->name(), cur_scale_v, cur_zp_v); + } + } + } else if (f_spec->spec_->type == ir::linalg::QuantizationSpecType::kSymPerTensor) { + auto this_spec = std::static_pointer_cast(f_spec->spec_); + if (!this_spec->solved) continue; + + if (!has_ref) { + ref_scale = this_spec->scale; + ref_input_name = tv->name(); + has_ref = true; + } else { + // Check if scale matches + auto cur_scale = this_spec->scale; + + MLLM_RT_ASSERT_EQ(ref_scale.numel(), 1); + MLLM_RT_ASSERT_EQ(cur_scale.numel(), 1); + + auto ref_scale_v = ref_scale.item(); + auto cur_scale_v = cur_scale.item(); + + if (std::abs(ref_scale_v - cur_scale_v) > 1e-6) { + MLLM_ERROR("PTQPass: ConcatOp '{}' has mismatched scale between inputs. " + "Input '{}': scale={}; Input '{}': scale={}", + op_name, ref_input_name, ref_scale_v, tv->name(), cur_scale_v); + } + } + } + } + } + + if (op->isa_()) { + auto ns = op->cast_()->getSymbolAttr()->str(); + recursiveCheckConcatInputs(w.getContext(), w.getContext()->lookupSymbolTable(ns)->cast_()); + } + return ir::IRWriter::WALK_CONTINUE; + }); +} + } // namespace uint8_t PTQPass::run(const ir::node_ptr_t& op) { @@ -374,6 +462,11 @@ uint8_t PTQPass::run(const ir::node_ptr_t& op) { writer.getContext(), getCtx()->lookupSymbolTable(call_main_graph_op->getSymbolAttr()->str())->cast_()); + // Check Concat inputs have consistent scale and zero_point + recursiveCheckConcatInputs( + writer.getContext(), + getCtx()->lookupSymbolTable(call_main_graph_op->getSymbolAttr()->str())->cast_()); + return ir::PASS_RET_SUCCESS; } diff --git a/pymllm/backends/qualcomm/transformers/core/observer.py b/pymllm/backends/qualcomm/transformers/core/observer.py new file mode 100644 index 000000000..67a946b10 --- /dev/null +++ b/pymllm/backends/qualcomm/transformers/core/observer.py @@ -0,0 +1,56 @@ +import torch +from torchao.quantization.pt2e import UniformQuantizationObserverBase + + +class ConcatObserver(UniformQuantizationObserverBase): + """ + Fetch maximum data range of all tensors to be concatenated + """ + + def __init__( + self, + dtype=torch.uint8, + qscheme=torch.per_tensor_affine, + reduce_range=False, + quant_min=None, + quant_max=None, + factory_kwargs=None, + eps=torch.finfo(torch.float32).eps, # noqa: B008 + is_dynamic=False, + **kwargs, + ) -> None: + super().__init__( + dtype=dtype, + qscheme=qscheme, + reduce_range=reduce_range, + quant_min=quant_min, + quant_max=quant_max, + factory_kwargs=factory_kwargs, + eps=eps, + is_dynamic=is_dynamic, + **kwargs, + ) + + factory_kwargs = torch.nn.factory_kwargs(factory_kwargs) + self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs)) + self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs)) + # get concat node and its inputs + self.input_observers = [] + + def add_observer(self, observer): + self.input_observers.append(observer) + + def forward(self, x_orig): + # calculate the min / max first + self.min_val = min(self.min_val, x_orig.min()) + self.max_val = max(self.max_val, x_orig.max()) + + # update min / max for all observers of input nodes + for observers in self.input_observers: + observers.min_val = self.min_val + observers.max_val = self.max_val + + return x_orig + + def calculate_qparams(self): + return self._calculate_qparams(self.min_val, self.max_val) diff --git a/pymllm/backends/qualcomm/transformers/core/qdq.py b/pymllm/backends/qualcomm/transformers/core/qdq.py index f1c4d20dc..c13011a51 100644 --- a/pymllm/backends/qualcomm/transformers/core/qdq.py +++ b/pymllm/backends/qualcomm/transformers/core/qdq.py @@ -78,11 +78,11 @@ def disable_observer(self): def enable_fakequant(self): """Enable simulation of quantization error.""" - self.fake_quant.enable_fakequant() + self.fake_quant.enable_fake_quant() def disable_fakequant(self): """Disable quantization simulation (act as identity).""" - self.fake_quant.disable_fakequant() + self.fake_quant.disable_fake_quant() def extra_repr(self): mode = "Symmetric" if "symmetric" in str(self.qscheme) else "Asymmetric" @@ -158,11 +158,11 @@ def disable_observer(self): def enable_fakequant(self): """Enable simulation of quantization error.""" - self.fake_quant.enable_fakequant() + self.fake_quant.enable_fake_quant() def disable_fakequant(self): """Disable quantization simulation (act as identity).""" - self.fake_quant.disable_fakequant() + self.fake_quant.disable_fake_quant() @property def scale(self): diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py index 2f099088e..92efaa06d 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py @@ -54,6 +54,7 @@ ActivationQDQ, FixedActivationQDQ, ) +from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver class Qwen3MLP(nn.Module): @@ -100,11 +101,13 @@ def forward(self, x): return o -def rotate_half(x): +def rotate_half( + x, x_observer, x2_neg_fake_quant: ActivationQDQ, concat_observer: ConcatObserver +): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) + return concat_observer(torch.cat((x2_neg_fake_quant(-x2), x1), dim=-1)) def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): @@ -214,6 +217,39 @@ def __init__(self, config: Qwen3Config, layer_idx: int): self.k_rope_mul_1_output_qdq = ActivationQDQ(bits=16) self.k_rope_add_0_output_qdq = ActivationQDQ(bits=16) + self.q_rope_concat_observer = ConcatObserver( + dtype=torch.int32, + qscheme=torch.per_tensor_affine, + reduce_range=False, + quant_min=0, + quant_max=2**16 - 1, + eps=0.0001 / 65535, + is_dynamic=False, + ) + self.q_rope_neg_half_qdq = ActivationQDQ(bits=16) + self.k_rope_concat_observer = ConcatObserver( + dtype=torch.int32, + qscheme=torch.per_tensor_affine, + reduce_range=False, + quant_min=0, + quant_max=2**16 - 1, + eps=0.0001 / 65535, + is_dynamic=False, + ) + self.k_rope_neg_half_qdq = ActivationQDQ(bits=16) + self.k_rope_concat_observer.add_observer( + self.k_norm_output_qdq.fake_quant.activation_post_process + ) + self.k_rope_concat_observer.add_observer( + self.k_rope_neg_half_qdq.fake_quant.activation_post_process + ) + self.q_rope_concat_observer.add_observer( + self.q_norm_output_qdq.fake_quant.activation_post_process + ) + self.q_rope_concat_observer.add_observer( + self.q_rope_neg_half_qdq.fake_quant.activation_post_process + ) + # In qnn, is uint8 sym. self.k_cast_to_int8_qdq = ActivationQDQ( bits=8, qscheme=torch.per_tensor_symmetric @@ -231,6 +267,7 @@ def __init__(self, config: Qwen3Config, layer_idx: int): self.minus_0_output_qdq = ActivationQDQ(bits=16) self.softmax_output_qdq = ActivationQDQ(bits=16) self.attn_value_matmul_output_qdq = ActivationQDQ(bits=16) + self.where_attn_qdq = ActivationQDQ(bits=16) @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") def forward( @@ -263,11 +300,27 @@ def forward( sin = sin.unsqueeze(1) query_states = self.q_rope_add_0_output_qdq( self.q_rope_mul_0_output_qdq(query_states * cos) - + self.q_rope_mul_1_output_qdq(rotate_half(query_states) * sin) + + self.q_rope_mul_1_output_qdq( + rotate_half( + query_states, + self.q_norm_output_qdq.fake_quant.activation_post_process, + self.q_rope_neg_half_qdq, + self.q_rope_concat_observer, + ) + * sin + ) ) key_states = self.k_rope_add_0_output_qdq( self.k_rope_mul_0_output_qdq(key_states * cos) - + self.k_rope_mul_1_output_qdq(rotate_half(key_states) * sin) + + self.k_rope_mul_1_output_qdq( + rotate_half( + key_states, + self.k_norm_output_qdq.fake_quant.activation_post_process, + self.k_rope_neg_half_qdq, + self.k_rope_concat_observer, + ) + * sin + ) ) key_states = self.k_cast_to_int8_qdq(key_states) @@ -303,7 +356,9 @@ def forward( * (-20) ) ) - attn_weights = torch.where(attention_mask == 0, attn_weights, attn_vv) + attn_weights = self.where_attn_qdq( + torch.where(attention_mask == 0, attn_weights, attn_vv) + ) attn_weights = self.softmax_output_qdq( nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to( diff --git a/pymllm/backends/qualcomm/transformers/qwen3/runner.py b/pymllm/backends/qualcomm/transformers/qwen3/runner.py index ed302f215..6565ca7e6 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/runner.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/runner.py @@ -2,7 +2,10 @@ from tqdm import tqdm from modelscope.msdatasets import MsDataset from transformers import AutoTokenizer -from pymllm.backends.qualcomm.transformers.core.qdq import ActivationQDQ +from pymllm.backends.qualcomm.transformers.core.qdq import ( + ActivationQDQ, + FixedActivationQDQ, +) from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm from pymllm.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, @@ -31,6 +34,16 @@ def enable_qdq_observer(m): m.enable_observer() +def enable_fake_quant(m): + if isinstance(m, ActivationQDQ) or isinstance(m, FixedActivationQDQ): + m.enable_fakequant() + + +def disable_fake_quant(m): + if isinstance(m, ActivationQDQ) or isinstance(m, FixedActivationQDQ): + m.disable_fakequant() + + def convert_weight(m): if isinstance(m, QLinearLPBQ) or isinstance(m, QLinearW8A16_PerChannelSym): m.convert_to_conv2d_deploy_hwio() @@ -61,6 +74,12 @@ def freeze_activation(self): def enable_activation_update(self): self.model.apply(enable_qdq_observer) + def enable_fake_quant(self): + self.model.apply(enable_fake_quant) + + def disable_fake_quant(self): + self.model.apply(disable_fake_quant) + def compile(self): print("Compile Start.") self.model = torch.compile( diff --git a/pymllm/backends/qualcomm/transformers/qwen3/train.py b/pymllm/backends/qualcomm/transformers/qwen3/train.py index 33351918f..25361f372 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/train.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/train.py @@ -37,8 +37,11 @@ def main(): args = parser.parse_args() m = Qwen3Quantizer(args.model_path, mllm_qualcomm_max_length=args.max_length) + + # FIXME: Should disable or not. + m.disable_fake_quant() m.calibrate(num_samples=args.num_samples, max_seq_length=args.max_length) - # m.compile() + m.enable_fake_quant() m.infer(args.infer_text) # !!! From 48c259a8e87b4b0fabb6eaeca7074a1656500e55 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Tue, 20 Jan 2026 09:16:50 +0000 Subject: [PATCH 07/42] feat(cpu): Implement fill operations for various data types including zeros, ones, specific values, arange, and random fills. Introduce a new fill-inl.hpp file for optimized implementations and update kernel dispatch to include these operations. Enhance CPUFillOp to utilize the new fill functions for better performance and maintainability. --- mllm/backends/cpu/kernels/common/fill-inl.hpp | 363 ++++++++++++++++++ .../cpu/kernels/common/kernel_dispatch.cpp | 180 ++++++++- .../cpu/kernels/common/kernel_dispatch.hpp | 217 +++++++++++ mllm/backends/cpu/ops/FillOp.cpp | 118 +++--- mllm/backends/qnn/aot/passes/PTQPass.cpp | 6 +- mllm/ffi/Extension.cc | 16 + pymllm/__init__.py | 16 +- pymllm/ffi/__init__.py | 67 +++- 8 files changed, 928 insertions(+), 55 deletions(-) create mode 100644 mllm/backends/cpu/kernels/common/fill-inl.hpp diff --git a/mllm/backends/cpu/kernels/common/fill-inl.hpp b/mllm/backends/cpu/kernels/common/fill-inl.hpp new file mode 100644 index 000000000..4c799daf6 --- /dev/null +++ b/mllm/backends/cpu/kernels/common/fill-inl.hpp @@ -0,0 +1,363 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +// NOTE: Do NOT use #pragma once here! +// Highway's foreach_target.h mechanism requires -inl.hpp files to be included +// multiple times, once for each target architecture (AVX3_DL, AVX10_2, etc.). + +#include +#include +#include "mllm/core/DataTypes.hpp" + +HWY_BEFORE_NAMESPACE(); +namespace mllm::cpu::common { // NOLINT +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; + +//===----------------------------------------------------------------------===// +// Fill Zeros +//===----------------------------------------------------------------------===// +template +HWY_INLINE void fill_zeros_impl(T* HWY_RESTRICT dst, size_t count) { + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + const hn::Vec zero = hn::Zero(d); + size_t idx = 0; + + for (; idx + N <= count; idx += N) { hn::StoreU(zero, d, dst + idx); } + + if (idx < count) { hn::StoreN(zero, d, dst + idx, count - idx); } +} + +// Specialization for types not supported by Highway SIMD, use memset +template +HWY_INLINE void fill_zeros_scalar(T* HWY_RESTRICT dst, size_t count) { + if constexpr (std::is_trivial_v) { + std::memset(dst, 0, count * sizeof(T)); + } else { + T zero_val{}; + for (size_t i = 0; i < count; ++i) { dst[i] = zero_val; } + } +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_zeros_fp32(mllm_fp32_t* HWY_RESTRICT dst, size_t size) { + fill_zeros_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_zeros_fp64(mllm_fp64_t* HWY_RESTRICT dst, size_t size) { + fill_zeros_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_zeros_i32(mllm_int32_t* HWY_RESTRICT dst, size_t size) { + fill_zeros_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_zeros_u32(mllm_uint32_t* HWY_RESTRICT dst, size_t size) { + fill_zeros_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_zeros_i64(mllm_int64_t* HWY_RESTRICT dst, size_t size) { + fill_zeros_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_zeros_u64(mllm_uint64_t* HWY_RESTRICT dst, size_t size) { + fill_zeros_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_zeros_i16(mllm_int16_t* HWY_RESTRICT dst, size_t size) { + fill_zeros_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_zeros_u16(mllm_uint16_t* HWY_RESTRICT dst, size_t size) { + fill_zeros_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_zeros_i8(mllm_int8_t* HWY_RESTRICT dst, size_t size) { + fill_zeros_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_zeros_u8(mllm_uint8_t* HWY_RESTRICT dst, size_t size) { + fill_zeros_impl(dst, size); +} + +//===----------------------------------------------------------------------===// +// Fill Ones +//===----------------------------------------------------------------------===// +template +HWY_INLINE void fill_ones_impl(T* HWY_RESTRICT dst, size_t count) { + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + const hn::Vec one = hn::Set(d, static_cast(1)); + size_t idx = 0; + + for (; idx + N <= count; idx += N) { hn::StoreU(one, d, dst + idx); } + + if (idx < count) { hn::StoreN(one, d, dst + idx, count - idx); } +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_ones_fp32(mllm_fp32_t* HWY_RESTRICT dst, size_t size) { + fill_ones_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_ones_fp64(mllm_fp64_t* HWY_RESTRICT dst, size_t size) { + fill_ones_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_ones_i32(mllm_int32_t* HWY_RESTRICT dst, size_t size) { + fill_ones_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_ones_u32(mllm_uint32_t* HWY_RESTRICT dst, size_t size) { + fill_ones_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_ones_i64(mllm_int64_t* HWY_RESTRICT dst, size_t size) { + fill_ones_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_ones_u64(mllm_uint64_t* HWY_RESTRICT dst, size_t size) { + fill_ones_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_ones_i16(mllm_int16_t* HWY_RESTRICT dst, size_t size) { + fill_ones_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_ones_u16(mllm_uint16_t* HWY_RESTRICT dst, size_t size) { + fill_ones_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_ones_i8(mllm_int8_t* HWY_RESTRICT dst, size_t size) { + fill_ones_impl(dst, size); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_ones_u8(mllm_uint8_t* HWY_RESTRICT dst, size_t size) { + fill_ones_impl(dst, size); +} + +//===----------------------------------------------------------------------===// +// Fill Specific Value +//===----------------------------------------------------------------------===// +template +HWY_INLINE void fill_value_impl(T* HWY_RESTRICT dst, size_t count, T value) { + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + const hn::Vec v = hn::Set(d, value); + size_t idx = 0; + + for (; idx + N <= count; idx += N) { hn::StoreU(v, d, dst + idx); } + + if (idx < count) { hn::StoreN(v, d, dst + idx, count - idx); } +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_value_fp32(mllm_fp32_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t value) { + fill_value_impl(dst, size, value); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_value_fp64(mllm_fp64_t* HWY_RESTRICT dst, size_t size, mllm_fp64_t value) { + fill_value_impl(dst, size, value); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_value_i32(mllm_int32_t* HWY_RESTRICT dst, size_t size, mllm_int32_t value) { + fill_value_impl(dst, size, value); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_value_u32(mllm_uint32_t* HWY_RESTRICT dst, size_t size, mllm_uint32_t value) { + fill_value_impl(dst, size, value); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_value_i64(mllm_int64_t* HWY_RESTRICT dst, size_t size, mllm_int64_t value) { + fill_value_impl(dst, size, value); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_value_u64(mllm_uint64_t* HWY_RESTRICT dst, size_t size, mllm_uint64_t value) { + fill_value_impl(dst, size, value); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_value_i16(mllm_int16_t* HWY_RESTRICT dst, size_t size, mllm_int16_t value) { + fill_value_impl(dst, size, value); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_value_u16(mllm_uint16_t* HWY_RESTRICT dst, size_t size, mllm_uint16_t value) { + fill_value_impl(dst, size, value); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_value_i8(mllm_int8_t* HWY_RESTRICT dst, size_t size, mllm_int8_t value) { + fill_value_impl(dst, size, value); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_value_u8(mllm_uint8_t* HWY_RESTRICT dst, size_t size, mllm_uint8_t value) { + fill_value_impl(dst, size, value); +} + +//===----------------------------------------------------------------------===// +// Fill Arange (start, end, step) +//===----------------------------------------------------------------------===// +template +HWY_INLINE void fill_arange_impl(T* HWY_RESTRICT dst, size_t count, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step) { + if (step == 0) { + fill_value_impl(dst, count, static_cast(start)); + return; + } + + // Calculate the actual number of elements to fill + size_t n = 0; + if ((step > 0 && start < end) || (step < 0 && start > end)) { + mllm_fp32_t n_float = (end - start) / step; + if (n_float > 0) { + n = static_cast(std::ceil(n_float)); + if (step > 0) { + if (start + (n - 1) * step >= end) --n; + } else { + if (start + (n - 1) * step <= end) --n; + } + n = std::min(n, count); + } + } + + // Use SIMD for float types where we can vectorize the computation + if constexpr (std::is_same_v) { + const hn::ScalableTag d; + const size_t N = hn::Lanes(d); + + // Create increment vector: [0, 1, 2, 3, ...] * step + const hn::Vec step_vec = hn::Set(d, step); + const hn::Vec n_step_vec = hn::Set(d, step * static_cast(N)); + + // Create base offsets [0, 1, 2, 3, ...] + hn::Vec base = hn::Iota(d, 0); + base = hn::Mul(base, step_vec); + hn::Vec current_start = hn::Add(hn::Set(d, start), base); + + size_t idx = 0; + for (; idx + N <= n; idx += N) { + hn::StoreU(current_start, d, dst + idx); + current_start = hn::Add(current_start, n_step_vec); + } + + // Handle remaining elements + for (; idx < n; ++idx) { dst[idx] = static_cast(start + idx * step); } + } else { + // Scalar fallback for other types + for (size_t i = 0; i < n; ++i) { dst[i] = static_cast(start + i * step); } + } +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_arange_fp32(mllm_fp32_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, mllm_fp32_t step) { + fill_arange_impl(dst, size, start, end, step); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_arange_i32(mllm_int32_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, mllm_fp32_t step) { + fill_arange_impl(dst, size, start, end, step); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_arange_u32(mllm_uint32_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, mllm_fp32_t step) { + fill_arange_impl(dst, size, start, end, step); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_arange_i64(mllm_int64_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, mllm_fp32_t step) { + fill_arange_impl(dst, size, start, end, step); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_arange_u64(mllm_uint64_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, mllm_fp32_t step) { + fill_arange_impl(dst, size, start, end, step); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_arange_i16(mllm_int16_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, mllm_fp32_t step) { + fill_arange_impl(dst, size, start, end, step); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_arange_u16(mllm_uint16_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, mllm_fp32_t step) { + fill_arange_impl(dst, size, start, end, step); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_arange_i8(mllm_int8_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, mllm_fp32_t step) { + fill_arange_impl(dst, size, start, end, step); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_arange_u8(mllm_uint8_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, mllm_fp32_t step) { + fill_arange_impl(dst, size, start, end, step); +} + +//===----------------------------------------------------------------------===// +// Fill Random (using LCG random number generator) +//===----------------------------------------------------------------------===// +template +HWY_INLINE void fill_random_impl(T* HWY_RESTRICT dst, size_t count, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed) { + const uint64_t multiplier = 1103515245ULL; + const uint64_t increment = 12345ULL; + const uint64_t modulus = 1ULL << 31; // 2^31 + const mllm_fp32_t range = end - start; + + if (range == 0) { + fill_value_impl(dst, count, static_cast(start)); + return; + } + + uint64_t state = seed; + state = (multiplier * state + increment) % modulus; + + for (size_t i = 0; i < count; ++i) { + state = (multiplier * state + increment) % modulus; + const mllm_fp32_t random_value = static_cast(state) / static_cast(modulus - 1); + dst[i] = static_cast(start + random_value * range); + } +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_random_fp32(mllm_fp32_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, uint64_t seed) { + fill_random_impl(dst, size, start, end, seed); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_random_i32(mllm_int32_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, uint64_t seed) { + fill_random_impl(dst, size, start, end, seed); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_random_u32(mllm_uint32_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, uint64_t seed) { + fill_random_impl(dst, size, start, end, seed); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_random_i64(mllm_int64_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, uint64_t seed) { + fill_random_impl(dst, size, start, end, seed); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_random_u64(mllm_uint64_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, uint64_t seed) { + fill_random_impl(dst, size, start, end, seed); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_random_i16(mllm_int16_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, uint64_t seed) { + fill_random_impl(dst, size, start, end, seed); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_random_u16(mllm_uint16_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, uint64_t seed) { + fill_random_impl(dst, size, start, end, seed); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_random_i8(mllm_int8_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, uint64_t seed) { + fill_random_impl(dst, size, start, end, seed); +} + +static HWY_NOINLINE HWY_MAYBE_UNUSED void fill_random_u8(mllm_uint8_t* HWY_RESTRICT dst, size_t size, mllm_fp32_t start, + mllm_fp32_t end, uint64_t seed) { + fill_random_impl(dst, size, start, end, seed); +} + +} // namespace HWY_NAMESPACE +} // namespace mllm::cpu::common +HWY_AFTER_NAMESPACE(); diff --git a/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp b/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp index 1ad3cee93..7e81adfdf 100644 --- a/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp +++ b/mllm/backends/cpu/kernels/common/kernel_dispatch.cpp @@ -17,6 +17,7 @@ // Include all inline implementations here #include "mllm/backends/cpu/kernels/common/elewise-inl.hpp" +#include "mllm/backends/cpu/kernels/common/fill-inl.hpp" #if HWY_ONCE namespace mllm::cpu::common { @@ -69,11 +70,188 @@ HWY_DLLEXPORT void call_elewise_div_scalar_fp32(mllm_fp32_t* out, const mllm_fp3 // GELU //===----------------------------------------------------------------------===// // HWY_EXPORT(gelu_fp32); -// +// // HWY_DLLEXPORT void call_gelu_fp32(mllm_fp32_t* out, const mllm_fp32_t* in, size_t n) { // HWY_DYNAMIC_DISPATCH(gelu_fp32)(out, in, n); // } +//===----------------------------------------------------------------------===// +// Fill Zeros +//===----------------------------------------------------------------------===// +HWY_EXPORT(fill_zeros_fp32); +HWY_EXPORT(fill_zeros_fp64); +HWY_EXPORT(fill_zeros_i32); +HWY_EXPORT(fill_zeros_u32); +HWY_EXPORT(fill_zeros_i64); +HWY_EXPORT(fill_zeros_u64); +HWY_EXPORT(fill_zeros_i16); +HWY_EXPORT(fill_zeros_u16); +HWY_EXPORT(fill_zeros_i8); +HWY_EXPORT(fill_zeros_u8); + +HWY_DLLEXPORT void call_fill_zeros_fp32(mllm_fp32_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_zeros_fp32)(dst, n); } +HWY_DLLEXPORT void call_fill_zeros_fp64(mllm_fp64_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_zeros_fp64)(dst, n); } +HWY_DLLEXPORT void call_fill_zeros_i32(mllm_int32_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_zeros_i32)(dst, n); } +HWY_DLLEXPORT void call_fill_zeros_u32(mllm_uint32_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_zeros_u32)(dst, n); } +HWY_DLLEXPORT void call_fill_zeros_i64(mllm_int64_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_zeros_i64)(dst, n); } +HWY_DLLEXPORT void call_fill_zeros_u64(mllm_uint64_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_zeros_u64)(dst, n); } +HWY_DLLEXPORT void call_fill_zeros_i16(mllm_int16_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_zeros_i16)(dst, n); } +HWY_DLLEXPORT void call_fill_zeros_u16(mllm_uint16_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_zeros_u16)(dst, n); } +HWY_DLLEXPORT void call_fill_zeros_i8(mllm_int8_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_zeros_i8)(dst, n); } +HWY_DLLEXPORT void call_fill_zeros_u8(mllm_uint8_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_zeros_u8)(dst, n); } + +//===----------------------------------------------------------------------===// +// Fill Ones +//===----------------------------------------------------------------------===// +HWY_EXPORT(fill_ones_fp32); +HWY_EXPORT(fill_ones_fp64); +HWY_EXPORT(fill_ones_i32); +HWY_EXPORT(fill_ones_u32); +HWY_EXPORT(fill_ones_i64); +HWY_EXPORT(fill_ones_u64); +HWY_EXPORT(fill_ones_i16); +HWY_EXPORT(fill_ones_u16); +HWY_EXPORT(fill_ones_i8); +HWY_EXPORT(fill_ones_u8); + +HWY_DLLEXPORT void call_fill_ones_fp32(mllm_fp32_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_ones_fp32)(dst, n); } +HWY_DLLEXPORT void call_fill_ones_fp64(mllm_fp64_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_ones_fp64)(dst, n); } +HWY_DLLEXPORT void call_fill_ones_i32(mllm_int32_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_ones_i32)(dst, n); } +HWY_DLLEXPORT void call_fill_ones_u32(mllm_uint32_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_ones_u32)(dst, n); } +HWY_DLLEXPORT void call_fill_ones_i64(mllm_int64_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_ones_i64)(dst, n); } +HWY_DLLEXPORT void call_fill_ones_u64(mllm_uint64_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_ones_u64)(dst, n); } +HWY_DLLEXPORT void call_fill_ones_i16(mllm_int16_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_ones_i16)(dst, n); } +HWY_DLLEXPORT void call_fill_ones_u16(mllm_uint16_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_ones_u16)(dst, n); } +HWY_DLLEXPORT void call_fill_ones_i8(mllm_int8_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_ones_i8)(dst, n); } +HWY_DLLEXPORT void call_fill_ones_u8(mllm_uint8_t* dst, size_t n) { HWY_DYNAMIC_DISPATCH(fill_ones_u8)(dst, n); } + +//===----------------------------------------------------------------------===// +// Fill Specific Value +//===----------------------------------------------------------------------===// +HWY_EXPORT(fill_value_fp32); +HWY_EXPORT(fill_value_fp64); +HWY_EXPORT(fill_value_i32); +HWY_EXPORT(fill_value_u32); +HWY_EXPORT(fill_value_i64); +HWY_EXPORT(fill_value_u64); +HWY_EXPORT(fill_value_i16); +HWY_EXPORT(fill_value_u16); +HWY_EXPORT(fill_value_i8); +HWY_EXPORT(fill_value_u8); + +HWY_DLLEXPORT void call_fill_value_fp32(mllm_fp32_t* dst, size_t n, mllm_fp32_t value) { + HWY_DYNAMIC_DISPATCH(fill_value_fp32)(dst, n, value); +} +HWY_DLLEXPORT void call_fill_value_fp64(mllm_fp64_t* dst, size_t n, mllm_fp64_t value) { + HWY_DYNAMIC_DISPATCH(fill_value_fp64)(dst, n, value); +} +HWY_DLLEXPORT void call_fill_value_i32(mllm_int32_t* dst, size_t n, mllm_int32_t value) { + HWY_DYNAMIC_DISPATCH(fill_value_i32)(dst, n, value); +} +HWY_DLLEXPORT void call_fill_value_u32(mllm_uint32_t* dst, size_t n, mllm_uint32_t value) { + HWY_DYNAMIC_DISPATCH(fill_value_u32)(dst, n, value); +} +HWY_DLLEXPORT void call_fill_value_i64(mllm_int64_t* dst, size_t n, mllm_int64_t value) { + HWY_DYNAMIC_DISPATCH(fill_value_i64)(dst, n, value); +} +HWY_DLLEXPORT void call_fill_value_u64(mllm_uint64_t* dst, size_t n, mllm_uint64_t value) { + HWY_DYNAMIC_DISPATCH(fill_value_u64)(dst, n, value); +} +HWY_DLLEXPORT void call_fill_value_i16(mllm_int16_t* dst, size_t n, mllm_int16_t value) { + HWY_DYNAMIC_DISPATCH(fill_value_i16)(dst, n, value); +} +HWY_DLLEXPORT void call_fill_value_u16(mllm_uint16_t* dst, size_t n, mllm_uint16_t value) { + HWY_DYNAMIC_DISPATCH(fill_value_u16)(dst, n, value); +} +HWY_DLLEXPORT void call_fill_value_i8(mllm_int8_t* dst, size_t n, mllm_int8_t value) { + HWY_DYNAMIC_DISPATCH(fill_value_i8)(dst, n, value); +} +HWY_DLLEXPORT void call_fill_value_u8(mllm_uint8_t* dst, size_t n, mllm_uint8_t value) { + HWY_DYNAMIC_DISPATCH(fill_value_u8)(dst, n, value); +} + +//===----------------------------------------------------------------------===// +// Fill Arange +//===----------------------------------------------------------------------===// +HWY_EXPORT(fill_arange_fp32); +HWY_EXPORT(fill_arange_i32); +HWY_EXPORT(fill_arange_u32); +HWY_EXPORT(fill_arange_i64); +HWY_EXPORT(fill_arange_u64); +HWY_EXPORT(fill_arange_i16); +HWY_EXPORT(fill_arange_u16); +HWY_EXPORT(fill_arange_i8); +HWY_EXPORT(fill_arange_u8); + +HWY_DLLEXPORT void call_fill_arange_fp32(mllm_fp32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step) { + HWY_DYNAMIC_DISPATCH(fill_arange_fp32)(dst, n, start, end, step); +} +HWY_DLLEXPORT void call_fill_arange_i32(mllm_int32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step) { + HWY_DYNAMIC_DISPATCH(fill_arange_i32)(dst, n, start, end, step); +} +HWY_DLLEXPORT void call_fill_arange_u32(mllm_uint32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step) { + HWY_DYNAMIC_DISPATCH(fill_arange_u32)(dst, n, start, end, step); +} +HWY_DLLEXPORT void call_fill_arange_i64(mllm_int64_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step) { + HWY_DYNAMIC_DISPATCH(fill_arange_i64)(dst, n, start, end, step); +} +HWY_DLLEXPORT void call_fill_arange_u64(mllm_uint64_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step) { + HWY_DYNAMIC_DISPATCH(fill_arange_u64)(dst, n, start, end, step); +} +HWY_DLLEXPORT void call_fill_arange_i16(mllm_int16_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step) { + HWY_DYNAMIC_DISPATCH(fill_arange_i16)(dst, n, start, end, step); +} +HWY_DLLEXPORT void call_fill_arange_u16(mllm_uint16_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step) { + HWY_DYNAMIC_DISPATCH(fill_arange_u16)(dst, n, start, end, step); +} +HWY_DLLEXPORT void call_fill_arange_i8(mllm_int8_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step) { + HWY_DYNAMIC_DISPATCH(fill_arange_i8)(dst, n, start, end, step); +} +HWY_DLLEXPORT void call_fill_arange_u8(mllm_uint8_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step) { + HWY_DYNAMIC_DISPATCH(fill_arange_u8)(dst, n, start, end, step); +} + +//===----------------------------------------------------------------------===// +// Fill Random +//===----------------------------------------------------------------------===// +HWY_EXPORT(fill_random_fp32); +HWY_EXPORT(fill_random_i32); +HWY_EXPORT(fill_random_u32); +HWY_EXPORT(fill_random_i64); +HWY_EXPORT(fill_random_u64); +HWY_EXPORT(fill_random_i16); +HWY_EXPORT(fill_random_u16); +HWY_EXPORT(fill_random_i8); +HWY_EXPORT(fill_random_u8); + +HWY_DLLEXPORT void call_fill_random_fp32(mllm_fp32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed) { + HWY_DYNAMIC_DISPATCH(fill_random_fp32)(dst, n, start, end, seed); +} +HWY_DLLEXPORT void call_fill_random_i32(mllm_int32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed) { + HWY_DYNAMIC_DISPATCH(fill_random_i32)(dst, n, start, end, seed); +} +HWY_DLLEXPORT void call_fill_random_u32(mllm_uint32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed) { + HWY_DYNAMIC_DISPATCH(fill_random_u32)(dst, n, start, end, seed); +} +HWY_DLLEXPORT void call_fill_random_i64(mllm_int64_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed) { + HWY_DYNAMIC_DISPATCH(fill_random_i64)(dst, n, start, end, seed); +} +HWY_DLLEXPORT void call_fill_random_u64(mllm_uint64_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed) { + HWY_DYNAMIC_DISPATCH(fill_random_u64)(dst, n, start, end, seed); +} +HWY_DLLEXPORT void call_fill_random_i16(mllm_int16_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed) { + HWY_DYNAMIC_DISPATCH(fill_random_i16)(dst, n, start, end, seed); +} +HWY_DLLEXPORT void call_fill_random_u16(mllm_uint16_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed) { + HWY_DYNAMIC_DISPATCH(fill_random_u16)(dst, n, start, end, seed); +} +HWY_DLLEXPORT void call_fill_random_i8(mllm_int8_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed) { + HWY_DYNAMIC_DISPATCH(fill_random_i8)(dst, n, start, end, seed); +} +HWY_DLLEXPORT void call_fill_random_u8(mllm_uint8_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed) { + HWY_DYNAMIC_DISPATCH(fill_random_u8)(dst, n, start, end, seed); +} + } // namespace mllm::cpu::common #endif // HWY_ONCE diff --git a/mllm/backends/cpu/kernels/common/kernel_dispatch.hpp b/mllm/backends/cpu/kernels/common/kernel_dispatch.hpp index eb100ac43..4df34db0e 100644 --- a/mllm/backends/cpu/kernels/common/kernel_dispatch.hpp +++ b/mllm/backends/cpu/kernels/common/kernel_dispatch.hpp @@ -7,6 +7,7 @@ #include "mllm/utils/CPUArchHelper.hpp" #if !(defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM)) +#include #include "mllm/core/DataTypes.hpp" // Platform-specific definitions used for declaring an interface, independent of @@ -30,6 +31,222 @@ HWY_DLLEXPORT void call_elewise_sub_scalar_fp32(mllm_fp32_t* out, const mllm_fp3 HWY_DLLEXPORT void call_elewise_mul_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, mllm_fp32_t y, size_t n); HWY_DLLEXPORT void call_elewise_div_scalar_fp32(mllm_fp32_t* out, const mllm_fp32_t* x, mllm_fp32_t y, size_t n); +//===----------------------------------------------------------------------===// +// Fill Zeros +//===----------------------------------------------------------------------===// +HWY_DLLEXPORT void call_fill_zeros_fp32(mllm_fp32_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_zeros_fp64(mllm_fp64_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_zeros_i32(mllm_int32_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_zeros_u32(mllm_uint32_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_zeros_i64(mllm_int64_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_zeros_u64(mllm_uint64_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_zeros_i16(mllm_int16_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_zeros_u16(mllm_uint16_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_zeros_i8(mllm_int8_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_zeros_u8(mllm_uint8_t* dst, size_t n); + +//===----------------------------------------------------------------------===// +// Fill Ones +//===----------------------------------------------------------------------===// +HWY_DLLEXPORT void call_fill_ones_fp32(mllm_fp32_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_ones_fp64(mllm_fp64_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_ones_i32(mllm_int32_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_ones_u32(mllm_uint32_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_ones_i64(mllm_int64_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_ones_u64(mllm_uint64_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_ones_i16(mllm_int16_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_ones_u16(mllm_uint16_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_ones_i8(mllm_int8_t* dst, size_t n); +HWY_DLLEXPORT void call_fill_ones_u8(mllm_uint8_t* dst, size_t n); + +//===----------------------------------------------------------------------===// +// Fill Specific Value +//===----------------------------------------------------------------------===// +HWY_DLLEXPORT void call_fill_value_fp32(mllm_fp32_t* dst, size_t n, mllm_fp32_t value); +HWY_DLLEXPORT void call_fill_value_fp64(mllm_fp64_t* dst, size_t n, mllm_fp64_t value); +HWY_DLLEXPORT void call_fill_value_i32(mllm_int32_t* dst, size_t n, mllm_int32_t value); +HWY_DLLEXPORT void call_fill_value_u32(mllm_uint32_t* dst, size_t n, mllm_uint32_t value); +HWY_DLLEXPORT void call_fill_value_i64(mllm_int64_t* dst, size_t n, mllm_int64_t value); +HWY_DLLEXPORT void call_fill_value_u64(mllm_uint64_t* dst, size_t n, mllm_uint64_t value); +HWY_DLLEXPORT void call_fill_value_i16(mllm_int16_t* dst, size_t n, mllm_int16_t value); +HWY_DLLEXPORT void call_fill_value_u16(mllm_uint16_t* dst, size_t n, mllm_uint16_t value); +HWY_DLLEXPORT void call_fill_value_i8(mllm_int8_t* dst, size_t n, mllm_int8_t value); +HWY_DLLEXPORT void call_fill_value_u8(mllm_uint8_t* dst, size_t n, mllm_uint8_t value); + +//===----------------------------------------------------------------------===// +// Fill Arange +//===----------------------------------------------------------------------===// +HWY_DLLEXPORT void call_fill_arange_fp32(mllm_fp32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step); +HWY_DLLEXPORT void call_fill_arange_i32(mllm_int32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step); +HWY_DLLEXPORT void call_fill_arange_u32(mllm_uint32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step); +HWY_DLLEXPORT void call_fill_arange_i64(mllm_int64_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step); +HWY_DLLEXPORT void call_fill_arange_u64(mllm_uint64_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step); +HWY_DLLEXPORT void call_fill_arange_i16(mllm_int16_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step); +HWY_DLLEXPORT void call_fill_arange_u16(mllm_uint16_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step); +HWY_DLLEXPORT void call_fill_arange_i8(mllm_int8_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step); +HWY_DLLEXPORT void call_fill_arange_u8(mllm_uint8_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step); + +//===----------------------------------------------------------------------===// +// Fill Random +//===----------------------------------------------------------------------===// +HWY_DLLEXPORT void call_fill_random_fp32(mllm_fp32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed); +HWY_DLLEXPORT void call_fill_random_i32(mllm_int32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed); +HWY_DLLEXPORT void call_fill_random_u32(mllm_uint32_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed); +HWY_DLLEXPORT void call_fill_random_i64(mllm_int64_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed); +HWY_DLLEXPORT void call_fill_random_u64(mllm_uint64_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed); +HWY_DLLEXPORT void call_fill_random_i16(mllm_int16_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed); +HWY_DLLEXPORT void call_fill_random_u16(mllm_uint16_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed); +HWY_DLLEXPORT void call_fill_random_i8(mllm_int8_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed); +HWY_DLLEXPORT void call_fill_random_u8(mllm_uint8_t* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed); + +//===----------------------------------------------------------------------===// +// Template wrapper for generic fill operations +//===----------------------------------------------------------------------===// +template +inline void fill_zeros_anytype(T* dst, size_t n) { + if constexpr (std::is_same_v) { + call_fill_zeros_fp32(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_zeros_fp64(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_zeros_i32(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_zeros_u32(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_zeros_i64(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_zeros_u64(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_zeros_i16(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_zeros_u16(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_zeros_i8(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_zeros_u8(dst, n); + } else { + // Fallback for unsupported types + std::memset(dst, 0, n * sizeof(T)); + } +} + +template +inline void fill_ones_anytype(T* dst, size_t n) { + if constexpr (std::is_same_v) { + call_fill_ones_fp32(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_ones_fp64(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_ones_i32(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_ones_u32(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_ones_i64(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_ones_u64(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_ones_i16(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_ones_u16(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_ones_i8(dst, n); + } else if constexpr (std::is_same_v) { + call_fill_ones_u8(dst, n); + } else { + // Fallback + for (size_t i = 0; i < n; ++i) { dst[i] = static_cast(1); } + } +} + +template +inline void fill_value_anytype(T* dst, size_t n, mllm_fp32_t value) { + if constexpr (std::is_same_v) { + call_fill_value_fp32(dst, n, value); + } else if constexpr (std::is_same_v) { + call_fill_value_fp64(dst, n, static_cast(value)); + } else if constexpr (std::is_same_v) { + call_fill_value_i32(dst, n, static_cast(value)); + } else if constexpr (std::is_same_v) { + call_fill_value_u32(dst, n, static_cast(value)); + } else if constexpr (std::is_same_v) { + call_fill_value_i64(dst, n, static_cast(value)); + } else if constexpr (std::is_same_v) { + call_fill_value_u64(dst, n, static_cast(value)); + } else if constexpr (std::is_same_v) { + call_fill_value_i16(dst, n, static_cast(value)); + } else if constexpr (std::is_same_v) { + call_fill_value_u16(dst, n, static_cast(value)); + } else if constexpr (std::is_same_v) { + call_fill_value_i8(dst, n, static_cast(value)); + } else if constexpr (std::is_same_v) { + call_fill_value_u8(dst, n, static_cast(value)); + } else { + // Fallback + for (size_t i = 0; i < n; ++i) { dst[i] = static_cast(value); } + } +} + +template +inline void fill_arange_anytype(T* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, mllm_fp32_t step) { + if constexpr (std::is_same_v) { + call_fill_arange_fp32(dst, n, start, end, step); + } else if constexpr (std::is_same_v) { + call_fill_arange_i32(dst, n, start, end, step); + } else if constexpr (std::is_same_v) { + call_fill_arange_u32(dst, n, start, end, step); + } else if constexpr (std::is_same_v) { + call_fill_arange_i64(dst, n, start, end, step); + } else if constexpr (std::is_same_v) { + call_fill_arange_u64(dst, n, start, end, step); + } else if constexpr (std::is_same_v) { + call_fill_arange_i16(dst, n, start, end, step); + } else if constexpr (std::is_same_v) { + call_fill_arange_u16(dst, n, start, end, step); + } else if constexpr (std::is_same_v) { + call_fill_arange_i8(dst, n, start, end, step); + } else if constexpr (std::is_same_v) { + call_fill_arange_u8(dst, n, start, end, step); + } else { + // Fallback + for (size_t i = 0; i < n; ++i) { dst[i] = static_cast(start + i * step); } + } +} + +template +inline void fill_random_anytype(T* dst, size_t n, mllm_fp32_t start, mllm_fp32_t end, uint64_t seed) { + if constexpr (std::is_same_v) { + call_fill_random_fp32(dst, n, start, end, seed); + } else if constexpr (std::is_same_v) { + call_fill_random_i32(dst, n, start, end, seed); + } else if constexpr (std::is_same_v) { + call_fill_random_u32(dst, n, start, end, seed); + } else if constexpr (std::is_same_v) { + call_fill_random_i64(dst, n, start, end, seed); + } else if constexpr (std::is_same_v) { + call_fill_random_u64(dst, n, start, end, seed); + } else if constexpr (std::is_same_v) { + call_fill_random_i16(dst, n, start, end, seed); + } else if constexpr (std::is_same_v) { + call_fill_random_u16(dst, n, start, end, seed); + } else if constexpr (std::is_same_v) { + call_fill_random_i8(dst, n, start, end, seed); + } else if constexpr (std::is_same_v) { + call_fill_random_u8(dst, n, start, end, seed); + } else { + // Fallback using LCG + const uint64_t multiplier = 1103515245ULL; + const uint64_t increment = 12345ULL; + const uint64_t modulus = 1ULL << 31; + const mllm_fp32_t range = end - start; + uint64_t state = seed; + for (size_t i = 0; i < n; ++i) { + state = (multiplier * state + increment) % modulus; + const mllm_fp32_t random_value = static_cast(state) / static_cast(modulus - 1); + dst[i] = static_cast(start + random_value * range); + } + } +} + } // namespace mllm::cpu::common #endif diff --git a/mllm/backends/cpu/ops/FillOp.cpp b/mllm/backends/cpu/ops/FillOp.cpp index e4d935f51..cf5cee47e 100644 --- a/mllm/backends/cpu/ops/FillOp.cpp +++ b/mllm/backends/cpu/ops/FillOp.cpp @@ -21,7 +21,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& switch (dst.dtype()) { case kFloat32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - x86::fill_zeros(dst.ptr(), dst.numel(), threads); + common::fill_zeros_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_zeros(dst.ptr(), dst.numel(), threads); #endif @@ -29,7 +29,8 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kFloat16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + // FP16 not directly supported by Highway on x86, use scalar fallback + std::memset(dst.ptr(), 0, dst.numel() * sizeof(mllm_fp16_t)); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_zeros_fp16(dst.ptr(), dst.numel(), threads); #endif @@ -37,7 +38,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt64: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_zeros_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_zeros_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -45,7 +46,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_zeros_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_zeros_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -53,7 +54,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_zeros_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_zeros_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -61,7 +62,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt8: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_zeros_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_zeros_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -69,7 +70,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt64: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_zeros_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_zeros_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -77,7 +78,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_zeros_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_zeros_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -85,7 +86,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_zeros_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_zeros_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -93,7 +94,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt8: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_zeros_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_zeros_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -110,7 +111,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& switch (dst.dtype()) { case kFloat32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - x86::fill_ones(dst.ptr(), dst.numel(), threads); + common::fill_ones_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_ones(dst.ptr(), dst.numel(), threads); #endif @@ -118,7 +119,9 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kFloat16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + // FP16 not directly supported by Highway on x86, use scalar fallback + auto ptr = dst.ptr(); + for (size_t i = 0; i < dst.numel(); ++i) { ptr[i] = static_cast(1.0f); } #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_ones_fp16(dst.ptr(), dst.numel(), threads); #endif @@ -126,7 +129,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt64: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_ones_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_ones_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -134,7 +137,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_ones_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_ones_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -142,7 +145,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_ones_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_ones_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -150,7 +153,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt8: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_ones_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_ones_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -158,7 +161,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt64: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_ones_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_ones_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -166,7 +169,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_ones_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_ones_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -174,7 +177,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_ones_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_ones_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -182,7 +185,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt8: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_ones_anytype(dst.ptr(), dst.numel()); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_ones_anytype(dst.ptr(), dst.numel(), threads); #endif @@ -199,7 +202,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& switch (dst.dtype()) { case kFloat32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - x86::fill_arange(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step, threads); + common::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_arange(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step, threads); #endif @@ -207,7 +210,9 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kFloat16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + // FP16 not directly supported by Highway on x86, use scalar fallback + auto ptr = dst.ptr(); + for (size_t i = 0; i < dst.numel(); ++i) { ptr[i] = static_cast(options_.start + i * options_.step); } #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_arange_fp16(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step, threads); #endif @@ -215,7 +220,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt64: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step, threads); @@ -224,7 +229,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step, threads); @@ -233,7 +238,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step, threads); @@ -242,7 +247,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt8: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step, threads); @@ -251,7 +256,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt64: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step, threads); @@ -260,7 +265,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step, threads); @@ -269,7 +274,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step, threads); @@ -278,7 +283,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt8: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_arange_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.step, threads); @@ -295,7 +300,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& switch (dst.dtype()) { case kFloat32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - x86::fill_random(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed, threads); + common::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_random(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed, threads); #endif @@ -303,7 +308,18 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kFloat16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + // FP16 not directly supported by Highway on x86, use scalar fallback + const uint64_t multiplier = 1103515245ULL; + const uint64_t increment = 12345ULL; + const uint64_t modulus = 1ULL << 31; + const mllm_fp32_t range = options_.end - options_.start; + uint64_t state = options_.seed; + auto ptr = dst.ptr(); + for (size_t i = 0; i < dst.numel(); ++i) { + state = (multiplier * state + increment) % modulus; + const mllm_fp32_t random_value = static_cast(state) / static_cast(modulus - 1); + ptr[i] = static_cast(options_.start + random_value * range); + } #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_random_fp16(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed, threads); #endif @@ -311,7 +327,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt64: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed, threads); #endif @@ -319,7 +335,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed, threads); #endif @@ -327,7 +343,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed, threads); #endif @@ -335,7 +351,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt8: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed, threads); #endif @@ -343,7 +359,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt64: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed, threads); #endif @@ -351,7 +367,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed, threads); #endif @@ -359,7 +375,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed, threads); #endif @@ -367,7 +383,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt8: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_random_anytype(dst.ptr(), dst.numel(), options_.start, options_.end, options_.seed, threads); #endif @@ -383,7 +399,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& switch (dst.dtype()) { case kFloat32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - x86::fill_specific_value(dst.ptr(), dst.numel(), options_.value, threads); + common::fill_value_anytype(dst.ptr(), dst.numel(), options_.value); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_specific_value(dst.ptr(), dst.numel(), options_.value, threads); #endif @@ -391,7 +407,9 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kFloat16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + // FP16 not directly supported by Highway on x86, use scalar fallback + auto ptr = dst.ptr(); + for (size_t i = 0; i < dst.numel(); ++i) { ptr[i] = static_cast(options_.value); } #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_specific_value_fp16(dst.ptr(), dst.numel(), options_.value, threads); #endif @@ -399,7 +417,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt64: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_value_anytype(dst.ptr(), dst.numel(), options_.value); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_specific_value_anytype(dst.ptr(), dst.numel(), options_.value, threads); #endif @@ -407,7 +425,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_value_anytype(dst.ptr(), dst.numel(), options_.value); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_specific_value_anytype(dst.ptr(), dst.numel(), options_.value, threads); #endif @@ -415,7 +433,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_value_anytype(dst.ptr(), dst.numel(), options_.value); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_specific_value_anytype(dst.ptr(), dst.numel(), options_.value, threads); #endif @@ -423,7 +441,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kInt8: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_value_anytype(dst.ptr(), dst.numel(), options_.value); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_specific_value_anytype(dst.ptr(), dst.numel(), options_.value, threads); #endif @@ -431,7 +449,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt64: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_value_anytype(dst.ptr(), dst.numel(), options_.value); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_specific_value_anytype(dst.ptr(), dst.numel(), options_.value, threads); #endif @@ -439,7 +457,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt32: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_value_anytype(dst.ptr(), dst.numel(), options_.value); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_specific_value_anytype(dst.ptr(), dst.numel(), options_.value, threads); #endif @@ -447,7 +465,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt16: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_value_anytype(dst.ptr(), dst.numel(), options_.value); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_specific_value_anytype(dst.ptr(), dst.numel(), options_.value, threads); #endif @@ -455,7 +473,7 @@ void CPUFillOp::forward(const std::vector& inputs, std::vector& } case kUInt8: { #if defined(MLLM_HOST_ARCH_X86_64) || defined(MLLM_HOST_ARCH_X86) - // TODO + common::fill_value_anytype(dst.ptr(), dst.numel(), options_.value); #elif defined(MLLM_HOST_ARCH_ARM64) || defined(MLLM_HOST_ARCH_ARM) arm::fill_specific_value_anytype(dst.ptr(), dst.numel(), options_.value, threads); #endif diff --git a/mllm/backends/qnn/aot/passes/PTQPass.cpp b/mllm/backends/qnn/aot/passes/PTQPass.cpp index 82869ab16..0d34a51b2 100644 --- a/mllm/backends/qnn/aot/passes/PTQPass.cpp +++ b/mllm/backends/qnn/aot/passes/PTQPass.cpp @@ -387,8 +387,10 @@ void recursiveCheckConcatInputs(const std::shared_ptr& ir_ctx, co if (std::abs(ref_scale_v - cur_scale_v) > 1e-6 || ref_zp_v != cur_zp_v) { MLLM_ERROR("PTQPass: ConcatOp '{}' has mismatched scale/zp between inputs. " - "Input '{}': scale={}, zp={}; Input '{}': scale={}, zp={}", - op_name, ref_input_name, ref_scale_v, ref_zp_v, tv->name(), cur_scale_v, cur_zp_v); + "Input '{}': scale={}, zp={}, scale_name={}, zp_name={}; Input '{}': scale={}, zp={}, scale_name={}, " + "zp_name={}", + op_name, ref_input_name, ref_scale_v, ref_zp_v, ref_scale.name(), ref_zero_point.name(), tv->name(), + cur_scale_v, cur_zp_v, cur_scale.name(), cur_zero_point.name()); } } } else if (f_spec->spec_->type == ir::linalg::QuantizationSpecType::kSymPerTensor) { diff --git a/mllm/ffi/Extension.cc b/mllm/ffi/Extension.cc index 22449f883..cb999191d 100644 --- a/mllm/ffi/Extension.cc +++ b/mllm/ffi/Extension.cc @@ -53,9 +53,25 @@ TVM_FFI_STATIC_INIT_BLOCK() { refl::GlobalDef().def("mllm.cpu_", []() -> mllm::ffi::Device { return mllm::ffi::Device(::mllm::DeviceTypes::kCPU); }); refl::GlobalDef().def("mllm.cuda_", []() -> mllm::ffi::Device { return mllm::ffi::Device(::mllm::DeviceTypes::kCUDA); }); refl::GlobalDef().def("mllm.qnn_", []() -> mllm::ffi::Device { return mllm::ffi::Device(::mllm::DeviceTypes::kQNN); }); + // Floating point types refl::GlobalDef().def("mllm.float32_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kFloat32); }); refl::GlobalDef().def("mllm.float16_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kFloat16); }); refl::GlobalDef().def("mllm.bfloat16_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kBFloat16); }); + + // Signed integer types + refl::GlobalDef().def("mllm.int8_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kInt8); }); + refl::GlobalDef().def("mllm.int16_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kInt16); }); + refl::GlobalDef().def("mllm.int32_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kInt32); }); + refl::GlobalDef().def("mllm.int64_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kInt64); }); + + // Unsigned integer types + refl::GlobalDef().def("mllm.uint8_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kUInt8); }); + refl::GlobalDef().def("mllm.uint16_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kUInt16); }); + refl::GlobalDef().def("mllm.uint32_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kUInt32); }); + refl::GlobalDef().def("mllm.uint64_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kUInt64); }); + + // Bool type + refl::GlobalDef().def("mllm.bool_", []() -> mllm::ffi::DType { return mllm::ffi::DType(::mllm::DataTypes::kUInt8); }); } //===----------------------------------------------------------------------===// diff --git a/pymllm/__init__.py b/pymllm/__init__.py index 66240b714..1bd31cd6c 100644 --- a/pymllm/__init__.py +++ b/pymllm/__init__.py @@ -12,12 +12,27 @@ from . import service from . import backends from .ffi import ( + # Floating point types float32, float16, bfloat16, + # Signed integer types + int8, + int16, + int32, + int64, + # Unsigned integer types + uint8, + uint16, + uint32, + uint64, + # Bool type + boolean, + # Devices cpu, cuda, qnn, + # Tensor and utilities Tensor, empty, echo, @@ -26,7 +41,6 @@ is_numpy_available, from_torch, from_numpy, - empty, zeros, ones, arange, diff --git a/pymllm/ffi/__init__.py b/pymllm/ffi/__init__.py index 17bd04c19..9780eabb0 100644 --- a/pymllm/ffi/__init__.py +++ b/pymllm/ffi/__init__.py @@ -48,6 +48,10 @@ def to_pod(self) -> int: return tvm_ffi.get_global_func("mllm.DType.to_pod")(self) +# ============================================================================= +# DType factory functions +# ============================================================================= +# Floating point types def float32_() -> DType: return _ffi_api.float32_() @@ -60,6 +64,45 @@ def bfloat16_() -> DType: return _ffi_api.bfloat16_() +# Signed integer types +def int8_() -> DType: + return _ffi_api.int8_() + + +def int16_() -> DType: + return _ffi_api.int16_() + + +def int32_() -> DType: + return _ffi_api.int32_() + + +def int64_() -> DType: + return _ffi_api.int64_() + + +# Unsigned integer types +def uint8_() -> DType: + return _ffi_api.uint8_() + + +def uint16_() -> DType: + return _ffi_api.uint16_() + + +def uint32_() -> DType: + return _ffi_api.uint32_() + + +def uint64_() -> DType: + return _ffi_api.uint64_() + + +# Bool type (backed by uint8) +def bool_() -> DType: + return _ffi_api.bool_() + + def cpu_() -> Device: return _ffi_api.cpu_() @@ -219,10 +262,32 @@ def is_contiguous(self): return tvm_ffi.get_global_func("mllm.Tensor.is_contiguous")(self) -# Global dtypes +# ============================================================================= +# Global dtype instances +# ============================================================================= +# Floating point types float32: DType = float32_() float16: DType = float16_() bfloat16: DType = bfloat16_() + +# Signed integer types +int8: DType = int8_() +int16: DType = int16_() +int32: DType = int32_() +int64: DType = int64_() + +# Unsigned integer types +uint8: DType = uint8_() +uint16: DType = uint16_() +uint32: DType = uint32_() +uint64: DType = uint64_() + +# Bool type (use 'boolean' to avoid shadowing Python's built-in 'bool') +boolean: DType = bool_() + +# ============================================================================= +# Global device instances +# ============================================================================= cpu: Device = cpu_() cuda: Device = cuda_() qnn: Device = qnn_() From e976d11e4dbbc6baf7d1717e69aeab2dda7ffbf6 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 21 Jan 2026 13:25:41 +0000 Subject: [PATCH 08/42] feat(qnn): Enhance QNNBackend initialization with improved logging and error handling; update default log level to verbose. Add QEmbedding class for quantized embedding operations in PyTorch. Introduce build tasks for Android and x86 QNN AOT SDKs. --- .gitignore | 1 + mllm/CMakeLists.txt | 9 +- mllm/backends/qnn/QNNBackend.cpp | 89 +++++++++--- mllm/backends/qnn/QNNBackend.hpp | 2 +- mllm/backends/qnn/Register.cpp | 15 +- mllm/backends/qnn/aot/QnnWrappersAPI.cpp | 7 + .../qualcomm/transformers/core/embedding.py | 133 ++++++++++++++++++ tasks/build_sdk_android_qnn_aot.yaml | 22 +++ tasks/build_sdk_x86_qnn_aot.yaml | 2 +- 9 files changed, 255 insertions(+), 25 deletions(-) create mode 100644 pymllm/backends/qualcomm/transformers/core/embedding.py create mode 100644 tasks/build_sdk_android_qnn_aot.yaml diff --git a/.gitignore b/.gitignore index 22e2a9a6f..7397d6ecc 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ tasks/mllmteam* build*/ install*/ mllm-sdk-*/ +mllm-install-*/ # Pymllm related stubs/ diff --git a/mllm/CMakeLists.txt b/mllm/CMakeLists.txt index fd796f95a..06fa5aab2 100644 --- a/mllm/CMakeLists.txt +++ b/mllm/CMakeLists.txt @@ -58,7 +58,14 @@ endif() # FIXME: @oreomaker Need to remove comma features in slice! # Suppress comma-subscript warnings (deprecated C++ feature that will be removed in C++26) -target_compile_options(MllmRT PUBLIC -Wno-comma-subscript) +# This flag is only available in Clang 13+ and GCC 10+ +if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") + target_compile_options(MllmRT PUBLIC -Wno-comma-subscript) +elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL "10.0") + target_compile_options(MllmRT PUBLIC -Wno-comma-subscript) + endif() +endif() # ONLY APPLE CAN DO ! # Processing OpenMP diff --git a/mllm/backends/qnn/QNNBackend.cpp b/mllm/backends/qnn/QNNBackend.cpp index 54da97c9d..05ebedfcb 100644 --- a/mllm/backends/qnn/QNNBackend.cpp +++ b/mllm/backends/qnn/QNNBackend.cpp @@ -29,15 +29,28 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) { QNNViewOpFactory, QNNRMSNormOpFactory, QNNTransposeOpFactory, QNNX2XOpFactory, QNNCastTypeOpFactory, QNNParamOpFactory, QNNSiLUOpFactory, QNNEmbeddingOpFactory>(); - QnnLog_Level_t qnnLogLevel = QNN_LOG_LEVEL_ERROR; // default QNN log level + QnnLog_Level_t qnnLogLevel = QNN_LOG_LEVEL_VERBOSE; // default QNN log level profilingLevel_ = ProfilingLevel::OFF; debug_ = false; // when set true, NATIVE tensor will be regared as APP_READ tensor - loadQNNSymbol(); - loadQNNSystemSymbol(); + if (!loadQNNSymbol()) { + MLLM_ERROR_EXIT(ExitCode::kQnnError, "Failed to load QNN symbols"); + } else { + MLLM_INFO("QNN symbols loaded successfully"); + } + + if (!loadQNNSystemSymbol()) { + MLLM_ERROR_EXIT(ExitCode::kQnnError, "Failed to load QNN System symbols"); + } else { + MLLM_INFO("QNN System symbols loaded successfully"); + } runtime_ = QNNRuntime::create(profilingLevel_, qnnLogLevel); - if (!runtime_) { MLLM_ERROR_EXIT(1, "Failed to create QNN Runtime"); } + if (!runtime_) { + MLLM_ERROR_EXIT(ExitCode::kQnnError, "Failed to create QNN Runtime"); + } else { + MLLM_INFO("QNN Runtime created successfully"); + } // check QNN capability, detect QNN features for future use char* backendBuildId{nullptr}; @@ -59,6 +72,7 @@ QNNBackend::QNNBackend() : Backend(kQNN, createQNNAllocator()) { perf_ = QNNPerf::create(&runtime_->qnnInterface); perf_->setPowerConfigBurst(); perf_->setRpcLatencyAndPolling(); + MLLM_INFO("QNN Perf created successfully"); } QNNPerf::QNNPerf(const QNN_INTERFACE_VER_TYPE* qnnInterface) { @@ -204,11 +218,13 @@ QNNRuntime* QNNRuntime::initRuntime(ProfilingLevel profilingLevel, QnnLog_Level_ // Create Log Qnn_LogHandle_t logHandle = nullptr; { - QnnLog_Callback_t logCallback = &__mllmQnnLoggerCallback; + QnnLog_Callback_t logCallback = __mllmQnnLoggerCallback; if ((QNN_GET_ERROR_CODE(qnnInterface.logCreate(logCallback, qnnLogLevel, &logHandle)) != QNN_SUCCESS) || (logHandle == nullptr)) { MLLM_ERROR("Failed to initialize logging in the backend."); return nullptr; + } else { + MLLM_INFO("Logging initialized successfully"); } } @@ -220,6 +236,8 @@ QNNRuntime* QNNRuntime::initRuntime(ProfilingLevel profilingLevel, QnnLog_Level_ || (backendHandle == nullptr)) { MLLM_ERROR("Failed to create the backend."); return nullptr; + } else { + MLLM_INFO("Backend created successfully"); } } @@ -227,16 +245,13 @@ QNNRuntime* QNNRuntime::initRuntime(ProfilingLevel profilingLevel, QnnLog_Level_ Qnn_DeviceHandle_t deviceHandle = nullptr; { // Check whether the device API is supported. - if (nullptr != qnnInterface.propertyHasCapability) { - auto qnnStatus = qnnInterface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE); - if (QNN_PROPERTY_NOT_SUPPORTED == qnnStatus) { - MLLM_WARN("Device property is not supported"); - return nullptr; - } - if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnStatus) { - MLLM_ERROR("Device property is not known to backend"); + if (nullptr != qnnInterface.deviceCreate) { + auto status = qnnInterface.deviceCreate(logHandle, nullptr, &deviceHandle); + if (QNN_SUCCESS != status) { + MLLM_ERROR("Failed to create device, error: {}", (int)status); return nullptr; } + MLLM_INFO("Device created successfully"); } } @@ -269,9 +284,7 @@ QNNRuntime* QNNRuntime::initRuntime(ProfilingLevel profilingLevel, QnnLog_Level_ std::string target; }; - std::vector opPackages = { - {.path = "libQnnLLaMAPackage_CPU.so", .interfaceProvider = "LLaMAPackageInterfaceProvider", .target = "CPU"}, - {.path = "libQnnLLaMAPackage_HTP.so", .interfaceProvider = "LLaMAPackageInterfaceProvider", .target = "HTP"}}; + std::vector opPackages = {}; for (const auto& pkg : opPackages) { if (!qnnInterface.backendRegisterOpPackage) { @@ -298,6 +311,8 @@ QNNRuntime* QNNRuntime::initRuntime(ProfilingLevel profilingLevel, QnnLog_Level_ != QnnSystemInterface_getProviders((const QnnSystemInterface_t***)&systemInterfaceProviders, &numProviders)) { MLLM_ERROR("Failed to get system interface providers."); return nullptr; + } else { + MLLM_INFO("System interface providers found: {}", numProviders); } if (0 == numProviders) { MLLM_ERROR("Failed to get interface providers: 0 interface providers."); @@ -305,11 +320,17 @@ QNNRuntime* QNNRuntime::initRuntime(ProfilingLevel profilingLevel, QnnLog_Level_ } bool foundValidSystemInterface = false; for (size_t pIdx = 0; pIdx < numProviders; pIdx++) { - foundValidSystemInterface = true; if (QNN_SYSTEM_API_VERSION_MAJOR == systemInterfaceProviders[pIdx]->systemApiVersion.major && QNN_SYSTEM_API_VERSION_MINOR <= systemInterfaceProviders[pIdx]->systemApiVersion.minor) { qnnSystemInterface = systemInterfaceProviders[pIdx]->QNN_SYSTEM_INTERFACE_VER_NAME; + foundValidSystemInterface = true; break; + } else { + // Print system interface provider and self version + MLLM_WARN("System interface provider: {} version: {}", systemInterfaceProviders[pIdx]->systemApiVersion.major, + systemInterfaceProviders[pIdx]->systemApiVersion.minor); + MLLM_WARN("Self version: {} {}", QNN_SYSTEM_API_VERSION_MAJOR, QNN_SYSTEM_API_VERSION_MINOR); + MLLM_WARN("Unable to find a valid system interface."); } } if (!foundValidSystemInterface) { @@ -334,7 +355,14 @@ bool QNNRuntime::retrieveContext(const std::string& contextBinaryPath, Qnn_Conte std::vector>& qnnModels, QnnContext_Config_t** contextConfig) { // Read the binary from qnn_context.bin and get the size in byte std::ifstream file(contextBinaryPath, std::ios::binary | std::ios::ate); + if (!file.is_open() || !file.good()) { + MLLM_ERROR("Could not open context binary file: {}", contextBinaryPath); + return false; + } else { + MLLM_INFO("Context binary file opened successfully: {}", contextBinaryPath); + } std::streamsize size = file.tellg(); + MLLM_INFO("Context binary file size: {} MB", size / 1024 / 1024); file.seekg(0, std::ios::beg); auto binaryBuffer = std::make_unique(size); @@ -344,17 +372,27 @@ bool QNNRuntime::retrieveContext(const std::string& contextBinaryPath, Qnn_Conte // inspect binary info QnnSystemContext_Handle_t sysCtxHandle{nullptr}; + if (!qnnSystemInterface.systemContextCreate) { + MLLM_ERROR("systemContextCreate is nullptr."); + return false; + } if (QNN_SUCCESS != qnnSystemInterface.systemContextCreate(&sysCtxHandle)) { MLLM_ERROR("Could not create system handle."); return false; + } else { + MLLM_INFO("System context created successfully"); } + const QnnSystemContext_BinaryInfo_t* binaryInfo{nullptr}; Qnn_ContextBinarySize_t binaryInfoSize{0}; + if (QNN_SUCCESS != qnnSystemInterface.systemContextGetBinaryInfo(sysCtxHandle, static_cast(binaryBuffer.get()), size, &binaryInfo, &binaryInfoSize)) { MLLM_ERROR("Failed to get context binary info"); return false; + } else { + MLLM_INFO("Context binary info retrieved successfully"); } // Extract graph metadata to create QNNModels instead of GraphInfo_t @@ -365,13 +403,24 @@ bool QNNRuntime::retrieveContext(const std::string& contextBinaryPath, Qnn_Conte MLLM_ERROR("Failed to copy metadata."); return false; } - qnnSystemInterface.systemContextFree(sysCtxHandle); + if (QNN_SUCCESS != qnnSystemInterface.systemContextFree(sysCtxHandle)) { + MLLM_ERROR("Could not free system context."); + return false; + } else { + MLLM_INFO("System context freed successfully"); + } sysCtxHandle = nullptr; // Create context from binary Qnn_ContextBinarySize_t writtenSize = 0; - qnnInterface.contextCreateFromBinary(backendHandle, deviceHandle, (const QnnContext_Config_t**)contextConfig, - binaryBuffer.get(), size, &context, profileHandle); + if (QNN_CONTEXT_NO_ERROR + != qnnInterface.contextCreateFromBinary(backendHandle, deviceHandle, (const QnnContext_Config_t**)contextConfig, + binaryBuffer.get(), size, &context, profileHandle)) { + MLLM_ERROR("Could not create context from binary. Mostly due to binary's qnn version mismatch with backend's qnn version."); + return false; + } else { + MLLM_INFO("Context created from binary successfully"); + } // Create QNNModels for each graph and initialize from context qnnModels.clear(); diff --git a/mllm/backends/qnn/QNNBackend.hpp b/mllm/backends/qnn/QNNBackend.hpp index 49669c7c1..78953f32d 100644 --- a/mllm/backends/qnn/QNNBackend.hpp +++ b/mllm/backends/qnn/QNNBackend.hpp @@ -45,7 +45,7 @@ class QNNRuntime { ~QNNRuntime(); static std::unique_ptr create(ProfilingLevel profilingLevel = ProfilingLevel::OFF, - QnnLog_Level_t qnnLogLevel = QNN_LOG_LEVEL_WARN) { + QnnLog_Level_t qnnLogLevel = QNN_LOG_LEVEL_VERBOSE) { return std::unique_ptr(initRuntime(profilingLevel, qnnLogLevel)); } diff --git a/mllm/backends/qnn/Register.cpp b/mllm/backends/qnn/Register.cpp index 158294f35..88185921e 100644 --- a/mllm/backends/qnn/Register.cpp +++ b/mllm/backends/qnn/Register.cpp @@ -21,9 +21,18 @@ void initQnnBackend(const std::string& context_path) { // 1. Register backend auto backend = std::make_shared(); if (std::filesystem::exists(context_path)) { - if (!backend->loadContext(context_path)) { MLLM_ERROR_EXIT(1, "Failed to load QNN context from {}", context_path); } + MLLM_INFO("QNN context path exists: {}", context_path); + if (!backend->loadContext(context_path)) { + MLLM_ERROR_EXIT(1, "Failed to load QNN context from {}", context_path); + } else { + MLLM_INFO("QNN context loaded successfully from {}", context_path); + } } else { - if (!backend->createContext()) { MLLM_ERROR_EXIT(1, "Failed to create QNN context"); } + if (!backend->createContext()) { + MLLM_ERROR_EXIT(1, "Failed to create QNN context"); + } else { + MLLM_INFO("QNN context created successfully"); + } } ctx.registerBackend(backend); @@ -33,6 +42,8 @@ void initQnnBackend(const std::string& context_path) { .really_large_tensor_threshold = 0, .using_buddy_mem_pool = false, }); + MLLM_INFO("QNN memory manager registered"); + // 3. Initialize dispatcher manager ctx.dispatcherManager()->registerDispatcher( createQNNDispatcher(ctx.dispatcherManager()->getExecutor(), qnn::QNNDispatcherOptions())); diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp index b2b04fd78..23496591f 100644 --- a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp +++ b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp @@ -107,6 +107,7 @@ std::string QnnAOTNodeTensor::parseQnnTensorNameFromIR(const ir::tensor::TensorV Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::tensor::TensorValue::ptr_t& v) { Qnn_QuantizeParams_t ret = QNN_QUANTIZE_PARAMS_INIT; + MLLM_RT_ASSERT(v); MLLM_RT_ASSERT(v->getAttr("quant_recipe")); auto quant_spec = v->getAttr("quant_recipe")->cast_()->spec_; @@ -120,6 +121,9 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten auto cfg = std::static_pointer_cast(quant_spec); ret.encodingDefinition = QNN_DEFINITION_DEFINED; ret.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET; + if (!cfg->scale || !cfg->zero_point) { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "AsymPerTensor quant recipe has no scale or zero point. tensor: {}", v->name()); + } ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item(), .offset = cfg->zero_point.item()}; break; } @@ -127,6 +131,9 @@ Qnn_QuantizeParams_t QnnAOTNodeTensor::parseQnnQuantizeParamFromIR(const ir::ten auto cfg = std::static_pointer_cast(quant_spec); ret.encodingDefinition = QNN_DEFINITION_DEFINED; ret.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET; + if (!cfg->scale) { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "SymPerTensor quant recipe has no scale. tensor: {}", v->name()); + } ret.scaleOffsetEncoding = Qnn_ScaleOffset_t{.scale = cfg->scale.item(), .offset = 0}; break; } diff --git a/pymllm/backends/qualcomm/transformers/core/embedding.py b/pymllm/backends/qualcomm/transformers/core/embedding.py new file mode 100644 index 000000000..84c4d61fe --- /dev/null +++ b/pymllm/backends/qualcomm/transformers/core/embedding.py @@ -0,0 +1,133 @@ +import torch +import torch.nn as nn +from torch.ao.quantization import FakeQuantize, MinMaxObserver + + +class QEmbedding(nn.Module): + def __init__( + self, + num_embeddings, + embedding_dim, + padding_idx=None, + quant_bits=16, + ): + super().__init__() + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + self.quant_bits = quant_bits + + self.weight = nn.Parameter(torch.empty(num_embeddings, embedding_dim)) + nn.init.normal_(self.weight) + + if padding_idx is not None: + with torch.no_grad(): + self.weight[padding_idx].fill_(0) + + # Quantization configuration for Weight + self.weight_fake_quant = FakeQuantize( + observer=MinMaxObserver.with_args( + qscheme=torch.per_tensor_affine, + dtype=torch.qint32, + eps=0.0001 / 65535, + ), + quant_min=0, + quant_max=2 ** (quant_bits) - 1, + dtype=torch.qint32, + qscheme=torch.per_tensor_affine, + ) + + def forward(self, x): + # 1. Weight fake quantization + # If observer is not closed, this step will continuously update scale/zp + # If freeze_weight() is called, this will just use fixed scale/zp for quantization + w_q = self.weight_fake_quant(self.weight) + + # 2. Embedding lookup (Gather operation) + return nn.functional.embedding( + x, + w_q, + padding_idx=self.padding_idx, + ) + + @torch.no_grad() + def convert_to_deploy(self): + """ + In-place replacement of self.weight: + Float Parameter -> Int Buffer + """ + # 1. Ensure quantization parameters are ready + if self.weight_fake_quant.scale is None: + self.freeze_weight() + + scale = self.weight_fake_quant.scale + zero_point = self.weight_fake_quant.zero_point + quant_min = self.weight_fake_quant.quant_min + quant_max = self.weight_fake_quant.quant_max + + # 2. Calculate integer values + # w_int = round(w / s + zp) + w_int = torch.round(self.weight / scale + zero_point).clamp( + quant_min, quant_max + ) + + # 3. Set target integer type + if self.quant_bits <= 8: + target_dtype = torch.uint8 + elif self.quant_bits <= 16: + target_dtype = torch.uint16 + else: + target_dtype = torch.uint32 + + w_int = w_int.to(target_dtype) + + # === Key steps: Replacement operations === + + # A. Delete original Parameter 'weight' + # Must delete first, otherwise cannot register buffer with same name + del self.weight + + # B. Register Buffer with same name 'weight' + # This makes state_dict['weight'] become Int Tensor + self.register_buffer("weight", w_int) + + # C. Register Scale (usually needed by engine) + self.register_buffer("scale", scale) + self.register_buffer("zero_point", zero_point) + + # D. Clean up unnecessary modules + if hasattr(self, "weight_fake_quant"): + del self.weight_fake_quant + + class_name = self.__class__.__name__ + instance_class_name = type(self).__name__ + print( + f"Class: {class_name}, Instance: {instance_class_name}, Deploy Mode Activated. 'weight' is now {self.weight.dtype} buffer. zp is {zero_point}" + ) + + @torch.no_grad() + def freeze_weight(self): + """ + Manually trigger Observer to observe and calculate scale, then lock it. + Solve the problem of output being 0 on first run. + """ + self.weight_fake_quant.activation_post_process(self.weight) + s, zp = self.weight_fake_quant.activation_post_process.calculate_qparams() + self.weight_fake_quant.scale.copy_(s) + self.weight_fake_quant.zero_point.copy_(zp) + self.weight_fake_quant.disable_observer() + class_name = self.__class__.__name__ + instance_class_name = type(self).__name__ + print( + f"Class: {class_name}, Instance: {instance_class_name}, Weight Quantized: scale={self.weight_fake_quant.scale}, zp={self.weight_fake_quant.zero_point}" + ) + + def disable_quant(self): + """Completely turn off quantization noise and return to floating point mode""" + self.weight_fake_quant.disable_fakequant() + + def extra_repr(self): + s = f"{self.num_embeddings}, {self.embedding_dim}" + if self.padding_idx is not None: + s += f", padding_idx={self.padding_idx}" + return s diff --git a/tasks/build_sdk_android_qnn_aot.yaml b/tasks/build_sdk_android_qnn_aot.yaml new file mode 100644 index 000000000..f0e983b75 --- /dev/null +++ b/tasks/build_sdk_android_qnn_aot.yaml @@ -0,0 +1,22 @@ +Tasks: + - CMakeConfigTask: + cmake_cfg_path: "build-android-arm64-v8a-qnn" + cmake_build_type: "ReleaseWithDebInfo" + cmake_toolchain_file: "$ANDROID_NDK_PATH/build/cmake/android.toolchain.cmake" + cmake_extra_args: + - "-DMLLM_CROSS_COMPILE=ON" + - "-DMLLM_BUILD_ARM_BACKEND=ON" + - "-DMLLM_BUILD_QNN_BACKEND=ON" + - "-DANDROID_PLATFORM=android-28" + - "-DANDROID_ABI=arm64-v8a" + - '-DMLLM_CPU_BACKEND_COMPILE_OPTIONS="-march=armv8.2-a+fp16+fp16fml+dotprod+i8mm;-ffast-math;-Wno-nan-infinity-disabled"' + - "-DCMAKE_INSTALL_PREFIX=mllm-install-android-arm64-v8a-qnn" + - "-DMLLM_KERNEL_USE_THREADS=ON" + - "-DMLLM_KERNEL_THREADS_VENDOR_OPENMP=ON" + - "-DMLLM_KERNEL_USE_THREADS_VENDOR_MLLM=OFF" + + - CMakeBuildTask: + cmake_cfg_path: "build-android-arm64-v8a-qnn" + + - CMakeInstallTask: + cmake_cfg_path: "build-android-arm64-v8a-qnn" diff --git a/tasks/build_sdk_x86_qnn_aot.yaml b/tasks/build_sdk_x86_qnn_aot.yaml index f33281616..fd9131d2e 100644 --- a/tasks/build_sdk_x86_qnn_aot.yaml +++ b/tasks/build_sdk_x86_qnn_aot.yaml @@ -1,7 +1,7 @@ Tasks: - CMakeConfigTask: cmake_cfg_path: "build-qnn-aot" - cmake_build_type: "Release" + cmake_build_type: "ReleaseWithDebInfo" cmake_extra_args: # Optional, If use Highway - "-DHWY_ENABLE_TESTS=OFF" From 224d68e0dcacb337239b4cc769594d4b66178a1f Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Fri, 23 Jan 2026 02:24:07 +0000 Subject: [PATCH 09/42] feat(qnn): Update quantization handling and embedding output data types; ensure position-independent code for flatbuffers. Enhance context creation with existing context checks and improve weight quantization specifications. --- CMakeLists.txt | 1 + mllm/backends/qnn/aot/QnnWrappersAPI.cpp | 6 +++++ .../qnn/aot/passes/LLMQuantRecipePass.cpp | 26 ++++++++++--------- mllm/backends/qnn/aot/passes/PTQPass.cpp | 23 +++++++++++++++- mllm/backends/qnn/aot_rt/PromptProcessor.cpp | 1 - mllm/core/aops/EmbeddingOp.cpp | 6 +++-- 6 files changed, 47 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fca470ee5..a19e80df3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -262,6 +262,7 @@ add_subdirectory(third_party/fmt) add_subdirectory(third_party/xxHash) set(FLATBUFFERS_BUILD_TESTS OFF) add_subdirectory(third_party/flatbuffers EXCLUDE_FROM_ALL) +set_target_properties(flatbuffers PROPERTIES POSITION_INDEPENDENT_CODE ON) add_subdirectory(mllm) if(MLLM_ENABLE_TEST) diff --git a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp index 23496591f..a79047e78 100644 --- a/mllm/backends/qnn/aot/QnnWrappersAPI.cpp +++ b/mllm/backends/qnn/aot/QnnWrappersAPI.cpp @@ -436,6 +436,12 @@ void QnnAOTEnv::_setup(const std::string& path) { } std::shared_ptr QnnAOTEnv::createContext(const std::string& name, bool weights_sharing) { + // Check if context with this name already exists + if (contexts_.count(name) > 0) { + MLLM_WARN("Context '{}' already exists, reusing the existing context", name); + return contexts_[name]; + } + std::shared_ptr context = std::make_shared(); context->name_ = name; diff --git a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp index 957fdf321..18bbb505c 100644 --- a/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp +++ b/mllm/backends/qnn/aot/passes/LLMQuantRecipePass.cpp @@ -986,6 +986,7 @@ bool LLMQuantRecipeEmbeddingPattern::rewrite(ir::IRWriter& writer, const ir::op_ auto annotation_attr = writer.create(); + // i_0 logic stays the same if (!i_0->getAttr("quant_recipe")) { auto i_0_spec = genSimpleQuantizationSpecAttr(writer.getContext(), i_0->cast_()); i_0->setAttr("quant_recipe", i_0_spec); @@ -996,16 +997,7 @@ bool LLMQuantRecipeEmbeddingPattern::rewrite(ir::IRWriter& writer, const ir::op_ i_0->getAttr("quant_recipe")->cast_()->spec_); } - if (!o_0->getAttr("quant_recipe")) { - auto o_0_spec = genSimpleQuantizationSpecAttr(writer.getContext(), o_0->cast_()); - o_0->setAttr("quant_recipe", o_0_spec); - annotation_attr->annotation_.outputs.emplace_back(o_0_spec->spec_); - } else { - annotation_attr->annotation_.outputs.emplace_back( - o_0->getAttr("quant_recipe")->cast_()->spec_); - } - - // Weights + // Weights - must be uint16, force set to kUInt16PerTensorAsy auto weight_name = embedding_op->getAOp()->getName() + ".weight"; auto weight_reg_tensor_ir = writer.getContext()->lookupSymbolTable(weight_name); MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir); @@ -1013,11 +1005,21 @@ bool LLMQuantRecipeEmbeddingPattern::rewrite(ir::IRWriter& writer, const ir::op_ MLLM_RETURN_FALSE_IF_NOT(weight_reg_tensor_ir->outputs().front()->isa_()); auto weight_tensor = weight_reg_tensor_ir->outputs().front()->cast_(); - // Embedding weight quantization method same as outputs, but not share, just same type - auto weight_spec_attr = genSimpleQuantizationSpecAttr(writer.getContext(), weight_tensor); + // Embedding weight dtype must be uint16, force set to kUInt16PerTensorAsy + MLLM_RETURN_FALSE_IF_NOT(weight_tensor->tensor_.dtype() == kUInt16 || weight_tensor->tensor_.dtype() == kUInt16PerTensorAsy); + weight_tensor->tensor_ = weight_tensor->tensor_.__unsafeSetDType(kUInt16PerTensorAsy); + + // Create weight spec with kUInt16PerTensorAsy (AsymPerTensor) + auto weight_spec = + ir::linalg::QuantizationSpecAsymPerTensor::create(0, 65535, kUInt16, kFloat32, kInt32, Tensor::nil(), Tensor::nil()); + auto weight_spec_attr = writer.getContext()->create(weight_spec); weight_reg_tensor_ir->outputs().front()->setAttr("quant_recipe", weight_spec_attr); annotation_attr->annotation_.weights.insert({"weight", weight_spec_attr->spec_}); + // o_0's quant recipe shares with weight + o_0->setAttr("quant_recipe", weight_spec_attr); + annotation_attr->annotation_.outputs.emplace_back(weight_spec_attr->spec_); + // Attach to quantize node node->setAttr("quant_recipe", annotation_attr); diff --git a/mllm/backends/qnn/aot/passes/PTQPass.cpp b/mllm/backends/qnn/aot/passes/PTQPass.cpp index 0d34a51b2..d9f1d97cb 100644 --- a/mllm/backends/qnn/aot/passes/PTQPass.cpp +++ b/mllm/backends/qnn/aot/passes/PTQPass.cpp @@ -111,6 +111,22 @@ void solveEmbeddingWeight(const ir::IRContext::ptr_t& ctx, const ParameterFile:: weight_spec->solved = true; break; } + case ir::linalg::QuantizationSpecType::kAsymPerTensor: { + auto this_spec = std::static_pointer_cast(weight_spec); + auto scale = pf->pull(mllm_op->getName() + ".scale"); + auto zero_point = pf->pull(mllm_op->getName() + ".zero_point"); + this_spec->scale = scale; + this_spec->zero_point = zero_point; + checkTypeLimits(pf->pull(mllm_op->getName() + ".weight"), this_spec->quant_min, this_spec->quant_max); + MLLM_RT_ASSERT(scale.dtype() == kFloat32); + MLLM_RT_ASSERT(scale.rank() == 1); + MLLM_RT_ASSERT(scale.item() > 0); + MLLM_RT_ASSERT(zero_point.dtype() == kInt32); + MLLM_RT_ASSERT(zero_point.rank() == 1); + MLLM_RT_ASSERT(zero_point.item() >= 0); + weight_spec->solved = true; + break; + } default: { NYI("quant recipe type not support"); } @@ -203,6 +219,9 @@ void _recursiveSolveNormalImpl(const ir::IRContext::ptr_t& ctx, const ir::Val::p auto _attr = ctx->create(std::vector{(uint16_t)ptq_constant_v}); tv->removeAttr("constant"); tv->setAttr("constant", _attr); + + MLLM_INFO("Constant tensor '{}' quantized (AsymPerTensor): before={}, after={}", tv->name(), constant_v, + ptq_constant_v); } this_spec->solved = true; @@ -262,6 +281,8 @@ void _recursiveSolveNormalImpl(const ir::IRContext::ptr_t& ctx, const ir::Val::p auto _attr = ctx->create(std::vector{(uint16_t)ptq_constant_v}); tv->removeAttr("constant"); tv->setAttr("constant", _attr); + + MLLM_INFO("Constant tensor '{}' quantized (SymPerTensor): before={}, after={}", tv->name(), constant_v, ptq_constant_v); } this_spec->solved = true; @@ -273,7 +294,7 @@ void _recursiveSolveNormalImpl(const ir::IRContext::ptr_t& ctx, const ir::Val::p break; } default: { - NYI("quant recipe type not support on tensor: {}", v->name()); + NYI("Quant recipe type not support on tensor: {}", v->name()); } } } diff --git a/mllm/backends/qnn/aot_rt/PromptProcessor.cpp b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp index 50396955d..99cd22db9 100644 --- a/mllm/backends/qnn/aot_rt/PromptProcessor.cpp +++ b/mllm/backends/qnn/aot_rt/PromptProcessor.cpp @@ -1,4 +1,3 @@ - // Copyright (c) MLLM Team. // Licensed under the MIT License. diff --git a/mllm/core/aops/EmbeddingOp.cpp b/mllm/core/aops/EmbeddingOp.cpp index b67eeff7a..a5a6400dd 100644 --- a/mllm/core/aops/EmbeddingOp.cpp +++ b/mllm/core/aops/EmbeddingOp.cpp @@ -70,8 +70,10 @@ void EmbeddingOp::reshape(const std::vector& inputs, std::vector std::vector o_shape{/*batch*/ shape[0], /*seq*/ shape[1], /*feat dim*/ options_.hidden_size}; - // FIXME: We should tell embedding output to use what kinds of data types. Currently it's hardcoded to float32. - outputs.emplace_back(Tensor::empty(o_shape, kFloat32, i.device())); + // Output dtype should match weight dtype (e.g., uint16 for AsymPerTensor quantization) + auto out_dtype = weight_.dtype(); + if (weight_.dtype() == kUInt16) { out_dtype = kUInt16PerTensorAsy; } + outputs.emplace_back(Tensor::empty(o_shape, out_dtype, i.device())); } void EmbeddingOp::setup(const std::vector& inputs, std::vector& outputs) { BaseOp::setup(inputs, outputs); } From d2d5c09ce56c74f38ffa3a38e4273e29e98af1eb Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Fri, 23 Jan 2026 02:42:09 +0000 Subject: [PATCH 10/42] feat(qwen3): Integrate QEmbedding for quantized embeddings and refine input layer normalization handling in Qwen3DecoderLayer. Update weight conversion logic in training script to address model compatibility issues. --- .../qualcomm/transformers/qwen3/modeling_qwen3.py | 12 +++++++----- .../backends/qualcomm/transformers/qwen3/runner.py | 3 +++ pymllm/backends/qualcomm/transformers/qwen3/train.py | 10 +++++++--- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py index 92efaa06d..cf71a48ba 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py @@ -54,6 +54,7 @@ ActivationQDQ, FixedActivationQDQ, ) +from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver @@ -393,7 +394,8 @@ def __init__(self, config: Qwen3Config, layer_idx: int): self.attention_type = config.layer_types[layer_idx] # QDQ - self.input_layernorm_input_qdq = ActivationQDQ(bits=16) + if self.layer_dix != 0: + self.input_layernorm_input_qdq = ActivationQDQ(bits=16) self.add_0_lhs_input_qdq = ActivationQDQ(bits=16) self.add_0_output_qdq = ActivationQDQ(bits=16) self.add_1_lhs_input_qdq = ActivationQDQ(bits=16) @@ -412,7 +414,8 @@ def forward( ] = None, # necessary, but kept here for BC **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: - hidden_states = self.input_layernorm_input_qdq(hidden_states) + if self.layer_dix != 0: + hidden_states = self.input_layernorm_input_qdq(hidden_states) residual = hidden_states hidden_states = self.input_layernorm(hidden_states) # Self Attention @@ -513,9 +516,8 @@ def __init__(self, config: Qwen3Config): super().__init__(config) self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding( - config.vocab_size, config.hidden_size, self.padding_idx + self.embed_tokens = QEmbedding( + config.vocab_size, config.hidden_size, self.padding_idx, quant_bits=16 ) self.layers = nn.ModuleList( [ diff --git a/pymllm/backends/qualcomm/transformers/qwen3/runner.py b/pymllm/backends/qualcomm/transformers/qwen3/runner.py index 6565ca7e6..416816875 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/runner.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/runner.py @@ -11,6 +11,7 @@ QLinearLPBQ, QLinearW8A16_PerChannelSym, ) +from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding from pymllm.backends.qualcomm.transformers.qwen3.modeling_qwen3 import Qwen3ForCausalLM @@ -49,6 +50,8 @@ def convert_weight(m): m.convert_to_conv2d_deploy_hwio() if isinstance(m, QRMSNorm): m.convert_to_deploy() + if isinstance(m, QEmbedding): + m.convert_to_deploy() class Qwen3Quantizer: diff --git a/pymllm/backends/qualcomm/transformers/qwen3/train.py b/pymllm/backends/qualcomm/transformers/qwen3/train.py index 25361f372..9c4604d8f 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/train.py +++ b/pymllm/backends/qualcomm/transformers/qwen3/train.py @@ -48,9 +48,13 @@ def main(): # Things below is for deploy. We will turn all fp32 weights and some buffers(rope) to quantized dtype. # !!! # This line maybe error. we need use quantized weight!!! not embed_tokens.weight!!! - m.model.lm_head.weight = torch.nn.Parameter( - m.model.model.embed_tokens.weight.clone() - ) + # m.model.lm_head.weight = torch.nn.Parameter( + # m.model.model.embed_tokens.weight.clone() + # ) + if "1.7B" in args.model_path: + raise ValueError( + "1.7B model is not supported for now due to tied embedding weights is not supported." + ) m.convert() os.makedirs(args.output_dir, exist_ok=True) From c4f230648ccce567ced2d7c8990d81757a64aa24 Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Fri, 23 Jan 2026 15:35:15 +0800 Subject: [PATCH 11/42] fix --- examples/qwen2_5omni/config_qwen2_5omni.json | 495 ------------------- examples/qwen2_5omni/image_infer.cpp | 4 +- 2 files changed, 2 insertions(+), 497 deletions(-) delete mode 100644 examples/qwen2_5omni/config_qwen2_5omni.json diff --git a/examples/qwen2_5omni/config_qwen2_5omni.json b/examples/qwen2_5omni/config_qwen2_5omni.json deleted file mode 100644 index 633e1b2b1..000000000 --- a/examples/qwen2_5omni/config_qwen2_5omni.json +++ /dev/null @@ -1,495 +0,0 @@ -{ - "architectures": [ - "Qwen2_5OmniModel" - ], - "enable_audio_output": true, - "enable_talker": true, - "model_type": "qwen2_5_omni", - "talker_config": { - "_attn_implementation_autoset": true, - "_name_or_path": "Qwen2.5-Omni-7B/talker", - "architectures": [ - "Qwen2OmniTalkerForConditionalGeneration" - ], - "attention_dropout": 0.0, - "audio_end_token_id": 151648, - "audio_start_token_id": 151647, - "audio_token_index": 151646, - "embedding_size": 3584, - "head_dim": 128, - "hidden_act": "silu", - "hidden_size": 896, - "image_token_index": 151655, - "init_std": 0.02, - "initializer_range": 0.02, - "intermediate_size": 18944, - "max_position_embeddings": 32768, - "max_window_layers": 28, - "model_type": "qwen2_5_omni_talker", - "num_attention_heads": 12, - "num_hidden_layers": 24, - "num_key_value_heads": 4, - "position_id_per_seconds": 25, - "rms_norm_eps": 1e-06, - "rope_scaling": { - "mrope_section": [ - 16, - 24, - 24 - ], - "rope_type": "default", - "type": "default" - }, - "rope_theta": 1000000.0, - "seconds_per_chunk": 2, - "sliding_window": 32768, - "spatial_merge_size": 2, - "torch_dtype": "bfloat16", - "tts_codec_end_token_id": 8294, - "tts_codec_mask_token_id": 8296, - "tts_codec_pad_token_id": 8292, - "tts_codec_start_token_id": 8293, - "tts_text_end_token_id": 151861, - "tts_text_pad_token_id": 151859, - "tts_text_start_token_id": 151860, - "use_cache": true, - "use_sliding_window": false, - "video_token_index": 151656, - "vision_end_token_id": 151653, - "vision_start_token_id": 151652, - "vocab_size": 8448 - }, - "thinker_config": { - "_attn_implementation_autoset": true, - "_name_or_path": "Qwen2.5-Omni-7B/thinker", - "architectures": [ - "Qwen2OmniNaViTThinkerForConditionalGeneration" - ], - "audio_config": { - "_attn_implementation_autoset": true, - "_name_or_path": "", - "activation_dropout": 0.0, - "activation_function": "gelu", - "add_cross_attention": false, - "architectures": null, - "attention_dropout": 0.0, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "d_model": 1280, - "decoder_start_token_id": null, - "diversity_penalty": 0.0, - "do_sample": false, - "dropout": 0.0, - "early_stopping": false, - "encoder_attention_heads": 20, - "encoder_ffn_dim": 5120, - "encoder_layerdrop": 0.0, - "encoder_layers": 32, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "init_std": 0.02, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "length_penalty": 1.0, - "max_length": 20, - "max_source_positions": 1500, - "min_length": 0, - "model_type": "qwen2_5_omni_audio_encoder", - "n_window": 100, - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_hidden_layers": 32, - "num_mel_bins": 128, - "num_return_sequences": 1, - "output_attentions": false, - "output_dim": 3584, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "prefix": null, - "problem_type": null, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1.0, - "return_dict": true, - "return_dict_in_generate": false, - "scale_embedding": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1.0, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1.0, - "torch_dtype": null, - "torchscript": false, - "typical_p": 1.0, - "use_bfloat16": false - }, - "text_config": { - "model_type": "qwen2_5_omni_text", - "hidden_act": "silu", - "hidden_size": 3584, - "init_std": 0.02, - "intermediate_size": 18944, - "vocab_size": 152064, - "num_attention_heads": 28, - "num_hidden_layers": 28, - "num_key_value_heads": 4, - "max_position_embeddings": 32768, - "max_window_layers": 28, - "rms_norm_eps": 1e-06, - "rope_scaling": { - "mrope_section": [ - 16, - 24, - 24 - ], - "rope_type": "default", - "type": "default" - }, - "use_cache": true, - "rope_theta": 1000000.0, - "use_sliding_window": false, - "sliding_window": 32768, - "attention_dropout": 0.0, - "tie_word_embeddings": false - }, - "audio_end_token_id": 151648, - "audio_start_token_id": 151647, - "audio_token_index": 151646, - "bos_token_id": 151644, - "eos_token_id": 151645, - "ignore_index": -100, - "image_token_index": 151655, - "init_std": 0.02, - "model_type": "qwen2_5_omni_thinker", - "pad_token_id": 151643, - "position_id_per_seconds": 25, - "seconds_per_chunk": 2, - "torch_dtype": "bfloat16", - "user_token_id": 872, - "video_token_index": 151656, - "vision_config": { - "_attn_implementation_autoset": true, - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "depth": 32, - "diversity_penalty": 0.0, - "do_sample": false, - "early_stopping": false, - "embed_dim": 1280, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "fullatt_block_indexes": [ - 7, - 15, - 23, - 31 - ], - "hidden_act": "silu", - "hidden_size": 1280, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "in_channels": 3, - "in_chans": 3, - "init_std": 0.02, - "intermediate_size": 3420, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "length_penalty": 1.0, - "max_length": 20, - "min_length": 0, - "model_type": "qwen2_5_omni_vision_encoder", - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_heads": 16, - "num_return_sequences": 1, - "out_hidden_size": 3584, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "patch_size": 14, - "prefix": null, - "problem_type": null, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1.0, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "spatial_merge_size": 2, - "spatial_patch_size": 14, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1.0, - "temporal_patch_size": 2, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "tokens_per_second": 25, - "top_k": 50, - "top_p": 1.0, - "torch_dtype": null, - "torchscript": false, - "typical_p": 1.0, - "use_bfloat16": false, - "window_size": 112 - }, - "vision_end_token_id": 151653, - "vision_start_token_id": 151652, - "vision_token_id": 151654 - }, - "token2wav_config": { - "_attn_implementation_autoset": true, - "bigvgan_config": { - "_attn_implementation_autoset": true, - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0.0, - "do_sample": false, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "length_penalty": 1.0, - "max_length": 20, - "mel_dim": 80, - "min_length": 0, - "model_type": "qwen2_5_omni_bigvgan", - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "prefix": null, - "problem_type": null, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1.0, - "resblock_dilation_sizes": [ - [ - 1, - 3, - 5 - ], - [ - 1, - 3, - 5 - ], - [ - 1, - 3, - 5 - ] - ], - "resblock_kernel_sizes": [ - 3, - 7, - 11 - ], - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1.0, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1.0, - "torch_dtype": null, - "torchscript": false, - "typical_p": 1.0, - "upsample_initial_channel": 1536, - "upsample_kernel_sizes": [ - 11, - 7, - 4, - 4, - 4, - 4 - ], - "upsample_rates": [ - 5, - 3, - 2, - 2, - 2, - 2 - ], - "use_bfloat16": false, - "use_bias_at_final": false - }, - "dit_config": { - "_attn_implementation_autoset": true, - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "depth": 22, - "dim": 1024, - "diversity_penalty": 0.0, - "do_sample": false, - "dropout": 0.1, - "early_stopping": false, - "emb_dim": 512, - "enc_attention_channels": 64, - "enc_channels": [ - 256, - 256, - 256, - 256, - 768 - ], - "enc_dilations": [ - 1, - 2, - 3, - 4, - 1 - ], - "enc_dim": 128, - "enc_emb_dim": 192, - "enc_global_context": true, - "enc_kernel_sizes": [ - 5, - 3, - 3, - 3, - 1 - ], - "enc_lin_neurons": 192, - "enc_res2net_scale": 2, - "enc_se_channels": 64, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "ff_mult": 2, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "head_dim": 64, - "heads": 16, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "length_penalty": 1.0, - "max_length": 20, - "mel_dim": 80, - "min_length": 0, - "model_type": "qwen2_5_omni_dit", - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_embeds": 8193, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "prefix": null, - "problem_type": null, - "pruned_heads": {}, - "remove_invalid_values": false, - "repeats": 2, - "repetition_penalty": 1.0, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1.0, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1.0, - "torch_dtype": "float32", - "torchscript": false, - "typical_p": 1.0, - "use_bfloat16": false - }, - "model_type": "qwen2_5_omni_token2wav" - }, - "torch_dtype": "bfloat16", - "transformers_version": "4.50.0.dev0" -} \ No newline at end of file diff --git a/examples/qwen2_5omni/image_infer.cpp b/examples/qwen2_5omni/image_infer.cpp index 41bf770b1..3c0bf214b 100644 --- a/examples/qwen2_5omni/image_infer.cpp +++ b/examples/qwen2_5omni/image_infer.cpp @@ -50,12 +50,12 @@ MLLM_MAIN({ std::string prompt_text; fmt::print("Image path (or 'exit/quit'): "); - image_path = "../../../mllm2-former/mllm/rsc/pics.jpg"; + image_path = ""; //std::getline(std::cin, image_path); if (image_path == "exit" || image_path == "quit") { return 0; } fmt::print("Prompt text: "); - prompt_text = "描述图片中物体"; + prompt_text = ""; //std::getline(std::cin, prompt_text); try { From a235a134ec63263c1bf58c4beb7c5731ac5eec56 Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Fri, 23 Jan 2026 15:35:49 +0800 Subject: [PATCH 12/42] fix --- examples/qwen2_5omni/audio_infer.cpp | 4 +- .../qwen2_5omni/config_qwen2_5omni_7B.json | 495 ++++++++++++++++++ 2 files changed, 497 insertions(+), 2 deletions(-) create mode 100644 examples/qwen2_5omni/config_qwen2_5omni_7B.json diff --git a/examples/qwen2_5omni/audio_infer.cpp b/examples/qwen2_5omni/audio_infer.cpp index 014b4688f..d159c2b3e 100644 --- a/examples/qwen2_5omni/audio_infer.cpp +++ b/examples/qwen2_5omni/audio_infer.cpp @@ -51,12 +51,12 @@ MLLM_MAIN({ fmt::print("Audio path (or 'exit/quit'): "); //std::getline(std::cin, audio_path); //if (audio_path == "exit" || audio_path == "quit") { return 0; } - audio_path = "/Users/kkkai/Desktop/mllm2-former/mllm/rsc/recognize.wav"; + audio_path = ""; fmt::print("Prompt text: "); //std::getline(std::cin, prompt_text); //if (prompt_text.empty()) { prompt_text = "Please describe the audio."; } - prompt_text = "复述这段音频"; + prompt_text = ""; try { fmt::print("Processing...\n"); diff --git a/examples/qwen2_5omni/config_qwen2_5omni_7B.json b/examples/qwen2_5omni/config_qwen2_5omni_7B.json new file mode 100644 index 000000000..8f27b94b9 --- /dev/null +++ b/examples/qwen2_5omni/config_qwen2_5omni_7B.json @@ -0,0 +1,495 @@ +{ + "architectures": [ + "Qwen2_5OmniModel" + ], + "enable_audio_output": true, + "enable_talker": true, + "model_type": "qwen2_5_omni", + "talker_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "Qwen2.5-Omni-7B/talker", + "architectures": [ + "Qwen2OmniTalkerForConditionalGeneration" + ], + "attention_dropout": 0.0, + "audio_end_token_id": 151648, + "audio_start_token_id": 151647, + "audio_token_index": 151646, + "embedding_size": 3584, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 896, + "image_token_index": 151655, + "init_std": 0.02, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2_5_omni_talker", + "num_attention_heads": 12, + "num_hidden_layers": 24, + "num_key_value_heads": 4, + "position_id_per_seconds": 25, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "rope_theta": 1000000.0, + "seconds_per_chunk": 2, + "sliding_window": 32768, + "spatial_merge_size": 2, + "torch_dtype": "bfloat16", + "tts_codec_end_token_id": 8294, + "tts_codec_mask_token_id": 8296, + "tts_codec_pad_token_id": 8292, + "tts_codec_start_token_id": 8293, + "tts_text_end_token_id": 151861, + "tts_text_pad_token_id": 151859, + "tts_text_start_token_id": 151860, + "use_cache": true, + "use_sliding_window": false, + "video_token_index": 151656, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vocab_size": 8448 + }, + "thinker_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "Qwen2.5-Omni-7B/thinker", + "architectures": [ + "Qwen2OmniNaViTThinkerForConditionalGeneration" + ], + "audio_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "", + "activation_dropout": 0.0, + "activation_function": "gelu", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "d_model": 1280, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "dropout": 0.0, + "early_stopping": false, + "encoder_attention_heads": 20, + "encoder_ffn_dim": 5120, + "encoder_layerdrop": 0.0, + "encoder_layers": 32, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "init_std": 0.02, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_source_positions": 1500, + "min_length": 0, + "model_type": "qwen2_5_omni_audio_encoder", + "n_window": 100, + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 32, + "num_mel_bins": 128, + "num_return_sequences": 1, + "output_attentions": false, + "output_dim": 3584, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "scale_embedding": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + }, + "text_config": { + "model_type": "qwen2_5_omni_text", + "hidden_act": "silu", + "hidden_size": 3584, + "init_std": 0.02, + "intermediate_size": 18944, + "vocab_size": 152064, + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "rms_norm_eps": 1e-06, + "rope_scaling": { + "mrope_section": [ + 16, + 24, + 24 + ], + "rope_type": "default", + "type": "default" + }, + "use_cache": true, + "rope_theta": 1000000.0, + "use_sliding_window": false, + "sliding_window": 32768, + "attention_dropout": 0.0, + "tie_word_embeddings": false + }, + "audio_end_token_id": 151648, + "audio_start_token_id": 151647, + "audio_token_index": 151646, + "bos_token_id": 151644, + "eos_token_id": 151645, + "ignore_index": -100, + "image_token_index": 151655, + "init_std": 0.02, + "model_type": "qwen2_5_omni_thinker", + "pad_token_id": 151643, + "position_id_per_seconds": 25, + "seconds_per_chunk": 2, + "torch_dtype": "bfloat16", + "user_token_id": 872, + "video_token_index": 151656, + "vision_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "depth": 32, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "embed_dim": 1280, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fullatt_block_indexes": [ + 7, + 15, + 23, + 31 + ], + "hidden_act": "silu", + "hidden_size": 1280, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "in_channels": 3, + "in_chans": 3, + "init_std": 0.02, + "intermediate_size": 3420, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "qwen2_5_omni_vision_encoder", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_heads": 16, + "num_return_sequences": 1, + "out_hidden_size": 3584, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "spatial_merge_size": 2, + "spatial_patch_size": 14, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "temporal_patch_size": 2, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "tokens_per_second": 25, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "window_size": 112 + }, + "vision_end_token_id": 151653, + "vision_start_token_id": 151652, + "vision_token_id": 151654 + }, + "token2wav_config": { + "_attn_implementation_autoset": true, + "bigvgan_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "mel_dim": 80, + "min_length": 0, + "model_type": "qwen2_5_omni_bigvgan", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "upsample_initial_channel": 1536, + "upsample_kernel_sizes": [ + 11, + 7, + 4, + 4, + 4, + 4 + ], + "upsample_rates": [ + 5, + 3, + 2, + 2, + 2, + 2 + ], + "use_bfloat16": false, + "use_bias_at_final": false + }, + "dit_config": { + "_attn_implementation_autoset": true, + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "depth": 22, + "dim": 1024, + "diversity_penalty": 0.0, + "do_sample": false, + "dropout": 0.1, + "early_stopping": false, + "emb_dim": 512, + "enc_attention_channels": 64, + "enc_channels": [ + 256, + 256, + 256, + 256, + 768 + ], + "enc_dilations": [ + 1, + 2, + 3, + 4, + 1 + ], + "enc_dim": 128, + "enc_emb_dim": 192, + "enc_global_context": true, + "enc_kernel_sizes": [ + 5, + 3, + 3, + 3, + 1 + ], + "enc_lin_neurons": 192, + "enc_res2net_scale": 2, + "enc_se_channels": 64, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "ff_mult": 2, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 64, + "heads": 16, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "mel_dim": 80, + "min_length": 0, + "model_type": "qwen2_5_omni_dit", + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_embeds": 8193, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repeats": 2, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + }, + "model_type": "qwen2_5_omni_token2wav" + }, + "torch_dtype": "bfloat16", + "transformers_version": "4.50.0.dev0" +} From adc3b644af619085240f98ecb058f0cbf66da7fc Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Sun, 25 Jan 2026 01:51:00 +0800 Subject: [PATCH 13/42] add ConvTranspose1dOp & TanhOp --- mllm/backends/cpu/CPUBackend.cpp | 14 +-- mllm/backends/cpu/ops/ConvTranspose1DOp.cpp | 91 ++++++++++++++++++++ mllm/backends/cpu/ops/ConvTranspose1DOp.hpp | 25 ++++++ mllm/backends/cpu/ops/TanhOp.cpp | 42 +++++++++ mllm/backends/cpu/ops/TanhOp.hpp | 25 ++++++ mllm/compile/ir/GeneratedRTTIKind.hpp | 2 + mllm/compile/ir/NodeRTTIClassOfImpl.hpp | 6 ++ mllm/compile/ir/linalg/Op.cpp | 2 + mllm/compile/ir/linalg/Op.hpp | 4 + mllm/core/OpTypes.hpp | 4 + mllm/core/aops/ConvTranspose1DOp.cpp | 95 +++++++++++++++++++++ mllm/core/aops/ConvTranspose1DOp.hpp | 52 +++++++++++ mllm/core/aops/TanhOp.cpp | 37 ++++++++ mllm/core/aops/TanhOp.hpp | 33 +++++++ mllm/nn/Nn.hpp | 2 + mllm/nn/layers/ConvTranspose1D.cpp | 32 +++++++ mllm/nn/layers/ConvTranspose1D.hpp | 29 +++++++ mllm/nn/layers/Tanh.cpp | 12 +++ mllm/nn/layers/Tanh.hpp | 21 +++++ 19 files changed, 522 insertions(+), 6 deletions(-) create mode 100644 mllm/backends/cpu/ops/ConvTranspose1DOp.cpp create mode 100644 mllm/backends/cpu/ops/ConvTranspose1DOp.hpp create mode 100644 mllm/backends/cpu/ops/TanhOp.cpp create mode 100644 mllm/backends/cpu/ops/TanhOp.hpp create mode 100644 mllm/core/aops/ConvTranspose1DOp.cpp create mode 100644 mllm/core/aops/ConvTranspose1DOp.hpp create mode 100644 mllm/core/aops/TanhOp.cpp create mode 100644 mllm/core/aops/TanhOp.hpp create mode 100644 mllm/nn/layers/ConvTranspose1D.cpp create mode 100644 mllm/nn/layers/ConvTranspose1D.hpp create mode 100644 mllm/nn/layers/Tanh.cpp create mode 100644 mllm/nn/layers/Tanh.hpp diff --git a/mllm/backends/cpu/CPUBackend.cpp b/mllm/backends/cpu/CPUBackend.cpp index 0964cba0d..f4b909913 100644 --- a/mllm/backends/cpu/CPUBackend.cpp +++ b/mllm/backends/cpu/CPUBackend.cpp @@ -14,6 +14,7 @@ #include "mllm/backends/cpu/ops/ConcatOp.hpp" #include "mllm/backends/cpu/ops/ContiguousOp.hpp" #include "mllm/backends/cpu/ops/Conv1DOp.hpp" +#include "mllm/backends/cpu/ops/ConvTranspose1DOp.hpp" #include "mllm/backends/cpu/ops/Conv2DOp.hpp" #include "mllm/backends/cpu/ops/Conv3DOp.hpp" #include "mllm/backends/cpu/ops/CopyOp.hpp" @@ -52,6 +53,7 @@ #include "mllm/backends/cpu/ops/Scatter2ShardsOp.hpp" #include "mllm/backends/cpu/ops/SiLUOp.hpp" #include "mllm/backends/cpu/ops/SigmoidOp.hpp" +#include "mllm/backends/cpu/ops/TanhOp.hpp" #include "mllm/backends/cpu/ops/SliceOp.hpp" #include "mllm/backends/cpu/ops/SoftmaxOp.hpp" #include "mllm/backends/cpu/ops/SplitOp.hpp" @@ -78,12 +80,12 @@ CPUBackend::CPUBackend() : Backend(kCPU, createCPUAllocator()) { CPUSiLUOpFactory, CPUSigmoidOpFactory, CPURMSNormOpFactory, CPUGELUOpFactory, CPUQuickGELUOpFactory, CPUReLUOpFactory, CPUMatMulOpFactory, CPUFlashAttention2OpFactory, CPUSliceOpFactory, CPUVisionRoPEOpFactory, CPUParamOpFactory, CPUMultimodalRoPEOpFactory, CPURoPEOpFactory, CPUCausalMaskOpFactory, CPUConv1DOpFactory, - CPUConv3DOpFactory, CPUSTFTOpFactory, CPUISTFTOpFactory, CPUIndexOpFactory, CPUTopKOpFactory, CPUClipOpFactory, - CPUMeanOpFactory, CPUKVCacheOpFactory, CPUPagedAttnOpFactory, CPUScatter2ShardsOpFactory, CPURadixAttnOpFactory, - CPUConv2DOpFactory, CPULayerNorm2DOpFactory, CPUInterpolateOpFactory, CPUPadOpFactory, CPUMaskedScatterOpFactory, - CPUArgsortOpFactory, CPUCloneOpFactory, CPUAvgPool1dOpFactory, CPUFlashAttention2SwaSinkOpFactory, - CPURadixAttnRelaxOpFactory, CPURadixAttnSwaSinkOpFactory, CPUEqualOpFactory, CPUWhereOpFactory, - CPUGatherOpFactory>(); + CPUConvTranspose1DOpFactory, CPUConv3DOpFactory, CPUSTFTOpFactory, CPUISTFTOpFactory, CPUIndexOpFactory, + CPUTopKOpFactory, CPUClipOpFactory, CPUMeanOpFactory, CPUKVCacheOpFactory, CPUPagedAttnOpFactory, + CPUScatter2ShardsOpFactory, CPURadixAttnOpFactory, CPUConv2DOpFactory, CPULayerNorm2DOpFactory, + CPUInterpolateOpFactory, CPUPadOpFactory, CPUMaskedScatterOpFactory, CPUArgsortOpFactory, CPUCloneOpFactory, + CPUAvgPool1dOpFactory, CPUFlashAttention2SwaSinkOpFactory, CPURadixAttnRelaxOpFactory, + CPURadixAttnSwaSinkOpFactory, CPUEqualOpFactory, CPUWhereOpFactory, CPUGatherOpFactory, CPUTanhOpFactory>(); } CPUBackend::~CPUBackend() { diff --git a/mllm/backends/cpu/ops/ConvTranspose1DOp.cpp b/mllm/backends/cpu/ops/ConvTranspose1DOp.cpp new file mode 100644 index 000000000..cfa38bf34 --- /dev/null +++ b/mllm/backends/cpu/ops/ConvTranspose1DOp.cpp @@ -0,0 +1,91 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/backends/cpu/ops/ConvTranspose1DOp.hpp" +#include "mllm/core/Parallel.hpp" +#include "mllm/utils/Common.hpp" + +namespace mllm::cpu { + +CPUConvTranspose1DOp::CPUConvTranspose1DOp(const aops::ConvTranspose1DOpOptions& options) + : aops::ConvTranspose1DOp(options) {} + +void CPUConvTranspose1DOp::forward(const std::vector& inputs, std::vector& outputs) { + auto& input = inputs[0]; + auto& output = outputs[0]; + + auto i_shape = input.shape(); + auto o_shape = output.shape(); + + // input shape: [batch, in_channels, sequence] + // output shape: [batch, out_channels, out_sequence] + const int batch = i_shape[0]; + const int in_channels = i_shape[1]; + const int sequence = i_shape[2]; + + const int out_channels = o_shape[1]; + const int out_sequence = o_shape[2]; + + const int kernel_size = options_.kernel_size; + const int stride = options_.stride; + const int padding = options_.padding; + const int dilation = options_.dilation; + const int groups = options_.groups; + + const int in_channels_per_group = in_channels / groups; + const int out_channels_per_group = out_channels / groups; + + MLLM_RT_ASSERT(weight_.dtype() == kFloat32); + const auto* weight_ptr = weight_.ptr(); + const auto* input_ptr = input.ptr(); + auto* output_ptr = output.ptr(); + + float* bias_ptr = nullptr; + if (options_.bias && !bias_.isNil()) { bias_ptr = bias_.ptr(); } + + std::fill_n(output_ptr, output.numel(), 0.0f); + + const int total_iterations = batch * out_channels * out_sequence; + + switch (output.dtype()) { + case kFloat32: + MLLM_CONDITIONAL_PARALLEL_FOR(options_.getThreads() > 1, 4, idx, 0, total_iterations, 1, { + int b = idx / (out_channels * out_sequence); + int oc = (idx % (out_channels * out_sequence)) / out_sequence; + int out_pos = idx % out_sequence; + + const int group_idx = oc / out_channels_per_group; + const int oc_in_group = oc % out_channels_per_group; + + float sum = 0.0f; + + for (int ic_in_group = 0; ic_in_group < in_channels_per_group; ++ic_in_group) { + const int ic = group_idx * in_channels_per_group + ic_in_group; + const int base_input_idx = b * (in_channels * sequence) + ic * sequence; + + const int base_weight_idx = (ic * out_channels_per_group + oc_in_group) * kernel_size; + + for (int k = 0; k < kernel_size; ++k) { + int input_pos = out_pos + padding - k * dilation; + if (input_pos % stride != 0) { continue; } + input_pos /= stride; + if (input_pos < 0 || input_pos >= sequence) { continue; } + + const int input_idx = base_input_idx + input_pos; + const int weight_idx = base_weight_idx + k; + + sum += input_ptr[input_idx] * weight_ptr[weight_idx]; + } + } + + if (bias_ptr) { sum += bias_ptr[oc]; } + + const int output_idx = b * (out_channels * out_sequence) + oc * out_sequence + out_pos; + output_ptr[output_idx] = sum; + }); + break; + default: NYI("ConvTranspose1D: unsupported data type"); + } +} + +} // namespace mllm::cpu diff --git a/mllm/backends/cpu/ops/ConvTranspose1DOp.hpp b/mllm/backends/cpu/ops/ConvTranspose1DOp.hpp new file mode 100644 index 000000000..fd1163ed3 --- /dev/null +++ b/mllm/backends/cpu/ops/ConvTranspose1DOp.hpp @@ -0,0 +1,25 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/ConvTranspose1DOp.hpp" + +namespace mllm::cpu { + +class CPUConvTranspose1DOp final : public aops::ConvTranspose1DOp { + public: + explicit CPUConvTranspose1DOp(const aops::ConvTranspose1DOpOptions& options); + + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class CPUConvTranspose1DOpFactory : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::ConvTranspose1DOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::cpu diff --git a/mllm/backends/cpu/ops/TanhOp.cpp b/mllm/backends/cpu/ops/TanhOp.cpp new file mode 100644 index 000000000..3d8dc6af1 --- /dev/null +++ b/mllm/backends/cpu/ops/TanhOp.cpp @@ -0,0 +1,42 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include + +#include "mllm/backends/cpu/ops/TanhOp.hpp" +#include "mllm/core/Parallel.hpp" +#include "mllm/utils/Common.hpp" + +namespace mllm::cpu { + +CPUTanhOp::CPUTanhOp(const aops::TanhOpOptions& options) : aops::TanhOp(options) {} + +void CPUTanhOp::forward(const std::vector& inputs, std::vector& outputs) { + const auto& X = inputs[0]; + auto& Y = outputs[0]; + + const auto numel = X.numel(); + + switch (X.dtype()) { + case kFloat32: { + const auto* x_ptr = X.ptr(); + auto* y_ptr = Y.ptr(); + MLLM_CONDITIONAL_PARALLEL_FOR(options_.getThreads() > 1, 4, idx, 0, numel, 1, { + y_ptr[idx] = std::tanh(x_ptr[idx]); + }); + break; + } + case kFloat16: { + const auto* x_ptr = X.ptr(); + auto* y_ptr = Y.ptr(); + MLLM_CONDITIONAL_PARALLEL_FOR(options_.getThreads() > 1, 4, idx, 0, numel, 1, { + float v = static_cast(x_ptr[idx]); + y_ptr[idx] = static_cast(std::tanh(v)); + }); + break; + } + default: NYI("CPUTanhOp::forward not support dtype {}", nameOfType(X.dtype())); break; + } +} + +} // namespace mllm::cpu diff --git a/mllm/backends/cpu/ops/TanhOp.hpp b/mllm/backends/cpu/ops/TanhOp.hpp new file mode 100644 index 000000000..c88fae9ce --- /dev/null +++ b/mllm/backends/cpu/ops/TanhOp.hpp @@ -0,0 +1,25 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/aops/TanhOp.hpp" + +namespace mllm::cpu { + +class CPUTanhOp final : public aops::TanhOp { + public: + explicit CPUTanhOp(const aops::TanhOpOptions& options); + + void forward(const std::vector& inputs, std::vector& outputs) override; +}; + +class CPUTanhOpFactory : public TypedOpFactory { + public: + std::shared_ptr createOpImpl(const aops::TanhOpOptions& options) override { + return std::make_shared(options); + } +}; + +} // namespace mllm::cpu diff --git a/mllm/compile/ir/GeneratedRTTIKind.hpp b/mllm/compile/ir/GeneratedRTTIKind.hpp index d100dc621..2d83493d1 100644 --- a/mllm/compile/ir/GeneratedRTTIKind.hpp +++ b/mllm/compile/ir/GeneratedRTTIKind.hpp @@ -44,6 +44,7 @@ enum NodeKind : uint32_t { RK_Op_LinalgIROp_RepeatOp, RK_Op_LinalgIROp_PermuteOp, RK_Op_LinalgIROp_Conv1DOp, + RK_Op_LinalgIROp_ConvTranspose1DOp, RK_Op_LinalgIROp_Conv2DOp, RK_Op_LinalgIROp_Conv3DOp, RK_Op_LinalgIROp_GELUOp, @@ -86,6 +87,7 @@ enum NodeKind : uint32_t { RK_Op_LinalgIROp_EqualOp, RK_Op_LinalgIROp_WhereOp, RK_Op_LinalgIROp_SigmoidOp, + RK_Op_LinalgIROp_TanhOp, RK_Op_LinalgIROp_CustomizedOp, RK_Op_LinalgIROp_Last, RK_Op_GraphIROp, diff --git a/mllm/compile/ir/NodeRTTIClassOfImpl.hpp b/mllm/compile/ir/NodeRTTIClassOfImpl.hpp index 6a98797a9..4c3313cf9 100644 --- a/mllm/compile/ir/NodeRTTIClassOfImpl.hpp +++ b/mllm/compile/ir/NodeRTTIClassOfImpl.hpp @@ -102,6 +102,9 @@ struct NodeRTTIClassOfImpl { #define RTTI_RK_OP_LINALGIROP_CONV1DOP_IMPL(v) \ return (v)->getKind() >= RK_Op_LinalgIROp_Conv1DOp && (v)->getKind() <= RK_Op_LinalgIROp_Conv1DOp +#define RTTI_RK_OP_LINALGIROP_CONVTRANSPOSE1DOP_IMPL(v) \ + return (v)->getKind() >= RK_Op_LinalgIROp_ConvTranspose1DOp && (v)->getKind() <= RK_Op_LinalgIROp_ConvTranspose1DOp + #define RTTI_RK_OP_LINALGIROP_CONV2DOP_IMPL(v) \ return (v)->getKind() >= RK_Op_LinalgIROp_Conv2DOp && (v)->getKind() <= RK_Op_LinalgIROp_Conv2DOp @@ -229,6 +232,9 @@ struct NodeRTTIClassOfImpl { #define RTTI_RK_OP_LINALGIROP_SIGMOIDOP_IMPL(v) \ return (v)->getKind() >= RK_Op_LinalgIROp_SigmoidOp && (v)->getKind() <= RK_Op_LinalgIROp_SigmoidOp +#define RTTI_RK_OP_LINALGIROP_TANHOP_IMPL(v) \ + return (v)->getKind() >= RK_Op_LinalgIROp_TanhOp && (v)->getKind() <= RK_Op_LinalgIROp_TanhOp + #define RTTI_RK_OP_LINALGIROP_CUSTOMIZEDOP_IMPL(v) \ return (v)->getKind() >= RK_Op_LinalgIROp_CustomizedOp && (v)->getKind() <= RK_Op_LinalgIROp_CustomizedOp diff --git a/mllm/compile/ir/linalg/Op.cpp b/mllm/compile/ir/linalg/Op.cpp index bb4e2fb9d..ad05e9437 100644 --- a/mllm/compile/ir/linalg/Op.cpp +++ b/mllm/compile/ir/linalg/Op.cpp @@ -55,6 +55,7 @@ LINALG_AOPS_DECL(OpTypes::kTranspose, TransposeOp); LINALG_AOPS_DECL(OpTypes::kRMSNorm, RMSNormOp); LINALG_AOPS_DECL(OpTypes::kSiLU, SiLUOp); LINALG_AOPS_DECL(OpTypes::kSigmoid, SigmoidOp); +LINALG_AOPS_DECL(OpTypes::kTanh, TanhOp); LINALG_AOPS_DECL(OpTypes::kCastType, CastTypeOp); @@ -70,6 +71,7 @@ LINALG_AOPS_DECL(OpTypes::kRepeat, RepeatOp); LINALG_AOPS_DECL(OpTypes::kPermute, PermuteOp); LINALG_AOPS_DECL(OpTypes::kConv1D, Conv1DOp); +LINALG_AOPS_DECL(OpTypes::kConvTranspose1D, ConvTranspose1DOp); LINALG_AOPS_DECL(OpTypes::kConv2D, Conv2DOp); LINALG_AOPS_DECL(OpTypes::kConv3D, Conv3DOp); diff --git a/mllm/compile/ir/linalg/Op.hpp b/mllm/compile/ir/linalg/Op.hpp index 02d04400b..6e6de4785 100644 --- a/mllm/compile/ir/linalg/Op.hpp +++ b/mllm/compile/ir/linalg/Op.hpp @@ -29,6 +29,7 @@ class TransposeOp; class RMSNormOp; class SiLUOp; class SigmoidOp; +class TanhOp; class CausalMaskOp; class CastTypeOp; class X2XOp; @@ -38,6 +39,7 @@ class FlashAttention2Op; class RepeatOp; class PermuteOp; class Conv1DOp; +class ConvTranspose1DOp; class Conv2DOp; class Conv3DOp; class GELUOp; @@ -188,6 +190,7 @@ LINALG_AOPS_DEFINE(TransposeOp, TRANSPOSEOP); LINALG_AOPS_DEFINE(RMSNormOp, RMSNORMOP); LINALG_AOPS_DEFINE(SiLUOp, SILUOP); LINALG_AOPS_DEFINE(SigmoidOp, SIGMOIDOP); +LINALG_AOPS_DEFINE(TanhOp, TANHOP); LINALG_AOPS_DEFINE(CastTypeOp, CASTTYPEOP); @@ -201,6 +204,7 @@ LINALG_AOPS_DEFINE(RepeatOp, REPEATOP); LINALG_AOPS_DEFINE(PermuteOp, PERMUTEOP); LINALG_AOPS_DEFINE(Conv1DOp, CONV1DOP); +LINALG_AOPS_DEFINE(ConvTranspose1DOp, CONVTRANSPOSE1DOP); LINALG_AOPS_DEFINE(Conv2DOp, CONV2DOP); LINALG_AOPS_DEFINE(Conv3DOp, CONV3DOP); diff --git a/mllm/core/OpTypes.hpp b/mllm/core/OpTypes.hpp index 310b39cd0..d64d484fe 100644 --- a/mllm/core/OpTypes.hpp +++ b/mllm/core/OpTypes.hpp @@ -96,6 +96,8 @@ enum class OpTypes : int32_t { kWhere = 74, kSigmoid = 75, + kTanh = 76, + kConvTranspose1D = 77, // Dynamic Op Start for user to register there own ops. kDynamicOp_Start = 4096, @@ -181,6 +183,8 @@ inline std::string optype2Str(OpTypes type) { case OpTypes::kEqual: return "Equal"; case OpTypes::kWhere: return "Where"; case OpTypes::kSigmoid: return "Sigmoid"; + case OpTypes::kTanh: return "Tanh"; + case OpTypes::kConvTranspose1D: return "ConvTranspose1D"; case OpTypes::kDynamicOp_Start: return "DynamicOp_Start"; case OpTypes::kOpType_End: return "OpType_End"; default: return "Unknown"; diff --git a/mllm/core/aops/ConvTranspose1DOp.cpp b/mllm/core/aops/ConvTranspose1DOp.cpp new file mode 100644 index 000000000..25d1b5935 --- /dev/null +++ b/mllm/core/aops/ConvTranspose1DOp.cpp @@ -0,0 +1,95 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/core/aops/ConvTranspose1DOp.hpp" +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/utils/Common.hpp" +#include "mllm/compile/ir/linalg/Op.hpp" +#include "mllm/compile/ir/graph/Op.hpp" +#include "mllm/compile/ir/tensor/Op.hpp" + +namespace mllm::aops { + +ConvTranspose1DOp::ConvTranspose1DOp(const ConvTranspose1DOpOptions& options) + : BaseOp(OpTypes::kConvTranspose1D), options_(options) {} + +void ConvTranspose1DOp::load(const ParameterFile::ptr_t& ploader) { + switch (ploader->version()) { + case ModelFileVersion::kV1: { + weight_ = ploader->pull(getName() + ".weight"); + if (options_.bias) { bias_ = ploader->pull(getName() + ".bias"); } + weight_ = weight_.view({options_.in_channels, options_.out_channels / options_.groups, options_.kernel_size}); + if (options_.bias) { bias_ = bias_.view({options_.out_channels}); } + break; + } + case ModelFileVersion::kUserTemporary: + case ModelFileVersion::kV2: { + weight_ = ploader->pull(getName() + ".weight"); + if (options_.bias) { bias_ = ploader->pull(getName() + ".bias"); } + break; + } + default: NYI("Unsupported model file version") + } +} + +void ConvTranspose1DOp::trace(void* trace_context, const std::vector& inputs, std::vector& outputs) { + auto ir_ctx = (ir::IRContext*)trace_context; + + if (weight_ && !ir_ctx->lookupSymbolTable(getName() + ".weight")) { + ir::IRWriterGuard guard(ir_ctx, ir_ctx->lookupSymbolTable("init")->cast_()->getTopRegion()); + ir_ctx->create(ir_ctx->create(weight_)); + if (options_.bias) { ir_ctx->create(ir_ctx->create(bias_)); } + } + + auto i_irs = ir::tensor::wrapTensors2TensorIR(ir_ctx, inputs); + auto o_irs = ir::tensor::wrapTensors2TensorIR(ir_ctx, outputs); + ir_ctx->create(shared_from_this(), i_irs, o_irs); +} + +void ConvTranspose1DOp::forward(const std::vector& inputs, std::vector& outputs) { + NYI("ConvTranspose1DOp::forward not implemented in aops base."); +} + +void ConvTranspose1DOp::reshape(const std::vector& inputs, std::vector& outputs) { + const auto& i = inputs[0]; + const auto& ishape = i.shape(); + + if (ishape.size() != 3) { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "ConvTranspose1DOp expects 3D input, got {} D", ishape.size()); + outputs.emplace_back(Tensor::empty(i.shape(), i.dtype(), i.device())); + return; + } + + const int batch = ishape[0]; + const int in_channels = ishape[1]; + const int sequence = ishape[2]; + + MLLM_RT_ASSERT_EQ(in_channels, options_.in_channels); + MLLM_RT_ASSERT_EQ(in_channels % options_.groups, 0); + MLLM_RT_ASSERT_EQ(options_.out_channels % options_.groups, 0); + + const int kernel_size = options_.kernel_size; + const int stride = options_.stride; + const int dilation = options_.dilation; + const int padding = options_.padding; + const int output_padding = options_.output_padding; + + const int seq_out = (sequence - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + output_padding + 1; + + auto new_shape = std::vector{batch, options_.out_channels, seq_out}; + outputs.emplace_back(Tensor::empty(new_shape, i.dtype(), i.device())); +} + +void ConvTranspose1DOp::setup(const std::vector& inputs, std::vector& outputs) { + BaseOp::setup(inputs, outputs); +} + +ParameterFile::ptr_t ConvTranspose1DOp::getParams() { + auto p = ParameterFile::create(); + p->push(getName() + ".weight", weight_); + if (options_.bias) { p->push(getName() + ".bias", bias_); } + return p; +} + +} // namespace mllm::aops diff --git a/mllm/core/aops/ConvTranspose1DOp.hpp b/mllm/core/aops/ConvTranspose1DOp.hpp new file mode 100644 index 000000000..daeda0b8e --- /dev/null +++ b/mllm/core/aops/ConvTranspose1DOp.hpp @@ -0,0 +1,52 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/ParameterFile.hpp" + +namespace mllm::aops { + +struct ConvTranspose1DOpOptions : public BaseOpOptions { + int32_t in_channels; + int32_t out_channels; + int32_t kernel_size; + int32_t stride = 1; + int32_t padding = 0; + int32_t output_padding = 0; + int32_t dilation = 1; + int32_t groups = 1; + bool bias = true; +}; + +class ConvTranspose1DOp : public BaseOp { + public: + explicit ConvTranspose1DOp(const ConvTranspose1DOpOptions& options); + + void load(const ParameterFile::ptr_t& ploader) override; + + void trace(void* trace_context, const std::vector& inputs, std::vector& outputs) override; + + void forward(const std::vector& inputs, std::vector& outputs) override; + + void reshape(const std::vector& inputs, std::vector& outputs) override; + + void setup(const std::vector& inputs, std::vector& outputs) override; + + ParameterFile::ptr_t getParams() override; + + inline Tensor& weight() { return weight_; } + + inline Tensor& bias() { return bias_; } + + inline ConvTranspose1DOpOptions& options() { return options_; } + + protected: + Tensor weight_; + Tensor bias_; + ConvTranspose1DOpOptions options_; +}; + +} // namespace mllm::aops diff --git a/mllm/core/aops/TanhOp.cpp b/mllm/core/aops/TanhOp.cpp new file mode 100644 index 000000000..c0938d82f --- /dev/null +++ b/mllm/core/aops/TanhOp.cpp @@ -0,0 +1,37 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/core/aops/TanhOp.hpp" +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/Tensor.hpp" +#include "mllm/utils/Common.hpp" +#include "mllm/compile/ir/linalg/Op.hpp" + +namespace mllm::aops { + +TanhOp::TanhOp(const TanhOpOptions& options) : BaseOp(OpTypes::kTanh), options_(options) {} + +void TanhOp::load(const ParameterFile::ptr_t& ploader) { MLLM_EMPTY_SCOPE; } + +void TanhOp::trace(void* trace_context, const std::vector& inputs, std::vector& outputs) { + auto ir_ctx = (ir::IRContext*)trace_context; + auto i_irs = ir::tensor::wrapTensors2TensorIR(ir_ctx, inputs); + auto o_irs = ir::tensor::wrapTensors2TensorIR(ir_ctx, outputs); + ir_ctx->create(shared_from_this(), i_irs, o_irs); +} + +void TanhOp::forward(const std::vector& inputs, std::vector& outputs) { + NYI("TanhOp::forward not implemented in aops base."); +} + +void TanhOp::reshape(const std::vector& inputs, std::vector& outputs) { + if (options_.isInplace()) { + outputs.emplace_back(inputs[0]); + } else { + outputs.emplace_back(Tensor::empty(inputs[0].shape(), inputs[0].dtype(), inputs[0].device())); + } +} + +void TanhOp::setup(const std::vector& inputs, std::vector& outputs) { BaseOp::setup(inputs, outputs); } + +} // namespace mllm::aops diff --git a/mllm/core/aops/TanhOp.hpp b/mllm/core/aops/TanhOp.hpp new file mode 100644 index 000000000..8b2ce4f43 --- /dev/null +++ b/mllm/core/aops/TanhOp.hpp @@ -0,0 +1,33 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/core/BaseOp.hpp" +#include "mllm/core/ParameterFile.hpp" + +namespace mllm::aops { + +struct TanhOpOptions : public BaseOpOptions {}; + +class TanhOp : public BaseOp { + public: + explicit TanhOp(const TanhOpOptions& options); + + void load(const ParameterFile::ptr_t& ploader) override; + + void trace(void* trace_context, const std::vector& inputs, std::vector& outputs) override; + + void forward(const std::vector& inputs, std::vector& outputs) override; + + void reshape(const std::vector& inputs, std::vector& outputs) override; + + void setup(const std::vector& inputs, std::vector& outputs) override; + + inline TanhOpOptions& options() { return options_; } + + protected: + TanhOpOptions options_; +}; + +} // namespace mllm::aops diff --git a/mllm/nn/Nn.hpp b/mllm/nn/Nn.hpp index fdb0edc82..160e1cb43 100644 --- a/mllm/nn/Nn.hpp +++ b/mllm/nn/Nn.hpp @@ -11,6 +11,7 @@ #include "mllm/nn/layers/RMSNorm.hpp" // IWYU pragma: export #include "mllm/nn/layers/SiLU.hpp" // IWYU pragma: export #include "mllm/nn/layers/Sigmoid.hpp" // IWYU pragma: export +#include "mllm/nn/layers/Tanh.hpp" // IWYU pragma: export #include "mllm/nn/layers/Embedding.hpp" // IWYU pragma: export #include "mllm/nn/layers/GELU.hpp" // IWYU pragma: export #include "mllm/nn/layers/QuickGELU.hpp" // IWYU pragma: export @@ -26,6 +27,7 @@ #include "mllm/nn/layers/Param.hpp" // IWYU pragma: export #include "mllm/nn/layers/KVCache.hpp" // IWYU pragma: export #include "mllm/nn/layers/Conv1D.hpp" // IWYU pragma: export +#include "mllm/nn/layers/ConvTranspose1D.hpp" // IWYU pragma: export #include "mllm/nn/layers/AvgPool1d.hpp" // IWYU pragma: export #include "mllm/nn/layers/STFT.hpp" // IWYU pragma: export #include "mllm/nn/layers/PagedAttn.hpp" // IWYU pragma: export diff --git a/mllm/nn/layers/ConvTranspose1D.cpp b/mllm/nn/layers/ConvTranspose1D.cpp new file mode 100644 index 000000000..de2a7a5c7 --- /dev/null +++ b/mllm/nn/layers/ConvTranspose1D.cpp @@ -0,0 +1,32 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/nn/layers/ConvTranspose1D.hpp" + +namespace mllm::nn { + +ConvTranspose1D::ConvTranspose1D() : Layer(OpTypes::kConvTranspose1D, aops::ConvTranspose1DOpOptions{}) {} + +ConvTranspose1D::ConvTranspose1D(int32_t in_channels, int32_t out_channels, int32_t kernel_size, int32_t stride_size, + int32_t padding, int32_t output_padding, int32_t dilation, int32_t groups, bool bias) + : Layer(OpTypes::kConvTranspose1D, aops::ConvTranspose1DOpOptions{.in_channels = in_channels, + .out_channels = out_channels, + .kernel_size = kernel_size, + .stride = stride_size, + .padding = padding, + .output_padding = output_padding, + .dilation = dilation, + .groups = groups, + .bias = bias}) {} + +ConvTranspose1D::ConvTranspose1D(const aops::ConvTranspose1DOpOptions& options) : Layer(OpTypes::kConvTranspose1D, options) {} + +Tensor ConvTranspose1D::weight() const { + return std::static_pointer_cast(impl()->getInstancedOp())->weight(); +} + +Tensor ConvTranspose1D::bias() const { + return std::static_pointer_cast(impl()->getInstancedOp())->bias(); +} + +} // namespace mllm::nn diff --git a/mllm/nn/layers/ConvTranspose1D.hpp b/mllm/nn/layers/ConvTranspose1D.hpp new file mode 100644 index 000000000..6ddc2fac3 --- /dev/null +++ b/mllm/nn/layers/ConvTranspose1D.hpp @@ -0,0 +1,29 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include +#include "mllm/nn/Layer.hpp" +#include "mllm/core/aops/ConvTranspose1DOp.hpp" + +namespace mllm::nn { + +class ConvTranspose1D : public Layer { + public: + ConvTranspose1D(); + + ConvTranspose1D(int32_t in_channels, int32_t out_channels, int32_t kernel_size, int32_t stride_size = 1, + int32_t padding = 0, int32_t output_padding = 0, int32_t dilation = 1, int32_t groups = 1, + bool bias = true); + + explicit ConvTranspose1D(const aops::ConvTranspose1DOpOptions& options); + + [[nodiscard]] Tensor weight() const; + + [[nodiscard]] Tensor bias() const; + + MLLM_LAYER_ANY_INPUTS_1_OUTPUTS_FORWARD +}; + +} // namespace mllm::nn diff --git a/mllm/nn/layers/Tanh.cpp b/mllm/nn/layers/Tanh.cpp new file mode 100644 index 000000000..dda95f7ae --- /dev/null +++ b/mllm/nn/layers/Tanh.cpp @@ -0,0 +1,12 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include "mllm/nn/layers/Tanh.hpp" + +namespace mllm::nn { + +Tanh::Tanh() : Layer(OpTypes::kTanh, aops::TanhOpOptions{}) {} + +Tanh::Tanh(const aops::TanhOpOptions& options) : Layer(OpTypes::kTanh, options) {} + +} // namespace mllm::nn diff --git a/mllm/nn/layers/Tanh.hpp b/mllm/nn/layers/Tanh.hpp new file mode 100644 index 000000000..ab84e7eeb --- /dev/null +++ b/mllm/nn/layers/Tanh.hpp @@ -0,0 +1,21 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#pragma once + +#include "mllm/nn/Layer.hpp" +#include "mllm/core/aops/TanhOp.hpp" + +namespace mllm::nn { + +class Tanh : public Layer { + public: + Tanh(); + + explicit Tanh(const aops::TanhOpOptions& options); + + MLLM_LAYER_ANY_INPUTS_1_OUTPUTS_FORWARD + MLLM_LAYER_ENABLE_INPLACE_ATTRIBUTE(Tanh) +}; + +} // namespace mllm::nn From 674f97c4cb02b07a53cb182d27799745c12fa5a5 Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Sun, 25 Jan 2026 19:25:41 +0800 Subject: [PATCH 14/42] fix: fix Tanh op and add test for Tanh Op and ConvTranspose1d Op --- mllm/backends/cpu/ops/ConvTranspose1DOp.cpp | 2 + tests/cpu/ConvTranspose1DKernelTest.hpp | 134 ++++++++++++++++++++ tests/cpu/KernelTest.cpp | 42 ++++++ tests/cpu/TanhKernelTest.hpp | 49 +++++++ 4 files changed, 227 insertions(+) create mode 100644 tests/cpu/ConvTranspose1DKernelTest.hpp create mode 100644 tests/cpu/TanhKernelTest.hpp diff --git a/mllm/backends/cpu/ops/ConvTranspose1DOp.cpp b/mllm/backends/cpu/ops/ConvTranspose1DOp.cpp index cfa38bf34..15a8097d1 100644 --- a/mllm/backends/cpu/ops/ConvTranspose1DOp.cpp +++ b/mllm/backends/cpu/ops/ConvTranspose1DOp.cpp @@ -35,6 +35,8 @@ void CPUConvTranspose1DOp::forward(const std::vector& inputs, std::vecto const int in_channels_per_group = in_channels / groups; const int out_channels_per_group = out_channels / groups; + MLLM_RT_ASSERT_EQ(input.dtype(), kFloat32); + MLLM_RT_ASSERT_EQ(output.dtype(), kFloat32); MLLM_RT_ASSERT(weight_.dtype() == kFloat32); const auto* weight_ptr = weight_.ptr(); const auto* input_ptr = input.ptr(); diff --git a/tests/cpu/ConvTranspose1DKernelTest.hpp b/tests/cpu/ConvTranspose1DKernelTest.hpp new file mode 100644 index 000000000..d7657baf1 --- /dev/null +++ b/tests/cpu/ConvTranspose1DKernelTest.hpp @@ -0,0 +1,134 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. +#pragma once + +#include +#include + +#include "KernelTestHelper.hpp" +#include "mllm/core/ParameterFile.hpp" +#include "mllm/mllm.hpp" +#include "mllm/nn/Nn.hpp" + +using namespace mllm; // NOLINT + +void naive_conv_transpose1d(const float* input_data, const float* weight_data, const float* bias_data, float* output_data, + int batch, int in_channels, int sequence, int out_channels, int kernel_size, int stride, + int padding, int dilation, int output_padding, int groups) { + const int out_sequence = (sequence - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + output_padding + 1; + std::fill_n(output_data, batch * out_channels * out_sequence, 0.0f); + + const int in_channels_per_group = in_channels / groups; + const int out_channels_per_group = out_channels / groups; + + for (int b = 0; b < batch; ++b) { + for (int oc = 0; oc < out_channels; ++oc) { + const int group_idx = oc / out_channels_per_group; + const int oc_in_group = oc % out_channels_per_group; + for (int out_pos = 0; out_pos < out_sequence; ++out_pos) { + float sum = 0.0f; + for (int ic_in_group = 0; ic_in_group < in_channels_per_group; ++ic_in_group) { + const int ic = group_idx * in_channels_per_group + ic_in_group; + const int base_input_idx = b * (in_channels * sequence) + ic * sequence; + const int base_weight_idx = (ic * out_channels_per_group + oc_in_group) * kernel_size; + + for (int k = 0; k < kernel_size; ++k) { + int input_pos = out_pos + padding - k * dilation; + if (input_pos % stride != 0) { continue; } + input_pos /= stride; + if (input_pos < 0 || input_pos >= sequence) { continue; } + + const int input_idx = base_input_idx + input_pos; + const int weight_idx = base_weight_idx + k; + sum += input_data[input_idx] * weight_data[weight_idx]; + } + } + if (bias_data != nullptr) { sum += bias_data[oc]; } + const int output_idx = b * (out_channels * out_sequence) + oc * out_sequence + out_pos; + output_data[output_idx] = sum; + } + } + } +} + +class ConvTranspose1DModule : public nn::Module { + nn::ConvTranspose1D conv_; + + public: + ConvTranspose1DModule(int in_channel, int out_channel, int kernel_size, int stride, int padding, int output_padding, + int dilation, int groups, bool bias) { + conv_ = reg("conv", in_channel, out_channel, kernel_size, stride, padding, output_padding, dilation, + groups, bias); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + return {conv_(inputs[0])}; + } +}; + +class ConvTranspose1DKernelTest : public KernelTest { + public: + bool testConvTranspose1DOnce(const std::unordered_map& cfg) { + auto batch = cfg.at("batch"); + auto in_channel = cfg.at("in_channel"); + auto out_channel = cfg.at("out_channel"); + auto sequence = cfg.at("sequence"); + auto kernel_size = cfg.at("kernel_size"); + auto stride = cfg.at("stride"); + auto padding = cfg.at("padding"); + auto output_padding = cfg.at("output_padding"); + auto dilation = cfg.at("dilation"); + auto groups = cfg.at("groups"); + auto bias = cfg.at("bias"); + + auto module = ConvTranspose1DModule(in_channel, out_channel, kernel_size, stride, padding, output_padding, dilation, + groups, bias); + + auto weight_param = + Tensor::random({in_channel, out_channel / groups, kernel_size}, -1, 1, kFloat32, kCPU); + auto bias_param = Tensor::random({out_channel}, -1, 1, kFloat32, kCPU); + weight_param.setName("conv.weight"); + bias_param.setName("conv.bias"); + + auto param = ParameterFile::create(); + param->push("conv.weight", weight_param); + if (bias) { param->push("conv.bias", bias_param); } + module.load(param); + + auto input = Tensor::random({batch, in_channel, sequence}, -1, 1, kFloat32, kCPU); + auto predict = module(input)[0]; + + auto expected = Tensor::zeros(predict.shape(), kFloat32, kCPU); + naive_conv_transpose1d(input.ptr(), weight_param.ptr(), bias ? bias_param.ptr() : nullptr, + expected.ptr(), batch, in_channel, sequence, out_channel, kernel_size, stride, padding, + dilation, output_padding, groups); + + auto result = test::allClose(expected, predict, 1e-4f, 1e-4f); + if (!result) { + print(result); + return false; + } + return true; + } + + bool testConvTranspose1D(const std::vector>& cfgs) { + for (auto& cfg : cfgs) { + if (!testConvTranspose1DOnce(cfg)) { + auto batch = cfg.at("batch"); + auto in_channel = cfg.at("in_channel"); + auto out_channel = cfg.at("out_channel"); + auto sequence = cfg.at("sequence"); + auto kernel_size = cfg.at("kernel_size"); + auto stride = cfg.at("stride"); + auto padding = cfg.at("padding"); + auto output_padding = cfg.at("output_padding"); + auto dilation = cfg.at("dilation"); + auto groups = cfg.at("groups"); + auto bias = cfg.at("bias"); + print(batch, in_channel, out_channel, sequence, kernel_size, stride, padding, output_padding, dilation, groups, bias); + return false; + } + } + return true; + } +}; diff --git a/tests/cpu/KernelTest.cpp b/tests/cpu/KernelTest.cpp index 9f8d613ee..575360703 100644 --- a/tests/cpu/KernelTest.cpp +++ b/tests/cpu/KernelTest.cpp @@ -857,6 +857,48 @@ TEST_F(FlashAttn2KernelTest, fwd_bshd) { } #endif +//===----------------------------------------------------------------------===// +// Tanh +//===----------------------------------------------------------------------===// +#include "TanhKernelTest.hpp" +TEST_F(TanhKernelTest, TanhFloat32) { EXPECT_EQ(testTanh({{8}, {2, 3, 4}}), true); } + +//===----------------------------------------------------------------------===// +// ConvTranspose1D +//===----------------------------------------------------------------------===// +#include "ConvTranspose1DKernelTest.hpp" +TEST_F(ConvTranspose1DKernelTest, Basic) { + EXPECT_EQ(testConvTranspose1D({ + { + {"batch", 1}, + {"in_channel", 2}, + {"out_channel", 3}, + {"sequence", 4}, + {"kernel_size", 3}, + {"stride", 2}, + {"padding", 1}, + {"output_padding", 0}, + {"dilation", 1}, + {"groups", 1}, + {"bias", 1}, + }, + { + {"batch", 2}, + {"in_channel", 1}, + {"out_channel", 2}, + {"sequence", 5}, + {"kernel_size", 2}, + {"stride", 1}, + {"padding", 0}, + {"output_padding", 0}, + {"dilation", 1}, + {"groups", 1}, + {"bias", 0}, + }, + }), + true); +} + //===----------------------------------------------------------------------===// // Conv2D Test // diff --git a/tests/cpu/TanhKernelTest.hpp b/tests/cpu/TanhKernelTest.hpp new file mode 100644 index 000000000..ff6762170 --- /dev/null +++ b/tests/cpu/TanhKernelTest.hpp @@ -0,0 +1,49 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. +#pragma once + +#include + +#include "KernelTestHelper.hpp" +#include "mllm/mllm.hpp" +#include "mllm/nn/Nn.hpp" + +class TanhModule : public mllm::nn::Module { + mllm::nn::Tanh tanh_; + + public: + TanhModule() { tanh_ = reg("tanh"); } + + std::vector forward(const std::vector& inputs, + const std::vector& args) override { + return {tanh_(inputs[0])}; + } +}; + +class TanhKernelTest : public KernelTest { + public: + bool testTanh(const std::vector& shapes) { + using mllm::Tensor; + using mllm::kCPU; + using mllm::kFloat32; + TanhModule module; + + for (auto& s : shapes) { + auto input = Tensor::random(s, -3, 3, kFloat32, kCPU); + auto output = module(input)[0]; + auto expected = Tensor::empty(s, kFloat32, kCPU).alloc(); + + const auto* in_ptr = input.ptr(); + auto* out_ptr = expected.ptr(); + const auto numel = input.numel(); + for (size_t i = 0; i < numel; ++i) { out_ptr[i] = std::tanh(in_ptr[i]); } + + auto result = mllm::test::allClose(expected, output, 1e-5f, 1e-5f); + if (!result) { + mllm::print(result); + return false; + } + } + return true; + } +}; From af574aec2c8adbbe3e8db0cbe7e13c915bcbd416 Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Tue, 24 Feb 2026 15:42:37 +0800 Subject: [PATCH 15/42] add --- examples/minicpm_o45/CMakeLists.txt | 6 +- examples/qwen2_5omni/audio_out_infer.cpp | 93 ++++++++++++++++++++++++ examples/qwen2_5omni/image_infer_dbg.cpp | 91 +++++++++++++++++++++++ 3 files changed, 187 insertions(+), 3 deletions(-) create mode 100644 examples/qwen2_5omni/audio_out_infer.cpp create mode 100644 examples/qwen2_5omni/image_infer_dbg.cpp diff --git a/examples/minicpm_o45/CMakeLists.txt b/examples/minicpm_o45/CMakeLists.txt index a866fb4ec..a755efda1 100644 --- a/examples/minicpm_o45/CMakeLists.txt +++ b/examples/minicpm_o45/CMakeLists.txt @@ -2,6 +2,6 @@ add_executable(mllm-minicpm-o45-runner main.cpp) target_link_libraries(mllm-minicpm-o45-runner PRIVATE MllmRT MllmCPUBackend) target_include_directories(mllm-minicpm-o45-runner PRIVATE ${MLLM_INCLUDE_DIR}) -add_executable(mllm-minicpm-o45-runner-python main_python.cpp) -target_link_libraries(mllm-minicpm-o45-runner-python PRIVATE MllmRT MllmCPUBackend) -target_include_directories(mllm-minicpm-o45-runner-python PRIVATE ${MLLM_INCLUDE_DIR}) +# add_executable(mllm-minicpm-o45-runner-python main_python.cpp) +# target_link_libraries(mllm-minicpm-o45-runner-python PRIVATE MllmRT MllmCPUBackend) +# target_include_directories(mllm-minicpm-o45-runner-python PRIVATE ${MLLM_INCLUDE_DIR}) diff --git a/examples/qwen2_5omni/audio_out_infer.cpp b/examples/qwen2_5omni/audio_out_infer.cpp new file mode 100644 index 000000000..9e46fcd0e --- /dev/null +++ b/examples/qwen2_5omni/audio_out_infer.cpp @@ -0,0 +1,93 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include +#include "wenet_audio/wav.h" + +using mllm::Argparse; + +MLLM_MAIN({ + mllm::Logger::level() = mllm::LogLevel::kError; + + auto& help = Argparse::add("-h|--help").help("Show help message"); + auto& model_path = Argparse::add("-m|--model_path").help("Model path").required(true); + auto& model_version = Argparse::add("-mv|--model_version").help("Model version").required(true); + auto& tokenizer_path = Argparse::add("-t|--tokenizer_path").help("Tokenizer directory").required(true); + auto& config_path = Argparse::add("-c|--config_path").help("Config path").required(true); + auto& spk_dict_path = Argparse::add("-s|--spk_dict_path").help("Speaker json path").required(true); + auto& prompt = Argparse::add("-p|--prompt").help("Prompt text").def(""); + auto& image_path = Argparse::add("-i|--image_path").help("Image path").def(""); + auto& audio_path = Argparse::add("-a|--audio_path").help("Audio path").def(""); + auto& speaker = Argparse::add("-sp|--speaker").help("Speaker name (default: first entry)").def(""); + auto& output_path = Argparse::add("-o|--output_path").help("Output wav path").def("./qwen2_5omni.wav"); + + Argparse::parse(argc, argv); + + mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1; + if (model_version.get() == "v1") { + file_version = mllm::ModelFileVersion::kV1; + } else if (model_version.get() == "v2") { + file_version = mllm::ModelFileVersion::kV2; + } + + if (help.isSet()) { + Argparse::printHelp(); + mllm::shutdownContext(); + return 0; + } + + if (!image_path.get().empty() && !audio_path.get().empty()) { + MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, "Only one of --image_path or --audio_path can be set."); + } + + auto qwen_cfg = mllm::models::qwen2_5omni::Qwen2_5OmniConfig(config_path.get()); + auto qwen_tokenizer = mllm::models::qwen2_5omni::Qwen2_5OmniTokenizer(tokenizer_path.get()); + auto qwen_omni = mllm::models::qwen2_5omni::Qwen2_5OmniForConditionalGeneration(qwen_cfg); + + auto param = mllm::load(model_path.get(), file_version); + qwen_omni.load(param); + qwen_omni.loadSpeakers(spk_dict_path.get()); + + std::string prompt_text = prompt.get(); + if (prompt_text.empty()) { + fmt::print("Prompt text: "); + std::getline(std::cin, prompt_text); + if (prompt_text.empty()) { prompt_text = "Please respond."; } + } + + mllm::models::ARGenerationOutputPast inputs; + if (!image_path.get().empty()) { + inputs = qwen_tokenizer.convertVisionMessage({.prompt = prompt_text, .img_file_path = image_path.get()}); + } else if (!audio_path.get().empty()) { + inputs = qwen_tokenizer.convertAudioMessage({.prompt = prompt_text, .audio_file_path = audio_path.get()}); + } else { + inputs = qwen_tokenizer.convertMessage({.prompt = prompt_text}); + } + + mllm::models::qwen2_5omni::Qwen2_5OmniAudioGenerationConfig gen_cfg; + auto output = qwen_omni.generateAudio(inputs, gen_cfg, speaker.get()); + + auto input_len = inputs["sequence"].shape()[1]; + auto total_len = output.sequences.shape()[1]; + fmt::print("\nResponse: "); + for (int i = input_len; i < total_len; ++i) { + std::wcout << qwen_tokenizer.detokenize(output.sequences.at({0, i})) << std::flush; + } + fmt::print("\n"); + + auto wav = output.wav * 32767.0f; + wenet::WavWriter wav_writer(wav.ptr(), wav.shape().back(), 1, 24000, 16); + wav_writer.Write(output_path.get()); + + fmt::print("Saved audio to {}\n", output_path.get()); + + qwen_omni.thinker_.perfSummary(); + + mllm::print("\n"); + mllm::memoryReport(); +}) diff --git a/examples/qwen2_5omni/image_infer_dbg.cpp b/examples/qwen2_5omni/image_infer_dbg.cpp new file mode 100644 index 000000000..de21c8ec7 --- /dev/null +++ b/examples/qwen2_5omni/image_infer_dbg.cpp @@ -0,0 +1,91 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include + +using mllm::Argparse; + +//MLLM_MAIN({ +int main(int argc, char** argv) { + ::mllm::__setup_signal_handler(); + ::mllm::initializeContext(); + + mllm::Logger::level() = mllm::LogLevel::kError; + + auto& help = Argparse::add("-h|--help").help("Show help message"); + auto& model_path = Argparse::add("-m|--model_path").help("Model path").required(true); + auto& model_version = Argparse::add("-mv|--model_version").help("Model version").required(true); + auto& tokenizer_path = Argparse::add("-t|--tokenizer_path").help("Tokenizer directory").required(true); + auto& config_path = Argparse::add("-c|--config_path").help("Config path").required(true); + + Argparse::parse(argc, argv); + + mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1; + if (model_version.get() == "v1") { + file_version = mllm::ModelFileVersion::kV1; + } else if (model_version.get() == "v2") { + file_version = mllm::ModelFileVersion::kV2; + } + + if (help.isSet()) { + Argparse::printHelp(); + mllm::shutdownContext(); + return 0; + } + + { + auto qwen2_5omni_cfg = mllm::models::qwen2_5omni::Qwen2_5OmniConfig(config_path.get()); + auto qwen2_5omni_tokenizer = + mllm::models::qwen2_5omni::Qwen2_5OmniTokenizer(tokenizer_path.get(), qwen2_5omni_cfg.visual_spatial_merge_size); + auto qwen2_5omni = mllm::models::qwen2_5omni::Qwen2_5OmniForCausalLM(qwen2_5omni_cfg); + + auto param = mllm::load(model_path.get(), file_version); + qwen2_5omni.thinker_.load(param); + + fmt::print("\n{:*^60}\n", " Qwen2.5-Omni Image CLI "); + fmt::print("Enter 'exit' or 'quit' to end the session\n\n"); + + std::string image_path; + std::string prompt_text; + + fmt::print("Image path (or 'exit/quit'): "); + image_path = "../../rsc/pics.jpg"; + //std::getline(std::cin, image_path); + if (image_path == "exit" || image_path == "quit") { return 0; } + + fmt::print("Prompt text: "); + prompt_text = "描述图片中物体"; + //std::getline(std::cin, prompt_text); + + try { + fmt::print("Processing...\n"); + auto inputs = qwen2_5omni_tokenizer.convertVisionMessage({.prompt = prompt_text, .img_file_path = image_path}); + + fmt::print("\nResponse: "); + qwen2_5omni.streamGenerate(inputs, + { + {"do_sample", mllm::AnyValue(false)}, + {"max_length", mllm::AnyValue(qwen2_5omni_cfg.max_cache_length)}, + }, + [&](int64_t token_id) { + auto str = qwen2_5omni_tokenizer.detokenize(token_id); + std::wcout << str << std::flush; + }); + + fmt::print("\n{}\n", std::string(60, '-')); + } catch (const std::exception& e) { fmt::print("\nError: {}\n{}\n", e.what(), std::string(60, '-')); } + + qwen2_5omni.perfSummary(); + } + + mllm::print("\n"); + mllm::memoryReport(); + + ::mllm::shutdownContext(); + return 0; +} From 06b754c4892cc84e0a4ee2878678ac7a0afeb142 Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Thu, 5 Mar 2026 14:37:06 +0800 Subject: [PATCH 16/42] add qwen2.5o talker --- .../modeling_qwen2_5omni_talker.hpp | 626 +++++++ .../modeling_qwen2_5omni_token2wav.hpp | 1508 +++++++++++++++++ 2 files changed, 2134 insertions(+) create mode 100644 mllm/models/qwen2_5omni/modeling_qwen2_5omni_talker.hpp create mode 100644 mllm/models/qwen2_5omni/modeling_qwen2_5omni_token2wav.hpp diff --git a/mllm/models/qwen2_5omni/modeling_qwen2_5omni_talker.hpp b/mllm/models/qwen2_5omni/modeling_qwen2_5omni_talker.hpp new file mode 100644 index 000000000..df8019a84 --- /dev/null +++ b/mllm/models/qwen2_5omni/modeling_qwen2_5omni_talker.hpp @@ -0,0 +1,626 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mllm/core/Parallel.hpp" +#include "mllm/core/SlicePrimitives.hpp" +#include "mllm/mllm.hpp" +#include "mllm/nn/Functional.hpp" +#include "mllm/nn/Module.hpp" +#include "mllm/nn/Nn.hpp" +#include "mllm/nn/lmcache/StaticCache.hpp" +#include "mllm/utils/Common.hpp" +#include "mllm/utils/Enumerate.hpp" + +#include "mllm/models/qwen2_5omni/configuration_qwen2_5omni.hpp" + +namespace mllm::models::qwen2_5omni { + +constexpr float kPi = 3.14159265358979323846f; + +inline auto makeTalkerRoPEInvFreq(int output_dim, float rope_theta) -> Tensor { + auto inv_freq = Tensor::empty({output_dim / 2}, kFloat32, kCPU).alloc(); + auto inv_freq_ptr = inv_freq.ptr(); + for (int i = 0; i < output_dim / 2; i++) { inv_freq_ptr[i] = 1.0f / std::pow(rope_theta, 2.0f * i / output_dim); } + return inv_freq; +} + +inline auto makeTalkerPositionEmbedding(Tensor& position_ids, const Tensor& inv_freq, const std::vector& mrope_section) + -> std::pair { + MLLM_RT_ASSERT_EQ(position_ids.shape().size(), 3); + MLLM_RT_ASSERT_EQ(position_ids.shape()[1], 1); + + Tensor tmp_sin = Tensor::empty({3, position_ids.shape()[2], inv_freq.shape()[0] * 2}).alloc(); + Tensor tmp_cos = Tensor::empty({3, position_ids.shape()[2], inv_freq.shape()[0] * 2}).alloc(); + + for (int b = 0; b < 3; ++b) { + for (int d = 0; d < inv_freq.shape()[0]; ++d) { + for (int s = 0; s < position_ids.shape()[2]; ++s) { + auto value = inv_freq.ptr()[d] * (*position_ids.offsettedPtr({b, 0, s})); + *tmp_cos.offsettedPtr({b, s, d}) = cosf(value); + *tmp_cos.offsettedPtr({b, s, d + inv_freq.shape()[0]}) = cosf(value); + *tmp_sin.offsettedPtr({b, s, d}) = sinf(value); + *tmp_sin.offsettedPtr({b, s, d + inv_freq.shape()[0]}) = sinf(value); + } + } + } + + Tensor sin = Tensor::nil(); + Tensor cos = Tensor::nil(); + + if (!mrope_section.empty()) { + auto double_rope_section = mrope_section; + for (int i : mrope_section) { double_rope_section.push_back(i); } + + int num_rows = tmp_sin.shape()[1]; + int num_cols = tmp_sin.shape()[2]; + + sin = Tensor::empty({num_rows, num_cols}, kFloat32, kCPU).alloc(); + cos = Tensor::empty({num_rows, num_cols}, kFloat32, kCPU).alloc(); + + std::vector start_cols; + int current_start = 0; + start_cols.push_back(current_start); + for (int s : double_rope_section) { + current_start += s; + start_cols.push_back(current_start); + } + + for (int j = 0; j < static_cast(double_rope_section.size()); ++j) { + int layer = j % 3; + int s_j = double_rope_section[j]; + int start_col_in = start_cols[j]; + int start_col_out = start_cols[j]; + for (int row = 0; row < num_rows; ++row) { + auto in_cos_row_ptr = tmp_cos.offsettedPtr({layer, row, 0}); + auto out_cos_row_ptr = cos.offsettedPtr({row, 0}); + for (int c = 0; c < s_j; ++c) { out_cos_row_ptr[start_col_out + c] = in_cos_row_ptr[start_col_in + c]; } + + auto in_sin_row_ptr = tmp_sin.offsettedPtr({layer, row, 0}); + auto out_sin_row_ptr = sin.offsettedPtr({row, 0}); + for (int c = 0; c < s_j; ++c) { out_sin_row_ptr[start_col_out + c] = in_sin_row_ptr[start_col_in + c]; } + } + } + } else { + sin = tmp_sin; + cos = tmp_cos; + } + + return {sin, cos}; +} + +struct Qwen2_5OmniSpeakerParams { + int64_t bos_token = 0; + Tensor cond = Tensor::nil(); + Tensor ref_mel = Tensor::nil(); +}; + +struct Qwen2_5OmniSpeakerMap { + std::unordered_map speakers; + std::string default_speaker; +}; + +inline Tensor tensorFromJson(const nlohmann::ordered_json& obj) { + if (!obj.contains("shape") || !obj.contains("data")) { + MLLM_ERROR_EXIT(ExitCode::kIOError, "Invalid speaker json entry: missing shape/data."); + } + auto shape = obj["shape"].get>(); + auto data = obj["data"].get>(); + + int64_t expected = 1; + for (auto dim : shape) { expected *= dim; } + MLLM_RT_ASSERT_EQ(expected, static_cast(data.size())); + + Tensor out = Tensor::empty(shape, kFloat32, kCPU).alloc(); + std::copy(data.begin(), data.end(), out.ptr()); + return out; +} + +inline Qwen2_5OmniSpeakerMap loadSpeakerMap(const std::string& path) { + std::ifstream in(path); + if (!in.is_open()) { MLLM_ERROR_EXIT(ExitCode::kIOError, "Failed to open spk_dict.json at {}", path); } + + nlohmann::ordered_json root; + in >> root; + + Qwen2_5OmniSpeakerMap map; + bool first = true; + for (auto it = root.begin(); it != root.end(); ++it) { + const auto& name = it.key(); + const auto& entry = it.value(); + Qwen2_5OmniSpeakerParams params; + params.bos_token = entry.value("bos_token", 0); + params.cond = tensorFromJson(entry["cond"]); + params.ref_mel = tensorFromJson(entry["ref_mel"]); + map.speakers.emplace(name, std::move(params)); + if (first) { + map.default_speaker = name; + first = false; + } + } + + if (map.speakers.empty()) { MLLM_ERROR_EXIT(ExitCode::kIOError, "Empty speaker map in {}", path); } + return map; +} + +class Qwen2_5OmniTalkerMLP final : public nn::Module { + nn::Linear gate_proj_; + nn::Linear up_proj_; + nn::Linear down_proj_; + nn::SiLU silu_; + + public: + Qwen2_5OmniTalkerMLP() = default; + Qwen2_5OmniTalkerMLP(const std::string& name, const Qwen2_5OmniTalkerConfig& cfg) : nn::Module(name) { + gate_proj_ = reg("gate_proj", cfg.hidden_size, cfg.intermediate_size, false); + silu_ = reg("act"); + up_proj_ = reg("up_proj", cfg.hidden_size, cfg.intermediate_size, false); + down_proj_ = reg("down_proj", cfg.intermediate_size, cfg.hidden_size, false); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto x = gate_proj_(inputs[0]); + x = silu_(x); + auto y = up_proj_(inputs[0]); + x = x * y; + x = down_proj_(x); + return {x}; + } +}; + +class Qwen2_5OmniTalkerAttention final : public nn::Module { + nn::Linear q_proj_; + nn::Linear k_proj_; + nn::Linear v_proj_; + nn::Linear o_proj_; + nn::MultimodalRoPE q_rope_; + nn::MultimodalRoPE k_rope_; + nn::CausalMask mask_; + nn::Softmax softmax_; + + int hidden_size_; + int head_dim_; + int num_attention_heads_; + int num_key_value_heads_; + int num_key_value_groups_; + + public: + Qwen2_5OmniTalkerAttention() = default; + + Qwen2_5OmniTalkerAttention(const std::string& name, const Qwen2_5OmniTalkerConfig& cfg) : nn::Module(name) { + hidden_size_ = cfg.hidden_size; + head_dim_ = cfg.head_dim; + num_attention_heads_ = cfg.num_attention_heads; + num_key_value_heads_ = cfg.num_key_value_heads; + num_key_value_groups_ = num_attention_heads_ / num_key_value_heads_; + + q_proj_ = reg("q_proj", hidden_size_, head_dim_ * num_attention_heads_, true); + k_proj_ = reg("k_proj", hidden_size_, head_dim_ * num_key_value_heads_, true); + v_proj_ = reg("v_proj", hidden_size_, head_dim_ * num_key_value_heads_, true); + o_proj_ = reg("o_proj", head_dim_ * num_attention_heads_, hidden_size_, false); + + q_rope_ = reg( + "q_rope", aops::Qwen2VLMultimodalRoPEOpOptions{.rope_theta = cfg.rope_theta, + .max_position_embeddings = cfg.max_position_embeddings, + .mrope_section = cfg.mrope_section}); + k_rope_ = reg( + "k_rope", aops::Qwen2VLMultimodalRoPEOpOptions{.rope_theta = cfg.rope_theta, + .max_position_embeddings = cfg.max_position_embeddings, + .mrope_section = cfg.mrope_section}); + + mask_ = reg("mask"); + softmax_ = reg("softmax", -1); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto x = inputs[0]; + auto llm_embedding_sin = inputs[1]; + auto llm_embedding_cos = inputs[2]; + auto past_kv_cache = args[0].get(); + + auto query_states = q_proj_(x); + auto key_states = k_proj_(x); + auto value_states = v_proj_(x); + + int B = inputs[0].shape()[0]; + int S = inputs[0].shape()[1]; + + query_states = query_states.view({B, S, num_attention_heads_, head_dim_}); + key_states = key_states.view({B, S, num_key_value_heads_, head_dim_}); + value_states = value_states.view({B, S, num_key_value_heads_, head_dim_}); + + query_states = query_states.transpose(1, 2); + key_states = key_states.transpose(1, 2); + value_states = value_states.transpose(1, 2); + + query_states = q_rope_(query_states, llm_embedding_sin, llm_embedding_cos); + key_states = k_rope_(key_states, llm_embedding_sin, llm_embedding_cos); + + auto [k, v] = past_kv_cache->updateKVCache(layer_idx_, key_states, value_states); + key_states = k; + value_states = v; + + Tensor attn; + if (key_states.dtype() == kFloat32) { + attn = nn::functional::matmul(query_states, key_states, false, true) * (1.f / sqrtf(head_dim_)); + attn = mask_(attn); + attn = softmax_(attn); + } else if (key_states.dtype() == kFloat16) { + attn = nn::functional::matmul(query_states.to(kFloat32), key_states.to(kFloat32), false, true) * (1.f / sqrtf(head_dim_)); + attn = mask_(attn); + attn = softmax_(attn); + attn = attn.to(kFloat16); + } + + auto output = nn::functional::matmul(attn, value_states); + output = output.transpose(1, 2).view({B, S, num_attention_heads_ * head_dim_}); + output = o_proj_(output); + return {output}; + } + + int layer_idx_ = 0; +}; + +class Qwen2_5OmniTalkerDecoder final : public nn::Module { + public: + Qwen2_5OmniTalkerAttention self_attn_; + Qwen2_5OmniTalkerMLP mlp_; + nn::RMSNorm input_layer_norm_; + nn::RMSNorm post_attention_layer_norm_; + + Qwen2_5OmniTalkerDecoder() = default; + + Qwen2_5OmniTalkerDecoder(const std::string& name, const Qwen2_5OmniTalkerConfig& cfg) : nn::Module(name) { + self_attn_ = reg("self_attn", cfg); + mlp_ = reg("mlp", cfg); + input_layer_norm_ = reg("input_layernorm", cfg.rms_norm_eps); + post_attention_layer_norm_ = reg("post_attention_layernorm", cfg.rms_norm_eps); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto llm_embedding_sin = inputs[1]; + auto llm_embedding_cos = inputs[2]; + auto& kv_cache = args[0]; + + auto x = input_layer_norm_(inputs[0]); + x = self_attn_(x, llm_embedding_sin, llm_embedding_cos, kv_cache)[0]; + auto tmp = x + inputs[0]; + x = post_attention_layer_norm_(tmp); + x = mlp_(x)[0]; + x = x + tmp; + return {x}; + } +}; + +class Qwen2_5OmniTalkerModel final : public nn::Module { + nn::ModuleList decode_blocks_; + nn::RMSNorm norm_; + + public: + Qwen2_5OmniTalkerModel() = default; + + Qwen2_5OmniTalkerModel(const std::string& name, const Qwen2_5OmniTalkerConfig& cfg) : nn::Module(name) { + decode_blocks_ = reg>("layers", cfg.num_hidden_layers, cfg); + for (auto [idx, b] : enumerate(decode_blocks_.list())) { b.self_attn_.layer_idx_ = idx; } + + norm_ = reg("norm", cfg.rms_norm_eps); + embedding_ = reg("embed_tokens", cfg.vocab_size, cfg.embedding_size); + + auto inv = makeTalkerRoPEInvFreq(cfg.head_dim, cfg.rope_theta); + registerBuffer("inv_freq", inv); + } + + std::vector forward(const std::vector& inputs, const std::vector& args) override { + auto& blocks = decode_blocks_.list(); + auto x = inputs[0]; + auto llm_embedding_sin = inputs[1]; + auto llm_embedding_cos = inputs[2]; + auto& kv_cache = args[0]; + + for (auto& block : blocks) { x = block(x, llm_embedding_sin, llm_embedding_cos, kv_cache)[0]; } + x = norm_(x); + + return {x}; + } + + nn::Embedding embedding_; +}; + +struct Qwen2_5OmniTalkerOutput { + Tensor logits = Tensor::nil(); + Tensor thinker_reply_part = Tensor::nil(); + Tensor position_ids = Tensor::nil(); +}; + +class Qwen2_5OmniTalker final : public nn::Module { + public: + Qwen2_5OmniTalker() = delete; + Qwen2_5OmniTalker(const std::string& name, const Qwen2_5OmniTalkerConfig& cfg) : nn::Module(name), cfg_(cfg) { + thinker_to_talker_proj_ = reg("thinker_to_talker_proj", cfg.embedding_size, cfg.hidden_size, true); + model_ = reg("model", cfg); + codec_head_ = reg("codec_head", cfg.hidden_size, cfg.vocab_size, false); + + kv_cache_ = nn::StaticCache(cfg.max_position_embeddings, cfg.num_hidden_layers, cfg.num_attention_heads, cfg.num_key_value_heads, + cfg.head_dim, kFloat32, kFloat32, kCPU, false); + + codec_bos_token_ = cfg.tts_codec_start_token_id; + codec_eos_token_ = cfg.tts_codec_end_token_id; + codec_pad_token_ = cfg.tts_codec_pad_token_id; + codec_mask_token_ = cfg.tts_codec_mask_token_id; + text_bos_token_ = cfg.tts_text_start_token_id; + text_eos_token_ = cfg.tts_text_end_token_id; + text_pad_token_ = cfg.tts_text_pad_token_id; + } + + void clearCache() { + kv_cache_.clearCache(); + rope_deltas_ = Tensor::nil(); + } + + Qwen2_5OmniTalkerOutput forward(const Tensor& input_ids, const Tensor& input_text_ids, Tensor thinker_reply_part, + Tensor inputs_embeds, const Tensor& attention_mask, const Tensor& image_grid_thw, + Tensor position_ids) { + Tensor ids_for_pos = input_text_ids.isNil() ? input_ids : input_text_ids; + position_ids = getPositionIds(ids_for_pos, image_grid_thw, position_ids); + + const bool prefill = kv_cache_.getCurrentSeqCnt(0) == 0; + if (!inputs_embeds.isNil() && prefill) { + const auto S = inputs_embeds.shape()[1]; + MLLM_RT_ASSERT(S >= 2); + + auto bos_token = Tensor::empty({1, 1}, kInt64, kCPU).alloc(); + bos_token.at({0, 0}) = codec_bos_token_; + auto bos_embed = model_.embedding_(bos_token); + + auto pad_token = Tensor::empty({1, 1}, kInt64, kCPU).alloc(); + pad_token.at({0, 0}) = codec_pad_token_; + auto pad_embed = model_.embedding_(pad_token); + + auto embed_dim = inputs_embeds.shape()[2]; + if (inputs_embeds.dtype() == kFloat32) { + auto* out_ptr = inputs_embeds.offsettedPtr({0, S - 1, 0}); + auto* pad_ptr = inputs_embeds.offsettedPtr({0, S - 2, 0}); + auto* bos_ptr = bos_embed.ptr(); + auto* pad_src_ptr = pad_embed.ptr(); + for (int d = 0; d < embed_dim; ++d) { + out_ptr[d] += bos_ptr[d]; + pad_ptr[d] += pad_src_ptr[d]; + } + } else if (inputs_embeds.dtype() == kFloat16) { + auto* out_ptr = inputs_embeds.offsettedPtr({0, S - 1, 0}); + auto* pad_ptr = inputs_embeds.offsettedPtr({0, S - 2, 0}); + auto* bos_ptr = bos_embed.ptr(); + auto* pad_src_ptr = pad_embed.ptr(); + for (int d = 0; d < embed_dim; ++d) { + out_ptr[d] = static_cast(static_cast(out_ptr[d]) + static_cast(bos_ptr[d])); + pad_ptr[d] = static_cast(static_cast(pad_ptr[d]) + static_cast(pad_src_ptr[d])); + } + } + } + + if (inputs_embeds.isNil()) { + auto codec_embeds = model_.embedding_(input_ids); + inputs_embeds = codec_embeds + thinker_reply_part[{kAll, {0, 1}, kAll}]; + if (thinker_reply_part.shape()[1] > 1) { + thinker_reply_part = thinker_reply_part[{kAll, {1, thinker_reply_part.shape()[1]}, kAll}]; + } + } + + auto [llm_embedding_sin, llm_embedding_cos] = + makeTalkerPositionEmbedding(position_ids, model_.getBuffer("inv_freq"), cfg_.mrope_section); + + auto talker_lm_input = thinker_to_talker_proj_(inputs_embeds); + auto hidden_states = model_(talker_lm_input, llm_embedding_sin, llm_embedding_cos, AnyValue(&kv_cache_))[0]; + auto logits = codec_head_(hidden_states).to(kFloat32); + + return { + .logits = logits, + .thinker_reply_part = thinker_reply_part, + .position_ids = position_ids, + }; + } + + int64_t codec_bos_token() const { return codec_bos_token_; } + int64_t codec_eos_token() const { return codec_eos_token_; } + int64_t codec_pad_token() const { return codec_pad_token_; } + int64_t codec_mask_token() const { return codec_mask_token_; } + int64_t text_eos_token() const { return text_eos_token_; } + int64_t text_pad_token() const { return text_pad_token_; } + int64_t text_bos_token() const { return text_bos_token_; } + + Qwen2_5OmniTalkerModel model_; + + private: + Tensor getPositionIds(const Tensor& input_ids, const Tensor& image_grid_thw, const Tensor& position_ids) const { + MLLM_RT_ASSERT_EQ(input_ids.shape().size(), 2); + + bool has_multimodal = false; + auto input_ids_ptr = input_ids.ptr(); + auto seq_len = input_ids.shape()[1]; + for (int s = 0; s < seq_len; ++s) { + if (input_ids_ptr[s] == cfg_.vision_start_token_id || input_ids_ptr[s] == cfg_.audio_start_token_id) { + has_multimodal = true; + break; + } + } + + if (has_multimodal) { return getPositionIdsPrefill(input_ids, image_grid_thw); } + + if (!position_ids.isNil()) { + auto last_pos = position_ids.constAt({0, 0, position_ids.shape()[2] - 1}); + auto ret_position_ids = Tensor::empty({3, 1, 1}, kInt64, kCPU).alloc(); + *ret_position_ids.offsettedPtr({0, 0, 0}) = last_pos + 1; + *ret_position_ids.offsettedPtr({1, 0, 0}) = last_pos + 1; + *ret_position_ids.offsettedPtr({2, 0, 0}) = last_pos + 1; + return ret_position_ids; + } + + auto B = input_ids.shape()[0]; + auto S = seq_len; + MLLM_RT_ASSERT_EQ(B, 1); + + Tensor out = Tensor::empty({3, B, S}, kInt64, kCPU).alloc(); + for (int d = 0; d < 3; ++d) { + auto out_ptr = out.offsettedPtr({d, 0, 0}); + for (int64_t s = 0; s < S; ++s) { out_ptr[s] = s; } + } + return out; + } + + Tensor getPositionIdsPrefill(const Tensor& input_ids, const Tensor& image_grid_thw) const { + MLLM_RT_ASSERT_EQ(input_ids.shape().size(), 2); + + auto B = input_ids.shape()[0]; + auto S = input_ids.shape()[1]; + MLLM_RT_ASSERT_EQ(B, 1); + + Tensor position_ids = Tensor::empty({3, B, S}, kInt64, kCPU).alloc(); + auto input_ids_ptr = input_ids.ptr(); + + auto fill_text_positions = [&](int start_seq, int len, int64_t start_id) { + for (int d = 0; d < 3; ++d) { + auto out_ptr = position_ids.offsettedPtr({d, 0, 0}); + for (int i = 0; i < len; ++i) { out_ptr[start_seq + i] = start_id + i; } + } + }; + + int seq_idx = 0; + int image_idx = 0; + int64_t current_max_position_id = -1; + const int total_images = image_grid_thw.isNil() ? 0 : image_grid_thw.shape()[0]; + + while (seq_idx < S) { + int next_vision = -1; + int next_audio = -1; + for (int i = seq_idx; i < S; ++i) { + if (input_ids_ptr[i] == cfg_.vision_start_token_id) { + next_vision = i; + break; + } + } + for (int i = seq_idx; i < S; ++i) { + if (input_ids_ptr[i] == cfg_.audio_start_token_id) { + next_audio = i; + break; + } + } + + if (next_vision == -1 && next_audio == -1) { + const int text_len = S - seq_idx; + if (text_len > 0) { fill_text_positions(seq_idx, text_len, current_max_position_id + 1); } + break; + } + + const bool is_vision = (next_vision != -1) && (next_audio == -1 || next_vision < next_audio); + const int segment_start = is_vision ? next_vision : next_audio; + + const int text_len = segment_start - seq_idx; + if (text_len > 0) { + fill_text_positions(seq_idx, text_len, current_max_position_id + 1); + current_max_position_id += text_len; + } + + if (is_vision) { + fill_text_positions(segment_start, 1, current_max_position_id + 1); + current_max_position_id += 1; + + int vision_end = -1; + for (int i = segment_start + 1; i < S; ++i) { + if (input_ids_ptr[i] == cfg_.vision_end_token_id) { + vision_end = i; + break; + } + } + MLLM_RT_ASSERT(vision_end != -1); + + if (image_idx >= total_images) { MLLM_ERROR_EXIT(ExitCode::kCoreError, "Image index out of range."); } + + auto grid_t = image_grid_thw.ptr()[image_idx * 3]; + auto grid_h = image_grid_thw.ptr()[image_idx * 3 + 1]; + auto grid_w = image_grid_thw.ptr()[image_idx * 3 + 2]; + int vision_len = grid_t * grid_h * grid_w; + vision_len /= (cfg_.spatial_merge_size * cfg_.spatial_merge_size); + + for (int i = 0; i < vision_len; ++i) { + const int pos = segment_start + 1 + i; + if (pos >= S) { break; } + for (int d = 0; d < 3; ++d) { + *position_ids.offsettedPtr({d, 0, pos}) = current_max_position_id + 1 + i; + } + } + current_max_position_id += vision_len; + + fill_text_positions(vision_end, 1, current_max_position_id + 1); + current_max_position_id += 1; + + seq_idx = vision_end + 1; + image_idx += 1; + } else { + fill_text_positions(segment_start, 1, current_max_position_id + 1); + current_max_position_id += 1; + + int audio_end = -1; + for (int i = segment_start + 1; i < S; ++i) { + if (input_ids_ptr[i] == cfg_.audio_end_token_id) { + audio_end = i; + break; + } + } + MLLM_RT_ASSERT(audio_end != -1); + + std::vector audio_positions; + for (int i = segment_start + 1; i < audio_end; ++i) { + if (input_ids_ptr[i] == cfg_.audio_token_id) { + audio_positions.push_back(i); + } else { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "Unsupported token inside audio segment."); + } + } + const int audio_len = static_cast(audio_positions.size()); + if (audio_len == 0) { MLLM_ERROR_EXIT(ExitCode::kCoreError, "Empty audio tokens inside audio segment."); } + const int64_t audio_start_id = current_max_position_id + 1; + for (int i = 0; i < audio_len; ++i) { + const int64_t pos_id = audio_start_id + i; + for (int d = 0; d < 3; ++d) { + *position_ids.offsettedPtr({d, 0, audio_positions[i]}) = pos_id; + } + } + current_max_position_id += audio_len; + fill_text_positions(audio_end, 1, current_max_position_id + 1); + current_max_position_id += 1; + seq_idx = audio_end + 1; + } + } + + return position_ids; + } + + const Qwen2_5OmniTalkerConfig& cfg_; + nn::Linear thinker_to_talker_proj_; + nn::Linear codec_head_; + nn::StaticCache kv_cache_; + Tensor rope_deltas_ = Tensor::nil(); + + int64_t codec_bos_token_ = 0; + int64_t codec_eos_token_ = 0; + int64_t codec_pad_token_ = 0; + int64_t codec_mask_token_ = 0; + int64_t text_bos_token_ = 0; + int64_t text_eos_token_ = 0; + int64_t text_pad_token_ = 0; +}; + +} // namespace mllm::models::qwen2_5omni diff --git a/mllm/models/qwen2_5omni/modeling_qwen2_5omni_token2wav.hpp b/mllm/models/qwen2_5omni/modeling_qwen2_5omni_token2wav.hpp new file mode 100644 index 000000000..6e5939a44 --- /dev/null +++ b/mllm/models/qwen2_5omni/modeling_qwen2_5omni_token2wav.hpp @@ -0,0 +1,1508 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mllm/core/Parallel.hpp" +#include "mllm/core/SlicePrimitives.hpp" +#include "mllm/mllm.hpp" +#include "mllm/nn/Functional.hpp" +#include "mllm/nn/Module.hpp" +#include "mllm/nn/Nn.hpp" +#include "mllm/utils/Common.hpp" +#include "mllm/utils/Enumerate.hpp" + +#include "mllm/models/qwen2_5omni/configuration_qwen2_5omni.hpp" + +namespace mllm::models::qwen2_5omni { + +namespace token2wav { + +constexpr float kPi = 3.14159265358979323846f; + +inline Tensor pad1dReflect(const Tensor& x, int32_t pad_left, int32_t pad_right) { + if (pad_left == 0 && pad_right == 0) { return x; } + return nn::functional::pad(x, {pad_left, pad_right}, aops::PadMode::kReflect); +} + +inline Tensor pad1dReplicate(const Tensor& x, int32_t pad_left, int32_t pad_right) { + if (pad_left == 0 && pad_right == 0) { return x; } + return nn::functional::pad(x, {pad_left, pad_right}, aops::PadMode::kReplicate); +} + +inline Tensor clampTensor(const Tensor& x, float min_val, float max_val) { + MLLM_RT_ASSERT_EQ(x.device(), kCPU); + MLLM_RT_ASSERT_EQ(x.dtype(), kFloat32); + + auto out = Tensor::empty(x.shape(), x.dtype(), x.device()).alloc(); + const auto* src = x.ptr(); + auto* dst = out.ptr(); + const auto numel = x.numel(); + + MLLM_CONDITIONAL_PARALLEL_FOR(numel > 1024, 4, idx, 0, numel, 1, { + float v = src[idx]; + v = std::min(std::max(v, min_val), max_val); + dst[idx] = v; + }); + return out; +} + +inline Tensor amplitudeToDb(const Tensor& amplitude, float min_db_level) { + MLLM_RT_ASSERT_EQ(amplitude.device(), kCPU); + MLLM_RT_ASSERT_EQ(amplitude.dtype(), kFloat32); + + const float min_level = std::exp(min_db_level / 20.0f * std::log(10.0f)); + const float log10_scale = 1.0f / std::log(10.0f); + + auto out = Tensor::empty(amplitude.shape(), amplitude.dtype(), amplitude.device()).alloc(); + const auto* src = amplitude.ptr(); + auto* dst = out.ptr(); + const auto numel = amplitude.numel(); + + MLLM_CONDITIONAL_PARALLEL_FOR(numel > 1024, 4, idx, 0, numel, 1, { + float v = std::max(src[idx], min_level); + dst[idx] = 20.0f * std::log(v) * log10_scale; + }); + + return out; +} + +inline Tensor normalizeSpectrogram(const Tensor& spectrogram, float max_value, float min_db) { + MLLM_RT_ASSERT_EQ(spectrogram.device(), kCPU); + MLLM_RT_ASSERT_EQ(spectrogram.dtype(), kFloat32); + + auto out = Tensor::empty(spectrogram.shape(), spectrogram.dtype(), spectrogram.device()).alloc(); + const auto* src = spectrogram.ptr(); + auto* dst = out.ptr(); + const auto numel = spectrogram.numel(); + + const float scale = (2.0f * max_value) / (-min_db); + MLLM_CONDITIONAL_PARALLEL_FOR(numel > 1024, 4, idx, 0, numel, 1, { + float v = scale * (src[idx] - min_db) - max_value; + v = std::min(std::max(v, -max_value), max_value); + dst[idx] = v; + }); + return out; +} + +inline float besselI0(float x) { + const float ax = std::abs(x); + if (ax < 3.75f) { + const float y = (ax / 3.75f); + const float y2 = y * y; + return 1.0f + y2 * (3.5156229f + + y2 * (3.0899424f + + y2 * (1.2067492f + + y2 * (0.2659732f + + y2 * (0.0360768f + + y2 * 0.0045813f))))); + } + + const float y = 3.75f / ax; + const float exp_ax = std::exp(ax); + return (exp_ax / std::sqrt(ax)) * + (0.39894228f + + y * (0.01328592f + + y * (0.00225319f + + y * (-0.00157565f + + y * (0.00916281f + + y * (-0.02057706f + + y * (0.02635537f + + y * (-0.01647633f + + y * 0.00392377f)))))))); +} + +inline Tensor kaiserSincFilter1d(float cutoff, float half_width, int32_t kernel_size) { + const bool is_even = (kernel_size % 2) == 0; + const int32_t half_size = kernel_size / 2; + + if (cutoff == 0.0f) { return Tensor::zeros({1, 1, kernel_size}, kFloat32, kCPU); } + + const float delta_f = 4.0f * half_width; + const float attenuation = 2.285f * static_cast(half_size - 1) * kPi * delta_f + 7.95f; + + float beta = 0.0f; + if (attenuation > 50.0f) { + beta = 0.1102f * (attenuation - 8.7f); + } else if (attenuation >= 21.0f) { + beta = 0.5842f * std::pow(attenuation - 21.0f, 0.4f) + 0.07886f * (attenuation - 21.0f); + } + + const float denom = besselI0(beta); + std::vector window(kernel_size, 1.0f); + for (int32_t n = 0; n < kernel_size; ++n) { + const float ratio = (2.0f * static_cast(n) / static_cast(kernel_size - 1)) - 1.0f; + const float val = std::sqrt(std::max(0.0f, 1.0f - ratio * ratio)); + window[n] = besselI0(beta * val) / denom; + } + + std::vector filter(kernel_size, 0.0f); + float sum = 0.0f; + for (int32_t n = 0; n < kernel_size; ++n) { + float t = static_cast(n) - static_cast(half_size); + if (is_even) { t += 0.5f; } + const float arg = 2.0f * cutoff * t; + const float sinc = (arg == 0.0f) ? 1.0f : std::sin(kPi * arg) / (kPi * arg); + const float v = 2.0f * cutoff * window[n] * sinc; + filter[n] = v; + sum += v; + } + + if (sum != 0.0f) { + for (auto& v : filter) { v /= sum; } + } + + auto out = Tensor::empty({1, 1, kernel_size}, kFloat32, kCPU).alloc(); + std::copy(filter.begin(), filter.end(), out.ptr()); + return out; +} + +inline Tensor convTranspose1dDepthwise(const Tensor& input, const Tensor& filter, int32_t stride) { + MLLM_RT_ASSERT_EQ(input.device(), kCPU); + MLLM_RT_ASSERT_EQ(input.dtype(), kFloat32); + MLLM_RT_ASSERT_EQ(filter.device(), kCPU); + MLLM_RT_ASSERT_EQ(filter.dtype(), kFloat32); + + const auto& in_shape = input.shape(); + const int32_t batch = in_shape[0]; + const int32_t channels = in_shape[1]; + const int32_t in_len = in_shape[2]; + const int32_t kernel = filter.shape()[2]; + + const int32_t out_len = (in_len - 1) * stride + kernel; + auto out = Tensor::zeros({batch, channels, out_len}, kFloat32, kCPU); + + const auto* in_ptr = input.ptr(); + const auto* filt_ptr = filter.ptr(); + auto* out_ptr = out.ptr(); + + const int32_t in_step = channels * in_len; + const int32_t out_step = channels * out_len; + + for (int32_t b = 0; b < batch; ++b) { + const float* in_b = in_ptr + b * in_step; + float* out_b = out_ptr + b * out_step; + for (int32_t c = 0; c < channels; ++c) { + const float* in_c = in_b + c * in_len; + float* out_c = out_b + c * out_len; + const float* f = filt_ptr; + for (int32_t i = 0; i < in_len; ++i) { + const float v = in_c[i]; + const int32_t base = i * stride; + for (int32_t k = 0; k < kernel; ++k) { out_c[base + k] += v * f[k]; } + } + } + } + + return out; +} + +inline Tensor conv1dDepthwise(const Tensor& input, const Tensor& filter, int32_t stride) { + MLLM_RT_ASSERT_EQ(input.device(), kCPU); + MLLM_RT_ASSERT_EQ(input.dtype(), kFloat32); + MLLM_RT_ASSERT_EQ(filter.device(), kCPU); + MLLM_RT_ASSERT_EQ(filter.dtype(), kFloat32); + + const auto& in_shape = input.shape(); + const int32_t batch = in_shape[0]; + const int32_t channels = in_shape[1]; + const int32_t in_len = in_shape[2]; + const int32_t kernel = filter.shape()[2]; + + const int32_t out_len = (in_len - kernel) / stride + 1; + auto out = Tensor::zeros({batch, channels, out_len}, kFloat32, kCPU); + + const auto* in_ptr = input.ptr(); + const auto* filt_ptr = filter.ptr(); + auto* out_ptr = out.ptr(); + + const int32_t in_step = channels * in_len; + const int32_t out_step = channels * out_len; + + for (int32_t b = 0; b < batch; ++b) { + const float* in_b = in_ptr + b * in_step; + float* out_b = out_ptr + b * out_step; + for (int32_t c = 0; c < channels; ++c) { + const float* in_c = in_b + c * in_len; + float* out_c = out_b + c * out_len; + const float* f = filt_ptr; + for (int32_t o = 0; o < out_len; ++o) { + float sum = 0.0f; + const int32_t base = o * stride; + for (int32_t k = 0; k < kernel; ++k) { sum += in_c[base + k] * f[k]; } + out_c[o] = sum; + } + } + } + + return out; +} + +inline Tensor randomNormal(const std::vector& shape, float mean = 0.0f, float std = 1.0f) { + auto out = Tensor::empty(shape, kFloat32, kCPU).alloc(); + auto* ptr = out.ptr(); + const int64_t numel = out.numel(); + std::mt19937 gen(static_cast(mllm::Context::instance().getRandomState())); + std::normal_distribution dist(mean, std); + for (int64_t i = 0; i < numel; ++i) { ptr[i] = dist(gen); } + return out; +} + +inline Tensor linspace(float start, float end, int32_t steps) { + auto out = Tensor::empty({steps}, kFloat32, kCPU).alloc(); + auto* ptr = out.ptr(); + if (steps <= 1) { + if (steps == 1) { ptr[0] = start; } + return out; + } + const float step = (end - start) / static_cast(steps - 1); + for (int32_t i = 0; i < steps; ++i) { ptr[i] = start + step * static_cast(i); } + return out; +} + +inline Tensor repeatInterleave(const Tensor& input, int32_t repeats, int32_t dim) { + MLLM_RT_ASSERT_EQ(input.device(), kCPU); + MLLM_RT_ASSERT_EQ(input.dtype(), kFloat32); + MLLM_RT_ASSERT_EQ(dim, 1); + + if (repeats == 1) { return input; } + + const auto& shape = input.shape(); + const int32_t batch = shape[0]; + const int32_t seq_len = shape[1]; + const int32_t channels = shape[2]; + + auto out = Tensor::empty({batch, seq_len * repeats, channels}, kFloat32, kCPU).alloc(); + const auto* src = input.ptr(); + auto* dst = out.ptr(); + + const int64_t in_stride_b = static_cast(seq_len) * channels; + const int64_t out_stride_b = static_cast(seq_len) * repeats * channels; + + for (int32_t b = 0; b < batch; ++b) { + const float* src_b = src + b * in_stride_b; + float* dst_b = dst + b * out_stride_b; + for (int32_t s = 0; s < seq_len; ++s) { + const float* src_s = src_b + static_cast(s) * channels; + for (int32_t r = 0; r < repeats; ++r) { + float* dst_s = dst_b + (static_cast(s) * repeats + r) * channels; + std::memcpy(dst_s, src_s, sizeof(float) * channels); + } + } + } + + return out; +} + +class SnakeBeta final : public nn::Module { + nn::Param alpha_; + nn::Param beta_; + float no_div_by_zero_ = 1e-9f; + + public: + SnakeBeta() = default; + SnakeBeta(const std::string& name, int32_t in_features) : nn::Module(name) { + alpha_ = reg("alpha", getModuleName() + ".alpha", std::vector{in_features}); + beta_ = reg("beta", getModuleName() + ".beta", std::vector{in_features}); + } + + std::vector forward(const std::vector& inputs, const std::vector&) override { + auto x = inputs[0]; + MLLM_RT_ASSERT_EQ(x.device(), kCPU); + MLLM_RT_ASSERT_EQ(x.dtype(), kFloat32); + if (!x.isContiguous()) { x = x.contiguous(); } + + const auto& shape = x.shape(); + const int32_t batch = shape[0]; + const int32_t channels = shape[1]; + const int32_t seq_len = shape[2]; + + auto y = Tensor::empty(shape, kFloat32, kCPU).alloc(); + const auto* x_ptr = x.ptr(); + auto* y_ptr = y.ptr(); + + auto alpha = alpha_.weight(); + auto beta = beta_.weight(); + const auto* alpha_ptr = alpha.ptr(); + const auto* beta_ptr = beta.ptr(); + + const int32_t stride_c = seq_len; + const int32_t stride_b = channels * seq_len; + + for (int32_t b = 0; b < batch; ++b) { + for (int32_t c = 0; c < channels; ++c) { + const float a = std::exp(alpha_ptr[c]); + const float bb = std::exp(beta_ptr[c]); + const float inv_b = 1.0f / (bb + no_div_by_zero_); + const int32_t base = b * stride_b + c * stride_c; + for (int32_t t = 0; t < seq_len; ++t) { + float v = x_ptr[base + t]; + const float s = std::sin(v * a); + v = v + inv_b * (s * s); + y_ptr[base + t] = v; + } + } + } + + return {y}; + } + +}; + +class TorchActivation1d final : public nn::Module { + public: + TorchActivation1d() = default; + TorchActivation1d(const std::string& name, int32_t channels, int32_t up_ratio = 2, int32_t down_ratio = 2, + int32_t up_kernel_size = 12, int32_t down_kernel_size = 12) + : nn::Module(name), + up_ratio_(up_ratio), + down_ratio_(down_ratio), + up_kernel_size_(up_kernel_size), + down_kernel_size_(down_kernel_size) { + act_ = reg("act", channels); + + up_kernel_size_ = (up_kernel_size_ <= 0) ? static_cast(int(6 * up_ratio_ / 2) * 2) : up_kernel_size_; + up_stride_ = up_ratio_; + up_pad_ = up_kernel_size_ / up_ratio_ - 1; + up_pad_left_ = up_pad_ * up_stride_ + (up_kernel_size_ - up_stride_) / 2; + up_pad_right_ = up_pad_ * up_stride_ + (up_kernel_size_ - up_stride_ + 1) / 2; + + down_kernel_size_ = (down_kernel_size_ <= 0) ? static_cast(int(6 * down_ratio_ / 2) * 2) : down_kernel_size_; + down_stride_ = down_ratio_; + down_even_ = (down_kernel_size_ % 2) == 0; + down_pad_left_ = down_kernel_size_ / 2 - (down_even_ ? 1 : 0); + down_pad_right_ = down_kernel_size_ / 2; + + up_filter_ = kaiserSincFilter1d(0.5f / static_cast(up_ratio_), 0.6f / static_cast(up_ratio_), up_kernel_size_); + down_filter_ = + kaiserSincFilter1d(0.5f / static_cast(down_ratio_), 0.6f / static_cast(down_ratio_), down_kernel_size_); + } + + std::vector forward(const std::vector& inputs, const std::vector&) override { + auto x = inputs[0]; + x = upsample(x); + x = act_(x)[0]; + x = downsample(x); + return {x}; + } + + private: + Tensor upsample(const Tensor& input) const { + auto padded = pad1dReplicate(input, up_pad_, up_pad_); + auto out = convTranspose1dDepthwise(padded, up_filter_, up_stride_); + out = out * static_cast(up_ratio_); + if (up_pad_left_ > 0 || up_pad_right_ > 0) { + auto length = out.shape()[2]; + auto start = up_pad_left_; + auto end = length - up_pad_right_; + out = out[{kAll, kAll, {start, end}}]; + } + return out; + } + + Tensor downsample(const Tensor& input) const { + auto padded = pad1dReplicate(input, down_pad_left_, down_pad_right_); + auto out = conv1dDepthwise(padded, down_filter_, down_stride_); + return out; + } + + SnakeBeta act_; + int32_t up_ratio_ = 2; + int32_t down_ratio_ = 2; + int32_t up_kernel_size_ = 12; + int32_t down_kernel_size_ = 12; + int32_t up_stride_ = 2; + int32_t down_stride_ = 2; + int32_t up_pad_ = 0; + int32_t up_pad_left_ = 0; + int32_t up_pad_right_ = 0; + int32_t down_pad_left_ = 0; + int32_t down_pad_right_ = 0; + bool down_even_ = false; + Tensor up_filter_ = Tensor::nil(); + Tensor down_filter_ = Tensor::nil(); +}; + +class TimeDelayNetBlock final : public nn::Module { + public: + TimeDelayNetBlock() = default; + TimeDelayNetBlock(const std::string& name, int32_t in_channels, int32_t out_channels, int32_t kernel_size, int32_t dilation) + : nn::Module(name), kernel_size_(kernel_size), dilation_(dilation) { + conv_ = reg("conv", in_channels, out_channels, kernel_size_, 1, 0, dilation_, 1, true); + relu_ = reg("relu"); + } + + std::vector forward(const std::vector& inputs, const std::vector&) override { + auto x = inputs[0]; + const int32_t pad_total = dilation_ * (kernel_size_ - 1); + const int32_t pad_left = pad_total / 2; + const int32_t pad_right = pad_total - pad_left; + if (pad_total > 0) { x = pad1dReflect(x, pad_left, pad_right); } + x = conv_(x); + x = relu_(x); + return {x}; + } + + private: + nn::Conv1D conv_; + nn::ReLU relu_; + int32_t kernel_size_ = 1; + int32_t dilation_ = 1; +}; + +class Res2NetBlock final : public nn::Module { + public: + Res2NetBlock() = default; + Res2NetBlock(const std::string& name, int32_t in_channels, int32_t out_channels, int32_t scale, int32_t kernel_size, int32_t dilation) + : nn::Module(name), scale_(scale) { + const int32_t in_channel = in_channels / scale; + const int32_t hidden_channel = out_channels / scale; + blocks_ = reg>("blocks", scale_ - 1, in_channel, hidden_channel, kernel_size, dilation); + } + + std::vector forward(const std::vector& inputs, const std::vector&) override { + auto x = inputs[0]; + const int32_t channels = x.shape()[1]; + const int32_t split = channels / scale_; + + std::vector outputs; + outputs.reserve(scale_); + Tensor output_part = Tensor::nil(); + + for (int32_t i = 0; i < scale_; ++i) { + auto hidden_part = x[{kAll, {i * split, (i + 1) * split}, kAll}]; + if (i == 0) { + output_part = hidden_part; + } else if (i == 1) { + output_part = blocks_.list()[i - 1](hidden_part)[0]; + } else { + output_part = blocks_.list()[i - 1](hidden_part + output_part)[0]; + } + outputs.push_back(output_part); + } + + auto out = nn::functional::concat(outputs, 1); + return {out}; + } + + private: + int32_t scale_ = 1; + nn::ModuleList blocks_; +}; + +class SqueezeExcitationBlock final : public nn::Module { + public: + SqueezeExcitationBlock() = default; + SqueezeExcitationBlock(const std::string& name, int32_t in_channels, int32_t se_channels, int32_t out_channels) + : nn::Module(name) { + conv1_ = reg("conv1", in_channels, se_channels, 1, 1, 0, 1, 1, true); + conv2_ = reg("conv2", se_channels, out_channels, 1, 1, 0, 1, 1, true); + relu_ = reg("relu"); + sigmoid_ = reg("sigmoid"); + } + + std::vector forward(const std::vector& inputs, const std::vector&) override { + auto hidden_states = inputs[0]; + auto hidden_mean = nn::functional::mean(hidden_states, 2, true); + hidden_mean = relu_(conv1_(hidden_mean)); + hidden_mean = sigmoid_(conv2_(hidden_mean)); + hidden_states = hidden_states * hidden_mean; + return {hidden_states}; + } + + private: + nn::Conv1D conv1_; + nn::Conv1D conv2_; + nn::ReLU relu_; + nn::Sigmoid sigmoid_; +}; + +class AttentiveStatisticsPooling final : public nn::Module { + public: + AttentiveStatisticsPooling() = default; + AttentiveStatisticsPooling(const std::string& name, int32_t channels, int32_t attention_channels) + : nn::Module(name), channels_(channels) { + tdnn_ = reg("tdnn", channels * 3, attention_channels, 1, 1); + tanh_ = reg("tanh"); + conv_ = reg("conv", attention_channels, channels, 1, 1, 0, 1, 1, true); + } + + std::vector forward(const std::vector& inputs, const std::vector&) override { + auto hidden_states = inputs[0]; + MLLM_RT_ASSERT_EQ(hidden_states.dtype(), kFloat32); + MLLM_RT_ASSERT_EQ(hidden_states.device(), kCPU); + + const int32_t batch = hidden_states.shape()[0]; + const int32_t channels = hidden_states.shape()[1]; + const int32_t seq_len = hidden_states.shape()[2]; + + auto mean = Tensor::empty({batch, channels}, kFloat32, kCPU).alloc(); + auto std = Tensor::empty({batch, channels}, kFloat32, kCPU).alloc(); + + const auto* x_ptr = hidden_states.ptr(); + auto* mean_ptr = mean.ptr(); + auto* std_ptr = std.ptr(); + + const int32_t stride_c = seq_len; + const int32_t stride_b = channels * seq_len; + + for (int32_t b = 0; b < batch; ++b) { + for (int32_t c = 0; c < channels; ++c) { + const int32_t base = b * stride_b + c * stride_c; + float sum = 0.0f; + for (int32_t t = 0; t < seq_len; ++t) { sum += x_ptr[base + t]; } + float m = sum / static_cast(seq_len); + mean_ptr[b * channels + c] = m; + + float var = 0.0f; + for (int32_t t = 0; t < seq_len; ++t) { + float diff = x_ptr[base + t] - m; + var += diff * diff; + } + var /= static_cast(seq_len); + std_ptr[b * channels + c] = std::sqrt(std::max(var, 1e-12f)); + } + } + + auto mean_rep = mean.view({batch, channels, 1}).repeat(seq_len, 2); + auto std_rep = std.view({batch, channels, 1}).repeat(seq_len, 2); + + auto attention = nn::functional::concat({hidden_states, mean_rep, std_rep}, 1); + attention = tdnn_(attention)[0]; + attention = tanh_(attention); + attention = conv_(attention); + attention = nn::functional::softmax(attention, 2); + + auto out_mean = Tensor::empty({batch, channels}, kFloat32, kCPU).alloc(); + auto out_std = Tensor::empty({batch, channels}, kFloat32, kCPU).alloc(); + auto* out_mean_ptr = out_mean.ptr(); + auto* out_std_ptr = out_std.ptr(); + const auto* attn_ptr = attention.ptr(); + + for (int32_t b = 0; b < batch; ++b) { + for (int32_t c = 0; c < channels; ++c) { + const int32_t base = b * stride_b + c * stride_c; + float m = 0.0f; + for (int32_t t = 0; t < seq_len; ++t) { m += attn_ptr[base + t] * x_ptr[base + t]; } + out_mean_ptr[b * channels + c] = m; + + float var = 0.0f; + for (int32_t t = 0; t < seq_len; ++t) { + float diff = x_ptr[base + t] - m; + var += attn_ptr[base + t] * diff * diff; + } + out_std_ptr[b * channels + c] = std::sqrt(std::max(var, 1e-12f)); + } + } + + auto pooled = nn::functional::concat({out_mean, out_std}, 1).view({batch, channels * 2, 1}); + return {pooled}; + } + + private: + int32_t channels_ = 0; + TimeDelayNetBlock tdnn_; + nn::Tanh tanh_; + nn::Conv1D conv_; +}; + +class SqueezeExcitationRes2NetBlock final : public nn::Module { + public: + SqueezeExcitationRes2NetBlock() = default; + SqueezeExcitationRes2NetBlock(const std::string& name, int32_t in_channels, int32_t out_channels, int32_t res2net_scale, + int32_t se_channels, int32_t kernel_size, int32_t dilation) + : nn::Module(name), out_channels_(out_channels) { + tdnn1_ = reg("tdnn1", in_channels, out_channels, 1, 1); + res2net_block_ = reg("res2net_block", out_channels, out_channels, res2net_scale, kernel_size, dilation); + tdnn2_ = reg("tdnn2", out_channels, out_channels, 1, 1); + se_block_ = reg("se_block", out_channels, se_channels, out_channels); + } + + std::vector forward(const std::vector& inputs, const std::vector&) override { + auto hidden_state = inputs[0]; + auto residual = hidden_state; + + hidden_state = tdnn1_(hidden_state)[0]; + hidden_state = res2net_block_(hidden_state)[0]; + hidden_state = tdnn2_(hidden_state)[0]; + hidden_state = se_block_(hidden_state)[0]; + hidden_state = hidden_state + residual; + return {hidden_state}; + } + + private: + int32_t out_channels_ = 0; + TimeDelayNetBlock tdnn1_; + Res2NetBlock res2net_block_; + TimeDelayNetBlock tdnn2_; + SqueezeExcitationBlock se_block_; +}; + +class ECAPA_TimeDelayNet final : public nn::Module { + public: + ECAPA_TimeDelayNet() = default; + explicit ECAPA_TimeDelayNet(const std::string& name, const Qwen2_5OmniDiTConfig& cfg) : nn::Module(name) { + if (cfg.enc_channels.size() != cfg.enc_kernel_sizes.size() || cfg.enc_channels.size() != cfg.enc_dilations.size()) { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "enc_channels, enc_kernel_sizes and enc_dilations should have same length"); + } + + if (cfg.enc_channels.empty()) { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "enc_channels should not be empty"); + } + + const int32_t num_blocks = static_cast(cfg.enc_channels.size()); + tdnn0_ = reg("blocks.0", cfg.mel_dim, cfg.enc_channels[0], cfg.enc_kernel_sizes[0], cfg.enc_dilations[0]); + + for (int32_t i = 1; i < num_blocks - 1; ++i) { + se_blocks_.emplace_back(reg( + "blocks." + std::to_string(i), + cfg.enc_channels[i - 1], + cfg.enc_channels[i], + cfg.enc_res2net_scale, + cfg.enc_se_channels, + cfg.enc_kernel_sizes[i], + cfg.enc_dilations[i])); + } + + mfa_ = reg("mfa", cfg.enc_channels.back(), cfg.enc_channels.back(), cfg.enc_kernel_sizes.back(), + cfg.enc_dilations.back()); + asp_ = reg("asp", cfg.enc_channels.back(), cfg.enc_attention_channels); + fc_ = reg("fc", cfg.enc_channels.back() * 2, cfg.enc_dim, 1, 1, 0, 1, 1, true); + } + + std::vector forward(const std::vector& inputs, const std::vector&) override { + auto hidden_states = inputs[0]; + MLLM_RT_ASSERT_EQ(hidden_states.dtype(), kFloat32); + MLLM_RT_ASSERT_EQ(hidden_states.device(), kCPU); + + hidden_states = hidden_states.transpose(1, 2); + + std::vector hidden_states_list; + hidden_states = tdnn0_(hidden_states)[0]; + hidden_states_list.push_back(hidden_states); + + for (auto& block : se_blocks_) { + hidden_states = block(hidden_states)[0]; + hidden_states_list.push_back(hidden_states); + } + + if (hidden_states_list.size() <= 1) { + MLLM_ERROR_EXIT(ExitCode::kCoreError, "ECAPA_TimeDelayNet expects at least 2 blocks."); + } + + std::vector mfa_inputs; + for (size_t i = 1; i < hidden_states_list.size(); ++i) { mfa_inputs.push_back(hidden_states_list[i]); } + hidden_states = nn::functional::concat(mfa_inputs, 1); + hidden_states = mfa_(hidden_states)[0]; + hidden_states = asp_(hidden_states)[0]; + hidden_states = fc_(hidden_states); + hidden_states = hidden_states.squeeze(-1); + + return {hidden_states}; + } + + private: + TimeDelayNetBlock tdnn0_; + std::vector se_blocks_; + TimeDelayNetBlock mfa_; + AttentiveStatisticsPooling asp_; + nn::Conv1D fc_; +}; + +class DiTInputEmbedding final : public nn::Module { + public: + DiTInputEmbedding() = default; + explicit DiTInputEmbedding(const std::string& name, const Qwen2_5OmniDiTConfig& cfg) : nn::Module(name) { + const int32_t in_dim = cfg.mel_dim + cfg.enc_dim + cfg.enc_emb_dim + cfg.emb_dim; + proj_ = reg("proj", in_dim, cfg.hidden_size, true); + spk_encoder_ = reg("spk_encoder", cfg); + } + + Tensor forward(const Tensor& hidden_states, const Tensor& speaker_embedding, const Tensor& condition_vector, const Tensor& code_embed, + bool drop_audio_cond, const Tensor& code_embed_uncond, bool apply_cfg) { + auto x = hidden_states; + auto spk = speaker_embedding; + auto cond = condition_vector; + auto code = code_embed; + + if (apply_cfg) { + x = nn::functional::concat({x, x}, 0); + spk = nn::functional::concat({spk, Tensor::zeros(spk.shape(), spk.dtype(), spk.device())}, 0); + cond = nn::functional::concat({cond, Tensor::zeros(cond.shape(), cond.dtype(), cond.device())}, 0); + code = nn::functional::concat({code, code_embed_uncond}, 0); + } else if (drop_audio_cond) { + cond = Tensor::zeros(cond.shape(), cond.dtype(), cond.device()); + spk = Tensor::zeros(spk.shape(), spk.dtype(), spk.device()); + } + + auto cond_embed = spk_encoder_(cond)[0]; + const int32_t seq_len = x.shape()[1]; + cond_embed = cond_embed.view({cond_embed.shape()[0], 1, cond_embed.shape()[1]}).repeat(seq_len, 1); + + auto merged = nn::functional::concat({x, cond_embed, code, spk}, -1); + auto out = proj_(merged); + return out; + } + + private: + nn::Linear proj_; + ECAPA_TimeDelayNet spk_encoder_; +}; + +class DiTCodecEmbedding final : public nn::Module { + public: + DiTCodecEmbedding() = default; + DiTCodecEmbedding(const std::string& name, int32_t codec_num_embeds, int32_t codec_dim, int32_t repeats) + : nn::Module(name), repeats_(repeats) { + codec_embed_ = reg("codec_embed", codec_num_embeds + 1, codec_dim); + } + + Tensor forward(const Tensor& code, bool drop_code) { + Tensor code_ids = code; + if (drop_code) { code_ids = Tensor::zeros(code.shape(), code.dtype(), code.device()); } + auto code_embed = codec_embed_(code_ids); + return repeatInterleave(code_embed, repeats_, 1); + } + + private: + int32_t repeats_ = 1; + nn::Embedding codec_embed_; +}; + +class Qwen2_5_OmniAdaLayerNormZero final : public nn::Module { + public: + Qwen2_5_OmniAdaLayerNormZero() = default; + Qwen2_5_OmniAdaLayerNormZero(const std::string& name, int32_t dim) : nn::Module(name) { + silu_ = reg("silu"); + linear_ = reg("linear", dim, dim * 6, true); + norm_ = reg("norm", std::vector{dim}, false, false, 1e-6f); + } + + std::vector forward(const std::vector& inputs, const std::vector&) override { + auto hidden_states = inputs[0]; + auto emb = inputs[1]; + emb = linear_(silu_(emb)); + + auto chunks = nn::functional::chunk<6>(emb, 1); + auto shift_msa = chunks[0]; + auto scale_msa = chunks[1]; + auto gate_msa = chunks[2]; + auto shift_mlp = chunks[3]; + auto scale_mlp = chunks[4]; + auto gate_mlp = chunks[5]; + + auto normed = norm_(hidden_states); + const int32_t seq_len = hidden_states.shape()[1]; + auto scale = scale_msa.view({scale_msa.shape()[0], 1, scale_msa.shape()[1]}).repeat(seq_len, 1); + auto shift = shift_msa.view({shift_msa.shape()[0], 1, shift_msa.shape()[1]}).repeat(seq_len, 1); + normed = normed * (scale + 1.0f) + shift; + + return {normed, gate_msa, shift_mlp, scale_mlp, gate_mlp}; + } + + private: + nn::SiLU silu_; + nn::Linear linear_; + nn::LayerNorm norm_; +}; + +class Qwen2_5_OmniAdaLayerNormZero_Final final : public nn::Module { + public: + Qwen2_5_OmniAdaLayerNormZero_Final() = default; + Qwen2_5_OmniAdaLayerNormZero_Final(const std::string& name, int32_t dim) : nn::Module(name) { + silu_ = reg("silu"); + linear_ = reg("linear", dim, dim * 2, true); + norm_ = reg("norm", std::vector{dim}, false, false, 1e-6f); + } + + Tensor forward(const Tensor& hidden_states, const Tensor& emb) { + auto emb_out = linear_(silu_(emb)); + auto chunks = nn::functional::chunk<2>(emb_out, 1); + auto scale = chunks[0]; + auto shift = chunks[1]; + + auto normed = norm_(hidden_states); + const int32_t seq_len = hidden_states.shape()[1]; + scale = scale.view({scale.shape()[0], 1, scale.shape()[1]}).repeat(seq_len, 1); + shift = shift.view({shift.shape()[0], 1, shift.shape()[1]}).repeat(seq_len, 1); + normed = normed * (scale + 1.0f) + shift; + return normed; + } + + private: + nn::SiLU silu_; + nn::Linear linear_; + nn::LayerNorm norm_; +}; + +class DiTMLP final : public nn::Module { + public: + DiTMLP() = default; + DiTMLP(const std::string& name, int32_t dim, int32_t mult) : nn::Module(name) { + const int32_t inner_dim = dim * mult; + fc1_ = reg("ff.0", dim, inner_dim, true); + act_ = reg("ff.1"); + fc2_ = reg("ff.3", inner_dim, dim, true); + } + + Tensor forward(const Tensor& hidden_states) { + auto x = fc1_(hidden_states); + x = act_(x); + x = fc2_(x); + return x; + } + + private: + nn::Linear fc1_; + nn::GELU act_; + nn::Linear fc2_; +}; + +inline void applyRotaryPosEmbFirstHead(Tensor& q, Tensor& k, const Tensor& cos, const Tensor& sin) { + MLLM_RT_ASSERT_EQ(q.device(), kCPU); + MLLM_RT_ASSERT_EQ(k.device(), kCPU); + MLLM_RT_ASSERT_EQ(cos.device(), kCPU); + MLLM_RT_ASSERT_EQ(sin.device(), kCPU); + MLLM_RT_ASSERT_EQ(q.dtype(), kFloat32); + MLLM_RT_ASSERT_EQ(k.dtype(), kFloat32); + MLLM_RT_ASSERT_EQ(cos.dtype(), kFloat32); + MLLM_RT_ASSERT_EQ(sin.dtype(), kFloat32); + + const int32_t batch = q.shape()[0]; + const int32_t heads = q.shape()[1]; + const int32_t seq_len = q.shape()[2]; + const int32_t head_dim = q.shape()[3]; + MLLM_RT_ASSERT_EQ(head_dim % 2, 0); + MLLM_RT_ASSERT_EQ(cos.shape()[0], batch); + MLLM_RT_ASSERT_EQ(cos.shape()[1], seq_len); + MLLM_RT_ASSERT_EQ(cos.shape()[2], head_dim); + + const auto* cos_ptr = cos.ptr(); + const auto* sin_ptr = sin.ptr(); + auto* q_ptr = q.ptr(); + auto* k_ptr = k.ptr(); + + const int64_t stride_q_b = static_cast(heads) * seq_len * head_dim; + const int64_t stride_q_h = static_cast(seq_len) * head_dim; + const int64_t stride_q_s = head_dim; + + const int64_t stride_cos_b = static_cast(seq_len) * head_dim; + const int64_t stride_cos_s = head_dim; + + for (int32_t b = 0; b < batch; ++b) { + const int64_t q_base_b = static_cast(b) * stride_q_b; + const int64_t cos_base_b = static_cast(b) * stride_cos_b; + for (int32_t s = 0; s < seq_len; ++s) { + float* q_row = q_ptr + q_base_b + 0 * stride_q_h + static_cast(s) * stride_q_s; + float* k_row = k_ptr + q_base_b + 0 * stride_q_h + static_cast(s) * stride_q_s; + const float* cos_row = cos_ptr + cos_base_b + static_cast(s) * stride_cos_s; + const float* sin_row = sin_ptr + cos_base_b + static_cast(s) * stride_cos_s; + for (int32_t d = 0; d < head_dim; d += 2) { + const float c = cos_row[d]; + const float ss = sin_row[d]; + const float q1 = q_row[d]; + const float q2 = q_row[d + 1]; + const float k1 = k_row[d]; + const float k2 = k_row[d + 1]; + q_row[d] = q1 * c - q2 * ss; + q_row[d + 1] = q1 * ss + q2 * c; + k_row[d] = k1 * c - k2 * ss; + k_row[d + 1] = k1 * ss + k2 * c; + } + } + } +} + +inline Tensor makeBlockDiff(int32_t batch, int32_t heads, int32_t seq_len, int32_t block_size) { + (void)heads; + MLLM_RT_ASSERT(block_size > 0); + std::vector block_indices(seq_len, 0); + for (int32_t i = 0; i < seq_len; ++i) { block_indices[i] = i / block_size; } + + std::vector base(static_cast(seq_len) * seq_len, 0.0f); + for (int32_t i = 0; i < seq_len; ++i) { + for (int32_t j = 0; j < seq_len; ++j) { + base[static_cast(i) * seq_len + j] = static_cast(block_indices[j] - block_indices[i]); + } + } + + // Use a broadcast-friendly shape to avoid materializing head copies while keeping naive broadcast support. + auto out = Tensor::empty({batch, 1, seq_len, seq_len}, kFloat32, kCPU).alloc(); + const int64_t block_stride = static_cast(seq_len) * seq_len; + auto* out_ptr = out.ptr(); + for (int32_t b = 0; b < batch; ++b) { + float* dst = out_ptr + static_cast(b) * block_stride; + std::memcpy(dst, base.data(), sizeof(float) * base.size()); + } + return out; +} + +inline Tensor makeBlockMask(const Tensor& block_diff, int32_t look_backward_block, int32_t look_ahead_block) { + MLLM_RT_ASSERT_EQ(block_diff.device(), kCPU); + MLLM_RT_ASSERT_EQ(block_diff.dtype(), kFloat32); + + auto mask = Tensor::empty(block_diff.shape(), kFloat32, kCPU).alloc(); + const auto* src = block_diff.ptr(); + auto* dst = mask.ptr(); + const int64_t numel = block_diff.numel(); + const float lower = -static_cast(look_backward_block); + const float upper = static_cast(look_ahead_block); + + MLLM_CONDITIONAL_PARALLEL_FOR(numel > 1024, 4, idx, 0, numel, 1, { + const float v = src[idx]; + dst[idx] = (v >= lower && v <= upper) ? 0.0f : -1e4f; + }); + return mask; +} + +class DiTAttention final : public nn::Module { + public: + DiTAttention() = default; + explicit DiTAttention(const std::string& name, const Qwen2_5OmniDiTConfig& cfg) : nn::Module(name), cfg_(cfg) { + dim_ = cfg.hidden_size; + heads_ = cfg.num_attention_heads; + head_dim_ = cfg.head_dim; + inner_dim_ = head_dim_ * heads_; + + to_q_ = reg("to_q", dim_, inner_dim_, true); + to_k_ = reg("to_k", dim_, inner_dim_, true); + to_v_ = reg("to_v", dim_, inner_dim_, true); + to_out_ = reg("to_out.0", inner_dim_, dim_, true); + } + + Tensor forward(const Tensor& hidden_states, const std::pair& position_embeddings, const Tensor& attention_mask) { + auto query = to_q_(hidden_states); + auto key = to_k_(hidden_states); + auto value = to_v_(hidden_states); + + const int32_t batch = hidden_states.shape()[0]; + const int32_t seq_len = hidden_states.shape()[1]; + + query = query.view({batch, seq_len, heads_, head_dim_}).transpose(1, 2); + key = key.view({batch, seq_len, heads_, head_dim_}).transpose(1, 2); + value = value.view({batch, seq_len, heads_, head_dim_}).transpose(1, 2); + + if (!position_embeddings.first.isNil()) { + applyRotaryPosEmbFirstHead(query, key, position_embeddings.first, position_embeddings.second); + } + + auto attn_output = nn::functional::scaledDotProductAttention(query, key, value, attention_mask); + attn_output = attn_output.transpose(1, 2).view({batch, seq_len, inner_dim_}); + attn_output = to_out_(attn_output); + return attn_output; + } + + private: + Qwen2_5OmniDiTConfig cfg_; + int32_t dim_ = 0; + int32_t heads_ = 0; + int32_t head_dim_ = 0; + int32_t inner_dim_ = 0; + nn::Linear to_q_; + nn::Linear to_k_; + nn::Linear to_v_; + nn::Linear to_out_; +}; + +class SinusPositionEmbedding final : public nn::Module { + public: + SinusPositionEmbedding() = default; + explicit SinusPositionEmbedding(const std::string& name, int32_t dim) : nn::Module(name), dim_(dim) {} + + Tensor forward(const Tensor& hidden_states, float scale = 1000.0f) { + MLLM_RT_ASSERT_EQ(hidden_states.device(), kCPU); + MLLM_RT_ASSERT_EQ(hidden_states.dtype(), kFloat32); + + const int32_t batch = hidden_states.shape()[0]; + const int32_t half_dim = dim_ / 2; + auto out = Tensor::empty({batch, dim_}, kFloat32, kCPU).alloc(); + auto* out_ptr = out.ptr(); + const auto* hs_ptr = hidden_states.ptr(); + + const float emb = std::log(10000.0f) / static_cast(half_dim - 1); + std::vector freqs(half_dim); + for (int32_t i = 0; i < half_dim; ++i) { freqs[i] = std::exp(-emb * static_cast(i)); } + + for (int32_t b = 0; b < batch; ++b) { + const float t = hs_ptr[b] * scale; + float* row = out_ptr + static_cast(b) * dim_; + for (int32_t i = 0; i < half_dim; ++i) { + const float val = t * freqs[i]; + row[i] = std::sin(val); + row[i + half_dim] = std::cos(val); + } + } + + return out; + } + + private: + int32_t dim_ = 0; +}; + +class DiTTimestepEmbedding final : public nn::Module { + public: + DiTTimestepEmbedding() = default; + explicit DiTTimestepEmbedding(const std::string& name, int32_t dim, int32_t freq_embed_dim = 256) + : nn::Module(name), freq_embed_dim_(freq_embed_dim) { + time_embed_ = reg("time_embed", freq_embed_dim_); + fc1_ = reg("time_mlp.0", freq_embed_dim_, dim, true); + act_ = reg("time_mlp.1"); + fc2_ = reg("time_mlp.2", dim, dim, true); + } + + Tensor forward(const Tensor& timestep) { + auto time_hidden = time_embed_.forward(timestep); + time_hidden = fc1_(time_hidden); + time_hidden = act_(time_hidden); + time_hidden = fc2_(time_hidden); + return time_hidden; + } + + private: + int32_t freq_embed_dim_ = 256; + SinusPositionEmbedding time_embed_; + nn::Linear fc1_; + nn::SiLU act_; + nn::Linear fc2_; +}; + +class DiTDecoderLayer final : public nn::Module { + public: + DiTDecoderLayer() = default; + DiTDecoderLayer(const std::string& name, const Qwen2_5OmniDiTConfig& cfg, int32_t look_ahead_block, int32_t look_backward_block) + : nn::Module(name), look_ahead_block_(look_ahead_block), look_backward_block_(look_backward_block) { + attn_norm_ = reg("attn_norm", cfg.hidden_size); + attn_ = reg("attn", cfg); + ff_norm_ = reg("ff_norm", std::vector{cfg.hidden_size}, false, false, 1e-6f); + ff_ = reg("ff", cfg.hidden_size, cfg.ff_mult); + } + + Tensor forward(const Tensor& hidden_states, const Tensor& timestep, const std::pair& position_embeddings, + const Tensor& block_diff) { + auto attn_norm_out = attn_norm_(hidden_states, timestep); + auto norm = attn_norm_out[0]; + auto gate_msa = attn_norm_out[1]; + auto shift_mlp = attn_norm_out[2]; + auto scale_mlp = attn_norm_out[3]; + auto gate_mlp = attn_norm_out[4]; + + Tensor attn_mask = Tensor::nil(); + if (!block_diff.isNil()) { attn_mask = makeBlockMask(block_diff, look_backward_block_, look_ahead_block_); } + auto attn_output = attn_.forward(norm, position_embeddings, attn_mask); + + auto gate_msa_rep = gate_msa.view({gate_msa.shape()[0], 1, gate_msa.shape()[1]}).repeat(hidden_states.shape()[1], 1); + auto x = Tensor(hidden_states); + x = x + gate_msa_rep * attn_output; + + auto norm_ff = ff_norm_(x); + auto scale_rep = scale_mlp.view({scale_mlp.shape()[0], 1, scale_mlp.shape()[1]}).repeat(x.shape()[1], 1); + auto shift_rep = shift_mlp.view({shift_mlp.shape()[0], 1, shift_mlp.shape()[1]}).repeat(x.shape()[1], 1); + norm_ff = norm_ff * (scale_rep + 1.0f) + shift_rep; + auto ff_output = ff_.forward(norm_ff); + auto gate_mlp_rep = gate_mlp.view({gate_mlp.shape()[0], 1, gate_mlp.shape()[1]}).repeat(x.shape()[1], 1); + x = x + gate_mlp_rep * ff_output; + return x; + } + + private: + Qwen2_5_OmniAdaLayerNormZero attn_norm_; + DiTAttention attn_; + nn::LayerNorm ff_norm_; + DiTMLP ff_; + int32_t look_ahead_block_ = 0; + int32_t look_backward_block_ = 0; +}; + +class Qwen2_5OmniDiTRotaryEmbedding final : public nn::Module { + public: + Qwen2_5OmniDiTRotaryEmbedding() = default; + explicit Qwen2_5OmniDiTRotaryEmbedding(const std::string& name, const Qwen2_5OmniDiTConfig& cfg) : nn::Module(name), cfg_(cfg) { + const int32_t dim = cfg.head_dim; + inv_freq_ = reg("inv_freq", getModuleName() + ".inv_freq", std::vector{dim / 2}); + attention_scaling_ = 1.0f; + + auto inv = inv_freq_.weight(); + if (!inv.isNil() && inv.numel() == 0) { + inv = Tensor::empty({dim / 2}, kFloat32, kCPU).alloc(); + inv_freq_.weight().copy2(inv); + } + } + + std::pair forward(const Tensor& x, const Tensor& position_ids) { + MLLM_RT_ASSERT_EQ(x.device(), kCPU); + MLLM_RT_ASSERT_EQ(position_ids.device(), kCPU); + MLLM_RT_ASSERT_EQ(position_ids.dtype(), kInt64); + + const int32_t batch = position_ids.shape()[0]; + const int32_t seq_len = position_ids.shape()[1]; + auto inv_freq = inv_freq_.weight(); + if (inv_freq.isNil() || inv_freq.numel() == 0) { + const int32_t dim = cfg_.head_dim; + inv_freq = Tensor::empty({dim / 2}, kFloat32, kCPU).alloc(); + auto* ptr = inv_freq.ptr(); + for (int32_t i = 0; i < dim / 2; ++i) { + ptr[i] = 1.0f / std::pow(cfg_.rope_theta, 2.0f * i / static_cast(dim)); + } + } + + const int32_t half_dim = inv_freq.shape()[0]; + auto cos = Tensor::empty({batch, seq_len, half_dim * 2}, kFloat32, kCPU).alloc(); + auto sin = Tensor::empty({batch, seq_len, half_dim * 2}, kFloat32, kCPU).alloc(); + + const auto* inv_ptr = inv_freq.ptr(); + const auto* pos_ptr = position_ids.ptr(); + auto* cos_ptr = cos.ptr(); + auto* sin_ptr = sin.ptr(); + + const int64_t stride_pos_b = seq_len; + const int64_t stride_cos_b = static_cast(seq_len) * half_dim * 2; + const int64_t stride_cos_s = half_dim * 2; + + for (int32_t b = 0; b < batch; ++b) { + const int64_t pos_base = static_cast(b) * stride_pos_b; + const int64_t out_base = static_cast(b) * stride_cos_b; + for (int32_t s = 0; s < seq_len; ++s) { + const float position = static_cast(pos_ptr[pos_base + s]); + float* cos_row = cos_ptr + out_base + static_cast(s) * stride_cos_s; + float* sin_row = sin_ptr + out_base + static_cast(s) * stride_cos_s; + for (int32_t d = 0; d < half_dim; ++d) { + const float freq = inv_ptr[d] * position; + const float c = std::cos(freq) * attention_scaling_; + const float ss = std::sin(freq) * attention_scaling_; + cos_row[d] = c; + cos_row[d + half_dim] = c; + sin_row[d] = ss; + sin_row[d + half_dim] = ss; + } + } + } + + return {cos, sin}; + } + + private: + Qwen2_5OmniDiTConfig cfg_; + nn::Param inv_freq_; + float attention_scaling_ = 1.0f; +}; + +class RungeKutta4ODESolver { + public: + using Function = std::function; + + RungeKutta4ODESolver(Function function, Tensor initial_value) + : function_(std::move(function)), initial_value_(std::move(initial_value)) {} + + Tensor integrate(const std::vector& time_points) { + auto current_value = initial_value_; + if (time_points.size() < 2) { return current_value; } + + for (size_t i = 0; i + 1 < time_points.size(); ++i) { + const float time_start = time_points[i]; + const float time_end = time_points[i + 1]; + const float time_step = time_end - time_start; + + auto k1 = function_(time_start, current_value); + auto k2 = function_(time_start + time_step * one_third_, current_value + k1 * (time_step * one_third_)); + auto k3 = function_(time_start + time_step * two_thirds_, + current_value + (k2 - k1 * one_third_) * time_step); + auto k4 = function_(time_end, current_value + (k1 - k2 + k3) * time_step); + + auto delta = (k1 + (k2 + k3) * 3.0f + k4) * (time_step / 8.0f); + current_value = current_value + delta; + } + + return current_value; + } + + private: + Function function_; + Tensor initial_value_; + float one_third_ = 1.0f / 3.0f; + float two_thirds_ = 2.0f / 3.0f; +}; + +class Qwen2_5OmniToken2WavDiTModel final : public nn::Module { + public: + Qwen2_5OmniToken2WavDiTModel() = default; + explicit Qwen2_5OmniToken2WavDiTModel(const std::string& name, const Qwen2_5OmniDiTConfig& cfg) : nn::Module(name), cfg_(cfg) { + mel_dim_ = cfg.mel_dim; + repeats_ = cfg.repeats; + block_size_ = cfg.block_size; + num_attention_heads_ = cfg.num_attention_heads; + + time_embed_ = reg("time_embed", cfg.hidden_size); + text_embed_ = reg("text_embed", cfg.num_embeds, cfg.emb_dim, cfg.repeats); + input_embed_ = reg("input_embed", cfg); + rotary_embed_ = reg("rotary_embed", cfg); + + for (int32_t i = 0; i < cfg.num_hidden_layers; ++i) { + const bool look_ahead = std::find(cfg.look_ahead_layers.begin(), cfg.look_ahead_layers.end(), i) != cfg.look_ahead_layers.end(); + const bool look_backward = + std::find(cfg.look_backward_layers.begin(), cfg.look_backward_layers.end(), i) != cfg.look_backward_layers.end(); + transformer_blocks_.emplace_back(reg("transformer_blocks." + std::to_string(i), cfg, look_ahead ? 1 : 0, + look_backward ? 1 : 0)); + } + + norm_out_ = reg("norm_out", cfg.hidden_size); + proj_out_ = reg("proj_out", cfg.hidden_size, cfg.mel_dim, true); + } + + Tensor forward(const Tensor& hidden_states, const Tensor& condition_vector, const Tensor& speaker_embedding, const Tensor& quantized_code, + const Tensor& time_step, bool drop_audio_conditioning, bool drop_code, bool apply_cfg) { + Tensor timestep = time_step; + if (timestep.shape().empty()) { timestep = timestep.view({1}); } + if (timestep.shape().size() == 1 && timestep.shape()[0] == 1 && hidden_states.shape()[0] > 1) { + timestep = timestep.repeat(hidden_states.shape()[0], 0); + } + + auto time_embedding = time_embed_.forward(timestep); + auto text_embedding = text_embed_.forward(quantized_code, apply_cfg ? false : drop_code); + Tensor text_embedding_uncond = Tensor::nil(); + if (apply_cfg) { text_embedding_uncond = text_embed_.forward(quantized_code, true); } + + auto x = input_embed_.forward(hidden_states, speaker_embedding, condition_vector, text_embedding, drop_audio_conditioning, + text_embedding_uncond, apply_cfg); + + const int32_t seq_len = x.shape()[1]; + auto position_ids = Tensor::empty({x.shape()[0], seq_len}, kInt64, kCPU).alloc(); + auto* pos_ptr = position_ids.ptr(); + for (int32_t b = 0; b < position_ids.shape()[0]; ++b) { + for (int32_t s = 0; s < seq_len; ++s) { pos_ptr[b * seq_len + s] = s; } + } + + auto position_embeddings = rotary_embed_.forward(x, position_ids); + auto block_diff = makeBlockDiff(x.shape()[0], num_attention_heads_, seq_len, block_size_); + + for (auto& block : transformer_blocks_) { x = block.forward(x, time_embedding, position_embeddings, block_diff); } + + x = norm_out_.forward(x, time_embedding); + x = proj_out_(x); + return x; + } + + Tensor sample(const Tensor& conditioning_vector, const Tensor& reference_mel, const Tensor& quantized_code, int32_t num_steps, + float guidance_scale, float sway_coefficient) { + const int32_t max_duration = quantized_code.shape()[1] * repeats_; + auto initial_state = randomNormal({1, max_duration, mel_dim_}); + + const int32_t batch = reference_mel.shape()[0]; + if (batch != 1) { MLLM_ERROR_EXIT(ExitCode::kCoreError, "Only batch size = 1 is supported for Qwen2.5-Omni token2wav."); } + + auto cond = Tensor(conditioning_vector); + cond = cond.view({batch, 1, conditioning_vector.shape()[1]}).repeat(max_duration, 1); + + auto ode_function = [&](float time_step, const Tensor& hidden) -> Tensor { + auto t = Tensor::empty({1}, kFloat32, kCPU).alloc(); + t.ptr()[0] = time_step; + + if (guidance_scale < 1e-5f) { + return forward(hidden, reference_mel, cond, quantized_code, t, false, false, false); + } + + auto model_output = forward(hidden, reference_mel, cond, quantized_code, t, false, false, true); + auto outputs = nn::functional::chunk<2>(model_output, 0); + return outputs[0] + (outputs[0] - outputs[1]) * guidance_scale; + }; + + auto time_points_tensor = linspace(0.0f, 1.0f, num_steps); + std::vector time_points(static_cast(num_steps)); + const auto* tp_ptr = time_points_tensor.ptr(); + for (int32_t i = 0; i < num_steps; ++i) { time_points[i] = tp_ptr[i]; } + + if (sway_coefficient != 0.0f) { + for (auto& t : time_points) { + t = t + sway_coefficient * (std::cos(kPi / 2.0f * t) - 1.0f + t); + } + } + + RungeKutta4ODESolver solver(ode_function, initial_state); + auto generated = solver.integrate(time_points); + auto mel = generated.permute({0, 2, 1}); + if (!mel.isContiguous()) { mel = mel.contiguous(); } + return mel; + } + + private: + Qwen2_5OmniDiTConfig cfg_; + int32_t mel_dim_ = 0; + int32_t repeats_ = 1; + int32_t block_size_ = 1; + int32_t num_attention_heads_ = 1; + + DiTTimestepEmbedding time_embed_; + DiTCodecEmbedding text_embed_; + DiTInputEmbedding input_embed_; + Qwen2_5OmniDiTRotaryEmbedding rotary_embed_; + std::vector transformer_blocks_; + Qwen2_5_OmniAdaLayerNormZero_Final norm_out_; + nn::Linear proj_out_; +}; + +class AMPBlock final : public nn::Module { + public: + AMPBlock() = default; + AMPBlock(const std::string& name, int32_t channels, int32_t kernel_size, const std::vector& dilations) + : nn::Module(name) { + if (dilations.size() != 3) { MLLM_ERROR_EXIT(ExitCode::kCoreError, "AMPBlock expects 3 dilation values."); } + + convs1_.emplace_back(reg("convs1.0", channels, channels, kernel_size, 1, getPadding(kernel_size, dilations[0]), + dilations[0], 1, true)); + convs1_.emplace_back(reg("convs1.1", channels, channels, kernel_size, 1, getPadding(kernel_size, dilations[1]), + dilations[1], 1, true)); + convs1_.emplace_back(reg("convs1.2", channels, channels, kernel_size, 1, getPadding(kernel_size, dilations[2]), + dilations[2], 1, true)); + + convs2_.emplace_back(reg("convs2.0", channels, channels, kernel_size, 1, getPadding(kernel_size, 1), 1, 1, true)); + convs2_.emplace_back(reg("convs2.1", channels, channels, kernel_size, 1, getPadding(kernel_size, 1), 1, 1, true)); + convs2_.emplace_back(reg("convs2.2", channels, channels, kernel_size, 1, getPadding(kernel_size, 1), 1, 1, true)); + + const int32_t num_layers = static_cast(convs1_.size() + convs2_.size()); + for (int32_t i = 0; i < num_layers; ++i) { + activations_.emplace_back(reg("activations." + std::to_string(i), channels)); + } + } + + Tensor forward(const Tensor& hidden_states) { + auto out = hidden_states; + const int32_t num_blocks = static_cast(convs1_.size()); + for (int32_t i = 0; i < num_blocks; ++i) { + auto residual = out; + auto x = activations_[i * 2].forward({out}, {})[0]; + x = convs1_[i](x); + x = activations_[i * 2 + 1].forward({x}, {})[0]; + x = convs2_[i](x); + out = residual + x; + } + return out; + } + + private: + static int32_t getPadding(int32_t kernel_size, int32_t dilation) { + return static_cast((kernel_size * dilation - dilation) / 2); + } + + std::vector convs1_; + std::vector convs2_; + std::vector activations_; +}; + +class Qwen2_5OmniToken2WavBigVGANModel final : public nn::Module { + public: + Qwen2_5OmniToken2WavBigVGANModel() = default; + explicit Qwen2_5OmniToken2WavBigVGANModel(const std::string& name, const Qwen2_5OmniBigVGANConfig& cfg) : nn::Module(name), cfg_(cfg) { + num_residual_blocks_ = static_cast(cfg.resblock_kernel_sizes.size()); + num_upsample_layers_ = static_cast(cfg.upsample_rates.size()); + + conv_pre_ = reg("conv_pre", cfg.mel_dim, cfg.upsample_initial_channel, 7, 1, 3, 1, 1, true); + + for (int32_t layer_idx = 0; layer_idx < num_upsample_layers_; ++layer_idx) { + const int32_t stride = cfg.upsample_rates[layer_idx]; + const int32_t kernel = cfg.upsample_kernel_sizes[layer_idx]; + const int32_t in_ch = cfg.upsample_initial_channel / static_cast(std::pow(2, layer_idx)); + const int32_t out_ch = cfg.upsample_initial_channel / static_cast(std::pow(2, layer_idx + 1)); + const int32_t padding = (kernel - stride) / 2; + ups_.emplace_back(reg("ups." + std::to_string(layer_idx) + ".0", in_ch, out_ch, kernel, stride, + padding, 0, 1, 1, true)); + } + + for (int32_t layer_idx = 0; layer_idx < num_upsample_layers_; ++layer_idx) { + const int32_t channels = cfg.upsample_initial_channel / static_cast(std::pow(2, layer_idx + 1)); + for (size_t i = 0; i < cfg.resblock_kernel_sizes.size(); ++i) { + resblocks_.emplace_back(reg("resblocks." + std::to_string(resblocks_.size()), channels, + cfg.resblock_kernel_sizes[i], cfg.resblock_dilation_sizes[i])); + } + } + + activation_post_ = + reg("activation_post", cfg.upsample_initial_channel / static_cast(std::pow(2, num_upsample_layers_))); + conv_post_ = reg("conv_post", + cfg.upsample_initial_channel / static_cast(std::pow(2, num_upsample_layers_)), 1, 7, 1, 3, 1, 1, + false); + } + + Tensor forward(const Tensor& mel_spectrogram) { + auto mel = mel_spectrogram; + if (!mel.isContiguous()) { mel = mel.contiguous(); } + auto processed = processMelSpectrogram(mel); + return forwardProcessed(processed); + } + + private: + Tensor forwardProcessed(const Tensor& processed) { + auto hidden = conv_pre_(processed); + + for (int32_t layer_idx = 0; layer_idx < num_upsample_layers_; ++layer_idx) { + hidden = ups_[layer_idx](hidden); + Tensor residual_sum = Tensor::zeros(hidden.shape(), hidden.dtype(), hidden.device()); + for (int32_t block_idx = 0; block_idx < num_residual_blocks_; ++block_idx) { + residual_sum = residual_sum + resblocks_[layer_idx * num_residual_blocks_ + block_idx].forward(hidden); + } + hidden = residual_sum * (1.0f / static_cast(num_residual_blocks_)); + } + + hidden = activation_post_.forward({hidden}, {})[0]; + auto output = conv_post_(hidden); + output = clampTensor(output, -1.0f, 1.0f); + return output.squeeze(); + } + Tensor processMelSpectrogram(const Tensor& mel_spectrogram) const { + auto amplitude = nn::functional::exp(mel_spectrogram); + auto decibel = amplitudeToDb(amplitude, -115.0f) + (-20.0f); + return normalizeSpectrogram(decibel, 1.0f, -115.0f); + } + + Qwen2_5OmniBigVGANConfig cfg_; + int32_t num_residual_blocks_ = 0; + int32_t num_upsample_layers_ = 0; + nn::Conv1D conv_pre_; + std::vector ups_; + std::vector resblocks_; + TorchActivation1d activation_post_; + nn::Conv1D conv_post_; +}; + +class Qwen2_5OmniToken2WavModel final : public nn::Module { + public: + Qwen2_5OmniToken2WavModel() = default; + explicit Qwen2_5OmniToken2WavModel(const std::string& name, const Qwen2_5OmniToken2WavConfig& cfg) : nn::Module(name), cfg_(cfg) { + code2wav_dit_model_ = reg("code2wav_dit_model", cfg.dit_config); + code2wav_bigvgan_model_ = reg("code2wav_bigvgan_model", cfg.bigvgan_config); + } + + Tensor forward(const Tensor& code, const Tensor& conditioning, const Tensor& reference_mel, int32_t num_steps = 10, + float guidance_scale = 0.5f, float sway_coefficient = -1.0f) { + auto mel = code2wav_dit_model_.sample(conditioning, reference_mel, code, num_steps, guidance_scale, sway_coefficient); + if (!mel.isContiguous()) { mel = mel.contiguous(); } + return code2wav_bigvgan_model_.forward(mel); + } + + Tensor vocodeMel(const Tensor& mel) { + return code2wav_bigvgan_model_.forward(mel); + } + + private: + Qwen2_5OmniToken2WavConfig cfg_; + Qwen2_5OmniToken2WavDiTModel code2wav_dit_model_; + Qwen2_5OmniToken2WavBigVGANModel code2wav_bigvgan_model_; +}; + +} // namespace token2wav + +using token2wav::Qwen2_5OmniToken2WavBigVGANModel; +using token2wav::Qwen2_5OmniToken2WavDiTModel; +using token2wav::Qwen2_5OmniToken2WavModel; + +} // namespace mllm::models::qwen2_5omni From 5676edc00f354a96fad3c1cbc7c3bebd690dd90c Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Thu, 5 Mar 2026 15:32:14 +0800 Subject: [PATCH 17/42] add --- examples/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e5501e8cd..0f025fcf6 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -2,7 +2,6 @@ add_subdirectory(qwen2vl) add_subdirectory(qwen2vl_tracer) add_subdirectory(qwen2_5vl) add_subdirectory(qwen2_5vl_tracer) -add_subdirectory(qwen2_5omni) add_subdirectory(minicpm_o45) add_subdirectory(llama) add_subdirectory(minicpm_o) From 3bdf6e0b03c1c66e0ee91e42e220738f73406f31 Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Thu, 12 Mar 2026 19:01:35 +0800 Subject: [PATCH 18/42] fix --- examples/minicpm_o45/CMakeLists.txt | 4 + examples/minicpm_o45/main_dbg.cpp | 309 ++++++++++++++++++ .../minicpm_o45/tokenization_minicpm_o45.hpp | 12 +- 3 files changed, 319 insertions(+), 6 deletions(-) create mode 100644 examples/minicpm_o45/main_dbg.cpp diff --git a/examples/minicpm_o45/CMakeLists.txt b/examples/minicpm_o45/CMakeLists.txt index a755efda1..bf30aa52b 100644 --- a/examples/minicpm_o45/CMakeLists.txt +++ b/examples/minicpm_o45/CMakeLists.txt @@ -2,6 +2,10 @@ add_executable(mllm-minicpm-o45-runner main.cpp) target_link_libraries(mllm-minicpm-o45-runner PRIVATE MllmRT MllmCPUBackend) target_include_directories(mllm-minicpm-o45-runner PRIVATE ${MLLM_INCLUDE_DIR}) +add_executable(mllm-minicpm-o45-runner-dbg main_dbg.cpp) +target_link_libraries(mllm-minicpm-o45-runner-dbg PRIVATE MllmRT MllmCPUBackend) +target_include_directories(mllm-minicpm-o45-runner-dbg PRIVATE ${MLLM_INCLUDE_DIR}) + # add_executable(mllm-minicpm-o45-runner-python main_python.cpp) # target_link_libraries(mllm-minicpm-o45-runner-python PRIVATE MllmRT MllmCPUBackend) # target_include_directories(mllm-minicpm-o45-runner-python PRIVATE ${MLLM_INCLUDE_DIR}) diff --git a/examples/minicpm_o45/main_dbg.cpp b/examples/minicpm_o45/main_dbg.cpp new file mode 100644 index 000000000..899826c33 --- /dev/null +++ b/examples/minicpm_o45/main_dbg.cpp @@ -0,0 +1,309 @@ +// Copyright (c) MLLM Team. +// Licensed under the MIT License. + +#include +#include +#include +#include +#include +#include + +#include + +#include "mllm/mllm.hpp" +#include "mllm/models/minicpm_o45/configuration_minicpm_o45.hpp" +#include "mllm/models/minicpm_o45/modeling_minicpm_o45.hpp" +#include "mllm/models/minicpm_o45/modeling_minicpm_o45_token2wav.hpp" +#include "mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp" +#include "mllm/models/minicpm_o45/token2wav_prompt_cache.hpp" + +#include "wenet_audio/wav.h" + +using mllm::Argparse; + +//MLLM_MAIN({ +int main(int argc, char** argv) { + ::mllm::__setup_signal_handler(); + ::mllm::initializeContext(); + + mllm::Logger::level() = mllm::LogLevel::kError; + + auto& help = Argparse::add("-h|--help").help("Show help message"); + auto& model_path = Argparse::add("-m|--model_path").help("Model path").def(""); + auto& model_version = Argparse::add("-mv|--model_version").help("Model version: v1/v2").def("v1"); + auto& tokenizer_path = Argparse::add("-t|--tokenizer_path").help("Tokenizer path (tokenizer.json)").def(""); + auto& config_path = Argparse::add("-c|--config_path").help("Config path").def(""); + auto& prompt = Argparse::add("-p|--prompt").help("Prompt text").def("Describe the input."); + auto& image_path = Argparse::add("-i|--image").help("Optional image path").def(""); + auto& audio_path = Argparse::add("-a|--audio").help("Optional audio path (wav)").def(""); + auto& generate_tts_tokens = Argparse::add("-gt|--generate_tts_tokens") + .help("Generate TTS tokens (text->tts-token stage, no waveform)") + .def(false); + auto& text_max_new_tokens = Argparse::add("--text_max_new_tokens").help("Max new text tokens").def(512); + auto& tts_max_new_tokens = Argparse::add("--tts_max_new_tokens").help("Max new TTS tokens").def(1024); + auto& tts_min_new_tokens = Argparse::add("--tts_min_new_tokens").help("Min new TTS tokens").def(50); + auto& tts_force_no_stop = Argparse::add("--tts_force_no_stop").help("Disable TTS EOS stopping").def(false); + auto& tts_temperature = Argparse::add("--tts_temperature").help("TTS sampling temperature").def(0.8f); + auto& tts_top_k = Argparse::add("--tts_top_k").help("TTS top-k sampling (<=0 disables)").def(25); + auto& tts_top_p = Argparse::add("--tts_top_p").help("TTS top-p sampling (<=0 or >=1 disables)").def(0.85f); + auto& tts_repetition_penalty = + Argparse::add("--tts_repetition_penalty").help("TTS repetition penalty (1.0 disables)").def(1.05f); + auto& tts_repetition_window = + Argparse::add("--tts_repetition_window").help("TTS repetition window size in generated tokens").def(16); + auto& tts_greedy = Argparse::add("--tts_greedy").help("Use greedy decoding for TTS tokens").def(false); + auto& tts_tokens_out = Argparse::add("--tts_tokens_out").help("Output path for generated TTS token ids").def(""); + auto& tts_tokens_in = + Argparse::add("--tts_tokens_in").help("Input path for pre-generated TTS token ids (one per line or whitespace).").def(""); + auto& tts_wav_out = Argparse::add("--tts_wav_out") + .help("Output wav path. If set, run native C++ token2wav.") + .def(""); + auto& tts_token2wav_model_path = Argparse::add("--tts_token2wav_model_path") + .help("Path to token2wav .mllm (if empty, fallback to --model_path).") + .def(""); + auto& tts_token2wav_model_version = Argparse::add("--tts_token2wav_model_version") + .help("token2wav model version: v1/v2") + .def("v1"); + auto& tts_prompt_cache = Argparse::add("--tts_prompt_cache") + .help("Path to fixed prompt cache generated by export_prompt_cache.py") + .def(""); + auto& tts_token2wav_n_timesteps = Argparse::add("--tts_token2wav_n_timesteps") + .help("Flow diffusion steps for native token2wav") + .def(10); + auto& debug_progress = Argparse::add("--debug_progress").help("Print step-level debug progress.").def(false); + auto& debug_interval = + Argparse::add("--debug_interval").help("Token step interval for debug progress logs.").def(16); + + Argparse::parse(argc, argv); + + if (help.isSet()) { + Argparse::printHelp(); + mllm::shutdownContext(); + return 0; + } + + mllm::ModelFileVersion file_version = mllm::ModelFileVersion::kV1; + if (model_version.get() == "v2") { file_version = mllm::ModelFileVersion::kV2; } + + auto token2wav_model_path = tts_token2wav_model_path.get().empty() ? model_path.get() : tts_token2wav_model_path.get(); + mllm::ModelFileVersion token2wav_file_version = mllm::ModelFileVersion::kV1; + if (tts_token2wav_model_version.get() == "v2") { token2wav_file_version = mllm::ModelFileVersion::kV2; } + + auto run_native_token2wav = !tts_wav_out.get().empty(); + if (run_native_token2wav && tts_prompt_cache.get().empty()) { + MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, "--tts_prompt_cache is required when --tts_wav_out is set."); + } + + auto debug_t0 = std::chrono::steady_clock::now(); + auto debug_log = [&](const std::string& msg) { + if (!debug_progress.get()) { return; } + auto now = std::chrono::steady_clock::now(); + auto sec = std::chrono::duration_cast(now - debug_t0).count() / 1000.0; + fmt::print("[debug +{:.3f}s] {}\n", sec, msg); + }; + + if (!tts_tokens_in.get().empty()) { + if (!run_native_token2wav) { + MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, "--tts_wav_out is required when --tts_tokens_in is set."); + } + if (token2wav_model_path.empty()) { + MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, "Missing token2wav model path (--tts_token2wav_model_path or --model_path)."); + } + + std::ifstream ifs(tts_tokens_in.get()); + if (!ifs.is_open()) { MLLM_ERROR_EXIT(mllm::ExitCode::kIOError, "Failed to open token file: {}", tts_tokens_in.get()); } + std::vector token_ids; + for (std::string line; std::getline(ifs, line);) { + if (line.empty()) { continue; } + std::stringstream ss(line); + while (!ss.eof()) { + int64_t token = 0; + ss >> token; + if (!ss.fail()) { token_ids.push_back(token); } + } + } + if (token_ids.empty()) { MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, "No token id found in {}", tts_tokens_in.get()); } + + fmt::print("Loaded {} TTS token IDs from {}\n", token_ids.size(), tts_tokens_in.get()); + debug_log("Loading token2wav model and prompt cache..."); + auto token2wav_param = mllm::load(token2wav_model_path, token2wav_file_version); + auto prompt_cache = mllm::models::minicpm_o45::loadMiniCPMO45Token2WavPromptCache(tts_prompt_cache.get()); + + mllm::models::minicpm_o45::MiniCPMO45Token2WavModel token2wav("token2wav", {}); + token2wav.loadFromParameter(token2wav_param); + debug_log("Native token2wav model loaded."); + + debug_log("Running native flow + HiFT..."); + auto wav = token2wav.infer(token_ids, prompt_cache, std::max(1, tts_token2wav_n_timesteps.get())); + auto wav_i16 = wav * 32767.0f; + wenet::WavWriter wav_writer(wav_i16.ptr(), wav_i16.shape().back(), 1, 24000, 16); + wav_writer.Write(tts_wav_out.get()); + fmt::print("Saved TTS waveform to {}\n", tts_wav_out.get()); + debug_log("Native token2wav finished."); + mllm::shutdownContext(); + return 0; + } + + if (model_path.get().empty() || tokenizer_path.get().empty() || config_path.get().empty()) { + Argparse::printHelp(); + MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, + "Missing required arguments: --model_path, --tokenizer_path, --config_path"); + } + + auto cfg = mllm::models::minicpm_o45::MiniCPMO45Config(config_path.get()); + + debug_log("Loading tokenizer and model modules..."); + auto tokenizer = mllm::models::minicpm_o45::MiniCPMO45Tokenizer(tokenizer_path.get(), cfg.vision_patch_size, cfg.audio_pool_step); + auto model = mllm::models::minicpm_o45::MiniCPMO45ForCausalLM(cfg); + + debug_log("Loading model parameters..."); + auto param = mllm::load(model_path.get(), file_version); + model.llm_.load(param); + model.vpm_.load(param); + model.resampler_.load(param); + model.apm_.load(param); + model.audio_projection_layer_.load(param); + if (generate_tts_tokens.get()) { model.tts_.loadFromParameter(param); } + debug_log("Model parameters loaded."); + + mllm::models::minicpm_o45::MiniCPMO45Message message; + message.prompt = prompt.get(); + message.img_file_path = image_path.get(); + message.audio_file_path = audio_path.get(); + + auto inputs = tokenizer.convertMessage(message, generate_tts_tokens.get()); + debug_log("Tokenizer convertMessage finished."); + + fmt::print("\n{:*^60}\n", " MiniCPM-o-4_5 CLI "); + fmt::print("Prompt: {}\n", message.prompt); + if (!message.img_file_path.empty()) { fmt::print("Image : {}\n", message.img_file_path); } + if (!message.audio_file_path.empty()) { fmt::print("Audio : {}\n", message.audio_file_path); } + + if (!generate_tts_tokens.get()) { + fmt::print("\nResponse: "); + for (auto& step : model.chat(inputs)) { + std::wcout << tokenizer.detokenize(step.cur_token_id) << std::flush; + } + fmt::print("\n"); + } else { + auto tts_eos_id = tokenizer.lookupTokenId(L"<|tts_eos|>"); + auto im_end_id = tokenizer.lookupTokenId(L"<|im_end|>"); + auto eot_id = tokenizer.lookupTokenId(L"<|endoftext|>"); + + std::vector stop_token_ids = { + tts_eos_id, + im_end_id, + eot_id, + cfg.eos_token_id, + }; + + debug_log("Start text generation for TTS conditioning..."); + auto text_out = model.generateTextWithHidden( + inputs, text_max_new_tokens.get(), stop_token_ids, false, 1.0f, 0, 0.0f, + [&](int32_t step, int64_t token_id) { + auto interval = std::max(debug_interval.get(), 1); + if (debug_progress.get() && (step == 1 || (step % interval) == 0)) { + debug_log(fmt::format("Text generation step {} (token_id={})", step, token_id)); + } + }); + debug_log(fmt::format("Text generation done, generated_tokens={}", text_out.generated_tokens.size())); + + fmt::print("\nGenerated text tokens: {}\n", text_out.generated_tokens.size()); + fmt::print("Text (for TTS conditioning): "); + + std::vector tts_text_tokens; + std::vector tts_hidden_states; + for (size_t i = 0; i < text_out.aligned_tokens.size() && i < text_out.aligned_hidden_states.size(); ++i) { + auto token_id = text_out.aligned_tokens[i]; + if (token_id == tts_eos_id || token_id == im_end_id || token_id == eot_id || token_id == cfg.eos_token_id) { break; } + tts_text_tokens.push_back(token_id); + tts_hidden_states.push_back(text_out.aligned_hidden_states[i]); + std::wcout << tokenizer.detokenize(token_id) << std::flush; + } + fmt::print("\n"); + + if (tts_text_tokens.empty()) { + MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, + "No text token available before <|tts_eos|>/<|im_end|>; cannot build TTS condition."); + } + + auto condition_embeds = model.tts_.makeConditionEmbeddings(tts_text_tokens, tts_hidden_states); + if (condition_embeds.isNil()) { + MLLM_ERROR_EXIT(mllm::ExitCode::kCoreError, "Failed to build TTS conditioning embeddings."); + } + debug_log(fmt::format("Built TTS condition embeddings from {} text tokens.", tts_text_tokens.size())); + + mllm::models::minicpm_o45::MiniCPMO45TTSGenerationConfig tts_cfg; + tts_cfg.max_new_tokens = tts_max_new_tokens.get(); + tts_cfg.min_new_tokens = tts_min_new_tokens.get(); + tts_cfg.force_no_stop = tts_force_no_stop.get(); + tts_cfg.do_sample = !tts_greedy.get(); + tts_cfg.temperature = {tts_temperature.get()}; + tts_cfg.top_k = tts_top_k.get(); + tts_cfg.top_p = tts_top_p.get(); + tts_cfg.repetition_penalty = tts_repetition_penalty.get(); + tts_cfg.repetition_penalty_window = tts_repetition_window.get(); + tts_cfg.debug_interval = std::max(debug_interval.get(), 1); + if (debug_progress.get()) { + tts_cfg.step_callback = [&](int32_t step, const std::vector& tokens, bool has_eos) { + auto first_token = tokens.empty() ? -1 : tokens[0]; + debug_log(fmt::format("TTS generation step {} (first_vq_token={}, has_eos={})", step, first_token, + has_eos ? "true" : "false")); + }; + } + + debug_log("Start TTS token generation..."); + auto tts_out = model.tts_.generate(condition_embeds, tts_cfg); + debug_log("TTS token generation finished."); + if (tts_out.new_ids.isNil()) { + fmt::print("Generated TTS tokens: 0\n"); + } else { + auto token_count = tts_out.new_ids.shape()[1]; + fmt::print("Generated TTS tokens: {} (finished={})\n", token_count, tts_out.finished ? "true" : "false"); + + std::vector token_ids; + token_ids.reserve(token_count); + for (int32_t i = 0; i < token_count; ++i) { token_ids.push_back(tts_out.new_ids.at({0, i, 0})); } + + fmt::print("TTS token IDs:\n"); + for (size_t i = 0; i < token_ids.size(); ++i) { + fmt::print("{}{}", token_ids[i], (i + 1 == token_ids.size() ? "\n" : " ")); + } + + if (!tts_tokens_out.get().empty()) { + std::ofstream ofs(tts_tokens_out.get()); + if (!ofs.is_open()) { + MLLM_ERROR_EXIT(mllm::ExitCode::kIOError, "Failed to open output file: {}", tts_tokens_out.get()); + } + for (auto id : token_ids) { ofs << std::to_string(id) << '\n'; } + fmt::print("Saved TTS token ids to {}\n", tts_tokens_out.get()); + debug_log(fmt::format("Saved token ids to {}", tts_tokens_out.get())); + } + + if (!tts_wav_out.get().empty()) { + debug_log("Loading token2wav model and prompt cache..."); + auto token2wav_param = mllm::load(token2wav_model_path, token2wav_file_version); + auto prompt_cache = mllm::models::minicpm_o45::loadMiniCPMO45Token2WavPromptCache(tts_prompt_cache.get()); + + mllm::models::minicpm_o45::MiniCPMO45Token2WavModel token2wav("token2wav", {}); + token2wav.loadFromParameter(token2wav_param); + debug_log("Native token2wav model loaded."); + + debug_log("Running native flow + HiFT..."); + auto wav = token2wav.infer(token_ids, prompt_cache, std::max(1, tts_token2wav_n_timesteps.get())); + auto wav_i16 = wav * 32767.0f; + wenet::WavWriter wav_writer(wav_i16.ptr(), wav_i16.shape().back(), 1, 24000, 16); + wav_writer.Write(tts_wav_out.get()); + fmt::print("Saved TTS waveform to {}\n", tts_wav_out.get()); + debug_log("Native token2wav finished."); + } + } + } + + model.perfSummary(); + mllm::memoryReport(); + + ::mllm::shutdownContext(); + return 0; +} + +//}) diff --git a/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp b/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp index 0a0e00ca0..293026745 100644 --- a/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp +++ b/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp @@ -149,8 +149,8 @@ struct MiniCPMO45Message { if (!system_prompt.empty()) { result += "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"; } result += "<|im_start|>user\n"; - if (!img_file_path.empty()) { result += "(./)"; } - if (!audio_file_path.empty()) { result += "()"; } + if (!img_file_path.empty()) { result += "./"; } + if (!audio_file_path.empty()) { result += ""; } if (!prompt.empty()) { if (!img_file_path.empty() || !audio_file_path.empty()) { result += "\n"; } @@ -160,7 +160,7 @@ struct MiniCPMO45Message { result += "<|im_end|>\n"; result += "<|im_start|>assistant\n"; - if (generate_audio) { result += "<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>"; } + if (generate_audio) { result += "\n\n\n\n<|tts_bos|>"; } return result; } }; @@ -326,7 +326,7 @@ class MiniCPMO45Tokenizer final : public mllm::preprocessor::AutoTokenizer { } if (has_image) { - std::regex img_pattern(R"(\(\./\))"); + std::regex img_pattern(R"(\./)"); std::vector image_tags; std::sregex_iterator iter(applied_string.begin(), applied_string.end(), img_pattern); std::sregex_iterator end; @@ -355,9 +355,9 @@ class MiniCPMO45Tokenizer final : public mllm::preprocessor::AutoTokenizer { if (has_audio) { auto audio_placeholder = getAudioPlaceholder(audio_length, false); - size_t audio_placeholder_pos = applied_string.find("()"); + size_t audio_placeholder_pos = applied_string.find(""); if (audio_placeholder_pos != std::string::npos) { - applied_string.replace(audio_placeholder_pos, std::string("()").size(), audio_placeholder); + applied_string.replace(audio_placeholder_pos, std::string("").size(), audio_placeholder); } } From 571b93d11275f87fcca4a1cd96ebd2374978217a Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Thu, 12 Mar 2026 22:06:30 +0800 Subject: [PATCH 19/42] add minicpm-o4.5 system ref audio prompt path --- examples/minicpm_o45/main.cpp | 13 ++++ .../minicpm_o45/tokenization_minicpm_o45.hpp | 65 ++++++++++++++++--- 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/examples/minicpm_o45/main.cpp b/examples/minicpm_o45/main.cpp index 482428038..c78c6622b 100644 --- a/examples/minicpm_o45/main.cpp +++ b/examples/minicpm_o45/main.cpp @@ -32,6 +32,15 @@ MLLM_MAIN({ auto& prompt = Argparse::add("-p|--prompt").help("Prompt text").def("Describe the input."); auto& image_path = Argparse::add("-i|--image").help("Optional image path").def(""); auto& audio_path = Argparse::add("-a|--audio").help("Optional audio path (wav)").def(""); + auto& ref_audio_path = Argparse::add("--ref_audio") + .help("Optional reference audio path for system voice-cloning prompt (wav).") + .def(""); + auto& ref_audio_prompt_prefix = Argparse::add("--ref_audio_prompt_prefix") + .help("System prompt prefix placed before reference audio.") + .def("Clone the voice in the provided audio prompt."); + auto& ref_audio_prompt_suffix = Argparse::add("--ref_audio_prompt_suffix") + .help("System prompt suffix placed after reference audio.") + .def("As an assistant, you will speak using this voice style."); auto& generate_tts_tokens = Argparse::add("-gt|--generate_tts_tokens") .help("Generate TTS tokens (text->tts-token stage, no waveform)") .def(false); @@ -165,6 +174,9 @@ MLLM_MAIN({ message.prompt = prompt.get(); message.img_file_path = image_path.get(); message.audio_file_path = audio_path.get(); + message.ref_audio_file_path = ref_audio_path.get(); + message.ref_audio_prompt_prefix = ref_audio_prompt_prefix.get(); + message.ref_audio_prompt_suffix = ref_audio_prompt_suffix.get(); auto inputs = tokenizer.convertMessage(message, generate_tts_tokens.get()); debug_log("Tokenizer convertMessage finished."); @@ -173,6 +185,7 @@ MLLM_MAIN({ fmt::print("Prompt: {}\n", message.prompt); if (!message.img_file_path.empty()) { fmt::print("Image : {}\n", message.img_file_path); } if (!message.audio_file_path.empty()) { fmt::print("Audio : {}\n", message.audio_file_path); } + if (!message.ref_audio_file_path.empty()) { fmt::print("RefAudio : {}\n", message.ref_audio_file_path); } if (!generate_tts_tokens.get()) { fmt::print("\nResponse: "); diff --git a/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp b/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp index 293026745..5ccd1c838 100644 --- a/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp +++ b/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp @@ -141,12 +141,23 @@ struct MiniCPMO45Message { std::string prompt; std::string img_file_path; std::string audio_file_path; + std::string ref_audio_file_path; std::string system_prompt = "You are a helpful assistant. You can accept video, audio and text input and output voice and text."; + std::string ref_audio_prompt_prefix = "Clone the voice in the provided audio prompt."; + std::string ref_audio_prompt_suffix = "As an assistant, you will speak using this voice style."; [[nodiscard]] std::string buildChatMessage(bool generate_audio = false) const { std::string result; - if (!system_prompt.empty()) { result += "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"; } + if (!ref_audio_file_path.empty()) { + result += "<|im_start|>system\n"; + if (!ref_audio_prompt_prefix.empty()) { result += ref_audio_prompt_prefix + "\n"; } + result += ""; + if (!ref_audio_prompt_suffix.empty()) { result += "\n" + ref_audio_prompt_suffix; } + result += "<|im_end|>\n"; + } else if (!system_prompt.empty()) { + result += "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"; + } result += "<|im_start|>user\n"; if (!img_file_path.empty()) { result += "./"; } @@ -299,7 +310,9 @@ class MiniCPMO45Tokenizer final : public mllm::preprocessor::AutoTokenizer { ARGenerationOutputPast convertMessage(const MiniCPMO45Message& message, bool generate_audio_prompt = false) { bool has_image = !message.img_file_path.empty(); - bool has_audio = !message.audio_file_path.empty(); + bool has_ref_audio = !message.ref_audio_file_path.empty(); + bool has_user_audio = !message.audio_file_path.empty(); + bool has_audio = has_ref_audio || has_user_audio; auto applied_string = message.buildChatMessage(generate_audio_prompt); @@ -309,7 +322,8 @@ class MiniCPMO45Tokenizer final : public mllm::preprocessor::AutoTokenizer { std::vector grid; Tensor audio_features = Tensor::nil(); - int32_t audio_length = 0; + std::vector audio_lengths; + std::vector audio_feature_list; if (has_image) { auto [tensors, orig_size, target_sizes, img_grid] = image_preprocessor_.process(message.img_file_path); @@ -319,10 +333,40 @@ class MiniCPMO45Tokenizer final : public mllm::preprocessor::AutoTokenizer { grid = std::move(img_grid); } - if (has_audio) { + if (has_ref_audio) { + auto audio_data = mllm::audio::readWAV(message.ref_audio_file_path, 16000); + auto audio_length = static_cast(audio_data.size()); + if (audio_length > 0) { + audio_lengths.push_back(audio_length); + auto ref_audio_features = audio_preprocessor_.processAudioData(audio_data.data(), audio_length); + if (!ref_audio_features.isNil()) { audio_feature_list.push_back(ref_audio_features); } + } + } + + if (has_user_audio) { auto audio_data = mllm::audio::readWAV(message.audio_file_path, 16000); - audio_length = static_cast(audio_data.size()); - audio_features = audio_preprocessor_.processAudioData(audio_data.data(), audio_length); + auto audio_length = static_cast(audio_data.size()); + if (audio_length > 0) { + audio_lengths.push_back(audio_length); + auto user_audio_features = audio_preprocessor_.processAudioData(audio_data.data(), audio_length); + if (!user_audio_features.isNil()) { audio_feature_list.push_back(user_audio_features); } + } + } + + if (!audio_feature_list.empty()) { + int32_t batch = static_cast(audio_feature_list.size()); + auto channels = audio_feature_list[0].shape()[1]; + auto frames = audio_feature_list[0].shape()[2]; + audio_features = Tensor::empty({batch, channels, frames}, kFloat32, kCPU) + .setMemType(kExtraInput) + .setName("audio_features") + .alloc(); + auto* dst = audio_features.ptr(); + auto single_size = static_cast(channels) * static_cast(frames); + for (int32_t i = 0; i < batch; ++i) { + std::memcpy(dst + static_cast(i) * single_size, audio_feature_list[i].ptr(), + single_size * sizeof(float)); + } } if (has_image) { @@ -354,10 +398,13 @@ class MiniCPMO45Tokenizer final : public mllm::preprocessor::AutoTokenizer { } if (has_audio) { - auto audio_placeholder = getAudioPlaceholder(audio_length, false); - size_t audio_placeholder_pos = applied_string.find(""); - if (audio_placeholder_pos != std::string::npos) { + size_t search_pos = 0; + for (auto audio_length : audio_lengths) { + auto audio_placeholder = getAudioPlaceholder(audio_length, false); + auto audio_placeholder_pos = applied_string.find("", search_pos); + if (audio_placeholder_pos == std::string::npos) { break; } applied_string.replace(audio_placeholder_pos, std::string("").size(), audio_placeholder); + search_pos = audio_placeholder_pos + audio_placeholder.size(); } } From d7c1b3061fc0666be8fc49991134728527f48c28 Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Wed, 25 Mar 2026 13:40:11 +0800 Subject: [PATCH 20/42] fix --- examples/minicpm_o45/main.cpp | 5 +- examples/minicpm_o45/main_dbg.cpp | 18 +- .../quant_cfg_gguf_q4_0_aggressive.json | 359 ++++++++++++++++++ examples/minicpm_o45/quant_cfg_gguf_q4_k.json | 101 +++++ .../minicpm_o2_6/modeling_resampler.hpp | 106 +++--- mllm/models/minicpm_o2_6/modeling_siglip.hpp | 71 ++-- .../minicpm_o45/modeling_minicpm_o45.hpp | 30 +- .../minicpm_o45/tokenization_minicpm_o45.hpp | 2 +- 8 files changed, 602 insertions(+), 90 deletions(-) create mode 100644 examples/minicpm_o45/quant_cfg_gguf_q4_0_aggressive.json create mode 100644 examples/minicpm_o45/quant_cfg_gguf_q4_k.json diff --git a/examples/minicpm_o45/main.cpp b/examples/minicpm_o45/main.cpp index c78c6622b..3c7cdf6ab 100644 --- a/examples/minicpm_o45/main.cpp +++ b/examples/minicpm_o45/main.cpp @@ -271,7 +271,10 @@ MLLM_MAIN({ std::vector token_ids; token_ids.reserve(token_count); - for (int32_t i = 0; i < token_count; ++i) { token_ids.push_back(tts_out.new_ids.at({0, i, 0})); } + auto tts_ids = tts_out.new_ids.contiguous(); + const auto* tts_ids_ptr = tts_ids.ptr(); + auto num_vq = tts_ids.shape()[2]; + for (int32_t i = 0; i < token_count; ++i) { token_ids.push_back(tts_ids_ptr[static_cast(i) * num_vq]); } fmt::print("TTS token IDs:\n"); for (size_t i = 0; i < token_ids.size(); ++i) { diff --git a/examples/minicpm_o45/main_dbg.cpp b/examples/minicpm_o45/main_dbg.cpp index 899826c33..a3d4a78af 100644 --- a/examples/minicpm_o45/main_dbg.cpp +++ b/examples/minicpm_o45/main_dbg.cpp @@ -36,6 +36,15 @@ int main(int argc, char** argv) { auto& prompt = Argparse::add("-p|--prompt").help("Prompt text").def("Describe the input."); auto& image_path = Argparse::add("-i|--image").help("Optional image path").def(""); auto& audio_path = Argparse::add("-a|--audio").help("Optional audio path (wav)").def(""); + auto& ref_audio_path = Argparse::add("--ref_audio") + .help("Optional reference audio path for system voice-cloning prompt (wav).") + .def(""); + auto& ref_audio_prompt_prefix = Argparse::add("--ref_audio_prompt_prefix") + .help("System prompt prefix placed before reference audio.") + .def("Clone the voice in the provided audio prompt."); + auto& ref_audio_prompt_suffix = Argparse::add("--ref_audio_prompt_suffix") + .help("System prompt suffix placed after reference audio.") + .def("As an assistant, you will speak using this voice style."); auto& generate_tts_tokens = Argparse::add("-gt|--generate_tts_tokens") .help("Generate TTS tokens (text->tts-token stage, no waveform)") .def(false); @@ -169,6 +178,9 @@ int main(int argc, char** argv) { message.prompt = prompt.get(); message.img_file_path = image_path.get(); message.audio_file_path = audio_path.get(); + message.ref_audio_file_path = ref_audio_path.get(); + message.ref_audio_prompt_prefix = ref_audio_prompt_prefix.get(); + message.ref_audio_prompt_suffix = ref_audio_prompt_suffix.get(); auto inputs = tokenizer.convertMessage(message, generate_tts_tokens.get()); debug_log("Tokenizer convertMessage finished."); @@ -177,6 +189,7 @@ int main(int argc, char** argv) { fmt::print("Prompt: {}\n", message.prompt); if (!message.img_file_path.empty()) { fmt::print("Image : {}\n", message.img_file_path); } if (!message.audio_file_path.empty()) { fmt::print("Audio : {}\n", message.audio_file_path); } + if (!message.ref_audio_file_path.empty()) { fmt::print("RefAudio : {}\n", message.ref_audio_file_path); } if (!generate_tts_tokens.get()) { fmt::print("\nResponse: "); @@ -262,7 +275,10 @@ int main(int argc, char** argv) { std::vector token_ids; token_ids.reserve(token_count); - for (int32_t i = 0; i < token_count; ++i) { token_ids.push_back(tts_out.new_ids.at({0, i, 0})); } + auto tts_ids = tts_out.new_ids.contiguous(); + const auto* tts_ids_ptr = tts_ids.ptr(); + auto num_vq = tts_ids.shape()[2]; + for (int32_t i = 0; i < token_count; ++i) { token_ids.push_back(tts_ids_ptr[static_cast(i) * num_vq]); } fmt::print("TTS token IDs:\n"); for (size_t i = 0; i < token_ids.size(); ++i) { diff --git a/examples/minicpm_o45/quant_cfg_gguf_q4_0_aggressive.json b/examples/minicpm_o45/quant_cfg_gguf_q4_0_aggressive.json new file mode 100644 index 000000000..35c587d48 --- /dev/null +++ b/examples/minicpm_o45/quant_cfg_gguf_q4_0_aggressive.json @@ -0,0 +1,359 @@ +{ + "^llm\\.model\\.embed_tokens\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 151748, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.self_attn\\.q_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 4096, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.self_attn\\.k_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 1024, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.self_attn\\.v_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 1024, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.self_attn\\.o_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 4096, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.mlp\\.gate_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 12288, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.mlp\\.up_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 12288, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.mlp\\.down_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 4096, + 12288 + ], + "replace": true + } + }, + "^llm\\.lm_head\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 151748, + 4096 + ], + "replace": true + } + }, + + "^vpm\\.embeddings\\.position_embedding\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 4900, + 1152 + ], + "replace": true + } + }, + "^vpm\\.encoder\\.layers\\.\\d+\\.self_attn\\.q_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 1152, + 1152 + ], + "replace": true + } + }, + "^vpm\\.encoder\\.layers\\.\\d+\\.self_attn\\.k_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 1152, + 1152 + ], + "replace": true + } + }, + "^vpm\\.encoder\\.layers\\.\\d+\\.self_attn\\.v_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 1152, + 1152 + ], + "replace": true + } + }, + "^vpm\\.encoder\\.layers\\.\\d+\\.self_attn\\.out_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 1152, + 1152 + ], + "replace": true + } + }, + "^vpm\\.encoder\\.layers\\.\\d+\\.mlp\\.fc1\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 4304, + 1152 + ], + "replace": true + } + }, + "^vpm\\.encoder\\.layers\\.\\d+\\.mlp\\.fc2\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 1152, + 4304 + ], + "replace": true + } + }, + + "^resampler\\.kv_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 4096, + 1152 + ], + "replace": true + } + }, + "^resampler\\.attn\\.out_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 4096, + 4096 + ], + "replace": true + } + }, + + "^audio_projection_layer\\.linear1\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 4096, + 1024 + ], + "replace": true + } + }, + "^audio_projection_layer\\.linear2\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 4096, + 4096 + ], + "replace": true + } + }, + + "^tts\\.projector_spk\\.linear1\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 768, + 4096 + ], + "replace": true + } + }, + "^tts\\.projector_spk\\.linear2\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 768, + 768 + ], + "replace": true + } + }, + "^tts\\.projector_semantic\\.linear1\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 768, + 4096 + ], + "replace": true + } + }, + "^tts\\.projector_semantic\\.linear2\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 768, + 768 + ], + "replace": true + } + }, + + "^tts\\.model\\.layers\\.\\d+\\.self_attn\\.q_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 768, + 768 + ], + "replace": true + } + }, + "^tts\\.model\\.layers\\.\\d+\\.self_attn\\.k_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 768, + 768 + ], + "replace": true + } + }, + "^tts\\.model\\.layers\\.\\d+\\.self_attn\\.v_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 768, + 768 + ], + "replace": true + } + }, + "^tts\\.model\\.layers\\.\\d+\\.self_attn\\.o_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 768, + 768 + ], + "replace": true + } + }, + "^tts\\.model\\.layers\\.\\d+\\.mlp\\.gate_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 3072, + 768 + ], + "replace": true + } + }, + "^tts\\.model\\.layers\\.\\d+\\.mlp\\.up_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 3072, + 768 + ], + "replace": true + } + }, + "^tts\\.model\\.layers\\.\\d+\\.mlp\\.down_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 768, + 3072 + ], + "replace": true + } + }, + "^tts\\.emb_text\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_0", + "shape": [ + 152064, + 768 + ], + "replace": true + } + } +} diff --git a/examples/minicpm_o45/quant_cfg_gguf_q4_k.json b/examples/minicpm_o45/quant_cfg_gguf_q4_k.json new file mode 100644 index 000000000..59aecafb0 --- /dev/null +++ b/examples/minicpm_o45/quant_cfg_gguf_q4_k.json @@ -0,0 +1,101 @@ +{ + "^llm\\.model\\.embed_tokens\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_K", + "shape": [ + 151748, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.self_attn\\.q_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_K", + "shape": [ + 4096, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.self_attn\\.k_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_K", + "shape": [ + 1024, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.self_attn\\.v_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q5_K", + "shape": [ + 1024, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.self_attn\\.o_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_K", + "shape": [ + 4096, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.mlp\\.gate_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_K", + "shape": [ + 12288, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.mlp\\.up_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q4_K", + "shape": [ + 12288, + 4096 + ], + "replace": true + } + }, + "^llm\\.model\\.layers\\.\\d+\\.mlp\\.down_proj\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q5_K", + "shape": [ + 4096, + 12288 + ], + "replace": true + } + }, + "^llm\\.lm_head\\.weight$": { + "hints": { + "quant_method": "gguf", + "gguf_type": "Q6_K", + "shape": [ + 151748, + 4096 + ], + "replace": true + } + } +} diff --git a/mllm/models/minicpm_o2_6/modeling_resampler.hpp b/mllm/models/minicpm_o2_6/modeling_resampler.hpp index f447521bd..adc80a39f 100644 --- a/mllm/models/minicpm_o2_6/modeling_resampler.hpp +++ b/mllm/models/minicpm_o2_6/modeling_resampler.hpp @@ -99,34 +99,43 @@ class ResamplerAttention : public nn::Module { q = q + q_bias; k = k + k_bias; v = v + v_bias; + q = q.contiguous(); + k = k.contiguous(); + v = v.contiguous(); auto q_reshaped = Tensor::empty({num_heads_, num_queries, head_dim_}, kFloat32).alloc(); + const auto* q_ptr = q.ptr(); + auto* q_reshaped_ptr = q_reshaped.ptr(); for (int nq = 0; nq < num_queries; nq++) { + auto q_row_ptr = q_ptr + static_cast(nq) * embed_dim_; for (int h = 0; h < num_heads_; h++) { - for (int d = 0; d < head_dim_; d++) { - float val = q.at({nq, h * head_dim_ + d}); - *q_reshaped.offsettedPtr({h, nq, d}) = val; - } + auto src_ptr = q_row_ptr + h * head_dim_; + auto dst_ptr = q_reshaped_ptr + (static_cast(h) * num_queries + nq) * head_dim_; + std::memcpy(dst_ptr, src_ptr, static_cast(head_dim_) * sizeof(float)); } } q = q_reshaped; // [num_heads, num_queries, head_dim] auto k_reshaped = Tensor::empty({num_heads_, seq_len, head_dim_}, kFloat32).alloc(); + const auto* k_ptr = k.ptr(); + auto* k_reshaped_ptr = k_reshaped.ptr(); for (int s = 0; s < seq_len; s++) { + auto k_row_ptr = k_ptr + static_cast(s) * embed_dim_; for (int h = 0; h < num_heads_; h++) { - for (int d = 0; d < head_dim_; d++) { - float val = k.at({s, h * head_dim_ + d}); - *k_reshaped.offsettedPtr({h, s, d}) = val; - } + auto src_ptr = k_row_ptr + h * head_dim_; + auto dst_ptr = k_reshaped_ptr + (static_cast(h) * seq_len + s) * head_dim_; + std::memcpy(dst_ptr, src_ptr, static_cast(head_dim_) * sizeof(float)); } } k = k_reshaped; auto v_reshaped = Tensor::empty({num_heads_, seq_len, head_dim_}, kFloat32).alloc(); + const auto* v_ptr = v.ptr(); + auto* v_reshaped_ptr = v_reshaped.ptr(); for (int s = 0; s < seq_len; s++) { + auto v_row_ptr = v_ptr + static_cast(s) * embed_dim_; for (int h = 0; h < num_heads_; h++) { - for (int d = 0; d < head_dim_; d++) { - float val = v.at({s, h * head_dim_ + d}); - *v_reshaped.offsettedPtr({h, s, d}) = val; - } + auto src_ptr = v_row_ptr + h * head_dim_; + auto dst_ptr = v_reshaped_ptr + (static_cast(h) * seq_len + s) * head_dim_; + std::memcpy(dst_ptr, src_ptr, static_cast(head_dim_) * sizeof(float)); } } v = v_reshaped; @@ -140,10 +149,12 @@ class ResamplerAttention : public nn::Module { if (has_key_padding_mask && key_padding_mask.numel() > 0) { auto mask_value = -std::numeric_limits::infinity(); + auto key_padding_mask_contiguous = key_padding_mask.isContiguous() ? key_padding_mask : key_padding_mask.contiguous(); + const auto* key_padding_mask_ptr = key_padding_mask_contiguous.ptr(); for (int32_t h = 0; h < num_heads_; ++h) { for (int32_t q_idx = 0; q_idx < num_queries; ++q_idx) { for (int32_t s = 0; s < seq_len; ++s) { - if (key_padding_mask.at({s}) == 1) { *attn_weights.offsettedPtr({h, q_idx, s}) = mask_value; } + if (key_padding_mask_ptr[s] == 1) { *attn_weights.offsettedPtr({h, q_idx, s}) = mask_value; } } } } @@ -152,14 +163,16 @@ class ResamplerAttention : public nn::Module { attn_weights = nn::functional::softmax(attn_weights.unsqueeze(0), -1).squeeze(0); auto attn_output = nn::functional::matmul(attn_weights, v); // [num_heads, num_queries, head_dim] + attn_output = attn_output.contiguous(); auto attn_output_reshaped = Tensor::empty({num_queries, embed_dim_}, kFloat32).alloc(); + const auto* attn_output_ptr = attn_output.ptr(); + auto* attn_output_reshaped_ptr = attn_output_reshaped.ptr(); for (int h = 0; h < num_heads_; h++) { for (int nq = 0; nq < num_queries; nq++) { - for (int d = 0; d < head_dim_; d++) { - float val = attn_output.at({h, nq, d}); - *attn_output_reshaped.offsettedPtr({nq, h * head_dim_ + d}) = val; - } + auto src_ptr = attn_output_ptr + (static_cast(h) * num_queries + nq) * head_dim_; + auto dst_ptr = attn_output_reshaped_ptr + static_cast(nq) * embed_dim_ + h * head_dim_; + std::memcpy(dst_ptr, src_ptr, static_cast(head_dim_) * sizeof(float)); } } attn_output = attn_output_reshaped; @@ -224,11 +237,15 @@ class Resampler : public nn::Module { std::vector patch_len(batch_size); int max_h = 0, max_w = 0, max_patch_len = 0; + auto tgt_sizes_contiguous = tgt_sizes.isContiguous() ? tgt_sizes : tgt_sizes.contiguous(); + const auto* tgt_sizes_ptr = tgt_sizes_contiguous.ptr(); for (int i = 0; i < batch_size; i++) { - patch_len[i] = tgt_sizes.at({i, 0}) * tgt_sizes.at({i, 1}); + auto tgt_h = tgt_sizes_ptr[i * 2]; + auto tgt_w = tgt_sizes_ptr[i * 2 + 1]; + patch_len[i] = tgt_h * tgt_w; if (patch_len[i] > max_patch_len) max_patch_len = patch_len[i]; - if (tgt_sizes.at({i, 0}) > max_h) max_h = tgt_sizes.at({i, 0}); - if (tgt_sizes.at({i, 1}) > max_w) max_w = tgt_sizes.at({i, 1}); + if (tgt_h > max_h) max_h = tgt_h; + if (tgt_w > max_w) max_w = tgt_w; } if (max_h > max_size_[0] || max_w > max_size_[1]) { @@ -238,30 +255,35 @@ class Resampler : public nn::Module { registerBuffer("pos_embed", new_pos_embed); } - auto pos_embed = getBuffer("pos_embed"); // [max_h, max_w, embed_dim] + auto pos_embed = getBuffer("pos_embed").contiguous(); // [max_h, max_w, embed_dim] + const auto* pos_embed_ptr = pos_embed.ptr(); auto key_padding_mask = Tensor::empty({batch_size, max_patch_len}, kUInt8).alloc(); + auto* key_padding_mask_ptr = key_padding_mask.ptr(); for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < max_patch_len; j++) { key_padding_mask.at({i, j}) = 1; } - for (int j = 0; j < patch_len[i] && j < max_patch_len; j++) { key_padding_mask.at({i, j}) = 0; } + auto* key_padding_mask_row_ptr = key_padding_mask_ptr + static_cast(i) * max_patch_len; + std::memset(key_padding_mask_row_ptr, 1, static_cast(max_patch_len)); + if (patch_len[i] > 0) { + std::memset(key_padding_mask_row_ptr, 0, static_cast(std::min(patch_len[i], max_patch_len))); + } } std::vector pos_embed_list; for (int i = 0; i < batch_size; i++) { - int32_t tgt_h = tgt_sizes.at({i, 0}); - int32_t tgt_w = tgt_sizes.at({i, 1}); + int32_t tgt_h = tgt_sizes_ptr[i * 2]; + int32_t tgt_w = tgt_sizes_ptr[i * 2 + 1]; int32_t patch_count = tgt_h * tgt_w; Tensor pos_embed_i = Tensor::empty({patch_count, embed_dim_}, kFloat32).alloc(); + auto* pos_embed_i_ptr = pos_embed_i.ptr(); int patch_idx = 0; for (int h = 0; h < tgt_h; h++) { for (int w = 0; w < tgt_w; w++) { - for (int d = 0; d < embed_dim_; d++) { - float value = pos_embed.at({h, w, d}); - *pos_embed_i.offsettedPtr({patch_idx, d}) = value; - } + auto src_ptr = pos_embed_ptr + (static_cast(h) * max_w + w) * embed_dim_; + auto dst_ptr = pos_embed_i_ptr + static_cast(patch_idx) * embed_dim_; + std::memcpy(dst_ptr, src_ptr, static_cast(embed_dim_) * sizeof(float)); patch_idx++; } } @@ -270,18 +292,22 @@ class Resampler : public nn::Module { } Tensor pos_embed_padded = Tensor::empty({batch_size, max_patch_len, embed_dim_}, kFloat32).alloc(); + auto* pos_embed_padded_ptr = pos_embed_padded.ptr(); for (int i = 0; i < batch_size; i++) { - auto& pos_embed_i = pos_embed_list[i]; + auto pos_embed_i = pos_embed_list[i].contiguous(); int actual_len = pos_embed_i.shape()[0]; + const auto* pos_embed_i_ptr = pos_embed_i.ptr(); + auto* pos_embed_padded_batch_ptr = pos_embed_padded_ptr + static_cast(i) * max_patch_len * embed_dim_; - for (int j = 0; j < actual_len && j < max_patch_len; j++) { - for (int k = 0; k < embed_dim_; k++) { - *pos_embed_padded.offsettedPtr({i, j, k}) = pos_embed_i.at({j, k}); - } + auto rows_to_copy = std::min(actual_len, max_patch_len); + if (rows_to_copy > 0) { + std::memcpy(pos_embed_padded_batch_ptr, pos_embed_i_ptr, + static_cast(rows_to_copy) * embed_dim_ * sizeof(float)); } - for (int j = actual_len; j < max_patch_len; j++) { - for (int k = 0; k < embed_dim_; k++) { *pos_embed_padded.offsettedPtr({i, j, k}) = 0.0f; } + if (rows_to_copy < max_patch_len) { + std::memset(pos_embed_padded_batch_ptr + static_cast(rows_to_copy) * embed_dim_, 0, + static_cast(max_patch_len - rows_to_copy) * embed_dim_ * sizeof(float)); } } @@ -315,13 +341,7 @@ class Resampler : public nn::Module { // key_padding_mask for this batch Tensor key_padding_mask_b = key_padding_mask[{b, kAll}].view({max_patch_len}); - bool has_padding = false; - for (int i = 0; i < seq_len; i++) { - if (key_padding_mask_b.at({i}) == 1) { - has_padding = true; - break; - } - } + bool has_padding = patch_len[b] < seq_len; auto attn_output = has_padding ? attn_(q, kv_input, x_b, key_padding_mask_b)[0] : attn_(q, kv_input, x_b)[0]; diff --git a/mllm/models/minicpm_o2_6/modeling_siglip.hpp b/mllm/models/minicpm_o2_6/modeling_siglip.hpp index 30750201b..deb5deb3b 100644 --- a/mllm/models/minicpm_o2_6/modeling_siglip.hpp +++ b/mllm/models/minicpm_o2_6/modeling_siglip.hpp @@ -50,10 +50,14 @@ class SiglipVisionEmbeddings final : public nn::Module { // Create position embeddings if (!tgt_sizes.isNil() && !patch_attention_mask.isNil()) { + if (!tgt_sizes.isContiguous()) { tgt_sizes = tgt_sizes.contiguous(); } + if (!patch_attention_mask.isContiguous()) { patch_attention_mask = patch_attention_mask.contiguous(); } auto max_im_h = pixel_values.shape()[2]; auto max_im_w = pixel_values.shape()[3]; auto max_nb_patches_h = max_im_h / patch_size_; auto max_nb_patches_w = max_im_w / patch_size_; + const auto* tgt_sizes_ptr = tgt_sizes.ptr(); + const auto* patch_mask_ptr = patch_attention_mask.ptr(); // Create boundaries like torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) std::vector boundaries; @@ -63,10 +67,8 @@ class SiglipVisionEmbeddings final : public nn::Module { // Create position_ids tensor - using the max_patches from patch_attention_mask shape auto max_patches = patch_attention_mask.shape()[2]; auto position_ids = Tensor::empty({batch_size, max_patches}, kInt64).alloc(); - // Initialize to zeros - for (int b = 0; b < batch_size; b++) { - for (int p = 0; p < max_patches; p++) { position_ids.at({b, p}) = 0; } - } + std::memset(position_ids.ptr(), 0, static_cast(batch_size) * max_patches * sizeof(int64_t)); + auto* position_ids_ptr = position_ids.ptr(); // Fill position ids based on patch grid and attention mask for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) { @@ -74,8 +76,8 @@ class SiglipVisionEmbeddings final : public nn::Module { int nb_patches_w = max_nb_patches_w; if (tgt_sizes.shape().size() == 2 && batch_idx < tgt_sizes.shape()[0]) { - nb_patches_h = tgt_sizes.at({batch_idx, 0}); - nb_patches_w = tgt_sizes.at({batch_idx, 1}); + nb_patches_h = tgt_sizes_ptr[batch_idx * 2]; + nb_patches_w = tgt_sizes_ptr[batch_idx * 2 + 1]; } // Create fractional coordinates like torch.arange(0, 1 - 1e-6, 1 / nb_patches_h/w) @@ -132,10 +134,11 @@ class SiglipVisionEmbeddings final : public nn::Module { // Apply pos_ids only where patch_attention_mask is True (now it's 1D) // position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids int pos_ids_idx = 0; + auto patch_mask_batch_ptr = patch_mask_ptr + static_cast(batch_idx) * max_patches; + auto position_ids_batch_ptr = position_ids_ptr + static_cast(batch_idx) * max_patches; for (int flat_idx = 0; flat_idx < max_patches; ++flat_idx) { - uint8_t mask_val = patch_attention_mask.at({batch_idx, 0, flat_idx}); - if (mask_val && pos_ids_idx < pos_ids.size()) { - position_ids.at({batch_idx, flat_idx}) = pos_ids[pos_ids_idx]; + if (patch_mask_batch_ptr[flat_idx] && pos_ids_idx < pos_ids.size()) { + position_ids_batch_ptr[flat_idx] = pos_ids[pos_ids_idx]; pos_ids_idx++; } } @@ -350,20 +353,28 @@ class SiglipVisionModel final : public nn::Module { auto batch_size = pixel_values.shape()[0]; int max_patches = 0; // Calculate max_patches based on tgt_sizes + if (!tgt_sizes.isContiguous()) { tgt_sizes = tgt_sizes.contiguous(); } + const auto* tgt_sizes_ptr = tgt_sizes.ptr(); for (int i = 0; i < tgt_sizes.shape()[0]; i++) { - if (tgt_sizes.at({i, 0}) > 0 && tgt_sizes.at({i, 1}) > 0) { - int patches = (tgt_sizes.at({i, 0})) * (tgt_sizes.at({i, 1})); + auto tgt_h = tgt_sizes_ptr[i * 2]; + auto tgt_w = tgt_sizes_ptr[i * 2 + 1]; + if (tgt_h > 0 && tgt_w > 0) { + int patches = tgt_h * tgt_w; if (patches > max_patches) max_patches = patches; } } auto patch_attention_mask = Tensor::empty({batch_size, 1, max_patches}, kUInt8).alloc(); + auto* patch_attention_mask_ptr = patch_attention_mask.ptr(); for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < max_patches; j++) { patch_attention_mask.at({i, 0, j}) = 0; } + auto* patch_attention_mask_batch_ptr = patch_attention_mask_ptr + static_cast(i) * max_patches; + std::memset(patch_attention_mask_batch_ptr, 0, static_cast(max_patches)); if (!tgt_sizes.isNil() && i < tgt_sizes.shape()[0]) { - int nb_patches_h = tgt_sizes.at({i, 0}); - int nb_patches_w = tgt_sizes.at({i, 1}); + int nb_patches_h = tgt_sizes_ptr[i * 2]; + int nb_patches_w = tgt_sizes_ptr[i * 2 + 1]; int valid_patches = nb_patches_h * nb_patches_w; - for (int j = 0; j < valid_patches && j < max_patches; j++) { patch_attention_mask.at({i, 0, j}) = 1; } + if (valid_patches > 0) { + std::memset(patch_attention_mask_batch_ptr, 1, static_cast(std::min(valid_patches, max_patches))); + } } } std::vector hidden_states_result; @@ -374,7 +385,7 @@ class SiglipVisionModel final : public nn::Module { } auto hidden_states = hidden_states_result[0]; // [B, num_patches, embed_dim] - patch_attention_mask = patch_attention_mask.squeeze(1); // [B, max_patches] + patch_attention_mask = patch_attention_mask.squeeze(1).contiguous(); // [B, max_patches] // Create attention mask for encoder (4D mask for multi-head attention) // TODO: this will take about 100ms, optimize it @@ -382,42 +393,38 @@ class SiglipVisionModel final : public nn::Module { if (!patch_attention_mask.isNil()) { auto batch_size = patch_attention_mask.shape()[0]; auto max_patches = patch_attention_mask.shape()[1]; + const auto* patch_mask_ptr = patch_attention_mask.ptr(); bool all_valid = true; for (int i = 0; i < batch_size && all_valid; i++) { + auto patch_mask_batch_ptr = patch_mask_ptr + static_cast(i) * max_patches; for (int j = 0; j < max_patches && all_valid; j++) { - uint8_t mask_val = patch_attention_mask.at({i, j}); - if (mask_val == 0) { all_valid = false; } + if (patch_mask_batch_ptr[j] == 0) { all_valid = false; } } } if (!all_valid) { - // Convert patch_attention_mask to float and create 4D attention mask - auto patch_mask_float = Tensor::empty({batch_size, max_patches}, kFloat32).alloc(); - for (int i = 0; i < batch_size; i++) { - for (int j = 0; j < max_patches; j++) { - uint8_t mask_val = patch_attention_mask.at({i, j}); - patch_mask_float.at({i, j}) = mask_val ? 1.0f : 0.0f; - } - } - // Create 4D attention mask: [B, 1, max_patches, max_patches] attention_mask = Tensor::empty({batch_size, 1, max_patches, max_patches}, kFloat32).alloc(); + auto* attention_mask_ptr = attention_mask.ptr(); // Optimize with cache-friendly access patterns and reduced redundant accesses for (int b = 0; b < batch_size; b++) { // Pre-fetch mask values for this batch to improve cache locality - std::vector batch_mask(max_patches); - for (int p = 0; p < max_patches; p++) { batch_mask[p] = patch_mask_float.at({b, p}); } + std::vector batch_mask(max_patches); + std::memcpy(batch_mask.data(), patch_mask_ptr + static_cast(b) * max_patches, static_cast(max_patches)); // Compute attention mask for this batch with optimized memory access + auto* attention_mask_batch_ptr = + attention_mask_ptr + static_cast(b) * max_patches * max_patches; for (int i = 0; i < max_patches; i++) { - float mask_i = batch_mask[i]; + uint8_t mask_i = batch_mask[i]; + auto* attention_mask_row_ptr = attention_mask_batch_ptr + static_cast(i) * max_patches; // Process row in chunks for better cache utilization for (int j = 0; j < max_patches; j++) { - float mask_j = batch_mask[j]; + uint8_t mask_j = batch_mask[j]; // Both positions must be valid (branchless computation) float final_mask = (mask_i > 0.0f && mask_j > 0.0f) ? 0.0f : -1e9f; - attention_mask.at({b, 0, i, j}) = final_mask; + attention_mask_row_ptr[j] = final_mask; } } } diff --git a/mllm/models/minicpm_o45/modeling_minicpm_o45.hpp b/mllm/models/minicpm_o45/modeling_minicpm_o45.hpp index 1cb92d4d8..156896847 100644 --- a/mllm/models/minicpm_o45/modeling_minicpm_o45.hpp +++ b/mllm/models/minicpm_o45/modeling_minicpm_o45.hpp @@ -195,6 +195,7 @@ class MiniCPMO45TTS final : public nn::Module { } Tensor token_ids = Tensor::empty({1, static_cast(text_token_ids.size())}, kInt64, kCPU).alloc(); + auto* token_ids_ptr = token_ids.ptr(); for (size_t i = 0; i < text_token_ids.size(); ++i) { auto token_id = text_token_ids[i]; if (token_id < 0 || token_id >= cfg_.tts_num_text_tokens) { @@ -202,7 +203,7 @@ class MiniCPMO45TTS final : public nn::Module { token_id, cfg_.tts_num_text_tokens); return Tensor::nil(); } - token_ids.at({0, static_cast(i)}) = token_id; + token_ids_ptr[i] = token_id; } auto llm_embeds = emb_text_(token_ids); @@ -214,9 +215,9 @@ class MiniCPMO45TTS final : public nn::Module { auto tts_embeds = llm_embeds + projected_hidden; Tensor text_eos = Tensor::empty({1, 1}, kInt64, kCPU).alloc(); - text_eos.at({0, 0}) = cfg_.tts_text_eos_token_id; + text_eos.ptr()[0] = cfg_.tts_text_eos_token_id; Tensor audio_bos = Tensor::empty({1, 1}, kInt64, kCPU).alloc(); - audio_bos.at({0, 0}) = cfg_.tts_audio_bos_token_id; + audio_bos.ptr()[0] = cfg_.tts_audio_bos_token_id; if (cfg_.tts_text_eos_token_id < 0 || cfg_.tts_text_eos_token_id >= cfg_.tts_num_text_tokens) { MLLM_ERROR("MiniCPM-o-4_5 TTS text_eos_token_id out of range: {} (vocab={}).", cfg_.tts_text_eos_token_id, cfg_.tts_num_text_tokens); @@ -269,7 +270,8 @@ class MiniCPMO45TTS final : public nn::Module { if (t == 0) { inputs_embeds = condition_embeds; position_ids = Tensor::empty({1, condition_length}, kInt64, kCPU).alloc(); - for (int32_t i = 0; i < condition_length; ++i) { position_ids.at({0, i}) = i; } + auto* position_ids_ptr = position_ids.ptr(); + for (int32_t i = 0; i < condition_length; ++i) { position_ids_ptr[i] = i; } } else { for (int32_t q = 0; q < cfg_.tts_num_vq; ++q) { auto code_ids = generated[{kAll, {t - 1, t}, {q, q + 1}}].contiguous().view({1, 1}); @@ -281,7 +283,7 @@ class MiniCPMO45TTS final : public nn::Module { } } position_ids = Tensor::empty({1, 1}, kInt64, kCPU).alloc(); - position_ids.at({0, 0}) = condition_length + t - 1; + position_ids.ptr()[0] = condition_length + t - 1; } auto [llm_embedding_sin, llm_embedding_cos] = llama::makeRotaryPosEmbedding(position_ids, model_.getBuffer("inv_freq"), 1.0f); @@ -319,7 +321,7 @@ class MiniCPMO45TTS final : public nn::Module { bool use_sampling = generation_cfg.do_sample || generation_cfg.top_k > 0 || generation_cfg.top_p > 0.0f || std::abs(temp - 1.0f) > 1e-6f; auto token_id = sampleFromLogits(logits, use_sampling); - generated.at({0, t, q}) = token_id; + *generated.offsettedPtr({0, t, q}) = token_id; generated_history[q].push_back(token_id); step_tokens.push_back(token_id); has_eos = has_eos || token_id == eos_token; @@ -793,10 +795,11 @@ class MiniCPMO45ForCausalLM : public models::ARGeneration { if (vision_embeddings.dtype() != text_embeddings.dtype()) { vision_embeddings = vision_embeddings.to(text_embeddings.dtype()); } for (int32_t b = 0; b < batch_size; ++b) { + auto image_bounds_ptr = image_bounds.ptr(); for (int32_t bound_idx = 0; bound_idx < num_bounds; ++bound_idx) { int32_t vision_idx = 0; - auto start_pos = image_bounds.constAt({bound_idx, 0}) + 1; - auto end_pos = image_bounds.constAt({bound_idx, 1}) - 1; + auto start_pos = image_bounds_ptr[bound_idx * 2] + 1; + auto end_pos = image_bounds_ptr[bound_idx * 2 + 1] - 1; for (int32_t pos = start_pos; pos <= end_pos && vision_idx < vision_seq_len; ++pos, ++vision_idx) { if (text_embeddings.dtype() == kFloat32) { @@ -826,10 +829,11 @@ class MiniCPMO45ForCausalLM : public models::ARGeneration { if (audio_embeddings.dtype() != text_embeddings.dtype()) { audio_embeddings = audio_embeddings.to(text_embeddings.dtype()); } for (int32_t b = 0; b < batch_size; ++b) { + auto audio_bounds_ptr = audio_bounds.ptr(); for (int32_t bound_idx = 0; bound_idx < num_bounds; ++bound_idx) { int32_t audio_idx = 0; - auto start_pos = audio_bounds.constAt({bound_idx, 0}); - auto end_pos = audio_bounds.constAt({bound_idx, 1}) - 1; + auto start_pos = audio_bounds_ptr[bound_idx * 2]; + auto end_pos = audio_bounds_ptr[bound_idx * 2 + 1] - 1; for (int32_t pos = start_pos; pos <= end_pos && audio_idx < audio_seq_len; ++pos, ++audio_idx) { if (text_embeddings.dtype() == kFloat32) { @@ -849,12 +853,14 @@ class MiniCPMO45ForCausalLM : public models::ARGeneration { Tensor position_ids = Tensor::empty({1, seq_len}, kInt64).alloc(); if (!prev_position_ids.isNil()) { auto last_pos = *prev_position_ids.coffsettedPtr({0, prev_position_ids.shape()[1] - 1}); - for (int32_t i = 0; i < seq_len; ++i) { position_ids.at({0, i}) = last_pos + i + 1; } + auto* position_ids_ptr = position_ids.ptr(); + for (int32_t i = 0; i < seq_len; ++i) { position_ids_ptr[i] = last_pos + i + 1; } return position_ids; } auto last_seen_tokens = kv_cache_.getCurrentSeqCnt(0); - for (int32_t i = 0; i < seq_len; ++i) { position_ids.at({0, i}) = last_seen_tokens + i; } + auto* position_ids_ptr = position_ids.ptr(); + for (int32_t i = 0; i < seq_len; ++i) { position_ids_ptr[i] = last_seen_tokens + i; } return position_ids; } diff --git a/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp b/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp index 5ccd1c838..6e68c4bbc 100644 --- a/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp +++ b/mllm/models/minicpm_o45/tokenization_minicpm_o45.hpp @@ -413,7 +413,7 @@ class MiniCPMO45Tokenizer final : public mllm::preprocessor::AutoTokenizer { input_ids_vec.reserve(sequence_str.size()); for (const auto& str : sequence_str) { input_ids_vec.emplace_back(bpe_._lookup_vocab(str)); } - std::vector> image_bounds; + std::vector>image_bounds; std::vector> audio_bounds; if (has_image) { From f185440099307d5aeabaf086e2dfaf9a5d5df50e Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Tue, 17 Feb 2026 09:23:05 +0000 Subject: [PATCH 21/42] feat(mllm_kernel): simplify JIT usage in README and update kernel example - Replaced the previous JIT utility functions with a streamlined `jit` decorator for kernel registration. - Updated the README.md to reflect the new recommended pattern for CPU kernel implementation. - Simplified the example for using the JIT compilation with a focus on clarity and ease of use. --- mllm-kernel/README.md | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/mllm-kernel/README.md b/mllm-kernel/README.md index 14c8118f0..0a4580495 100644 --- a/mllm-kernel/README.md +++ b/mllm-kernel/README.md @@ -80,31 +80,30 @@ y = add_constant(x, 8) Use the helpers in `mllm_kernel.jit_utils`: -- `load_cpu_jit` -- `load_cuda_jit` +- `jit` - `make_cpp_args` -- `cache_once` -Example pattern: +Recommended pattern (CPU example): ```python import torch -from mllm_kernel.jit_utils import cache_once, load_cpu_jit, make_cpp_args - -@cache_once -def _jit_my_kernel_module(param: int): - args = make_cpp_args(param) - return load_cpu_jit( - "my_kernel", - *args, - cpp_files=["my_kernel.cpp"], - cpp_wrappers=[("my_kernel", f"my_namespace::my_kernel<{args}>")], - ) +import mllm_kernel + +@mllm_kernel.jit( + args=16, + device="cpu", + cpp_files=["my_kernel.cpp"], + cpp_wrappers=[("my_kernel", "my_namespace::my_kernel<16>")], + func_name="my_kernel", +) +def _my_kernel_16(compiled_module, dst: torch.Tensor, src: torch.Tensor) -> None: + compiled_module.my_kernel(dst, src) def my_kernel(src: torch.Tensor, param: int) -> torch.Tensor: + if param != 16: + raise ValueError("This demo only supports param=16.") dst = torch.empty_like(src) - module = _jit_my_kernel_module(param) - module.my_kernel(dst, src) + _my_kernel_16(dst, src) return dst ``` From 289b74b356f99d5121fe0422e709cae2ba16db66 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 18 Feb 2026 03:39:52 +0000 Subject: [PATCH 22/42] feat: update dependencies and refactor mobile module structure - Updated `apache-tvm-ffi` version to `0.1.8` in `pyproject.toml` and `mllm-kernel/pyproject.toml`. - Refactored mobile module imports and structure, moving scripts to `pymllm.mobile` and removing unused backends. - Introduced new classes and methods for quantization and model deployment in the Qualcomm backend. - Added new README files for mobile and Qualcomm transformer components. --- mllm-kernel/pyproject.toml | 2 +- mllm/ffi/Extension.cc | 5 +- mllm/ffi/vendors/tvm-ffi | 2 +- pymllm/__init__.py | 72 ++++++++----------- .../cuda/__init__.py => __main__.py} | 0 pymllm/backends/__init__.py | 4 -- pymllm/backends/cuda/tilelang_compile_test.py | 41 ----------- .../transformers/core => layers}/__init__.py | 0 pymllm/mobile/README.md | 3 +- pymllm/mobile/__init__.py | 45 ++++++++++++ .../spinquant => mobile/backends}/__init__.py | 2 + .../{ => mobile}/backends/qualcomm/README.md | 0 .../backends/qualcomm/__init__.py | 0 pymllm/{ => mobile}/backends/qualcomm/nn.py | 2 +- .../backends/qualcomm/qnn_aot_env.py | 4 +- .../backends/qualcomm/transformers/.gitignore | 0 .../backends/qualcomm/transformers/README.md | 0 .../qualcomm/transformers/__init__.py | 0 .../qualcomm/transformers/core}/__init__.py | 0 .../qualcomm/transformers/core/embedding.py | 0 .../qualcomm/transformers/core/observer.py | 0 .../qualcomm/transformers/core/qdq.py | 0 .../qualcomm/transformers/core/qlinear.py | 2 +- .../qualcomm/transformers/core/rms_norm.py | 0 .../transformers/llama/modeling_llama.py | 10 +-- .../qualcomm/transformers/llama/runner.py | 12 ++-- .../qualcomm/transformers/llama/train.py | 2 +- .../transformers/qwen2/modeling_qwen2.py | 10 +-- .../qualcomm/transformers/qwen2/runner.py | 12 ++-- .../qualcomm/transformers/qwen2/train.py | 2 +- .../transformers/qwen3/modeling_qwen3.py | 10 +-- .../qualcomm/transformers/qwen3/runner.py | 12 ++-- .../qualcomm/transformers/qwen3/train.py | 2 +- pymllm/{ => mobile}/convertor/__init__.py | 0 .../convertor/mllm_type_mapping.py | 0 .../{ => mobile}/convertor/model_file_v1.py | 0 .../{ => mobile}/convertor/model_file_v2.py | 0 pymllm/{ => mobile}/ffi/__init__.py | 0 pymllm/{ => mobile}/ffi/_ffi_api.py | 0 pymllm/{ => mobile}/ffi/base.py | 2 +- pymllm/{ => mobile}/nn/__init__.py | 0 pymllm/{ => mobile}/nn/_layers.py | 0 pymllm/{ => mobile}/nn/_module.py | 0 pymllm/{ => mobile}/nn/functional.py | 0 pymllm/{ => mobile}/quantize/__init__.py | 0 .../{ => mobile}/quantize/cast2fp32_pass.py | 0 .../quantize/gguf}/__init__.py | 0 pymllm/{ => mobile}/quantize/kai/__init__.py | 0 pymllm/{ => mobile}/quantize/kai/w4a32.py | 0 pymllm/{ => mobile}/quantize/pipeline.py | 0 pymllm/{ => mobile}/quantize/quantize_pass.py | 0 pymllm/{ => mobile}/quantize/solver.py | 0 .../quantize/spinquant}/__init__.py | 0 pymllm/{ => mobile}/service/__init__.py | 0 pymllm/{ => mobile}/service/models_hub.py | 0 pymllm/{ => mobile}/service/network.py | 0 pymllm/{ => mobile}/service/rr_process.py | 0 pymllm/{ => mobile}/service/tools.py | 0 .../tests/qualcomm/test_context_create.py | 4 +- pymllm/{ => mobile}/tests/test_nn.py | 4 +- pymllm/{ => mobile}/tests/test_tensor.py | 2 +- pymllm/{ => mobile}/utils/__init__.py | 0 pymllm/{ => mobile}/utils/adb.py | 0 pymllm/{ => mobile}/utils/error_handler.py | 0 pymllm/{ => mobile}/utils/mllm_convertor.py | 0 .../mllm_ir/trace.py => models/__init__.py} | 0 pymllm/utils/mllm_convertor_server/service.py | 2 - pyproject.toml | 8 +-- 68 files changed, 132 insertions(+), 146 deletions(-) rename pymllm/{backends/cuda/__init__.py => __main__.py} (100%) delete mode 100644 pymllm/backends/__init__.py delete mode 100644 pymllm/backends/cuda/tilelang_compile_test.py rename pymllm/{backends/qualcomm/transformers/core => layers}/__init__.py (100%) create mode 100644 pymllm/mobile/__init__.py rename pymllm/{quantize/spinquant => mobile/backends}/__init__.py (71%) rename pymllm/{ => mobile}/backends/qualcomm/README.md (100%) rename pymllm/{ => mobile}/backends/qualcomm/__init__.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/nn.py (75%) rename pymllm/{ => mobile}/backends/qualcomm/qnn_aot_env.py (83%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/.gitignore (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/README.md (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/__init__.py (100%) rename pymllm/{compile/mlir => mobile/backends/qualcomm/transformers/core}/__init__.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/core/embedding.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/core/observer.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/core/qdq.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/core/qlinear.py (99%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/core/rms_norm.py (100%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/llama/modeling_llama.py (98%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/llama/runner.py (96%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/llama/train.py (94%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen2/modeling_qwen2.py (98%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen2/runner.py (96%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen2/train.py (94%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen3/modeling_qwen3.py (98%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen3/runner.py (96%) rename pymllm/{ => mobile}/backends/qualcomm/transformers/qwen3/train.py (94%) rename pymllm/{ => mobile}/convertor/__init__.py (100%) rename pymllm/{ => mobile}/convertor/mllm_type_mapping.py (100%) rename pymllm/{ => mobile}/convertor/model_file_v1.py (100%) rename pymllm/{ => mobile}/convertor/model_file_v2.py (100%) rename pymllm/{ => mobile}/ffi/__init__.py (100%) rename pymllm/{ => mobile}/ffi/_ffi_api.py (100%) rename pymllm/{ => mobile}/ffi/base.py (90%) rename pymllm/{ => mobile}/nn/__init__.py (100%) rename pymllm/{ => mobile}/nn/_layers.py (100%) rename pymllm/{ => mobile}/nn/_module.py (100%) rename pymllm/{ => mobile}/nn/functional.py (100%) rename pymllm/{ => mobile}/quantize/__init__.py (100%) rename pymllm/{ => mobile}/quantize/cast2fp32_pass.py (100%) rename pymllm/{compile => mobile/quantize/gguf}/__init__.py (100%) rename pymllm/{ => mobile}/quantize/kai/__init__.py (100%) rename pymllm/{ => mobile}/quantize/kai/w4a32.py (100%) rename pymllm/{ => mobile}/quantize/pipeline.py (100%) rename pymllm/{ => mobile}/quantize/quantize_pass.py (100%) rename pymllm/{ => mobile}/quantize/solver.py (100%) rename pymllm/{quantize/gguf => mobile/quantize/spinquant}/__init__.py (100%) rename pymllm/{ => mobile}/service/__init__.py (100%) rename pymllm/{ => mobile}/service/models_hub.py (100%) rename pymllm/{ => mobile}/service/network.py (100%) rename pymllm/{ => mobile}/service/rr_process.py (100%) rename pymllm/{ => mobile}/service/tools.py (100%) rename pymllm/{ => mobile}/tests/qualcomm/test_context_create.py (89%) rename pymllm/{ => mobile}/tests/test_nn.py (83%) rename pymllm/{ => mobile}/tests/test_tensor.py (89%) rename pymllm/{ => mobile}/utils/__init__.py (100%) rename pymllm/{ => mobile}/utils/adb.py (100%) rename pymllm/{ => mobile}/utils/error_handler.py (100%) rename pymllm/{ => mobile}/utils/mllm_convertor.py (100%) rename pymllm/{compile/mllm_ir/trace.py => models/__init__.py} (100%) delete mode 100644 pymllm/utils/mllm_convertor_server/service.py diff --git a/mllm-kernel/pyproject.toml b/mllm-kernel/pyproject.toml index f64e1306e..5fe07eeab 100644 --- a/mllm-kernel/pyproject.toml +++ b/mllm-kernel/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "packaging", "torch", "torch-c-dlpack-ext", - "apache-tvm-ffi", + "apache-tvm-ffi == 0.1.8", ] [project.optional-dependencies] diff --git a/mllm/ffi/Extension.cc b/mllm/ffi/Extension.cc index cb999191d..f3f2d2488 100644 --- a/mllm/ffi/Extension.cc +++ b/mllm/ffi/Extension.cc @@ -83,12 +83,12 @@ TVM_FFI_STATIC_INIT_BLOCK() { // Tensor related refl::GlobalDef().def("mllm.empty", mllm::ffi::empty); refl::GlobalDef().def("mllm.from_torch", [](const tvm::ffi::Tensor& t) -> mllm::ffi::Tensor { - auto dl_pack = t.get()->ToDLPack(); + auto dl_pack = t.ToDLPack(); return ::mllm::ffi::Tensor(mllm::ffi::__from_dlpack(dl_pack)); }); refl::GlobalDef().def("mllm.from_numpy", [](const tvm::ffi::Tensor& t) -> mllm::ffi::Tensor { - auto dl_pack = t.get()->ToDLPack(); + auto dl_pack = t.ToDLPack(); return ::mllm::ffi::Tensor(mllm::ffi::__from_dlpack(dl_pack)); }); @@ -345,6 +345,7 @@ TVM_FFI_STATIC_INIT_BLOCK() { namespace refl = tvm::ffi::reflection; refl::ObjectDef<::mllm::ffi::BaseOpObj>(); + refl::ObjectDef<::mllm::ffi::ParameterFileObj>(); refl::GlobalDef().def("mllm.BaseOp.load", [](const mllm::ffi::BaseOp& self, const mllm::ffi::ParameterFile& obj) -> void { self.get()->op_ptr_->load(obj.get()->pf_ptr_); }); diff --git a/mllm/ffi/vendors/tvm-ffi b/mllm/ffi/vendors/tvm-ffi index 46f735807..dcd07cfe2 160000 --- a/mllm/ffi/vendors/tvm-ffi +++ b/mllm/ffi/vendors/tvm-ffi @@ -1 +1 @@ -Subproject commit 46f73580780f2973e6ea3afb6d3a9d6f6ffd02cc +Subproject commit dcd07cfe27465287ee5b203b742e85dcfb99606a diff --git a/pymllm/__init__.py b/pymllm/__init__.py index 1bd31cd6c..3f2488d27 100644 --- a/pymllm/__init__.py +++ b/pymllm/__init__.py @@ -2,48 +2,32 @@ # Licensed under the MIT License. from __future__ import annotations +import os +import sys -from . import ffi -from . import convertor -from . import utils -from . import quantize -from . import nn -from . import compile -from . import service -from . import backends -from .ffi import ( - # Floating point types - float32, - float16, - bfloat16, - # Signed integer types - int8, - int16, - int32, - int64, - # Unsigned integer types - uint8, - uint16, - uint32, - uint64, - # Bool type - boolean, - # Devices - cpu, - cuda, - qnn, - # Tensor and utilities - Tensor, - empty, - echo, - device, - is_torch_available, - is_numpy_available, - from_torch, - from_numpy, - zeros, - ones, - arange, - random, -) -from .nn.functional import matmul +__all__ = [] + + +def _has_mobile_libs() -> bool: + parent_dir = os.path.dirname(os.path.realpath(__file__)) + + # Platform-specific library names + if sys.platform.startswith("win32"): + lib_name = "MllmFFIExtension.dll" + elif sys.platform.startswith("darwin"): + lib_name = "MllmFFIExtension.dylib" + else: + lib_name = "MllmFFIExtension.so" + + lib_path = os.path.join(parent_dir, "lib", lib_name) + return os.path.exists(lib_path) + + +def is_mobile_available() -> bool: + return _has_mobile_libs() + + +if _has_mobile_libs(): + from . import mobile + + __all__.append("mobile") diff --git a/pymllm/backends/cuda/__init__.py b/pymllm/__main__.py similarity index 100% rename from pymllm/backends/cuda/__init__.py rename to pymllm/__main__.py diff --git a/pymllm/backends/__init__.py b/pymllm/backends/__init__.py deleted file mode 100644 index 5e926d580..000000000 --- a/pymllm/backends/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) MLLM Team. -# Licensed under the MIT License. - -from . import cuda, qualcomm diff --git a/pymllm/backends/cuda/tilelang_compile_test.py b/pymllm/backends/cuda/tilelang_compile_test.py deleted file mode 100644 index 65a2e0071..000000000 --- a/pymllm/backends/cuda/tilelang_compile_test.py +++ /dev/null @@ -1,41 +0,0 @@ -import tilelang -import tilelang.language as T - - -@tilelang.jit( - out_idx=[-1], compile_flags=["-O3", "--use_fast_math", "--expt-relaxed-constexpr"] -) -def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads): - @T.prim_func - def elem_add( - A: T.Tensor((M, N), in_dtype), - B: T.Tensor((M, N), in_dtype), - C: T.Tensor((M, N), out_dtype), - ): - with T.Kernel( - T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads - ) as (bx, by): - A_shared = T.alloc_shared((block_M, block_N), in_dtype) - B_shared = T.alloc_shared((block_M, block_N), in_dtype) - C_local = T.alloc_fragment((block_M, block_N), out_dtype) - C_shared = T.alloc_shared((block_M, block_N), out_dtype) - - T.copy(A[by * block_M, bx * block_N], A_shared) - T.copy(B[by * block_M, bx * block_N], B_shared) - for local_y, local_x in T.Parallel(block_M, block_N): - C_local[local_y, local_x] = ( - A_shared[local_y, local_x] + B_shared[local_y, local_x] - ) - T.copy(C_local, C_shared) - T.copy(C_shared, C[by * block_M, bx * block_N]) - - return elem_add - - -def compile_test(): - M = 1024 - N = 1024 - config = {"block_M": 128, "block_N": 128, "threads": 128} - kernel = elementwise_add(M, N, **config, in_dtype="float16", out_dtype="float16") - source = kernel.get_kernel_source() - print(source) diff --git a/pymllm/backends/qualcomm/transformers/core/__init__.py b/pymllm/layers/__init__.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/core/__init__.py rename to pymllm/layers/__init__.py diff --git a/pymllm/mobile/README.md b/pymllm/mobile/README.md index 29877ea00..ceb71a5d3 100644 --- a/pymllm/mobile/README.md +++ b/pymllm/mobile/README.md @@ -1 +1,2 @@ -We should refactor current pymllm's src to mobile directory. And provide more functionalities for torch based VLA. +# Pymllm mobile + diff --git a/pymllm/mobile/__init__.py b/pymllm/mobile/__init__.py new file mode 100644 index 000000000..8796bbeaf --- /dev/null +++ b/pymllm/mobile/__init__.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from . import ffi +from . import convertor +from . import utils +from . import quantize +from . import nn +from . import service +from . import backends +from .ffi import ( + # Floating point types + float32, + float16, + bfloat16, + # Signed integer types + int8, + int16, + int32, + int64, + # Unsigned integer types + uint8, + uint16, + uint32, + uint64, + # Bool type + boolean, + # Devices + cpu, + cuda, + qnn, + # Tensor and utilities + Tensor, + empty, + echo, + device, + is_torch_available, + is_numpy_available, + from_torch, + from_numpy, + zeros, + ones, + arange, + random, +) +from .nn.functional import matmul diff --git a/pymllm/quantize/spinquant/__init__.py b/pymllm/mobile/backends/__init__.py similarity index 71% rename from pymllm/quantize/spinquant/__init__.py rename to pymllm/mobile/backends/__init__.py index ea8e2bec7..1578a0d87 100644 --- a/pymllm/quantize/spinquant/__init__.py +++ b/pymllm/mobile/backends/__init__.py @@ -1,2 +1,4 @@ # Copyright (c) MLLM Team. # Licensed under the MIT License. + +from . import qualcomm diff --git a/pymllm/backends/qualcomm/README.md b/pymllm/mobile/backends/qualcomm/README.md similarity index 100% rename from pymllm/backends/qualcomm/README.md rename to pymllm/mobile/backends/qualcomm/README.md diff --git a/pymllm/backends/qualcomm/__init__.py b/pymllm/mobile/backends/qualcomm/__init__.py similarity index 100% rename from pymllm/backends/qualcomm/__init__.py rename to pymllm/mobile/backends/qualcomm/__init__.py diff --git a/pymllm/backends/qualcomm/nn.py b/pymllm/mobile/backends/qualcomm/nn.py similarity index 75% rename from pymllm/backends/qualcomm/nn.py rename to pymllm/mobile/backends/qualcomm/nn.py index 0ba9aef55..e4bc91ace 100644 --- a/pymllm/backends/qualcomm/nn.py +++ b/pymllm/mobile/backends/qualcomm/nn.py @@ -1,4 +1,4 @@ -from pymllm.nn._layers import Softmax, RoPE +from pymllm.mobile.nn._layers import Softmax, RoPE class QnnSoftmax(Softmax): diff --git a/pymllm/backends/qualcomm/qnn_aot_env.py b/pymllm/mobile/backends/qualcomm/qnn_aot_env.py similarity index 83% rename from pymllm/backends/qualcomm/qnn_aot_env.py rename to pymllm/mobile/backends/qualcomm/qnn_aot_env.py index 8b0c0d2e1..bc48c7c97 100644 --- a/pymllm/backends/qualcomm/qnn_aot_env.py +++ b/pymllm/mobile/backends/qualcomm/qnn_aot_env.py @@ -1,7 +1,7 @@ -from pymllm.ffi import is_qnn_aot_on_x86_enabled +from pymllm.mobile.ffi import is_qnn_aot_on_x86_enabled if is_qnn_aot_on_x86_enabled(): - from pymllm.ffi import ( + from pymllm.mobile.ffi import ( QnnDeviceAndContext, QnnAOTEnv, QcomChipset, diff --git a/pymllm/backends/qualcomm/transformers/.gitignore b/pymllm/mobile/backends/qualcomm/transformers/.gitignore similarity index 100% rename from pymllm/backends/qualcomm/transformers/.gitignore rename to pymllm/mobile/backends/qualcomm/transformers/.gitignore diff --git a/pymllm/backends/qualcomm/transformers/README.md b/pymllm/mobile/backends/qualcomm/transformers/README.md similarity index 100% rename from pymllm/backends/qualcomm/transformers/README.md rename to pymllm/mobile/backends/qualcomm/transformers/README.md diff --git a/pymllm/backends/qualcomm/transformers/__init__.py b/pymllm/mobile/backends/qualcomm/transformers/__init__.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/__init__.py rename to pymllm/mobile/backends/qualcomm/transformers/__init__.py diff --git a/pymllm/compile/mlir/__init__.py b/pymllm/mobile/backends/qualcomm/transformers/core/__init__.py similarity index 100% rename from pymllm/compile/mlir/__init__.py rename to pymllm/mobile/backends/qualcomm/transformers/core/__init__.py diff --git a/pymllm/backends/qualcomm/transformers/core/embedding.py b/pymllm/mobile/backends/qualcomm/transformers/core/embedding.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/core/embedding.py rename to pymllm/mobile/backends/qualcomm/transformers/core/embedding.py diff --git a/pymllm/backends/qualcomm/transformers/core/observer.py b/pymllm/mobile/backends/qualcomm/transformers/core/observer.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/core/observer.py rename to pymllm/mobile/backends/qualcomm/transformers/core/observer.py diff --git a/pymllm/backends/qualcomm/transformers/core/qdq.py b/pymllm/mobile/backends/qualcomm/transformers/core/qdq.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/core/qdq.py rename to pymllm/mobile/backends/qualcomm/transformers/core/qdq.py diff --git a/pymllm/backends/qualcomm/transformers/core/qlinear.py b/pymllm/mobile/backends/qualcomm/transformers/core/qlinear.py similarity index 99% rename from pymllm/backends/qualcomm/transformers/core/qlinear.py rename to pymllm/mobile/backends/qualcomm/transformers/core/qlinear.py index 9e90ba8a5..35439180c 100644 --- a/pymllm/backends/qualcomm/transformers/core/qlinear.py +++ b/pymllm/mobile/backends/qualcomm/transformers/core/qlinear.py @@ -2,7 +2,7 @@ import torch.nn as nn import torch.nn.functional as F from torch.ao.quantization import FakeQuantize, PerChannelMinMaxObserver -from pymllm.backends.qualcomm.transformers.core.observer import ( +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ( PerBlockParamFakeQuantize, ) from torchao.quantization.quant_primitives import ( diff --git a/pymllm/backends/qualcomm/transformers/core/rms_norm.py b/pymllm/mobile/backends/qualcomm/transformers/core/rms_norm.py similarity index 100% rename from pymllm/backends/qualcomm/transformers/core/rms_norm.py rename to pymllm/mobile/backends/qualcomm/transformers/core/rms_norm.py diff --git a/pymllm/backends/qualcomm/transformers/llama/modeling_llama.py b/pymllm/mobile/backends/qualcomm/transformers/llama/modeling_llama.py similarity index 98% rename from pymllm/backends/qualcomm/transformers/llama/modeling_llama.py rename to pymllm/mobile/backends/qualcomm/transformers/llama/modeling_llama.py index 119ec04bc..6b65f34b9 100644 --- a/pymllm/backends/qualcomm/transformers/llama/modeling_llama.py +++ b/pymllm/mobile/backends/qualcomm/transformers/llama/modeling_llama.py @@ -52,16 +52,16 @@ from transformers.models.llama.configuration_llama import LlamaConfig # Replace linear, rms_norm with: -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, ) -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver logger = logging.get_logger(__name__) diff --git a/pymllm/backends/qualcomm/transformers/llama/runner.py b/pymllm/mobile/backends/qualcomm/transformers/llama/runner.py similarity index 96% rename from pymllm/backends/qualcomm/transformers/llama/runner.py rename to pymllm/mobile/backends/qualcomm/transformers/llama/runner.py index 8aa4627bf..730147d0f 100644 --- a/pymllm/backends/qualcomm/transformers/llama/runner.py +++ b/pymllm/mobile/backends/qualcomm/transformers/llama/runner.py @@ -2,18 +2,18 @@ from tqdm import tqdm from modelscope.msdatasets import MsDataset from transformers import AutoTokenizer -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, QLinearW8A16_PerChannelSym, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.llama.modeling_llama import LlamaForCausalLM -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.llama.modeling_llama import LlamaForCausalLM +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver def recompute_scale_zp(module): diff --git a/pymllm/backends/qualcomm/transformers/llama/train.py b/pymllm/mobile/backends/qualcomm/transformers/llama/train.py similarity index 94% rename from pymllm/backends/qualcomm/transformers/llama/train.py rename to pymllm/mobile/backends/qualcomm/transformers/llama/train.py index cd10befba..41ffc0e27 100644 --- a/pymllm/backends/qualcomm/transformers/llama/train.py +++ b/pymllm/mobile/backends/qualcomm/transformers/llama/train.py @@ -2,7 +2,7 @@ import torch import argparse from safetensors.torch import save_model -from pymllm.backends.qualcomm.transformers.llama.runner import LlamaQuantizer +from pymllm.mobile.backends.qualcomm.transformers.llama.runner import LlamaQuantizer def main(): diff --git a/pymllm/backends/qualcomm/transformers/qwen2/modeling_qwen2.py b/pymllm/mobile/backends/qualcomm/transformers/qwen2/modeling_qwen2.py similarity index 98% rename from pymllm/backends/qualcomm/transformers/qwen2/modeling_qwen2.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen2/modeling_qwen2.py index 56b19c421..a43d8b7ea 100644 --- a/pymllm/backends/qualcomm/transformers/qwen2/modeling_qwen2.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen2/modeling_qwen2.py @@ -31,16 +31,16 @@ from transformers.models.qwen2.configuration_qwen2 import Qwen2Config # Replace linear, rms_norm with: -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, ) -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver class Qwen2MLP(nn.Module): diff --git a/pymllm/backends/qualcomm/transformers/qwen2/runner.py b/pymllm/mobile/backends/qualcomm/transformers/qwen2/runner.py similarity index 96% rename from pymllm/backends/qualcomm/transformers/qwen2/runner.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen2/runner.py index d2f5be05b..ce55fd06d 100644 --- a/pymllm/backends/qualcomm/transformers/qwen2/runner.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen2/runner.py @@ -2,18 +2,18 @@ from tqdm import tqdm from modelscope.msdatasets import MsDataset from transformers import AutoTokenizer -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, QLinearW8A16_PerChannelSym, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.qwen2.modeling_qwen2 import Qwen2ForCausalLM -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.qwen2.modeling_qwen2 import Qwen2ForCausalLM +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver def recompute_scale_zp(module): diff --git a/pymllm/backends/qualcomm/transformers/qwen2/train.py b/pymllm/mobile/backends/qualcomm/transformers/qwen2/train.py similarity index 94% rename from pymllm/backends/qualcomm/transformers/qwen2/train.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen2/train.py index fec5fdfca..1a8f25ce9 100644 --- a/pymllm/backends/qualcomm/transformers/qwen2/train.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen2/train.py @@ -2,7 +2,7 @@ import torch import argparse from safetensors.torch import save_model -from pymllm.backends.qualcomm.transformers.qwen2.runner import Qwen2Quantizer +from pymllm.mobile.backends.qualcomm.transformers.qwen2.runner import Qwen2Quantizer def main(): diff --git a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py b/pymllm/mobile/backends/qualcomm/transformers/qwen3/modeling_qwen3.py similarity index 98% rename from pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen3/modeling_qwen3.py index 2dabf5c9c..6a8788bad 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/modeling_qwen3.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen3/modeling_qwen3.py @@ -46,16 +46,16 @@ from transformers.models.qwen3.configuration_qwen3 import Qwen3Config # Replace linear, rms_norm with: -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, ) -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver class Qwen3MLP(nn.Module): diff --git a/pymllm/backends/qualcomm/transformers/qwen3/runner.py b/pymllm/mobile/backends/qualcomm/transformers/qwen3/runner.py similarity index 96% rename from pymllm/backends/qualcomm/transformers/qwen3/runner.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen3/runner.py index 02ea6a5f0..0d7499c96 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/runner.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen3/runner.py @@ -2,18 +2,18 @@ from tqdm import tqdm from modelscope.msdatasets import MsDataset from transformers import AutoTokenizer -from pymllm.backends.qualcomm.transformers.core.qdq import ( +from pymllm.mobile.backends.qualcomm.transformers.core.qdq import ( ActivationQDQ, FixedActivationQDQ, ) -from pymllm.backends.qualcomm.transformers.core.rms_norm import QRMSNorm -from pymllm.backends.qualcomm.transformers.core.qlinear import ( +from pymllm.mobile.backends.qualcomm.transformers.core.rms_norm import QRMSNorm +from pymllm.mobile.backends.qualcomm.transformers.core.qlinear import ( QLinearLPBQ, QLinearW8A16_PerChannelSym, ) -from pymllm.backends.qualcomm.transformers.core.embedding import QEmbedding -from pymllm.backends.qualcomm.transformers.qwen3.modeling_qwen3 import Qwen3ForCausalLM -from pymllm.backends.qualcomm.transformers.core.observer import ConcatObserver +from pymllm.mobile.backends.qualcomm.transformers.core.embedding import QEmbedding +from pymllm.mobile.backends.qualcomm.transformers.qwen3.modeling_qwen3 import Qwen3ForCausalLM +from pymllm.mobile.backends.qualcomm.transformers.core.observer import ConcatObserver def recompute_scale_zp(module): diff --git a/pymllm/backends/qualcomm/transformers/qwen3/train.py b/pymllm/mobile/backends/qualcomm/transformers/qwen3/train.py similarity index 94% rename from pymllm/backends/qualcomm/transformers/qwen3/train.py rename to pymllm/mobile/backends/qualcomm/transformers/qwen3/train.py index 63c6d0e86..f44fa67b5 100644 --- a/pymllm/backends/qualcomm/transformers/qwen3/train.py +++ b/pymllm/mobile/backends/qualcomm/transformers/qwen3/train.py @@ -2,7 +2,7 @@ import torch import argparse from safetensors.torch import save_model -from pymllm.backends.qualcomm.transformers.qwen3.runner import Qwen3Quantizer +from pymllm.mobile.backends.qualcomm.transformers.qwen3.runner import Qwen3Quantizer def main(): diff --git a/pymllm/convertor/__init__.py b/pymllm/mobile/convertor/__init__.py similarity index 100% rename from pymllm/convertor/__init__.py rename to pymllm/mobile/convertor/__init__.py diff --git a/pymllm/convertor/mllm_type_mapping.py b/pymllm/mobile/convertor/mllm_type_mapping.py similarity index 100% rename from pymllm/convertor/mllm_type_mapping.py rename to pymllm/mobile/convertor/mllm_type_mapping.py diff --git a/pymllm/convertor/model_file_v1.py b/pymllm/mobile/convertor/model_file_v1.py similarity index 100% rename from pymllm/convertor/model_file_v1.py rename to pymllm/mobile/convertor/model_file_v1.py diff --git a/pymllm/convertor/model_file_v2.py b/pymllm/mobile/convertor/model_file_v2.py similarity index 100% rename from pymllm/convertor/model_file_v2.py rename to pymllm/mobile/convertor/model_file_v2.py diff --git a/pymllm/ffi/__init__.py b/pymllm/mobile/ffi/__init__.py similarity index 100% rename from pymllm/ffi/__init__.py rename to pymllm/mobile/ffi/__init__.py diff --git a/pymllm/ffi/_ffi_api.py b/pymllm/mobile/ffi/_ffi_api.py similarity index 100% rename from pymllm/ffi/_ffi_api.py rename to pymllm/mobile/ffi/_ffi_api.py diff --git a/pymllm/ffi/base.py b/pymllm/mobile/ffi/base.py similarity index 90% rename from pymllm/ffi/base.py rename to pymllm/mobile/ffi/base.py index 07a01c49e..96aed2425 100644 --- a/pymllm/ffi/base.py +++ b/pymllm/mobile/ffi/base.py @@ -8,7 +8,7 @@ def _load_lib(): file_dir = os.path.dirname(os.path.realpath(__file__)) - parent_dir = os.path.dirname(file_dir) + parent_dir = os.path.dirname(os.path.dirname(file_dir)) # Platform-specific library names if sys.platform.startswith("win32"): diff --git a/pymllm/nn/__init__.py b/pymllm/mobile/nn/__init__.py similarity index 100% rename from pymllm/nn/__init__.py rename to pymllm/mobile/nn/__init__.py diff --git a/pymllm/nn/_layers.py b/pymllm/mobile/nn/_layers.py similarity index 100% rename from pymllm/nn/_layers.py rename to pymllm/mobile/nn/_layers.py diff --git a/pymllm/nn/_module.py b/pymllm/mobile/nn/_module.py similarity index 100% rename from pymllm/nn/_module.py rename to pymllm/mobile/nn/_module.py diff --git a/pymllm/nn/functional.py b/pymllm/mobile/nn/functional.py similarity index 100% rename from pymllm/nn/functional.py rename to pymllm/mobile/nn/functional.py diff --git a/pymllm/quantize/__init__.py b/pymllm/mobile/quantize/__init__.py similarity index 100% rename from pymllm/quantize/__init__.py rename to pymllm/mobile/quantize/__init__.py diff --git a/pymllm/quantize/cast2fp32_pass.py b/pymllm/mobile/quantize/cast2fp32_pass.py similarity index 100% rename from pymllm/quantize/cast2fp32_pass.py rename to pymllm/mobile/quantize/cast2fp32_pass.py diff --git a/pymllm/compile/__init__.py b/pymllm/mobile/quantize/gguf/__init__.py similarity index 100% rename from pymllm/compile/__init__.py rename to pymllm/mobile/quantize/gguf/__init__.py diff --git a/pymllm/quantize/kai/__init__.py b/pymllm/mobile/quantize/kai/__init__.py similarity index 100% rename from pymllm/quantize/kai/__init__.py rename to pymllm/mobile/quantize/kai/__init__.py diff --git a/pymllm/quantize/kai/w4a32.py b/pymllm/mobile/quantize/kai/w4a32.py similarity index 100% rename from pymllm/quantize/kai/w4a32.py rename to pymllm/mobile/quantize/kai/w4a32.py diff --git a/pymllm/quantize/pipeline.py b/pymllm/mobile/quantize/pipeline.py similarity index 100% rename from pymllm/quantize/pipeline.py rename to pymllm/mobile/quantize/pipeline.py diff --git a/pymllm/quantize/quantize_pass.py b/pymllm/mobile/quantize/quantize_pass.py similarity index 100% rename from pymllm/quantize/quantize_pass.py rename to pymllm/mobile/quantize/quantize_pass.py diff --git a/pymllm/quantize/solver.py b/pymllm/mobile/quantize/solver.py similarity index 100% rename from pymllm/quantize/solver.py rename to pymllm/mobile/quantize/solver.py diff --git a/pymllm/quantize/gguf/__init__.py b/pymllm/mobile/quantize/spinquant/__init__.py similarity index 100% rename from pymllm/quantize/gguf/__init__.py rename to pymllm/mobile/quantize/spinquant/__init__.py diff --git a/pymllm/service/__init__.py b/pymllm/mobile/service/__init__.py similarity index 100% rename from pymllm/service/__init__.py rename to pymllm/mobile/service/__init__.py diff --git a/pymllm/service/models_hub.py b/pymllm/mobile/service/models_hub.py similarity index 100% rename from pymllm/service/models_hub.py rename to pymllm/mobile/service/models_hub.py diff --git a/pymllm/service/network.py b/pymllm/mobile/service/network.py similarity index 100% rename from pymllm/service/network.py rename to pymllm/mobile/service/network.py diff --git a/pymllm/service/rr_process.py b/pymllm/mobile/service/rr_process.py similarity index 100% rename from pymllm/service/rr_process.py rename to pymllm/mobile/service/rr_process.py diff --git a/pymllm/service/tools.py b/pymllm/mobile/service/tools.py similarity index 100% rename from pymllm/service/tools.py rename to pymllm/mobile/service/tools.py diff --git a/pymllm/tests/qualcomm/test_context_create.py b/pymllm/mobile/tests/qualcomm/test_context_create.py similarity index 89% rename from pymllm/tests/qualcomm/test_context_create.py rename to pymllm/mobile/tests/qualcomm/test_context_create.py index 18983daa7..94f42b513 100644 --- a/pymllm/tests/qualcomm/test_context_create.py +++ b/pymllm/mobile/tests/qualcomm/test_context_create.py @@ -1,5 +1,5 @@ -import pymllm as mllm -from pymllm.backends.qualcomm.qnn_aot_env import ( +import pymllm.mobile as mllm +from pymllm.mobile.backends.qualcomm.qnn_aot_env import ( QnnAOTEnv, QnnDeviceAndContext, QcomTryBestPerformance, diff --git a/pymllm/tests/test_nn.py b/pymllm/mobile/tests/test_nn.py similarity index 83% rename from pymllm/tests/test_nn.py rename to pymllm/mobile/tests/test_nn.py index d9a3db2d8..403060e99 100644 --- a/pymllm/tests/test_nn.py +++ b/pymllm/mobile/tests/test_nn.py @@ -1,5 +1,5 @@ -import pymllm as mllm -from pymllm import nn +import pymllm.mobile as mllm +from pymllm.mobile import nn class FooModule(nn.Module): diff --git a/pymllm/tests/test_tensor.py b/pymllm/mobile/tests/test_tensor.py similarity index 89% rename from pymllm/tests/test_tensor.py rename to pymllm/mobile/tests/test_tensor.py index e935f10b4..474e10922 100644 --- a/pymllm/tests/test_tensor.py +++ b/pymllm/mobile/tests/test_tensor.py @@ -1,7 +1,7 @@ # Copyright (c) MLLM Team. # Licensed under the MIT License. -import pymllm as torch +import pymllm.mobile as torch def test_empty_tensor_create() -> bool: diff --git a/pymllm/utils/__init__.py b/pymllm/mobile/utils/__init__.py similarity index 100% rename from pymllm/utils/__init__.py rename to pymllm/mobile/utils/__init__.py diff --git a/pymllm/utils/adb.py b/pymllm/mobile/utils/adb.py similarity index 100% rename from pymllm/utils/adb.py rename to pymllm/mobile/utils/adb.py diff --git a/pymllm/utils/error_handler.py b/pymllm/mobile/utils/error_handler.py similarity index 100% rename from pymllm/utils/error_handler.py rename to pymllm/mobile/utils/error_handler.py diff --git a/pymllm/utils/mllm_convertor.py b/pymllm/mobile/utils/mllm_convertor.py similarity index 100% rename from pymllm/utils/mllm_convertor.py rename to pymllm/mobile/utils/mllm_convertor.py diff --git a/pymllm/compile/mllm_ir/trace.py b/pymllm/models/__init__.py similarity index 100% rename from pymllm/compile/mllm_ir/trace.py rename to pymllm/models/__init__.py diff --git a/pymllm/utils/mllm_convertor_server/service.py b/pymllm/utils/mllm_convertor_server/service.py deleted file mode 100644 index ea8e2bec7..000000000 --- a/pymllm/utils/mllm_convertor_server/service.py +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright (c) MLLM Team. -# Licensed under the MIT License. diff --git a/pyproject.toml b/pyproject.toml index 703d4456a..efe4a14d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [build-system] requires = [ - "scikit-build-core>=0.11.0", "apache-tvm-ffi" + "scikit-build-core>=0.11.0", "apache-tvm-ffi == 0.1.8" ] build-backend = "scikit_build_core.build" @@ -21,7 +21,7 @@ dependencies=[ "packaging", "pytest", "pytest-html", - "apache-tvm-ffi == 0.1.0b4", + "apache-tvm-ffi == 0.1.8", "pyyaml >= 6.0.2", "openai", "modelscope", @@ -36,8 +36,8 @@ dependencies=[ cuda = ["tilelang"] [project.scripts] -mllm-convertor = "pymllm.utils.mllm_convertor:main" -mllm-service = "pymllm.service.tools:cli_app" +mllm-convertor = "pymllm.mobile.utils.mllm_convertor:main" +mllm-service = "pymllm.mobile.service.tools:cli_app" [tool.setuptools.exclude-package-data] "*" = ["*.pyc"] From 45c2fb71bf1a29a54f89de79c9f81aa202ab1251 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 18 Feb 2026 04:46:41 +0000 Subject: [PATCH 23/42] feat: enhance configuration management and update dependencies - Added `flashinfer-python` to the optional `cuda` dependencies in `pyproject.toml`. - Introduced new configuration files for server, model, and layers to centralize runtime settings. - Created initial structure for various layers and components to support future development. --- pymllm/configs/__init__.py | 0 pymllm/configs/model_config.py | 0 pymllm/configs/server_config.py | 267 ++++++++++++++++++++++++++++ pymllm/layers/_layer.py | 0 pymllm/layers/attention/__init__.py | 0 pymllm/layers/attention/gdn.py | 0 pymllm/layers/attention/normal.py | 0 pymllm/layers/embedding.py | 0 pymllm/layers/mlp.py | 0 pymllm/layers/rms_norm.py | 0 pymllm/mem_cache/__init__.py | 0 pymllm/models/qwen3_moe.py | 0 pymllm/orchestrator/__init__.py | 0 pymllm/server/__init__.py | 0 pyproject.toml | 2 +- 15 files changed, 268 insertions(+), 1 deletion(-) create mode 100644 pymllm/configs/__init__.py create mode 100644 pymllm/configs/model_config.py create mode 100644 pymllm/configs/server_config.py create mode 100644 pymllm/layers/_layer.py create mode 100644 pymllm/layers/attention/__init__.py create mode 100644 pymllm/layers/attention/gdn.py create mode 100644 pymllm/layers/attention/normal.py create mode 100644 pymllm/layers/embedding.py create mode 100644 pymllm/layers/mlp.py create mode 100644 pymllm/layers/rms_norm.py create mode 100644 pymllm/mem_cache/__init__.py create mode 100644 pymllm/models/qwen3_moe.py create mode 100644 pymllm/orchestrator/__init__.py create mode 100644 pymllm/server/__init__.py diff --git a/pymllm/configs/__init__.py b/pymllm/configs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/configs/model_config.py b/pymllm/configs/model_config.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py new file mode 100644 index 000000000..56be4fc4f --- /dev/null +++ b/pymllm/configs/server_config.py @@ -0,0 +1,267 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any, Literal, Optional +from dataclasses import asdict, dataclass, field + + +@dataclass +class ServerConfig: + """ + Centralized runtime configuration for the MLLM server. + + The fields are grouped by operational concern so that: + - CLI args can map directly to this dataclass. + - YAML/JSON config files can be loaded and validated in one place. + - future extensions can follow a predictable structure. + """ + + # ------------------------------------------------------------------------- + # Model and tokenizer settings + # ------------------------------------------------------------------------- + # Required path to the model checkpoint directory or model identifier. + model_path: Path + # Optional tokenizer path; when omitted we fall back to `model_path`. + tokenizer_path: Optional[Path] = None + # Tokenizer bootstrap strategy: + # - "auto": infer tokenizer mode from model type. + # - "slow"/"fast": force a specific tokenizer implementation. + tokenizer_mode: Literal["auto", "slow", "fast"] = "auto" + # Number of worker threads/processes used by tokenizer service. + tokenizer_worker_num: int = 1 + # Skip tokenizer initialization at startup to reduce cold-start latency. + skip_tokenizer_init: bool = False + # Model loading format hint for loader backends. + load_format: Literal["auto", "pt", "safetensors", "gguf"] = "auto" + # Allow loading custom model code from remote repositories. + trust_remote_code: bool = False + # Explicit context length; `None` means infer from model config. + context_length: Optional[int] = None + # Model precision policy for weights and activations. + dtype: Literal["auto", "float16", "bfloat16", "float32"] = "auto" + # Quantization algorithm to apply at load time. + quantization: Optional[str] = None + # KV cache dtype; can differ from model dtype for better memory trade-offs. + kv_cache_dtype: Literal["auto", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] = ( + "auto" + ) + # HuggingFace revision/commit/tag for deterministic model resolution. + revision: Optional[str] = None + # Optional custom directory used to cache downloaded model artifacts. + download_dir: Optional[Path] = None + + # ------------------------------------------------------------------------- + # HTTP / API server settings + # ------------------------------------------------------------------------- + # Host address the HTTP server binds to. + host: str = "127.0.0.1" + # TCP port exposed by the HTTP server. + port: int = 30000 + # Optional FastAPI root path when running behind a reverse proxy. + fastapi_root_path: str = "" + # API key required by client-facing endpoints. + api_key: Optional[str] = None + # Admin API key for privileged management endpoints. + admin_api_key: Optional[str] = None + # Public model name returned in OpenAI-compatible API responses. + served_model_name: Optional[str] = None + # Path used for server-side file uploads or temporary user artifacts. + file_storage_path: Path = Path("mllm_storage") + + # ------------------------------------------------------------------------- + # Runtime and scheduling behavior + # ------------------------------------------------------------------------- + # Fraction of total GPU memory reserved for static allocations + # (primarily model weights + KV cache). + mem_fraction_static: Optional[float] = None + # Maximum number of requests concurrently executing in scheduler. + max_running_requests: Optional[int] = None + # Maximum queued requests waiting for execution. + max_queued_requests: Optional[int] = None + # Hard cap of total active tokens across all in-flight requests. + max_total_tokens: Optional[int] = None + # Prefill chunk size used to trade throughput vs memory pressure. + chunked_prefill_size: Optional[int] = None + # Upper bound for tokens accepted in a single prefill pass. + max_prefill_tokens: int = 16384 + # Scheduling policy: + # - "fcfs": first-come-first-served fairness. + # - "lpm": longest-prefix-match style cache locality optimization. + schedule_policy: Literal["fcfs", "lpm"] = "fcfs" + # Conservative multiplier for scheduler admission decisions. + # Values > 1.0 are safer for OOM avoidance but may reduce utilization. + schedule_conservativeness: float = 1.0 + # Enable low-power sleep while idle to reduce background GPU usage. + sleep_on_idle: bool = False + # Stream partial output every N decode steps when streaming is enabled. + stream_interval: int = 1 + # Enable token streaming in generation responses. + stream_output: bool = True + + # ------------------------------------------------------------------------- + # Parallelism and distributed deployment + # ------------------------------------------------------------------------- + # Tensor parallel size (intra-layer sharding). + tp_size: int = 1 + # Data parallel size (replicated model workers). + dp_size: int = 1 + # Expert parallel size for MoE-style models. + ep_size: int = 1 + # Pipeline parallel size (inter-layer partitioning). + pp_size: int = 1 + # Number of nodes participating in distributed serving. + nnodes: int = 1 + # Rank of current node in multi-node topology. + node_rank: int = 0 + # Torch distributed init address, e.g. "host:port". + dist_init_addr: Optional[str] = None + # Optional NCCL communication port override. + nccl_port: Optional[int] = None + # Timeout in seconds for distributed collectives. + dist_timeout: Optional[int] = None + # Base GPU index used for process-to-device mapping. + base_gpu_id: int = 0 + # Step size between logical workers when assigning GPU IDs. + gpu_id_step: int = 1 + + # ------------------------------------------------------------------------- + # Backend and acceleration toggles + # ------------------------------------------------------------------------- + # Attention kernel backend selection. + attention_backend: Optional[str] = None + # Sampling backend selection. + sampling_backend: Optional[str] = None + # Grammar-constrained decoding backend. + grammar_backend: Optional[str] = None + # Disable CUDA graph capture for debugging/compatibility. + disable_cuda_graph: bool = False + # Enable `torch.compile` acceleration path. + enable_torch_compile: bool = False + # Maximum batch size considered by `torch.compile` profiles. + torch_compile_max_bs: int = 32 + # Enable deterministic inference behavior where possible. + enable_deterministic_inference: bool = False + # Random seed for reproducible sampling and initialization. + random_seed: Optional[int] = None + + # ------------------------------------------------------------------------- + # Logging, metrics, and observability + # ------------------------------------------------------------------------- + # Global log level for server components. + log_level: Literal["debug", "info", "warning", "error", "critical"] = "info" + # HTTP access log level; if None, inherits global log level. + log_level_http: Optional[str] = None + # Log each request payload/metadata for debugging. + log_requests: bool = False + # Verbosity level for request logging, larger means more detail. + log_requests_level: int = 2 + # Toggle built-in Prometheus/metrics endpoint. + enable_metrics: bool = False + # Include latency/time-cost summaries in logs. + show_time_cost: bool = False + # Optional OpenTelemetry traces endpoint ("host:port"). + otlp_traces_endpoint: str = "localhost:4317" + # Enable tracing export to OTLP collector. + enable_trace: bool = False + + # ------------------------------------------------------------------------- + # Feature switches and advanced decoding options + # ------------------------------------------------------------------------- + # Enable LoRA adapter serving support. + enable_lora: bool = False + # Maximum number of LoRA adapters loaded simultaneously. + max_loaded_loras: Optional[int] = None + # Maximum LoRA adapters that can be mixed in one batch. + max_loras_per_batch: int = 8 + # LoRA backend implementation. + lora_backend: Literal["triton", "csgmv", "torch_native"] = "csgmv" + # Enable multimodal processing pipeline. + enable_multimodal: bool = False + # Max concurrent multimodal tool calls. + mm_max_concurrent_calls: int = 32 + # Timeout (seconds) for each multimodal call. + mm_per_request_timeout: float = 10.0 + # Speculative decoding algorithm name (e.g. "eagle", "ngram"). + speculative_algorithm: Optional[str] = None + # Draft model path used in speculative decoding. + speculative_draft_model_path: Optional[Path] = None + # Number of speculative steps per target decode iteration. + speculative_num_steps: Optional[int] = None + # Number of proposed draft tokens per speculation step. + speculative_num_draft_tokens: Optional[int] = None + + # ------------------------------------------------------------------------- + # Internal bookkeeping (not usually set by users directly) + # ------------------------------------------------------------------------- + # Additional arbitrary key-value options for forward compatibility. + extra_options: dict[str, Any] = field(default_factory=dict) + + def __post_init__(self) -> None: + """Normalize defaults and validate constraints after dataclass initialization.""" + if self.tokenizer_path is None: + self.tokenizer_path = self.model_path + if self.served_model_name is None: + self.served_model_name = str(self.model_path) + + self._validate_basic_constraints() + self._validate_parallelism_constraints() + self._validate_scheduler_constraints() + + def _validate_basic_constraints(self) -> None: + """Validate scalar ranges and common invariants.""" + if self.port <= 0 or self.port > 65535: + raise ValueError("`port` must be in range [1, 65535].") + if self.max_prefill_tokens <= 0: + raise ValueError("`max_prefill_tokens` must be greater than 0.") + if self.stream_interval <= 0: + raise ValueError("`stream_interval` must be greater than 0.") + if self.mem_fraction_static is not None and not ( + 0.0 < self.mem_fraction_static < 1.0 + ): + raise ValueError("`mem_fraction_static` must be in range (0.0, 1.0).") + + def _validate_parallelism_constraints(self) -> None: + """Validate distributed and parallel topology settings.""" + for key, value in { + "tp_size": self.tp_size, + "dp_size": self.dp_size, + "ep_size": self.ep_size, + "pp_size": self.pp_size, + "nnodes": self.nnodes, + }.items(): + if value <= 0: + raise ValueError(f"`{key}` must be greater than 0.") + + if self.node_rank < 0 or self.node_rank >= self.nnodes: + raise ValueError("`node_rank` must satisfy 0 <= node_rank < nnodes.") + + def _validate_scheduler_constraints(self) -> None: + """Validate scheduler-related soft limits.""" + if self.max_running_requests is not None and self.max_running_requests <= 0: + raise ValueError("`max_running_requests` must be greater than 0 when set.") + if self.max_queued_requests is not None and self.max_queued_requests < 0: + raise ValueError("`max_queued_requests` must be >= 0 when set.") + if self.max_total_tokens is not None and self.max_total_tokens <= 0: + raise ValueError("`max_total_tokens` must be greater than 0 when set.") + if self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0: + raise ValueError("`chunked_prefill_size` must be greater than 0 when set.") + if self.schedule_conservativeness <= 0: + raise ValueError("`schedule_conservativeness` must be greater than 0.") + + def to_dict(self) -> dict[str, Any]: + """ + Serialize config to a plain dictionary. + + Path values are converted to string for easier JSON/YAML serialization. + """ + data = asdict(self) + for key in [ + "model_path", + "tokenizer_path", + "download_dir", + "file_storage_path", + "speculative_draft_model_path", + ]: + if data.get(key) is not None: + data[key] = str(data[key]) + return data diff --git a/pymllm/layers/_layer.py b/pymllm/layers/_layer.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/layers/attention/__init__.py b/pymllm/layers/attention/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/layers/attention/gdn.py b/pymllm/layers/attention/gdn.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/layers/attention/normal.py b/pymllm/layers/attention/normal.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/layers/embedding.py b/pymllm/layers/embedding.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/layers/mlp.py b/pymllm/layers/mlp.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/layers/rms_norm.py b/pymllm/layers/rms_norm.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/mem_cache/__init__.py b/pymllm/mem_cache/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/models/qwen3_moe.py b/pymllm/models/qwen3_moe.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/orchestrator/__init__.py b/pymllm/orchestrator/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/server/__init__.py b/pymllm/server/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pyproject.toml b/pyproject.toml index efe4a14d2..89d69947d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ dependencies=[ ] [project.optional-dependencies] -cuda = ["tilelang"] +cuda = ["tilelang", "flashinfer-python"] [project.scripts] mllm-convertor = "pymllm.mobile.utils.mllm_convertor:main" From 14ce9cd35c9a730f7ba2202ce83c676a473ce12f Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 18 Feb 2026 11:40:56 +0000 Subject: [PATCH 24/42] feat: add main entry points and configuration for pymllm and mllm-kernel - Added main entry points for `pymllm` and `mllm-kernel` in their respective `pyproject.toml` files. - Implemented a configuration module for `pymllm` to manage global settings, including server, model, runtime, and cache configurations. - Introduced the `VocabParallelEmbedding` layer and utility functions for weight management in the layers module. - Created initial tests for the `VocabParallelEmbedding` layer to validate functionality with tensor parallelism. --- mllm-kernel/mllm_kernel/__main__.py | 2 +- mllm-kernel/pyproject.toml | 3 + pymllm/__main__.py | 39 ++ pymllm/configs/__init__.py | 21 ++ pymllm/configs/global_config.py | 349 ++++++++++++++++++ .../configs/quantization_config.py | 0 pymllm/layers/__init__.py | 11 + pymllm/layers/base.py | 27 ++ pymllm/layers/{_layer.py => custom_event.py} | 0 pymllm/layers/embedding.py | 152 ++++++++ pymllm/layers/utils.py | 45 +++ pymllm/orchestrator/__init__.py | 48 +++ pymllm/orchestrator/group_coordinator.py | 98 +++++ pymllm/orchestrator/parallel_state.py | 207 +++++++++++ pymllm/quantization/__init__.py | 0 pymllm/quantization/methods/__init__.py | 0 pymllm/quantization/methods/awq_w4a16.py | 0 pymllm/quantization/quant_recipe.py | 3 + pymllm/tests/README.md | 0 pymllm/tests/test_vocab_parallel_embedding.py | 310 ++++++++++++++++ pyproject.toml | 1 + 21 files changed, 1315 insertions(+), 1 deletion(-) create mode 100644 pymllm/configs/global_config.py rename mllm-kernel/requirements.txt => pymllm/configs/quantization_config.py (100%) create mode 100644 pymllm/layers/base.py rename pymllm/layers/{_layer.py => custom_event.py} (100%) create mode 100644 pymllm/layers/utils.py create mode 100644 pymllm/orchestrator/group_coordinator.py create mode 100644 pymllm/orchestrator/parallel_state.py create mode 100644 pymllm/quantization/__init__.py create mode 100644 pymllm/quantization/methods/__init__.py create mode 100644 pymllm/quantization/methods/awq_w4a16.py create mode 100644 pymllm/quantization/quant_recipe.py create mode 100644 pymllm/tests/README.md create mode 100644 pymllm/tests/test_vocab_parallel_embedding.py diff --git a/mllm-kernel/mllm_kernel/__main__.py b/mllm-kernel/mllm_kernel/__main__.py index d4888b86c..e5f0779d6 100644 --- a/mllm-kernel/mllm_kernel/__main__.py +++ b/mllm-kernel/mllm_kernel/__main__.py @@ -388,7 +388,7 @@ def main() -> None: logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") parser = argparse.ArgumentParser( - prog="python -m mllm_kernel", + prog="mllm_kernel", description="mllm-kernel helper commands.", ) parser.add_argument( diff --git a/mllm-kernel/pyproject.toml b/mllm-kernel/pyproject.toml index 5fe07eeab..a8dbd98ea 100644 --- a/mllm-kernel/pyproject.toml +++ b/mllm-kernel/pyproject.toml @@ -27,6 +27,9 @@ dev = [ "pytest-html", ] +[project.scripts] +mllm-kernel = "mllm_kernel.__main__:main" + [tool.scikit-build] # Build configuration wheel.py-api = "py3" diff --git a/pymllm/__main__.py b/pymllm/__main__.py index e69de29bb..0b427fcee 100644 --- a/pymllm/__main__.py +++ b/pymllm/__main__.py @@ -0,0 +1,39 @@ +def show_config() -> None: + from . import is_mobile_available + + mobile_enabled = str(is_mobile_available()).lower() + print(f"mllm mobile: {mobile_enabled}") + + # try import mllm_kernel, if true, print mllm_kernel config + try: + import mllm_kernel + + print(f"mllm_kernel: {mllm_kernel.__version__}") + except ImportError: + print("mllm_kernel: not found") + + +def main() -> None: + import argparse + + parser = argparse.ArgumentParser( + prog="pymllm", + description="pymllm helper commands.", + ) + parser.add_argument( + "command", + nargs="?", + choices=["show-config"], + help="Run helper command. Use 'show-config' to print config details.", + ) + args = parser.parse_args() + + if args.command == "show-config": + show_config() + return + + parser.print_help() + + +if __name__ == "__main__": + main() diff --git a/pymllm/configs/__init__.py b/pymllm/configs/__init__.py index e69de29bb..86af57beb 100644 --- a/pymllm/configs/__init__.py +++ b/pymllm/configs/__init__.py @@ -0,0 +1,21 @@ +"""Configuration module for pymllm.""" + +from pymllm.configs.global_config import ( + CacheConfig, + GlobalConfig, + ModelConfig, + RuntimeConfig, + get_global_config, +) +from pymllm.configs.server_config import ServerConfig + +__all__ = [ + # Main singleton + "GlobalConfig", + "get_global_config", + # Sub configs + "ServerConfig", + "ModelConfig", + "RuntimeConfig", + "CacheConfig", +] diff --git a/pymllm/configs/global_config.py b/pymllm/configs/global_config.py new file mode 100644 index 000000000..43783e946 --- /dev/null +++ b/pymllm/configs/global_config.py @@ -0,0 +1,349 @@ +"""Global configuration singleton with all server, model and runtime configs.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Literal, Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from transformers import PretrainedConfig + + +@dataclass +class ModelConfig: + """Model-specific configuration parsed from HF config. + + This is a lightweight wrapper around HuggingFace config with + additional derived fields for efficiency. + """ + # Original HF config (populated after loading) + hf_config: Optional[Any] = field(default=None, repr=False) + hf_text_config: Optional[Any] = field(default=None, repr=False) + + # Model architecture + model_type: str = "unknown" + architectures: list[str] = field(default_factory=list) + + # Dimensions + hidden_size: int = 0 + num_hidden_layers: int = 0 + num_attention_heads: int = 0 + num_key_value_heads: Optional[int] = None + intermediate_size: int = 0 + vocab_size: int = 0 + + # Context length + max_position_embeddings: int = 0 + context_length: int = 0 # effective context length + + # Normalization + rms_norm_eps: float = 1e-6 + tie_word_embeddings: bool = False + + # RoPE + rope_theta: float = 10000.0 + rope_scaling: Optional[Dict[str, Any]] = None + + # Quantization + quantization: Optional[str] = None + + def __post_init__(self): + """Set default kv heads if not specified.""" + if self.num_key_value_heads is None: + self.num_key_value_heads = self.num_attention_heads + + +@dataclass +class RuntimeConfig: + """Runtime state that changes during execution.""" + + # Distributed state + tp_rank: int = 0 + tp_size: int = 1 + dp_rank: int = 0 + dp_size: int = 1 + pp_rank: int = 0 + pp_size: int = 1 + world_rank: int = 0 + world_size: int = 1 + local_rank: int = 0 + + # Device + device: str = "cuda" + + # Memory pools + max_num_seqs: int = 0 + max_model_len: int = 0 + + # Scheduler state (mutable during runtime) + num_running_reqs: int = 0 + num_waiting_reqs: int = 0 + num_swapped_reqs: int = 0 + + +@dataclass +class CacheConfig: + """KV cache configuration.""" + + block_size: int = 16 + num_gpu_blocks: int = 0 + num_cpu_blocks: int = 0 + + # Cache dtype + cache_dtype: Literal["auto", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] = "auto" + + # Sliding window + sliding_window: Optional[int] = None + + # Prefix caching + enable_prefix_caching: bool = False + + +@dataclass +class GlobalConfig: + """Global configuration singleton containing all configs. + + This is the single source of truth for all configuration in pymllm. + It aggregates ServerConfig, ModelConfig, RuntimeConfig, and CacheConfig. + + Usage: + >>> from pymllm.configs import get_global_config + >>> config = get_global_config() + >>> + >>> # Access server config + >>> config.server.model_path + >>> config.server.tp_size + >>> + >>> # Access model config + >>> config.model.hidden_size + >>> config.model.vocab_size + >>> + >>> # Access runtime config (mutable) + >>> config.runtime.tp_rank + >>> config.runtime.device + >>> + >>> # Access cache config + >>> config.cache.block_size + >>> + >>> # Update with new server config + >>> config.load_server_config(server_config) + >>> + >>> # Update with HF model config + >>> config.load_hf_config(hf_config) + """ + + # Sub-configs + server: "ServerConfig" = field(default=None, repr=False) + model: ModelConfig = field(default_factory=ModelConfig) + runtime: RuntimeConfig = field(default_factory=RuntimeConfig) + cache: CacheConfig = field(default_factory=CacheConfig) + + # Additional metadata + _initialized: bool = field(default=False, repr=False) + + def __new__(cls): + if not hasattr(cls, '_instance') or cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __post_init__(self): + # Lazy import to avoid circular dependency + if self.server is None: + from pymllm.configs.server_config import ServerConfig + self.server = ServerConfig( + model_path=Path("."), # placeholder + ) + + @classmethod + def get_instance(cls) -> "GlobalConfig": + """Get the singleton instance.""" + if not hasattr(cls, '_instance') or cls._instance is None: + cls._instance = cls() + return cls._instance + + def load_server_config(self, server_config: "ServerConfig") -> None: + """Load server configuration and sync related fields.""" + self.server = server_config + + # Sync tp/dp/pp sizes to runtime + self.runtime.tp_size = server_config.tp_size + self.runtime.dp_size = server_config.dp_size + self.runtime.pp_size = server_config.pp_size + self.runtime.device = "cuda" if server_config.base_gpu_id >= 0 else "cpu" + + self._initialized = True + + def load_hf_config(self, hf_config: "PretrainedConfig") -> None: + """Load HuggingFace model configuration.""" + from transformers import PretrainedConfig + + # Store original + self.model.hf_config = hf_config + + # Get text config (for multimodal models) + if hasattr(hf_config, "text_config"): + self.model.hf_text_config = hf_config.text_config + text_config = hf_config.text_config + else: + text_config = hf_config + self.model.hf_text_config = hf_config + + # Extract fields + self.model.model_type = getattr(text_config, "model_type", "unknown") + self.model.architectures = getattr(text_config, "architectures", []) + + self.model.hidden_size = getattr(text_config, "hidden_size", 0) + self.model.num_hidden_layers = getattr(text_config, "num_hidden_layers", 0) + self.model.num_attention_heads = getattr(text_config, "num_attention_heads", 0) + self.model.num_key_value_heads = getattr(text_config, "num_key_value_heads", None) + self.model.intermediate_size = getattr(text_config, "intermediate_size", 0) + self.model.vocab_size = getattr(text_config, "vocab_size", 0) + + # Context length + self.model.max_position_embeddings = getattr( + text_config, "max_position_embeddings", 0 + ) + self.model.context_length = self._get_context_length(text_config) + + # Normalization + self.model.rms_norm_eps = getattr(text_config, "rms_norm_eps", 1e-6) + self.model.tie_word_embeddings = getattr( + text_config, "tie_word_embeddings", False + ) + + # RoPE + self.model.rope_theta = getattr(text_config, "rope_theta", 10000.0) + self.model.rope_scaling = getattr(text_config, "rope_scaling", None) + + # Sync to cache config + self.cache.sliding_window = getattr(text_config, "sliding_window", None) + + def _get_context_length(self, config: "PretrainedConfig") -> int: + """Extract effective context length from config.""" + # Try various fields + for key in ["max_position_embeddings", "n_positions", "seq_length"]: + if hasattr(config, key): + value = getattr(config, key) + if isinstance(value, int) and value > 0: + return value + return 2048 # default + + def update_runtime(self, **kwargs) -> None: + """Update runtime configuration.""" + for key, value in kwargs.items(): + if hasattr(self.runtime, key): + setattr(self.runtime, key, value) + else: + raise AttributeError(f"RuntimeConfig has no attribute '{key}'") + + def update_cache(self, **kwargs) -> None: + """Update cache configuration.""" + for key, value in kwargs.items(): + if hasattr(self.cache, key): + setattr(self.cache, key, value) + else: + raise AttributeError(f"CacheConfig has no attribute '{key}'") + + def temp(self, **kwargs): + """Context manager for temporary config changes. + + Usage: + # Modify runtime config temporarily + with config.temp(runtime=config.runtime): + config.runtime.tp_size = 2 + # ... do something with tp_size=2 + # runtime restored to original values + """ + return _TempGlobalConfig(self, **kwargs) + + def to_dict(self) -> Dict[str, Any]: + """Serialize all configs to dictionary.""" + return { + "server": self.server.to_dict() if self.server else {}, + "model": self._model_to_dict(), + "runtime": self._runtime_to_dict(), + "cache": self._cache_to_dict(), + } + + def _model_to_dict(self) -> Dict[str, Any]: + """Convert model config to dict.""" + return { + "model_type": self.model.model_type, + "architectures": self.model.architectures, + "hidden_size": self.model.hidden_size, + "num_hidden_layers": self.model.num_hidden_layers, + "num_attention_heads": self.model.num_attention_heads, + "num_key_value_heads": self.model.num_key_value_heads, + "intermediate_size": self.model.intermediate_size, + "vocab_size": self.model.vocab_size, + "context_length": self.model.context_length, + } + + def _runtime_to_dict(self) -> Dict[str, Any]: + """Convert runtime config to dict.""" + return { + "tp_rank": self.runtime.tp_rank, + "tp_size": self.runtime.tp_size, + "world_rank": self.runtime.world_rank, + "world_size": self.runtime.world_size, + "device": self.runtime.device, + } + + def _cache_to_dict(self) -> Dict[str, Any]: + """Convert cache config to dict.""" + return { + "block_size": self.cache.block_size, + "num_gpu_blocks": self.cache.num_gpu_blocks, + "cache_dtype": self.cache.cache_dtype, + } + + +class _TempGlobalConfig: + """Context manager for temporary global config changes. + + Supports nested keys like "runtime.tp_size" to modify sub-configs. + """ + + def __init__(self, config: GlobalConfig, **kwargs): + self.config = config + self.temp_values = kwargs + self.old_values = {} + + def _get_nested_attr(self, key: str): + """Get attribute, supporting dot notation for nested access.""" + if "." in key: + parts = key.split(".") + obj = self.config + for part in parts[:-1]: + obj = getattr(obj, part) + return getattr(obj, parts[-1]) + return getattr(self.config, key) + + def _set_nested_attr(self, key: str, value): + """Set attribute, supporting dot notation for nested access.""" + if "." in key: + parts = key.split(".") + obj = self.config + for part in parts[:-1]: + obj = getattr(obj, part) + setattr(obj, parts[-1], value) + else: + setattr(self.config, key, value) + + def __enter__(self): + for key, value in self.temp_values.items(): + self.old_values[key] = self._get_nested_attr(key) + self._set_nested_attr(key, value) + return self.config + + def __exit__(self, exc_type, exc_val, exc_tb): + for key, value in self.old_values.items(): + self._set_nested_attr(key, value) + return False + + +# Convenience function +def get_global_config() -> GlobalConfig: + """Get the global config singleton instance.""" + return GlobalConfig.get_instance() diff --git a/mllm-kernel/requirements.txt b/pymllm/configs/quantization_config.py similarity index 100% rename from mllm-kernel/requirements.txt rename to pymllm/configs/quantization_config.py diff --git a/pymllm/layers/__init__.py b/pymllm/layers/__init__.py index e69de29bb..6f70a4d1d 100644 --- a/pymllm/layers/__init__.py +++ b/pymllm/layers/__init__.py @@ -0,0 +1,11 @@ +"""Layers module for pymllm.""" + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.embedding import VocabParallelEmbedding +from pymllm.layers.utils import set_weight_attrs + +__all__ = [ + "MllmBaseLayer", + "set_weight_attrs", + "VocabParallelEmbedding", +] diff --git a/pymllm/layers/base.py b/pymllm/layers/base.py new file mode 100644 index 000000000..5dc519f41 --- /dev/null +++ b/pymllm/layers/base.py @@ -0,0 +1,27 @@ +import torch +from torch import nn +from torch.nn import Parameter +from pymllm.layers.utils import set_weight_attrs +from pymllm.quantization.quant_recipe import QuantRecipe + + +class MllmBaseLayer(nn.Module): + def __init__(self): + super().__init__() + self.quant_recipe: QuantRecipe = None + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + """Load weights into a parameter. + + This is the default implementation that directly copies the loaded weight + into the parameter. Subclasses should override this method to implement + custom loading logic (e.g., tensor parallelism sharding). + + Args: + param: The parameter to load weights into. + loaded_weight: The weight tensor loaded from checkpoint. + """ + param.data.copy_(loaded_weight) + + def forward(self, *args, **kwargs): + raise NotImplementedError("Subclasses must implement forward method") diff --git a/pymllm/layers/_layer.py b/pymllm/layers/custom_event.py similarity index 100% rename from pymllm/layers/_layer.py rename to pymllm/layers/custom_event.py diff --git a/pymllm/layers/embedding.py b/pymllm/layers/embedding.py index e69de29bb..0442caa41 100644 --- a/pymllm/layers/embedding.py +++ b/pymllm/layers/embedding.py @@ -0,0 +1,152 @@ +import torch +import torch.nn.functional as F +from torch.nn import Parameter + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.utils import set_weight_attrs +from pymllm.orchestrator import ( + divide, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_reduce, +) + + +class VocabParallelEmbedding(MllmBaseLayer): + """Embedding layer with vocabulary parallelism. + + This layer shards the embedding table along the vocabulary dimension + for tensor parallelism. + + Args: + num_embeddings: Size of the vocabulary. + embedding_dim: Size of the embedding vector. + padding_idx: Index for padding token (optional). + """ + + def __init__( + self, + num_embeddings: int, + embedding_dim: int, + padding_idx: int = None, + ): + super().__init__() + + # Get TP info from global state + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + + # Calculate sharded size + if self.num_embeddings % self.tp_size != 0: + raise ValueError( + f"num_embeddings ({num_embeddings}) must be divisible by " + f"tp_size ({self.tp_size})" + ) + + self.num_embeddings_per_partition = divide(num_embeddings, self.tp_size) + + # Create sharded weight + self.weight = Parameter( + torch.empty(self.num_embeddings_per_partition, embedding_dim) + ) + + # Calculate shard range + self.vocab_start_index = self.tp_rank * self.num_embeddings_per_partition + self.vocab_end_index = ( + self.vocab_start_index + self.num_embeddings_per_partition + ) + + # Set weight attributes for loading + set_weight_attrs( + self.weight, + { + "output_dim": 0, # Shard along vocab dimension + "input_dim": 1, # Embedding dimension + "weight_loader": self.weight_loader, + }, + ) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + """Load sharded weights into the parameter. + + Args: + param: The parameter to load weights into. + loaded_weight: The weight tensor loaded from checkpoint (full size). + """ + output_dim = getattr(param, "output_dim", None) + + if output_dim is None or self.tp_size == 1: + # No sharding, direct copy + assert param.data.shape == loaded_weight.shape, ( + f"Shape mismatch: param {param.data.shape} vs " + f"loaded {loaded_weight.shape}" + ) + param.data.copy_(loaded_weight) + else: + # Sharded loading: slice the loaded weight + assert loaded_weight.shape[output_dim] == self.num_embeddings, ( + f"Loaded weight vocab size {loaded_weight.shape[output_dim]} " + f"does not match expected {self.num_embeddings}" + ) + + # Slice along vocab dimension + if output_dim == 0: + shard_weight = loaded_weight[ + self.vocab_start_index : self.vocab_end_index, : + ] + else: + shard_weight = loaded_weight.narrow( + output_dim, + self.vocab_start_index, + self.num_embeddings_per_partition, + ) + + assert param.data.shape == shard_weight.shape, ( + f"Shard shape mismatch: param {param.data.shape} vs " + f"shard {shard_weight.shape}" + ) + param.data.copy_(shard_weight) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward pass of the embedding layer with TP support. + + Args: + x: Input tensor of token ids. + + Returns: + Embedded representation (all-reduced across TP group if needed). + """ + if self.tp_size > 1: + # Create mask for valid vocab range + vocab_mask = (x >= self.vocab_start_index) & (x < self.vocab_end_index) + + # Adjust indices to local vocab space + masked_input = torch.where( + vocab_mask, + x - self.vocab_start_index, + torch.zeros_like(x), # Invalid indices become 0 (will be masked) + ) + else: + masked_input = x + vocab_mask = None + + # Lookup embeddings + output = F.embedding( + masked_input.long(), + self.weight, + padding_idx=self.padding_idx if self.padding_idx is not None else None, + ) + + # Mask invalid positions (for TP) + if vocab_mask is not None: + output.masked_fill_(~vocab_mask.unsqueeze(-1), 0) + + # All-reduce across TP group + if self.tp_size > 1: + output = tensor_model_parallel_all_reduce(output) + + return output diff --git a/pymllm/layers/utils.py b/pymllm/layers/utils.py new file mode 100644 index 000000000..0dcbd1ac0 --- /dev/null +++ b/pymllm/layers/utils.py @@ -0,0 +1,45 @@ +"""Utility functions for layers.""" + +from typing import Any, Dict + +import torch + + +def set_weight_attrs( + weight: torch.Tensor, + weight_attrs: Dict[str, Any] | None, +) -> None: + """Set attributes on a weight tensor. + + This method is used to set attributes on a weight tensor. This method + will not overwrite existing attributes. + + Args: + weight: The weight tensor or parameter. + weight_attrs: A dictionary of attributes to set on the weight tensor. + Common attributes include: + - output_dim: The dimension along which to shard the weight (typically 0 for output dim) + - input_dim: The input dimension (typically 1 for input dim) + - weight_loader: A callable to load weights into this parameter + - packed_dim: The dimension along which the weight is packed (for quantization) + - packed_factor: The packing factor (for quantization) + + Example: + >>> weight = nn.Parameter(torch.empty(100, 64)) + >>> set_weight_attrs(weight, { + ... "output_dim": 0, + ... "input_dim": 1, + ... "weight_loader": my_loader_func, + ... }) + """ + if weight_attrs is None: + return + + for key, value in weight_attrs.items(): + if hasattr(weight, key): + raise AttributeError( + f"Overwriting existing tensor attribute: {key}. " + f"Existing value: {getattr(weight, key)}, " + f"New value: {value}" + ) + setattr(weight, key, value) diff --git a/pymllm/orchestrator/__init__.py b/pymllm/orchestrator/__init__.py index e69de29bb..f1716d794 100644 --- a/pymllm/orchestrator/__init__.py +++ b/pymllm/orchestrator/__init__.py @@ -0,0 +1,48 @@ +"""Orchestrator module for distributed computation.""" + +from pymllm.orchestrator.group_coordinator import ( + GroupCoordinator, + divide, + split_tensor_along_dim, +) +from pymllm.orchestrator.parallel_state import ( + data_parallel_all_reduce, + get_data_parallel_rank, + get_data_parallel_world_size, + get_dp_group, + get_pipeline_model_parallel_rank, + get_pipeline_model_parallel_world_size, + get_pp_group, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + get_tp_group, + initialize_model_parallel, + model_parallel_is_initialized, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, +) + +__all__ = [ + # GroupCoordinator + "GroupCoordinator", + "divide", + "split_tensor_along_dim", + # TP + "get_tp_group", + "get_tensor_model_parallel_rank", + "get_tensor_model_parallel_world_size", + "tensor_model_parallel_all_reduce", + "tensor_model_parallel_all_gather", + # DP + "get_dp_group", + "get_data_parallel_rank", + "get_data_parallel_world_size", + "data_parallel_all_reduce", + # PP + "get_pp_group", + "get_pipeline_model_parallel_rank", + "get_pipeline_model_parallel_world_size", + # State + "initialize_model_parallel", + "model_parallel_is_initialized", +] diff --git a/pymllm/orchestrator/group_coordinator.py b/pymllm/orchestrator/group_coordinator.py new file mode 100644 index 000000000..d06244734 --- /dev/null +++ b/pymllm/orchestrator/group_coordinator.py @@ -0,0 +1,98 @@ +"""GroupCoordinator for distributed communication.""" + +from typing import List, Optional +import torch +import torch.distributed as dist + + +class GroupCoordinator: + """Manages a group of processes for distributed communication. + + Lightweight wrapper around torch.distributed.ProcessGroup. + + Args: + ranks: List of global ranks in this group + local_rank: Local rank for device assignment + backend: Backend to use (nccl, gloo, etc.) + """ + + def __init__( + self, + ranks: List[int], + local_rank: int, + backend: str = "nccl", + ): + self.ranks = ranks + self.local_rank = local_rank + self.backend = backend + self.world_size = len(ranks) + + # Get rank in this specific group + self.rank_in_group = ranks.index(dist.get_rank()) if dist.is_initialized() else 0 + + # Create process group + if dist.is_initialized() and self.world_size > 1: + self.device_group = dist.new_group(ranks, backend=backend) + else: + self.device_group = None + + def all_reduce(self, tensor: torch.Tensor) -> torch.Tensor: + """All-reduce across the group.""" + if self.device_group is not None: + dist.all_reduce(tensor, group=self.device_group) + return tensor + + def all_gather(self, tensor: torch.Tensor, dim: int = 0) -> torch.Tensor: + """All-gather across the group.""" + if self.device_group is None: + return tensor + + world_size = self.world_size + if dim == 0: + shape = list(tensor.shape) + shape[0] = shape[0] * world_size + output = torch.empty(shape, dtype=tensor.dtype, device=tensor.device) + dist.all_gather_into_tensor(output, tensor, group=self.device_group) + return output + else: + # For non-dim-0 gathers, use tensor list + tensor_list = [ + torch.empty_like(tensor) for _ in range(world_size) + ] + dist.all_gather(tensor_list, tensor, group=self.device_group) + return torch.cat(tensor_list, dim=dim) + + def broadcast(self, tensor: torch.Tensor, src: int = 0) -> torch.Tensor: + """Broadcast from source rank to all.""" + if self.device_group is not None: + dist.broadcast(tensor, src=src, group=self.device_group) + return tensor + + +def divide(numerator: int, denominator: int) -> int: + """Divide and ensure divisibility.""" + assert numerator % denominator == 0, ( + f"{numerator} is not divisible by {denominator}" + ) + return numerator // denominator + + +def split_tensor_along_dim( + tensor: torch.Tensor, + dim: int, + world_size: int, + rank: int, +) -> torch.Tensor: + """Split tensor along a dimension for tensor parallelism.""" + dim_size = tensor.size(dim) + assert dim_size % world_size == 0, ( + f"Dimension {dim} ({dim_size}) not divisible by world_size {world_size}" + ) + + chunk_size = dim_size // world_size + start = rank * chunk_size + end = start + chunk_size + + slices = [slice(None)] * tensor.ndim + slices[dim] = slice(start, end) + return tensor[tuple(slices)] diff --git a/pymllm/orchestrator/parallel_state.py b/pymllm/orchestrator/parallel_state.py new file mode 100644 index 000000000..545c74a87 --- /dev/null +++ b/pymllm/orchestrator/parallel_state.py @@ -0,0 +1,207 @@ +"""Parallel state management for tensor and pipeline parallelism.""" + +import logging +import torch +import torch.distributed as dist +from typing import Optional + +from pymllm.configs.global_config import get_global_config +from pymllm.orchestrator.group_coordinator import GroupCoordinator + +logger = logging.getLogger(__name__) + + +# Global groups +_TP_GROUP: Optional[GroupCoordinator] = None +_DP_GROUP: Optional[GroupCoordinator] = None +_PP_GROUP: Optional[GroupCoordinator] = None + + +def initialize_model_parallel( + tensor_model_parallel_size: int = 1, + data_parallel_size: int = 1, + pipeline_model_parallel_size: int = 1, + backend: str = "nccl", +) -> None: + """Initialize model parallel groups. + + Args: + tensor_model_parallel_size: Number of GPUs for tensor parallelism + data_parallel_size: Number of GPUs for data parallelism + pipeline_model_parallel_size: Number of stages for pipeline parallelism + backend: Communication backend (nccl for GPU, gloo for CPU) + """ + global _TP_GROUP, _DP_GROUP, _PP_GROUP + + if not dist.is_initialized(): + return + + world_size = dist.get_world_size() + world_rank = dist.get_rank() + local_rank = int(torch.cuda.current_device()) if torch.cuda.is_available() else 0 + + config = get_global_config() + + # Update runtime config + config.runtime.world_size = world_size + config.runtime.world_rank = world_rank + config.runtime.local_rank = local_rank + config.runtime.tp_size = tensor_model_parallel_size + config.runtime.dp_size = data_parallel_size + config.runtime.pp_size = pipeline_model_parallel_size + + # Logging + logger.info( + "Model parallel runtime config set: world_size=%s, world_rank=%s, " + "local_rank=%s, tp_size=%s, dp_size=%s, pp_size=%s", + config.runtime.world_size, + config.runtime.world_rank, + config.runtime.local_rank, + config.runtime.tp_size, + config.runtime.dp_size, + config.runtime.pp_size, + ) + + # Validate parallelism setup + assert ( + tensor_model_parallel_size * data_parallel_size * pipeline_model_parallel_size + == world_size + ), ( + f"TP({tensor_model_parallel_size}) * DP({data_parallel_size}) * " + f"PP({pipeline_model_parallel_size}) != World({world_size})" + ) + + # Create TP groups (intra-layer sharding) + if tensor_model_parallel_size > 1: + num_tp_groups = world_size // tensor_model_parallel_size + for i in range(num_tp_groups): + ranks = list( + range( + i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size + ) + ) + if world_rank in ranks: + _TP_GROUP = GroupCoordinator( + ranks=ranks, + local_rank=local_rank, + backend=backend, + ) + config.runtime.tp_rank = _TP_GROUP.rank_in_group + break + else: + _TP_GROUP = None + config.runtime.tp_rank = 0 + + # Create DP groups (data replication) + if data_parallel_size > 1: + num_dp_groups = world_size // data_parallel_size + for i in range(num_dp_groups): + ranks = list(range(i, world_size, num_dp_groups)) + if world_rank in ranks: + _DP_GROUP = GroupCoordinator( + ranks=ranks, + local_rank=local_rank, + backend=backend, + ) + config.runtime.dp_rank = _DP_GROUP.rank_in_group + break + else: + _DP_GROUP = None + config.runtime.dp_rank = 0 + + # Create PP groups (inter-layer partitioning) + if pipeline_model_parallel_size > 1: + num_pp_groups = world_size // pipeline_model_parallel_size + for i in range(num_pp_groups): + start = i * pipeline_model_parallel_size + ranks = list(range(start, start + pipeline_model_parallel_size)) + if world_rank in ranks: + _PP_GROUP = GroupCoordinator( + ranks=ranks, + local_rank=local_rank, + backend=backend, + ) + config.runtime.pp_rank = _PP_GROUP.rank_in_group + break + else: + _PP_GROUP = None + config.runtime.pp_rank = 0 + + +def get_tp_group() -> Optional[GroupCoordinator]: + """Get the tensor model parallel group.""" + return _TP_GROUP + + +def get_dp_group() -> Optional[GroupCoordinator]: + """Get the data parallel group.""" + return _DP_GROUP + + +def get_pp_group() -> Optional[GroupCoordinator]: + """Get the pipeline parallel group.""" + return _PP_GROUP + + +# Convenience functions for tensor parallelism +def get_tensor_model_parallel_rank() -> int: + """Get current tensor model parallel rank.""" + return get_global_config().runtime.tp_rank + + +def get_tensor_model_parallel_world_size() -> int: + """Get tensor model parallel world size.""" + return get_global_config().runtime.tp_size + + +def get_data_parallel_rank() -> int: + """Get current data parallel rank.""" + return get_global_config().runtime.dp_rank + + +def get_data_parallel_world_size() -> int: + """Get data parallel world size.""" + return get_global_config().runtime.dp_size + + +def get_pipeline_model_parallel_rank() -> int: + """Get current pipeline parallel rank.""" + return get_global_config().runtime.pp_rank + + +def get_pipeline_model_parallel_world_size() -> int: + """Get pipeline parallel world size.""" + return get_global_config().runtime.pp_size + + +def model_parallel_is_initialized() -> bool: + """Check if model parallel is initialized.""" + return _TP_GROUP is not None or _DP_GROUP is not None or _PP_GROUP is not None + + +# Communication helpers +def tensor_model_parallel_all_reduce(tensor: torch.Tensor) -> torch.Tensor: + """All-reduce across TP group.""" + group = get_tp_group() + if group is None: + return tensor + return group.all_reduce(tensor) + + +def tensor_model_parallel_all_gather( + tensor: torch.Tensor, + dim: int = 0, +) -> torch.Tensor: + """All-gather across TP group.""" + group = get_tp_group() + if group is None: + return tensor + return group.all_gather(tensor, dim=dim) + + +def data_parallel_all_reduce(tensor: torch.Tensor) -> torch.Tensor: + """All-reduce across DP group.""" + group = get_dp_group() + if group is None: + return tensor + return group.all_reduce(tensor) diff --git a/pymllm/quantization/__init__.py b/pymllm/quantization/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/quantization/methods/__init__.py b/pymllm/quantization/methods/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/quantization/methods/awq_w4a16.py b/pymllm/quantization/methods/awq_w4a16.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/quantization/quant_recipe.py b/pymllm/quantization/quant_recipe.py new file mode 100644 index 000000000..a5b493bec --- /dev/null +++ b/pymllm/quantization/quant_recipe.py @@ -0,0 +1,3 @@ +class QuantRecipe: + def __init__(self): + pass diff --git a/pymllm/tests/README.md b/pymllm/tests/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/tests/test_vocab_parallel_embedding.py b/pymllm/tests/test_vocab_parallel_embedding.py new file mode 100644 index 000000000..e22b52a57 --- /dev/null +++ b/pymllm/tests/test_vocab_parallel_embedding.py @@ -0,0 +1,310 @@ +"""Tests for VocabParallelEmbedding layer. + +This module tests the VocabParallelEmbedding layer with and without +tensor parallelism. +""" + +import os +import logging +import pytest +import torch +import torch.nn as nn +import torch.multiprocessing as mp +from typing import Callable + +from pymllm.configs import get_global_config +from pymllm.layers import VocabParallelEmbedding +from pymllm.orchestrator import ( + initialize_model_parallel, +) + +# Show runtime init logs during test execution. +logging.basicConfig(level=logging.INFO, force=True) +logging.getLogger().setLevel(logging.INFO) + + +# ============================================================================= +# Helper: weight loading +# ============================================================================= +def load_weight(param: nn.Parameter, loaded_weight: torch.Tensor) -> None: + """Load weight using the weight_loader attached to param attribute.""" + weight_loader = getattr(param, "weight_loader", None) + if weight_loader is None: + # Fallback: direct copy + param.data.copy_(loaded_weight) + else: + # Call the loader attached to param + weight_loader(param, loaded_weight) + + +# ============================================================================= +# Real distributed tests with world_size=8 on CUDA +# ============================================================================= +def run_worker_tp8_cuda( + rank: int, + local_rank: int, + world_size: int, + local_world_size: int, + test_func: Callable, + return_dict: dict, +): + """Worker function for multi-process testing with TP=8 on CUDA. + + Args: + rank: Global rank across all nodes + local_rank: Local rank within this node (used for GPU binding) + world_size: Total number of processes across all nodes + local_world_size: Number of processes on this node + test_func: Test function to run + return_dict: Shared dict for returning results + """ + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "29500" + + # Set device using local_rank (binds to GPU 0,1,2,3 on this node) + torch.cuda.set_device(local_rank) + + torch.distributed.init_process_group( + backend="nccl", + rank=rank, + world_size=world_size, + ) + + initialize_model_parallel(tensor_model_parallel_size=8) + + try: + result = test_func(rank, local_rank, world_size) + return_dict[rank] = result + except Exception as e: + import traceback + + return_dict[rank] = f"ERROR: {e}\n{traceback.format_exc()}" + finally: + torch.distributed.destroy_process_group() + + +def embedding_forward_tp8_worker_cuda(rank: int, local_rank: int, world_size: int): + """Test forward pass with real TP=8 on CUDA. + + Args: + rank: Global rank + local_rank: Local rank within this node (for logging/debugging) + world_size: Total world size + """ + config = get_global_config() + + assert config.runtime.tp_size == 8, f"Rank {rank}: tp_size should be 8" + assert config.runtime.tp_rank == rank, f"Rank {rank}: tp_rank mismatch" + + vocab_size = 1024 + embed_dim = 64 + # .cuda() uses the device set by torch.cuda.set_device(local_rank) + layer = VocabParallelEmbedding(vocab_size, embed_dim).cuda() + + # Verify the layer is on the correct GPU + assert layer.weight.device.index == local_rank, ( + f"Rank {rank}: weight should be on GPU {local_rank}, got {layer.weight.device}" + ) + + expected_shard_size = vocab_size // 8 + assert layer.num_embeddings_per_partition == expected_shard_size + assert layer.weight.shape == (expected_shard_size, embed_dim) + + # Each rank initializes its own shard with known pattern + with torch.no_grad(): + layer.weight.fill_(float(rank + 1)) # Rank 0: 1.0, Rank 1: 2.0, ... + + # Create input on the correct GPU + input_ids = torch.tensor([[0, 128, 256, 384], [512, 640, 768, 896]], device="cuda") + + output = layer(input_ids) + assert output.shape == (2, 4, embed_dim) + + # Verify output is on correct GPU + assert output.device.index == local_rank, ( + f"Rank {rank}: output should be on GPU {local_rank}, got {output.device}" + ) + + if rank == 0: + # Each token is owned by exactly one TP rank. Since each rank fills its + # local shard with (rank + 1), post-all-reduce output must match below. + expected_token_values = torch.tensor( + [[1, 2, 3, 4], [5, 6, 7, 8]], + device=output.device, + dtype=output.dtype, + ) + expected_output = expected_token_values.unsqueeze(-1).expand(-1, -1, embed_dim) + + if torch.equal(output, expected_output): + return "PASSED" + return "FAILED: embedding output does not match expected TP aggregation" + + return "OK" + + +def weight_loading_tp8_worker_cuda(rank: int, local_rank: int, world_size: int): + """Test weight loading with real TP=8 on CUDA. + + Args: + rank: Global rank + local_rank: Local rank within this node (for GPU binding verification) + world_size: Total world size + """ + vocab_size = 1024 + embed_dim = 64 + layer = VocabParallelEmbedding(vocab_size, embed_dim).cuda() + + # Verify the layer is on the correct GPU + assert layer.weight.device.index == local_rank, ( + f"Rank {rank}: weight should be on GPU {local_rank}, got {layer.weight.device}" + ) + + full_weight = torch.randn(vocab_size, embed_dim) + load_weight(layer.weight, full_weight.cuda()) + + shard_size = vocab_size // 8 + start_idx = rank * shard_size + end_idx = start_idx + shard_size + expected_shard = full_weight[start_idx:end_idx] + + if not torch.allclose(layer.weight.cpu(), expected_shard): + return f"FAILED: shard mismatch at rank {rank}" + + if rank == 0: + gathered_shards = [layer.weight.cpu().clone()] + for other_rank in range(1, 8): + other_shard = full_weight[ + other_rank * shard_size : (other_rank + 1) * shard_size + ] + gathered_shards.append(other_shard) + + reconstructed = torch.cat(gathered_shards, dim=0) + if torch.allclose(reconstructed, full_weight): + return "PASSED" + else: + return "FAILED: reconstruction mismatch" + + return "OK" + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(torch.cuda.device_count() < 8, reason="Requires at least 8 GPUs") +class TestVocabParallelEmbeddingRealTP8: + """Real distributed tests with world_size=8 and TP=8 on CUDA.""" + + def test_forward_pass_tp8_real(self): + """Test forward pass with real TP=8 using 8 processes on CUDA.""" + world_size = 8 + local_world_size = 8 # Single node with 8 GPUs + + mp.set_start_method("spawn", force=True) + + manager = mp.Manager() + return_dict = manager.dict() + + processes = [] + for rank in range(world_size): + # In single-node setup, local_rank == rank + local_rank = rank + p = mp.Process( + target=run_worker_tp8_cuda, + args=( + rank, + local_rank, + world_size, + local_world_size, + embedding_forward_tp8_worker_cuda, + return_dict, + ), + ) + p.start() + processes.append(p) + + for p in processes: + p.join(timeout=120) + if p.is_alive(): + p.terminate() + p.join() + + for rank in range(world_size): + result = return_dict.get(rank, "TIMEOUT") + if rank == 0: + assert result == "PASSED", f"Rank {rank} failed: {result}" + else: + assert "ERROR" not in str(result), f"Rank {rank} error: {result}" + + def test_weight_loading_tp8_real(self): + """Test weight loading with real TP=8 using 8 processes on CUDA.""" + world_size = 8 + local_world_size = 8 # Single node with 8 GPUs + + mp.set_start_method("spawn", force=True) + + manager = mp.Manager() + return_dict = manager.dict() + + processes = [] + for rank in range(world_size): + # In single-node setup, local_rank == rank + local_rank = rank + p = mp.Process( + target=run_worker_tp8_cuda, + args=( + rank, + local_rank, + world_size, + local_world_size, + weight_loading_tp8_worker_cuda, + return_dict, + ), + ) + p.start() + processes.append(p) + + for p in processes: + p.join(timeout=120) + if p.is_alive(): + p.terminate() + p.join() + + for rank in range(world_size): + result = return_dict.get(rank, "TIMEOUT") + if rank == 0: + assert result == "PASSED", f"Rank {rank} failed: {result}" + else: + assert "ERROR" not in str(result), f"Rank {rank} error: {result}" + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +class TestVocabParallelEmbeddingCUDA: + """Tests for non-parallel TP=1 mode on CUDA.""" + + @pytest.fixture(autouse=True) + def setup_config(self): + config = get_global_config() + config.runtime.tp_size = 1 + config.runtime.tp_rank = 0 + yield + config.runtime.tp_size = 1 + config.runtime.tp_rank = 0 + + def test_cuda_forward(self): + layer = VocabParallelEmbedding(1000, 512).cuda() + input_ids = torch.randint(0, 1000, (4, 32), device="cuda") + + output = layer(input_ids) + + assert output.device.type == "cuda" + assert output.shape == (4, 32, 512) + + def test_cuda_weight_loader(self): + layer = VocabParallelEmbedding(100, 64).cuda() + + cpu_weight = torch.randn(100, 64) + load_weight(layer.weight, cpu_weight.cuda()) + + assert torch.allclose(layer.weight.cpu(), cpu_weight) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/pyproject.toml b/pyproject.toml index 89d69947d..160341bad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies=[ cuda = ["tilelang", "flashinfer-python"] [project.scripts] +pymllm = "pymllm.__main__:main" mllm-convertor = "pymllm.mobile.utils.mllm_convertor:main" mllm-service = "pymllm.mobile.service.tools:cli_app" From 027b0df67cf8e57ca62c61c6bdd43dc000a4a8e3 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Thu, 19 Feb 2026 08:53:02 +0000 Subject: [PATCH 25/42] feat: enhance layer implementations and add new components - Updated `.codespellrc` to include 'flashinfer' in the ignore words list. - Introduced new files for `launch_server`, `prepare`, and various layer implementations including `LayerNorm`, `RMSNorm`, and `MLP`. - Added `ColumnParallelLinear` and `RowParallelLinear` classes for efficient linear operations in tensor parallelism. - Implemented rotary embedding functions in `rope.py` for enhanced model performance. - Created caching mechanisms in `param_disk_cache.py` and `radix_cache.py` for improved memory management. - Refactored `GroupCoordinator` to enhance broadcasting functionality in distributed settings. --- .codespellrc | 2 +- pymllm/executor/__init__.py | 0 pymllm/executor/cuda_graph_runner.py | 0 pymllm/launch_server.py | 0 pymllm/layers/__init__.py | 24 ++ pymllm/layers/base.py | 3 +- pymllm/layers/embedding.py | 10 +- pymllm/layers/layer_norm.py | 43 ++++ pymllm/layers/linear.py | 263 +++++++++++++++++++++ pymllm/layers/mlp.py | 199 ++++++++++++++++ pymllm/layers/rms_norm.py | 64 ++++++ pymllm/layers/rope.py | 276 +++++++++++++++++++++++ pymllm/mem_cache/param_disk_cache.py | 0 pymllm/mem_cache/radix_cache.py | 0 pymllm/orchestrator/group_coordinator.py | 12 +- pymllm/prepare.py | 0 16 files changed, 890 insertions(+), 6 deletions(-) create mode 100644 pymllm/executor/__init__.py create mode 100644 pymllm/executor/cuda_graph_runner.py create mode 100644 pymllm/launch_server.py create mode 100644 pymllm/layers/layer_norm.py create mode 100644 pymllm/layers/linear.py create mode 100644 pymllm/layers/rope.py create mode 100644 pymllm/mem_cache/param_disk_cache.py create mode 100644 pymllm/mem_cache/radix_cache.py create mode 100644 pymllm/prepare.py diff --git a/.codespellrc b/.codespellrc index 9ddb9d851..bbf02bd17 100644 --- a/.codespellrc +++ b/.codespellrc @@ -1,3 +1,3 @@ [codespell] -ignore-words-list = ans, als, hel, boostrap, childs, te, vas, hsa, ment, cann, thi, makro, wil, rouge, PRIS, bfloat, constexpr, cuda, dlpack, expt, forceinline, ifndef, linalg, LPBQ, mllm, pymllm, Quantizaton, Qwen, ROCM, silu, torchao +ignore-words-list = ans, als, hel, boostrap, childs, te, vas, hsa, ment, cann, thi, makro, wil, rouge, PRIS, bfloat, constexpr, cuda, dlpack, expt, forceinline, ifndef, linalg, LPBQ, mllm, pymllm, Quantizaton, Qwen, ROCM, silu, torchao, flashinfer skip = *.json,*.jsonl,*.patch,*.txt diff --git a/pymllm/executor/__init__.py b/pymllm/executor/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/executor/cuda_graph_runner.py b/pymllm/executor/cuda_graph_runner.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/launch_server.py b/pymllm/launch_server.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/layers/__init__.py b/pymllm/layers/__init__.py index 6f70a4d1d..fd9a070ea 100644 --- a/pymllm/layers/__init__.py +++ b/pymllm/layers/__init__.py @@ -2,10 +2,34 @@ from pymllm.layers.base import MllmBaseLayer from pymllm.layers.embedding import VocabParallelEmbedding +from pymllm.layers.layer_norm import LayerNorm +from pymllm.layers.linear import ColumnParallelLinear, Linear, RowParallelLinear +from pymllm.layers.mlp import MLP, ParallelMLP +from pymllm.layers.rms_norm import GemmaRMSNorm, RMSNorm +from pymllm.layers.rope import ( + apply_llama31_rope, + apply_llama31_rope_pos_ids, + apply_rope, + apply_rope_pos_ids, + apply_rope_with_cos_sin_cache, +) from pymllm.layers.utils import set_weight_attrs __all__ = [ "MllmBaseLayer", "set_weight_attrs", "VocabParallelEmbedding", + "ColumnParallelLinear", + "Linear", + "RowParallelLinear", + "MLP", + "ParallelMLP", + "LayerNorm", + "RMSNorm", + "GemmaRMSNorm", + "apply_rope", + "apply_llama31_rope", + "apply_rope_pos_ids", + "apply_llama31_rope_pos_ids", + "apply_rope_with_cos_sin_cache", ] diff --git a/pymllm/layers/base.py b/pymllm/layers/base.py index 5dc519f41..3044e2064 100644 --- a/pymllm/layers/base.py +++ b/pymllm/layers/base.py @@ -3,12 +3,13 @@ from torch.nn import Parameter from pymllm.layers.utils import set_weight_attrs from pymllm.quantization.quant_recipe import QuantRecipe +from typing import Optional class MllmBaseLayer(nn.Module): def __init__(self): super().__init__() - self.quant_recipe: QuantRecipe = None + self.quant_recipe: Optional[QuantRecipe] = None def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): """Load weights into a parameter. diff --git a/pymllm/layers/embedding.py b/pymllm/layers/embedding.py index 0442caa41..ec99c5b2d 100644 --- a/pymllm/layers/embedding.py +++ b/pymllm/layers/embedding.py @@ -120,6 +120,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: Returns: Embedded representation (all-reduced across TP group if needed). """ + local_padding_idx = self.padding_idx if self.tp_size > 1: # Create mask for valid vocab range vocab_mask = (x >= self.vocab_start_index) & (x < self.vocab_end_index) @@ -130,6 +131,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: x - self.vocab_start_index, torch.zeros_like(x), # Invalid indices become 0 (will be masked) ) + # F.embedding expects indices in local weight-table space. + # Only pass padding_idx on the owning rank, remapped to local offset. + if self.padding_idx is not None: + if self.vocab_start_index <= self.padding_idx < self.vocab_end_index: + local_padding_idx = self.padding_idx - self.vocab_start_index + else: + local_padding_idx = None else: masked_input = x vocab_mask = None @@ -138,7 +146,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: output = F.embedding( masked_input.long(), self.weight, - padding_idx=self.padding_idx if self.padding_idx is not None else None, + padding_idx=local_padding_idx, ) # Mask invalid positions (for TP) diff --git a/pymllm/layers/layer_norm.py b/pymllm/layers/layer_norm.py new file mode 100644 index 000000000..54d94c19e --- /dev/null +++ b/pymllm/layers/layer_norm.py @@ -0,0 +1,43 @@ +from __future__ import annotations + +import torch +import flashinfer +from torch.nn import Parameter + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.utils import set_weight_attrs + + +class LayerNorm(MllmBaseLayer): + """LayerNorm layer implemented with FlashInfer kernel.""" + + def __init__(self, hidden_size: int, eps: float = 1e-6): + super().__init__() + self.hidden_size = hidden_size + self.eps = eps + + # flashinfer.norm.layernorm expects gamma/beta in fp32. + self.weight = Parameter(torch.ones(hidden_size, dtype=torch.float32)) + self.bias = Parameter(torch.zeros(hidden_size, dtype=torch.float32)) + set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) + set_weight_attrs(self.bias, {"weight_loader": self.weight_loader}) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.shape[-1] != self.hidden_size: + raise ValueError( + f"Expected last dim == hidden_size ({self.hidden_size}), " + f"but got input shape {tuple(x.shape)}" + ) + if x.dtype != torch.bfloat16: + raise TypeError( + "flashinfer.norm.layernorm requires bfloat16 input, " + f"but got {x.dtype}" + ) + + if x.dim() == 2: + return flashinfer.norm.layernorm(x, self.weight, self.bias, self.eps) + + original_shape = x.shape + x_2d = x.reshape(-1, self.hidden_size) + out = flashinfer.norm.layernorm(x_2d, self.weight, self.bias, self.eps) + return out.reshape(original_shape) diff --git a/pymllm/layers/linear.py b/pymllm/layers/linear.py new file mode 100644 index 000000000..dc583e931 --- /dev/null +++ b/pymllm/layers/linear.py @@ -0,0 +1,263 @@ +from __future__ import annotations + +import torch +import torch.nn.functional as F +from torch.nn import Parameter + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.utils import set_weight_attrs +from pymllm.orchestrator import ( + divide, + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, + tensor_model_parallel_all_gather, + tensor_model_parallel_all_reduce, +) + + +class ColumnParallelLinear(MllmBaseLayer): + """Linear layer with column parallelism (output-dimension sharding). + + The weight matrix is split along the output dimension across TP ranks. + Each rank holds ``out_features / tp_size`` rows of the weight. + + Args: + in_features: Size of each input sample. + out_features: Size of each output sample (before sharding). + bias: If ``True``, adds a learnable bias. + gather_output: If ``True``, all-gather the output across TP ranks + so every rank gets the full ``out_features``. Set to ``False`` + when the next layer is a :class:`RowParallelLinear` that expects + a split input. + """ + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + gather_output: bool = True, + ): + super().__init__() + + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + + self.in_features = in_features + self.out_features = out_features + self.gather_output = gather_output + + if out_features % self.tp_size != 0: + raise ValueError( + f"out_features ({out_features}) must be divisible by " + f"tp_size ({self.tp_size})" + ) + self.out_features_per_partition = divide(out_features, self.tp_size) + + self.output_start_index = self.tp_rank * self.out_features_per_partition + self.output_end_index = self.output_start_index + self.out_features_per_partition + + self.weight = Parameter( + torch.empty(self.out_features_per_partition, in_features) + ) + set_weight_attrs( + self.weight, + { + "output_dim": 0, + "input_dim": 1, + "weight_loader": self.weight_loader, + }, + ) + + if bias: + self.bias_flag = True + self.bias = Parameter(torch.empty(self.out_features_per_partition)) + set_weight_attrs( + self.bias, + { + "output_dim": 0, + "weight_loader": self.weight_loader, + }, + ) + else: + self.bias_flag = False + self.register_parameter("bias", None) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + """Load sharded weights into the parameter. + + Args: + param: The parameter to load weights into. + loaded_weight: The weight tensor loaded from checkpoint (full size). + """ + output_dim = getattr(param, "output_dim", None) + + if output_dim is None or self.tp_size == 1: + assert param.data.shape == loaded_weight.shape, ( + f"Shape mismatch: param {param.data.shape} vs " + f"loaded {loaded_weight.shape}" + ) + param.data.copy_(loaded_weight) + else: + shard_weight = loaded_weight.narrow( + output_dim, + self.output_start_index, + self.out_features_per_partition, + ) + assert param.data.shape == shard_weight.shape, ( + f"Shard shape mismatch: param {param.data.shape} vs " + f"shard {shard_weight.shape}" + ) + param.data.copy_(shard_weight) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + output = F.linear(x, self.weight, self.bias) + + if self.gather_output and self.tp_size > 1: + output = tensor_model_parallel_all_gather(output, dim=-1) + + return output + + +class RowParallelLinear(MllmBaseLayer): + """Linear layer with row parallelism (input-dimension sharding). + + The weight matrix is split along the input dimension across TP ranks. + Each rank holds all ``out_features`` rows but only + ``in_features / tp_size`` columns. + + Typically placed after a :class:`ColumnParallelLinear` whose + ``gather_output=False``, so the input is already split. + + Args: + in_features: Size of each input sample (before sharding). + out_features: Size of each output sample. + bias: If ``True``, adds a learnable bias (applied after all-reduce). + reduce_output: If ``True``, all-reduce the output across TP ranks. + """ + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + reduce_output: bool = True, + ): + super().__init__() + + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + + self.in_features = in_features + self.out_features = out_features + self.reduce_output = reduce_output + + if in_features % self.tp_size != 0: + raise ValueError( + f"in_features ({in_features}) must be divisible by " + f"tp_size ({self.tp_size})" + ) + self.in_features_per_partition = divide(in_features, self.tp_size) + + self.input_start_index = self.tp_rank * self.in_features_per_partition + self.input_end_index = self.input_start_index + self.in_features_per_partition + + self.weight = Parameter( + torch.empty(out_features, self.in_features_per_partition) + ) + set_weight_attrs( + self.weight, + { + "output_dim": 0, + "input_dim": 1, + "weight_loader": self.weight_loader, + }, + ) + + if bias: + self.bias_flag = True + self.bias = Parameter(torch.empty(out_features)) + set_weight_attrs(self.bias, {"weight_loader": self.weight_loader}) + else: + self.bias_flag = False + self.register_parameter("bias", None) + + def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): + """Load sharded weights into the parameter. + + Args: + param: The parameter to load weights into. + loaded_weight: The weight tensor loaded from checkpoint (full size). + """ + input_dim = getattr(param, "input_dim", None) + + if input_dim is None or self.tp_size == 1: + assert param.data.shape == loaded_weight.shape, ( + f"Shape mismatch: param {param.data.shape} vs " + f"loaded {loaded_weight.shape}" + ) + param.data.copy_(loaded_weight) + else: + shard_weight = loaded_weight.narrow( + input_dim, + self.input_start_index, + self.in_features_per_partition, + ) + assert param.data.shape == shard_weight.shape, ( + f"Shard shape mismatch: param {param.data.shape} vs " + f"shard {shard_weight.shape}" + ) + param.data.copy_(shard_weight) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + output = F.linear(x, self.weight) + + if self.reduce_output and self.tp_size > 1: + output = tensor_model_parallel_all_reduce(output) + + if self.bias is not None: + output = output + self.bias + + return output + + +class Linear(MllmBaseLayer): + """Linear layer with simple quant dispatch.""" + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = True, + ): + super().__init__() + self.in_features = in_features + self.out_features = out_features + + self.weight = Parameter(torch.empty(out_features, in_features)) + set_weight_attrs( + self.weight, + { + "output_dim": 0, + "input_dim": 1, + "weight_loader": self.weight_loader, + }, + ) + + if bias: + self.bias = Parameter(torch.empty(out_features)) + set_weight_attrs(self.bias, {"weight_loader": self.weight_loader}) + else: + self.register_parameter("bias", None) + + def _forward_torch_linear(self, x: torch.Tensor) -> torch.Tensor: + return F.linear(x, self.weight, self.bias) + + def _forward_quant_linear(self, x: torch.Tensor) -> torch.Tensor: + # TODO(wch): Implement quantized linear path. + raise NotImplementedError("quant_linear is not implemented yet.") + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.quant_recipe is None: + return self._forward_torch_linear(x) + return self._forward_quant_linear(x) diff --git a/pymllm/layers/mlp.py b/pymllm/layers/mlp.py index e69de29bb..1a40db92e 100644 --- a/pymllm/layers/mlp.py +++ b/pymllm/layers/mlp.py @@ -0,0 +1,199 @@ +from __future__ import annotations + +import logging +from typing import Callable, Literal, Optional + +import flashinfer +import torch + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.linear import ColumnParallelLinear, Linear, RowParallelLinear + +logger = logging.getLogger(__name__) + +MLPActivation = Literal["silu", "gelu", "gelu_tanh"] + +_ACTIVATION_MAP: dict[MLPActivation, Callable[..., torch.Tensor]] = { + "silu": flashinfer.activation.silu_and_mul, + "gelu": flashinfer.activation.gelu_and_mul, + "gelu_tanh": flashinfer.activation.gelu_tanh_and_mul, +} + + +def _validate_mlp_args( + hidden_size: int, intermediate_size: int, activation: str +) -> None: + if hidden_size <= 0: + raise ValueError(f"hidden_size must be > 0, but got {hidden_size}") + if intermediate_size <= 0: + raise ValueError( + f"intermediate_size must be > 0, but got {intermediate_size}" + ) + if activation not in _ACTIVATION_MAP: + raise ValueError( + f"Unsupported activation '{activation}'. " + f"Expected one of: {list(_ACTIVATION_MAP)}" + ) + + +def _run_gated_activation( + gate_up: torch.Tensor, + intermediate_size: int, + activation: MLPActivation, + enable_pdl: Optional[bool], +) -> torch.Tensor: + if gate_up.shape[-1] != 2 * intermediate_size: + raise ValueError( + "Expected last dim of gate_up tensor to be " + f"{2 * intermediate_size}, but got {gate_up.shape[-1]}" + ) + return _ACTIVATION_MAP[activation](gate_up, enable_pdl=enable_pdl) + + +class MLP(MllmBaseLayer): + """Feed-forward MLP block with FlashInfer fused gated activations. + + Non-parallel version (TP=1). Uses :class:`Linear` for all projections. + + Supported activations: ``silu``, ``gelu``, ``gelu_tanh``. + """ + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + activation: MLPActivation = "silu", + use_fused_gate_up_proj: bool = True, + use_bias_gate_up: bool = False, + use_bias_down: bool = False, + enable_pdl: Optional[bool] = None, + ): + super().__init__() + _validate_mlp_args(hidden_size, intermediate_size, activation) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.activation = activation + self.use_fused_gate_up_proj = use_fused_gate_up_proj + self.enable_pdl = enable_pdl + + if not use_fused_gate_up_proj: + logger.warning( + "MLP with use_fused_gate_up_proj=False uses a lower-efficiency path. " + "Use use_fused_gate_up_proj=True for better performance.", + ) + + if use_fused_gate_up_proj: + self.gate_up_proj = Linear( + hidden_size, 2 * intermediate_size, bias=use_bias_gate_up, + ) + self.gate_proj = None + self.up_proj = None + else: + self.gate_up_proj = None + self.gate_proj = Linear( + hidden_size, intermediate_size, bias=use_bias_gate_up, + ) + self.up_proj = Linear( + hidden_size, intermediate_size, bias=use_bias_gate_up, + ) + + self.down_proj = Linear( + intermediate_size, hidden_size, bias=use_bias_down, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.shape[-1] != self.hidden_size: + raise ValueError( + f"Expected last dim == hidden_size ({self.hidden_size}), " + f"but got input shape {tuple(x.shape)}" + ) + + if self.use_fused_gate_up_proj: + assert self.gate_up_proj is not None + gate_up = self.gate_up_proj(x) + else: + assert self.gate_proj is not None and self.up_proj is not None + gate_up = torch.cat([self.gate_proj(x), self.up_proj(x)], dim=-1) + + hidden = _run_gated_activation( + gate_up, self.intermediate_size, self.activation, self.enable_pdl, + ) + return self.down_proj(hidden) + + +class ParallelMLP(MllmBaseLayer): + """Tensor-parallel MLP with column-sharded intermediate dimension. + + Projection layout (Megatron-style): + + - ``gate_proj``: :class:`ColumnParallelLinear` + ``(hidden_size → intermediate_size, gather_output=False)`` + - ``up_proj``: :class:`ColumnParallelLinear` + ``(hidden_size → intermediate_size, gather_output=False)`` + - ``down_proj``: :class:`RowParallelLinear` + ``(intermediate_size → hidden_size, reduce_output=True)`` + + Gate and up projections are kept separate so that each TP rank holds a + correctly paired ``[gate_shard, up_shard]`` for the gated activation. + + Cost: **1 all-reduce** (inside ``down_proj``). + + Input shape : ``(*, hidden_size)`` — full / replicated. + Output shape: ``(*, hidden_size)`` — full / replicated. + + Args: + hidden_size: Model hidden dimension. + intermediate_size: Intermediate (expanded) dimension **before** TP + sharding. + activation: Gated activation type. + use_bias_gate_up: Add bias to the gate/up projections. + use_bias_down: Add bias to the down projection. + enable_pdl: FlashInfer PDL flag. + """ + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + activation: MLPActivation = "silu", + use_bias_gate_up: bool = False, + use_bias_down: bool = False, + enable_pdl: Optional[bool] = None, + ): + super().__init__() + _validate_mlp_args(hidden_size, intermediate_size, activation) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.activation = activation + self.enable_pdl = enable_pdl + + self.gate_proj = ColumnParallelLinear( + hidden_size, intermediate_size, + bias=use_bias_gate_up, gather_output=False, + ) + self.up_proj = ColumnParallelLinear( + hidden_size, intermediate_size, + bias=use_bias_gate_up, gather_output=False, + ) + + self.down_proj = RowParallelLinear( + intermediate_size, hidden_size, + bias=use_bias_down, reduce_output=True, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.shape[-1] != self.hidden_size: + raise ValueError( + f"Expected last dim == hidden_size ({self.hidden_size}), " + f"but got input shape {tuple(x.shape)}" + ) + + gate_up = torch.cat([self.gate_proj(x), self.up_proj(x)], dim=-1) + + shard_inter = self.down_proj.in_features_per_partition + hidden = _run_gated_activation( + gate_up, shard_inter, self.activation, self.enable_pdl, + ) + return self.down_proj(hidden) diff --git a/pymllm/layers/rms_norm.py b/pymllm/layers/rms_norm.py index e69de29bb..b55a0ea6c 100644 --- a/pymllm/layers/rms_norm.py +++ b/pymllm/layers/rms_norm.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import torch +import flashinfer +from torch.nn import Parameter + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.utils import set_weight_attrs + + +class RMSNorm(MllmBaseLayer): + """RMSNorm layer implemented with FlashInfer kernel.""" + + def __init__(self, hidden_size: int, eps: float = 1e-6): + super().__init__() + self.hidden_size = hidden_size + self.eps = eps + + self.weight = Parameter(torch.empty(hidden_size)) + set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.shape[-1] != self.hidden_size: + raise ValueError( + f"Expected last dim == hidden_size ({self.hidden_size}), " + f"but got input shape {tuple(x.shape)}" + ) + + # FlashInfer rmsnorm accepts 2D/3D input; flatten higher-rank tensors to 2D. + if x.dim() in (2, 3): + return flashinfer.norm.rmsnorm(x, self.weight, self.eps) + + original_shape = x.shape + x_2d = x.reshape(-1, self.hidden_size) + out = flashinfer.norm.rmsnorm(x_2d, self.weight, self.eps) + return out.reshape(original_shape) + + +class GemmaRMSNorm(MllmBaseLayer): + """Gemma-style RMSNorm layer implemented with FlashInfer kernel.""" + + def __init__(self, hidden_size: int, eps: float = 1e-6): + super().__init__() + self.hidden_size = hidden_size + self.eps = eps + + self.weight = Parameter(torch.empty(hidden_size)) + set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if x.shape[-1] != self.hidden_size: + raise ValueError( + f"Expected last dim == hidden_size ({self.hidden_size}), " + f"but got input shape {tuple(x.shape)}" + ) + + # gemma_rmsnorm is defined on 2D input; flatten other ranks to 2D. + if x.dim() == 2: + return flashinfer.norm.gemma_rmsnorm(x, self.weight, self.eps) + + original_shape = x.shape + x_2d = x.reshape(-1, self.hidden_size) + out = flashinfer.norm.gemma_rmsnorm(x_2d, self.weight, self.eps) + return out.reshape(original_shape) diff --git a/pymllm/layers/rope.py b/pymllm/layers/rope.py new file mode 100644 index 000000000..045774e93 --- /dev/null +++ b/pymllm/layers/rope.py @@ -0,0 +1,276 @@ +from __future__ import annotations + +from typing import Optional, Tuple + +import torch +import flashinfer + + +def apply_rope( + q: torch.Tensor, + k: torch.Tensor, + indptr: torch.Tensor, + offsets: torch.Tensor, + inplace: bool = False, + rotary_dim: Optional[int] = None, + interleave: bool = False, + rope_scale: float = 1.0, + rope_theta: float = 1e4, +) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: + """Apply rotary embedding to a batch of queries/keys (stored as RaggedTensor). + + cos/sin values are computed on the fly inside the kernel. Position offsets + are provided per-segment via ``indptr`` and ``offsets``. + + Args: + q: Query ragged tensor, shape ``(nnz, num_q_heads, head_dim)``. + k: Key ragged tensor, shape ``(nnz, num_k_heads, head_dim)``. + indptr: Indptr tensor, shape ``(batch_size + 1,)``. The i-th segment + spans ``q[indptr[i]:indptr[i+1]]``. + offsets: Relative position offsets per segment, shape ``(batch_size,)``. + inplace: If ``True``, apply RoPE in-place and return ``None``. + If ``False``, return new ``(q_rope, k_rope)`` tensors. + rotary_dim: Number of dimensions to apply RoPE to. ``None`` means + the entire ``head_dim``. + interleave: If ``True``, rotate even/odd dims (``[..., ::2]`` / + ``[..., 1::2]``). If ``False``, rotate first/second half dims. + rope_scale: Scaling factor for position indices. + rope_theta: Base frequency theta. + + Returns: + ``None`` when *inplace* is ``True``, otherwise a tuple + ``(q_rope, k_rope)`` of rotated tensors with the same shapes as + the inputs. + """ + if inplace: + flashinfer.rope.apply_rope_inplace( + q, k, indptr, offsets, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + ) + return None + + return flashinfer.rope.apply_rope( + q, k, indptr, offsets, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + ) + + +def apply_llama31_rope( + q: torch.Tensor, + k: torch.Tensor, + indptr: torch.Tensor, + offsets: torch.Tensor, + inplace: bool = False, + rotary_dim: Optional[int] = None, + interleave: bool = False, + rope_scale: float = 8.0, + rope_theta: float = 5e5, + low_freq_factor: float = 1.0, + high_freq_factor: float = 4.0, + old_context_len: int = 8192, +) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: + """Apply Llama 3.1 style rotary embedding to a batch of queries/keys. + + This variant adjusts frequencies with ``low_freq_factor``, + ``high_freq_factor``, and ``old_context_len`` following the Llama 3.1 + RoPE recipe. cos/sin values are computed on the fly. + + Args: + q: Query ragged tensor, shape ``(nnz, num_q_heads, head_dim)``. + k: Key ragged tensor, shape ``(nnz, num_k_heads, head_dim)``. + indptr: Indptr tensor, shape ``(batch_size + 1,)``. + offsets: Relative position offsets per segment, shape ``(batch_size,)``. + inplace: If ``True``, apply in-place and return ``None``. + rotary_dim: Number of dimensions to apply RoPE to. ``None`` means + the entire ``head_dim``. + interleave: If ``True``, rotate even/odd dims; otherwise first/second + half dims. + rope_scale: Scaling factor for position indices (default ``8``). + rope_theta: Base frequency theta (default ``5e5``). + low_freq_factor: Low frequency factor for Llama 3.1 RoPE. + high_freq_factor: High frequency factor for Llama 3.1 RoPE. + old_context_len: Original context length for Llama 3.1 RoPE. + + Returns: + ``None`` when *inplace* is ``True``, otherwise ``(q_rope, k_rope)``. + """ + if inplace: + flashinfer.rope.apply_llama31_rope_inplace( + q, k, indptr, offsets, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + low_freq_factor=low_freq_factor, + high_freq_factor=high_freq_factor, + old_context_len=old_context_len, + ) + return None + + return flashinfer.rope.apply_llama31_rope( + q, k, indptr, offsets, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + low_freq_factor=low_freq_factor, + high_freq_factor=high_freq_factor, + old_context_len=old_context_len, + ) + + +def apply_rope_pos_ids( + q: torch.Tensor, + k: torch.Tensor, + pos_ids: torch.Tensor, + inplace: bool = False, + rotary_dim: Optional[int] = None, + interleave: bool = False, + rope_scale: float = 1.0, + rope_theta: float = 1e4, +) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: + """Apply rotary embedding using explicit per-token position IDs. + + Unlike :func:`apply_rope` which derives positions from ``indptr`` / + ``offsets``, this function takes a flat ``pos_ids`` tensor that supplies + an explicit position for every token. + + Args: + q: Query tensor, shape ``(nnz, num_q_heads, head_dim)``. + k: Key tensor, shape ``(nnz, num_k_heads, head_dim)``. + pos_ids: Position indices, shape ``(nnz,)``. + inplace: If ``True``, apply in-place and return ``None``. + rotary_dim: Number of dimensions to apply RoPE to. + interleave: Interleaved layout flag. + rope_scale: Scaling factor for position indices. + rope_theta: Base frequency theta. + + Returns: + ``None`` when *inplace* is ``True``, otherwise ``(q_rope, k_rope)``. + """ + if inplace: + flashinfer.rope.apply_rope_pos_ids_inplace( + q, k, pos_ids, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + ) + return None + + return flashinfer.rope.apply_rope_pos_ids( + q, k, pos_ids, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + ) + + +def apply_llama31_rope_pos_ids( + q: torch.Tensor, + k: torch.Tensor, + pos_ids: torch.Tensor, + inplace: bool = False, + rotary_dim: Optional[int] = None, + interleave: bool = False, + rope_scale: float = 8.0, + rope_theta: float = 5e5, + low_freq_factor: float = 1.0, + high_freq_factor: float = 4.0, + old_context_len: int = 8192, +) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: + """Apply Llama 3.1 style RoPE using explicit per-token position IDs. + + Combines Llama 3.1 frequency adjustments with explicit ``pos_ids``. + + Args: + q: Query tensor, shape ``(nnz, num_q_heads, head_dim)``. + k: Key tensor, shape ``(nnz, num_k_heads, head_dim)``. + pos_ids: Position indices, shape ``(nnz,)``. + inplace: If ``True``, apply in-place and return ``None``. + rotary_dim: Number of dimensions to apply RoPE to. + interleave: Interleaved layout flag. + rope_scale: Scaling factor (default ``8``). + rope_theta: Base frequency theta (default ``5e5``). + low_freq_factor: Low frequency factor for Llama 3.1 RoPE. + high_freq_factor: High frequency factor for Llama 3.1 RoPE. + old_context_len: Original context length for Llama 3.1 RoPE. + + Returns: + ``None`` when *inplace* is ``True``, otherwise ``(q_rope, k_rope)``. + """ + if inplace: + flashinfer.rope.apply_llama31_rope_pos_ids_inplace( + q, k, pos_ids, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + low_freq_factor=low_freq_factor, + high_freq_factor=high_freq_factor, + old_context_len=old_context_len, + ) + return None + + return flashinfer.rope.apply_llama31_rope_pos_ids( + q, k, pos_ids, + rotary_dim=rotary_dim, + interleave=interleave, + rope_scale=rope_scale, + rope_theta=rope_theta, + low_freq_factor=low_freq_factor, + high_freq_factor=high_freq_factor, + old_context_len=old_context_len, + ) + + +def apply_rope_with_cos_sin_cache( + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + head_size: int, + cos_sin_cache: torch.Tensor, + inplace: bool = False, + is_neox: bool = True, +) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: + """Apply rotary embedding with precomputed cos/sin cache. + + Compatible with SGL/vLLM implementations. Note that ``query`` and ``key`` + use a **flattened** head layout ``(nnz, num_heads * head_size)`` instead + of the 3-D layout used by the other ``apply_rope*`` functions. + + Args: + positions: Position indices, shape ``(nnz,)``. + query: Query tensor, shape ``(nnz, num_q_heads * head_size)``. + key: Key tensor, shape ``(nnz, num_k_heads * head_size)``. + head_size: Size of each attention head. + cos_sin_cache: Precomputed cos/sin tensor, shape + ``(max_seq_len, rotary_dim)``. The first half of ``rotary_dim`` + stores cosine values, the second half stores sine values. + inplace: If ``True``, apply in-place and return ``None``. + is_neox: If ``True`` (default), use GPT-NeoX style (rotate + first/second half dims). If ``False``, use interleaved style + (rotate even/odd dims). + + Returns: + ``None`` when *inplace* is ``True``, otherwise + ``(query_out, key_out)`` with the same shapes as the inputs. + """ + if inplace: + flashinfer.rope.apply_rope_with_cos_sin_cache_inplace( + positions, query, key, head_size, cos_sin_cache, + is_neox=is_neox, + ) + return None + + return flashinfer.rope.apply_rope_with_cos_sin_cache( + positions, query, key, head_size, cos_sin_cache, + is_neox=is_neox, + ) diff --git a/pymllm/mem_cache/param_disk_cache.py b/pymllm/mem_cache/param_disk_cache.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/mem_cache/radix_cache.py b/pymllm/mem_cache/radix_cache.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/orchestrator/group_coordinator.py b/pymllm/orchestrator/group_coordinator.py index d06244734..2fec30784 100644 --- a/pymllm/orchestrator/group_coordinator.py +++ b/pymllm/orchestrator/group_coordinator.py @@ -1,6 +1,6 @@ """GroupCoordinator for distributed communication.""" -from typing import List, Optional +from typing import List import torch import torch.distributed as dist @@ -63,9 +63,15 @@ def all_gather(self, tensor: torch.Tensor, dim: int = 0) -> torch.Tensor: return torch.cat(tensor_list, dim=dim) def broadcast(self, tensor: torch.Tensor, src: int = 0) -> torch.Tensor: - """Broadcast from source rank to all.""" + """Broadcast from source rank to all. + + Args: + tensor: Tensor to broadcast. + src: Source rank relative to this group (0 <= src < world_size). + """ if self.device_group is not None: - dist.broadcast(tensor, src=src, group=self.device_group) + global_src = self.ranks[src] + dist.broadcast(tensor, src=global_src, group=self.device_group) return tensor diff --git a/pymllm/prepare.py b/pymllm/prepare.py new file mode 100644 index 000000000..e69de29bb From f6aee675407ec0c160370fdd338148cd4fddc9f1 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Thu, 19 Feb 2026 09:28:47 +0000 Subject: [PATCH 26/42] feat: add initial files for pymllm architecture and launch functionality - Introduced a new architecture diagram image `pymllm-arch.png` in the assets directory. - Updated `README.md` to include the architecture diagram. - Created initial `launch.py` files in both the engine and server directories for future functionality. - Added an empty `scheduler.py` file in the orchestrator directory to support scheduling features. --- assets/pymllm-arch.png | Bin 0 -> 388499 bytes pymllm/README.md | 3 +++ pymllm/engine/launch.py | 1 + .../scheduler.py} | 0 pymllm/{prepare.py => server/launch.py} | 0 5 files changed, 4 insertions(+) create mode 100644 assets/pymllm-arch.png create mode 100644 pymllm/engine/launch.py rename pymllm/{launch_server.py => orchestrator/scheduler.py} (100%) rename pymllm/{prepare.py => server/launch.py} (100%) diff --git a/assets/pymllm-arch.png b/assets/pymllm-arch.png new file mode 100644 index 0000000000000000000000000000000000000000..37c48b2a087b35d0693646566dc9870c50786b6f GIT binary patch literal 388499 zcma%EcOaGT7mw&mv?#P_Tx5h2vTuv7-9WaocXqCM+e_-&WR$&SW^2lpy;b(k-u#~D za=H9k|9rpK?Y{5(JZGQJInQ|?$;pUs+q!S-rcIl+p)Q`kylE48`KC=|Dr8&0H!rJS z^=#U-a}(|g{y5c7vvrSnz;1Insz*QO zE?f$|Lgjhnv8uqXdygL<@UCJ>pYGdz7Daou)8g`ri=laS(F>_li)AH&4feXjzC5x! z`UR6~XJgA}vxAJ{l#FV#r}Tt}%odL#T{aOBlaTE^wEUruXv(O|~4WiH=(f%N=Eg5_fbC@u!ZjMeT@VJ0r3TGy!)%gY9dJx z<=ej8uaqL#_v{s941Sd>7hz^89zMq}AUKd1+ufI;cHx4{CQRYhYtQiiu=3FVr}-v$ zed0E_twStl*R5@VJ=o+|D_i+dPcGQjkbm?G+%u%4^DF=GV?2dR-XhE0yLE-mZo_|r zmjE<}Ch2c)(8qMhF5Xn-s?hQ;kbET7f2AW=2cX6M8~^P@{F;~R_KIh@)fMEZDj!b` zpx^RU6~aV3yEd#9|6}~pb>)zw&6Gb>CT@~H^!hi7KL>Ot0}@!Cwwru)c^FfQNVh*; zPZW&yWrZY>_3y9ex(gX~Glaqy?)|;#NEhS= z<%QdjLU2tNT2f!o7a`mfSOnKJ@sSNnCF#Rk53W-1sPMhbeRw6>#f4`g{+o@=B^yr% zG+%5Lxl<`HxMJJzzg@fW8YfnWm@h|uF>rr$>G5lLB4Y|jvj5M7;up4Z{VCe#LUqlb z4-nsB3{L$ADl8ihDgXQw=JL`Y(LCeiQeK^?Ft3wbi97258`wNaH0R=&FQUm1If*&y zCS=jb;Sv{_N0M^?k5V=slb^y=-YNbSQO1ag*Eo~%1pNHU&ctIl!e9RO@aVJ1#rzxC zCn`@@?~`w}^&0I4RzRb<{SmyEn)#F9qndj-^F?1)LiSvA3bS7{1A ztdm521Xo<4r4yw|3b_;{ASQlmz77%nrPeawI_H9 znBIvt-mgb+%i`)GZj{B_gB3>oztz_KMml@MZpcf;lx0Cs-`_-wfq(_UO>_AJ4axs4 zo>kricPht6H1b=)qhqU}Ewt3~*IwPEW)U&#Z)F-UdOlbnrz|5%w=zj$|KgG0z3_fx z0Rc>FP}=>p)%8@PknHQ(u~7r_Hv#Xt|C?(>FG;v!e&agUt=*aM9?Sfp+XX!Jk?)?X zQUg{&{_x@2pExe9=cUwTU|bR3*~P$(eRn3x{7F()Xav;#{o+obNeoSA*a#OVr$}V} z2V<^z2*e-TI&`b_;X{1*iHv?PBdhWNL^TW$<9-~w*t(_BnG${dz0Iq51DN%!kbQ-D zB`GcwzyC3w!#%0sB6SK`)jTRf+JasG%0l8rluHxXye-Ce3^-PFAXtWD{H{h9NtSE`1k&M(E>PHp1N~2*WhC4bT*MjL3MV}G@CIQuYpdcoWM|q~7`K{GtRoh9K3))3 zzYlJMIolk~zPpnKNB@!F;Ai)xfc8E&x%qvS#`9|Vd4FUsSi1rcB;{Toam||~DIUKi z*y<{lB^hG?>E~LnK%lJo`b5Kj6no^-Rv;5?CLuZ`WR!j`ldH@8wF!d@$3;7?z(T)j zs3s;bN-p{DiNjZ&>o2ZHY60zaRg;3(VBYPnF#Iih*67p6$u(Osd<~BU9N#4o=ue}| zc6dz!Acsmxq<$|e!3>ad?FwI+aL?oVLfj~`%GTu%|8Jp!w*VF})j!nzySKvU%wSU^ z;G)k@=!S0YBFGt}jT!B;%1 zw6fl{^S@P*SpKlV5^z<3^=(0q83J$niGnha^>Bmh{}CsQM;0XR=Xea-c!SQ(l3k z_;|W1k8pd|P^QsLb%cmK$J=kWS$)G8Zd7a`b|t~4Sa*6VdhKwY{-6VX$7V7KOH~$)u)+)!BKfsu%o)d6(R!34_ zT%6dp#aeNuM_g1{LCv~mG~PBR24T<{?r&A}`t{ftN#W3?lJlfH|KQ5hv`Q^>CGv9YOUz^abjXc;}I569vz{wnV>xNxZQ>%i4 zhT}w1m5iXLQ9@4Ibb=%1G`&^D8J#L=zPOUSi8wP)6|)@cS6fHndfN8%%HW=)X*ai6RS)$f%MP&FOnJ@V5OK+aS|*9_8-zyoJHh zgr(Vp&d88whNqfR%!yGBlAM;zK@OvD`CY{eWJ~ob?c3&BN8DXEs{Cg5;NJhIIn}F< zjAL|L#17?@hIvOsA!vBEPR3{~7|s-HELQ4!8IYhkp0{?Twb(SVK1%kVn`9Old4yH3 zMAPSQO8E9naOjwsW4}s-N373JhFjk-*>reAvA`Pq2j?$!77zhZ!G4mPIB2#yBD1nd zHMv%Fam3Axx7dnUnELHlnP9Pkiiv#gRL<~(_Ce9Ak&57hd zY^Hjk`*%y;820I~gYAFM+ytx(Cf`tLQLa(QnZ4J`U{TQ9a)%vSynmlg`ux zN0XU`awlF-LuHPv=^szhUy7D+GdTQwXYrn#+3KEVyDQ(2gfQ!>;f{RcQi0#{3UOBh zvMn-Tlldk|-;I7kL$dw!HC zd)Uy?I(i_pKfYi2bb`}FOqPQ8V=~IeE;fJLD1dkE;)hh;ZW#%17%haRX-Eg5<1WqH zYp}q|%%y%vm}T+j1JMJb8U5{?4&N!V)b-R@vuFEavZ#tc7Ob>56SS1UW*tRfurPsJ zBVYd7epS4H3DMi#fuL?@X}~>N8Do{zJtW)Ql6dL&abWfT6(lOB~~A)Scw{K ze#9JjEq5$n*n_)VC^@tsA0RAAv1yHloi3ZVA7hJudxP<3te&vZtB;w6Vt1U?fz>2CI4am6p&$yg^4=1r;`KL zz_zG8udYeG7eQshm;I1Xa1oN3sq#X40MK(k&(%^t{!80em+O6~41T zbkTgKlhauVJ3R+vH*{ZlO7;@8Jfprgz{-0hI4|^5c#YvuqAe=O{`IzY6*|DJJUADa z*uE?iqg_f^IVS2afK3|Pg~uyKOU;~;Us~5H;k2W{B)z2wE6Kqho;iMCBFQ_)xFn_U z8Dban;KEF~v-(ip(qf@PvSvoxd>5ZQRoRCfqS=~_=gk&p>+%$YC&IP?|HB|bZ!h@u zECmxLdX+P}x)=N$&)km|D+K1L5rsvY7~pZa{Ee2mW)%r zis{tGDp7l&jVEK3l8f8}$3sUum_;-C9&r@f+U0#Y^2DP+c*xvs&%paE2a#qVag1|Q z!1>kR+oz)VqTHv+EH4MK_a|^Kta*zQ3B}+Cb?C#E>j?R~(m~Vio}q+kllZGAhzGYK zFwreZA;mevj>BwCCh2NQz-brlGm5Y^ArtSnJZ_l()N#K$OH;Lp@;w zEw2D>JsJ5n=-t&Pq7^+Y%Z1Z>e~fjPbK0HnPAcY5snu{^!15wa+yo&3N6<`(o&?sY zbt2x@vo2@Ino(FktOQ_OlC|!2l)hewSI5(vlR`^N6(+OV7UB=s)L)l(Q+XuZ$=BvE zhUL}$GACukMz_FLz>~v;R(%9p9OAEt_P3%n;zB*n}^wNHrd=1*} z*z;Ai+22TOoW=sDvLm38533?cOK|99s}b?j(g8+-EsAqa605qO(V6j5KXj;qLvOYA z8pwK;I=x)bo(!2A@dv^@3Sxz~z$b4zeok5?dDyAv90TiKj-2sIuTF8@F7r4RQS@|u zfk>L%Km*O(aGtZZacB;|NLqugKUV8-!rnH6hyaPnkrb`aMH`KFOJL3wo0xSUi3aW9 ze>P?}@}4#~v5|Ks+h5SuRDLL>N+yP?xlyfP+G2*XtXmab|HMv2>qaiPnWrmnV(3ebaz1RO~M?e@Rj>V!d}cD9@pm<0dprEBOrGN1%@aU%(e{IEc-^5;*FCs_{U$ zi&2PtCzp-HzQIyr=}WQK>LOKR3}Ddd8R@R=iST9g4ZAdhh}BWA^+e<#d>uxfXt4LP zL|>Yb%9^NEi8=4j?mzP4FzQU!aIQTHD~DAQ3y&y>K*S8DN@2Z;* z^eHg~C|zNcL=k$_?{F3N>BI+y#Y7-n=yJ)|t#1nGRj?#p)0aiu4omOn?r;oeG#GqO z!D;L}?tS`)d0$k7S8APl7UCUe4rzFcFCY3#N{VQqBI=OjZB451Bw;b{{R<)|7|mJC zp9Kcm$b8}+JgBCTHLO)>+Az|IJ%4L)rrSF%y){!pL4s2kidHcc3hdoa?y@2o+KaY% zb1hiJhmm60Q;T;P=7Z6@4X6^HiOlwRJ<=a35lL+_&5|d#)&MG}JXCkIKex-zNCL}{ ztR@!T$un1+Z8s)CH7SNYG7Mx0dp@Nh8Hle#$I;FoPdH`1-n&1LMXP3Y33b`nt321& z(eTx6H<^nWB}>iSB8n%41ft@Wl83|Qdk&+b+NqVH?fNo%dtA2z0G(7 z+m}R_#zbRCY8}cL!^bDxq~}h=83yn_bN}X}HQZ>FV3PK5L;R=tmh}8aqu2}U_iy(X{EAyUO$nb8aa8`t-|5M{#H3`B6dXKgPYTMNHkxzg3jl1 z$=eOn278x)rF-F>_5hK=!}gLS>UdiH>jO9Qi6|z1)EN7Itx7(aB#iyysa_>kUOcC6 z$J^wjq!)7-B^}qr=kJXOZFdvR6#_xHnL?-^jI6w5+KozvbZ6C>AxlI zJeh=NFnE`oEL3>HWFB8Lk1z`tPenMIWRHB7KcXBn>e>}1o{n&5dS^C2UoGm?c=fZp zi=by`lM*k=73CyJtP?GCD}$Y!A+ z^aLj=*if&naKzKZx9-FqD=sYJ)40TGrx#Jx{JHT@+w6yP?0OrXs}EqSPkV-DjJQgk zZBl8Qt;Gg*el9P*MW5~$+H4xf-8FdNfs$>|HotR!hcZCj>s`H{QGABnBf;Tp5Eo_$^_jo&b$|I2L|BoA!dB&@5PF1B5uC)i>rquVAX6FZmzG9^!h(X)hcn4nIh zgb$YIKEIp$l?^6TC(%|#!!Vm*srwpx`-%+c0(lWr+BIc3OK(4LBCNG*UuwTnIa(p4 zFnqP{VWoz=_p6u_60C&=0No~+CamQ(3mLr!BpgwF4Q6=@67;@k@VC`fJ+x*c2nz(2 z!&~0lvH!T(fS3&R4u3IXnhT>1cB9xgV&d}Q528mX5HmL&-nUJk3@PAiv8p<`IP%s` zN*{f>|3WOlGc+^01MQ=k#v7BBQU5K0kT0({cwFm(ZKFX%fmr4=2g_C{71PkQOP@n?`yU(|Otm!clV2fZ^~H3f6rSOfvji#H9QBn#O0nTK9=73N&=Q`)X}9>*O%n4C3_;nHYT~r;$&z zX8g`4xragQoCM+|8QrfeoR)xu(izW2_JnqS)Dvpo+s?cnkGcVbQiTY!DhBI-$O%y3 zP)rz9D%Q7Ywd;;Am;lK^hH+|*g3jx^f#Y7*-|x}dI4*>+Wq-R(5&acLU^PaZi%KS` zHPMEA^_qA3Qd^uqPo1?ME9Fbi(wFCOD~@ZOZx2GS(>qOnU!v?C@_Ovv>_w89JzR4WB*jmo1PcANn4~^jq(X{kEfwd@ODfjQ z-lokCj+qEc5>n4pEX<6`s{t_;Uy5JLNVf5quLQ}!+cz|z{^RSi!6R*`jA=%~0>MNV z!_#lxO1i#D0J5^pd4r;~?_|?ZGR^D_54R4w_2ZFrx3$sdi#ybC&K>|zHF1Wt)ya#q z{obEwRATpO=ml6u*V!y(Og0)7>|Ov4=Y(!{%|)s_5EgX1vS0)A(E%x*B*6%&;{*KR zEbjDs2OCg~?!9(Hzy&)o$LPllXe`avS>T z_L5(I$5Zz>qj3vKjtuPvt14-6S-;UagtyadRZyUXuzOr^RY$!;-~q?+ubu4-OSc*T z&PYXu1H3XY}W`ym- zl4WRik(g~4`k8M5uoMou(YKw53hBga;>nzO^P?xP|Gq%|GvDY??nfd`r9mWEL^KgSBA?AdOOXa>14dLMyqh@SzK<7f-*cm@^Y| zI*mLjHZM3>S9rw4djTSYB6AJp99I{C2T;;ijv@MU{ zXLElWU-M0=H*-T-2_MlVA8C#PKP8DOw*`QmMMhtAjKmFtOAaNmFV3f`+doq>*dc~~ zaj~Y5myW#G)yzT54mb^UfkARka^0iWLY*wKEWPq^RRUsO08be#hJiHH{}(8eoL1+h z1rtyI$ciIR*e}zlS$!ovhvt2$?<>0E=0EuZx@;VhNb@|pnj{FcxIlCFRD`? z-8KJ_S>zH{U+D79j7~2$n9|tQWwpu!v&wU_fo&a4DScgTZir(*>{87LN@Ugz*mFAz zVn7-i$XL<&<__5N2f9yRABk$|o}yC%%_PMDQtTtOJejMheVY%p)t{$-7{ye6>oa$e zV(zGCv|_A``}I+>Pl&+)yDneG9HWE44t3t8mN_QtH0oZEqIqWO%d}9>63N(ZET8FfagCu(&DhzEAi!t6RFlgOsxNja7{-yx}FB z&NIa)+FR&)we!kmWdsLPu#UYH8q;TIfIm|bXxQ$pX4Bx)QS7`>=X~5-9c_^6Y>Peb z9a;^5q|odxo&=Wv4qC~HlZQ7vpe{H>>8)yk5-*jyZ9xAn3*cD_!uhTPQwQZwM*q~N zkF&47Brfk}Bs}`%Xz{S~(%hv~J4sxCW5{@jZk?R2E6?AD(C~B9=`-5|$*oWv{fLq9 zmTTnzX+Lp6u=kabgFQWfSWj4|?n5P@G?1n6Z znhb4J#u_J#W0J%vjsqOcZLz6g3B&?dP?b0IBvUyYp|9ZJGBiaWm%7~bB^0r#%lzfh zcIq4^a}%osZE8<5-X;EWyD}f8fPoJ!ll!Zlb3E(#`n8?}8Dfq@ldL6{PlL<6pKC5HMoCwMd zo)nzfiJ^62rdy^$7#8%UtO0IT=QE;klvefTnTh(3<22sl$KnB)6!+N8cM7E7H%q3q z#XmFut{i^p!neybkzyed5)iotGRnk_lwqH_fvKtlTT;O z)l^3>?C*VJ`tBnt+n_GFC7Rh?$U9RIr7Sh`=)O$ahc17!Tsdo0xa5!8v)q1C)4UtUE{5+_ds>;HnUqF&gMwS@T8ak*dSW|s#|Rf@QsoYIX%*LpYUx%zpt19 zJI4*WHcYov#98za+wJZJ=-M?#ePa-~TuJ!(b)Ojen*F%RgfnxYrP3j{w33@Y9x;0X zPa){uxG&RFXbRMfQsX9oddFk~)X_M`c&i$40(L9`i1IjC?>QN=)I0v-*_)E*1!rw4 zk1Arj#S(0jhMOjg(-#XtmzjCgV0=h`1+01reu_TXZtqb15y51Gy;g5<`C9FXryqo; znyW6=bKZJKA;8)u^L@PDCp6opD2>&KpJBs3`#-jO*#-c@_s;%j+_TSGc)oiOaGZWH`H{g+#{~9G(;69>#3|a=<$X$Uz1$_IR1|nm$g#e)a-#6atjkfwo5rk zM3MmR;00lf`BCpYV>%@Ii$KTob44@VK^g+RdyE`(Afiemr@7vHcYP(DCqcQT3t4sX0o5?i2#fH)C>C3ZwgjL2thJ!9V zYRQ~@+jR09&nR81O{4kJ!W19-B#0TKzK@Gh2Tgxu=JhjfJdG(3j!&*Kr&2DE|LXbm zKm@Soijq|JS(m=U=!2(DwaPWR`b_YD#L<6ix=f+z8K|4Bwf2<^?kCGEaIQ2>i|Na9k zUUzeVJ9g5xdS3@3>UeFR#BU*_Nv`ycNfRF1?@mr$>a4 zPP0dC2$Vn-TA%mL2)jTCmV;n&WVfN6@mGLEEWX(_Oi&B*@pB3Le}4&@zcB`TK!-zI zvqEiDzOUXbnlAx(EFe(n^OS|BI1w~got8m!sxa}Qi3ZE_R@GO|q7GafkKjDkh{Aro zeXr4B?3rzL2)fzW|C9aYSaW)1UTJnJ1|$^a`loaDQ$rBfVzxKJ zHZDEZG$rcNL0A{wFKuen$OkZu(LTkzOx0W8?{!W#nQ0_SQFRSvkCj<~5-$#q9!UJK z9al_-4)XTNkTxp|2UUlWPa(8%txEu8XDP+@MH9C#z7XQu9$Kg$QD~XoiZ*ih%jC9$ zkZ?n{$WP1;sf_oaVk(=leGkZMG>BgOIJP z;>9*uKS4NLXX6PmIu#vQt@2lQ~YkPpG7sFEjAr}6E zQ5;U9E;p+tCg(tVsOdxA$>4i_1rBNG`HqW1s6@e@ z;5$Y3tz&`E(|Ya)wP$Sw@q}Ij(>EYLN-U8Cr7qIpCc9)1$9R4H0QDJEoJK~&I31%9 zGwi}PjNdukq)Z}m)xDKK#V#xNF(~A`H3*1_O_HdxbJflMzRirT-M4jCWd8dO?{W}T zbl#@mh@?O2A6jTCtekXdf2Qythz(>=oOUgde8NObzVS+GsA~d!ks&mGEvR8zCGoU# z=+t77ms;!vO=mrMeV=Rfz-pL#29q1jHLI@|B;SNnCw0dKnVz(?_l;u zJU;+}QpQ53<7_YQ`}X!mH<4RCN>!?IzgT0e9@2~Yv3GE9n-KOCIhrjt@hplH15oJU zB9@4kmu=b{r?A?^ViV&{DdQbHY9<2T?@|{rfLiT|n0_lk#2L<<#<04NGF073mE+I7 z>kJNVxup|;;AJWqKjm{i9Yhh~q2limQo6I+_YqS}dwGhP3`T!`*ugnNDV6XHc2O{S zBNmws-#XdEQ;3mgTwJYPFSqHm?3DZeejALzW4Y1hQ)g79yNN>>Xd7lK12|8J9M*<* zB@2u2?>OO`4pOt*I-?JZ5w>B>*|1X~d&oM0ZNEv%CH@}|k4j*#a%6O1NpCtZj{jpV(vz1jfJDh&rIng-`qLb0Fpr!0SJLsI)P$Yss^EVd*)7Lc8 ze!k$Ukn2&@wWl>!aieS+Geth~aiEl@_1T^uTTnQew3t~V++gC4$OKW~O^E}{0ale~ zi@PlcVJK+VP^_*X%zB#q9jM0H$EY*VvI@Psi^!Id`)G^{D!HnOZ?C~E%PmA}kyo%N zEH!uAe2o?-b((8x6PXfM$NlEh7l^Ybz1yg?hoHO zPet{N>>qm)j=}aG{^9Jg&#}YMlt^M7KT8c%y%2J;$@H6q*mg;DAn2R$8cb_4ElE*- z+Hv9wKRx=Aty;gsM2v>;(O00Nd#-aJx!g2k2-Y?sj9H@<*>t+>b9E}j$;Ykubzkq= z5L#eAczr`S zAP*$8HgStKRAJcZwE59u+mtdy-$(9*IGU1q%QF55`4_pq-t=RWZO%&u2IUiV8XBLc ze)ufp)UlPn8N16Y^r|ij)(JEzxn(<%Fs=C(q~e07<6s~##_QR5zuyFc0}{!ucN&+xI5nih?%sn z9BB^K#bMQ`thx^oA`?+q`XIkJy>j6xc4)4X8?N|?h(sAkcuHG*2Xu4BN{Xb_>-w80 zG!E(*!|EpN8`7e%9chtuo~`1jt!;{?2S(Am9H28g0_aZk&v*aPZtVi~E=?4+=`439 z*Y|oYHs$+CkDv#_8U=;$;V`;~PyBl9^y#J2j!N{8Qy zvW)wKNZqOFNMkbcn}MvoGCwfNv`?pgNMo<^0w~B!p1H6W^eeIU#>MEoLzTebf%(l< zTUuXX`$-SBX8jOXf#`M|tOMLL0ILJugZ%Wo;HTMesmT$Dl6SlH8CO5in#=(nDn7I> z-}JTvtjtz1*@l*E>( zo2thd7J9^sxdTU@nqOHG$prlYo-hfy(z@uJ5a)69+3DA?jzPb7LI8!iK%*4;5}>U* zPmZVgXm1NR?nUw)I@*Bar_S^ubm%QhP6(rl{55L`8$G!gi70-ZWrYBdbFKZE%Sj4! zEFIVSkv;v1b4X$i%Vh_gC+-S(m|{(kCK7DDB|T#Y(akbHUMr}h@13W_^yj}buTb1$=-fs#KZ7`z@{Vx*+mW<>M?xy)66yU50L}Yae4%#|c8UI`?KWf%p>aPfj zG)eJ{QmzBtX6Pz^<=fdk7dTnF18mjpK$);2Nj1iopSN6StYj24ljO>u7!%letI|F2 zst5oaucEAa-^Jazz$|Yym|DjH@&+3ikuayu_uLyj_5-$}F}ejG6mGExmaXKWizmJy zjt|S%C7Bo&(DLt39C#r@_7IA$ZTd4`5pCY|C)P8pIs2@WCOwvLy*My`uI6qK_Y?vB zBW!{LiRWfuG8$c%f-p&KdZs5j*U%|EY$uxm%xX~~bC4>N;W_jj!}OsjFZ*JZ*2|0) zpQ79|sX&|IV0GaU3Qy?-YaBB1uvG^ALsF$`2=;BfbY7gH7TIDWAWXY8sg%4N1Z&ZE zqd=eE5-Tcvf6&#_mv~SltAKtDNi0Qj?u+jk<&LH*Ih~AAmc3!#^h(CB>c$bi+HX8m^t(c+ zE8|l*3&T)Uj#ih8m;7t0^j{$k@cM~3H3EyLjO&?#=$Q@(zY<g5y_mvOW8}wU(mJOEFPwxpqSvp;!H@^_b>r3$HMUcrWMrtwy zMi-hh5N6$+`Re9$H69?X7NDG=KyD2RDaSCn`P@K>Nin z#Oalwy*X)A8%th$YGeQr8w$AuPud-@A9zzUi+$@sKR5|uNNivQ7U6vo=7_~0gH)dc z*09p_GK%UFY&euBmqv-9)a*RkA$PHFM@6cz1275m0Qtq-5|imIcPdmzq1JvJt{nuD zvd-w3cvfI6Y>Pk9YN#>}Ie>V917;b8XzV;~xyJktTE;gJ3^2KDH0SPKpVsN6;srz^ zP8H-4j3)fRTNlHs>%>mFI@q{&X0-j{ewYELwH*C!g-&CzYsdD!xCtx=V~%YYU#4;Q zKszp|S(t|v1kY?=@g#cNW^?%@)ie}(dlM+X#(XSMN)~7%QRIv`jfd<(tyu-=P+UiBdiSM?A&>kFf0;c46?+MP%r1+6@eHZ7FM42$Q zCGWS1SvV{K41*nj!sjMtJlaJHZr;x>M_=v+Eqkht96XV)zlVMd3j7X|&%EdygDZ^! zmovS0Zt3V0^fycvHH;R6F!O2>)Pv%7w)LY zCc~JxXp$hrEaF{z%dnGe&AArnZy6OCXF?Tk^g0gmB-lx(<{~&4Uv;PrqZMnY8$pGKT=5zf<4S}^mhK@Nx6OU3DMZjBR{)F%5;_aN0#r`M0K@D{XzWLPaJ2y}oAg!uA(9d-9_ zLq|bU!P?raLy>uqBgeT`xAEO|WqR*GOXX$Q=Q;|y!C%4LHcoN2rx^tJS@RHU8dtMj za?=sfa!2@o>~Kia)wo#x5Vafj&TBY+)(Pqz@~Z*o1_>i3-OKrzgn2LQs-LBp7k*!` zu<&W+017gXOx3Q_lTmn%(e6~C5ef^FLP<0x8~EfR5ErS_qkCS#lt@w7yz51LK#E$D z3bXR^7y-;aGBV7%Cla>kV^Hwa&~E>GIRAkhCK-+F`kLB(DG?kOxYYofy^3g5j4w^_ zPQ!9%sN$D#2j7CaXB=&Pj_7{L9qw^O<8wlTe>+4B4hbCuk!$$5j2VHF<>ZWld>+)~ z)by3pnV@&_!4h!lMulWIm1tr9L$n@lJ%A61dhLd!lW> z0sXANsgU?{T%T|(1iiahp}Lq^(a{%f9v5iUQWR@qz>oz4nnZ!$Y-mLig3|$g(ETws z#3YT>a*sD*oXHRwCIFLPD!4YK4oaJxFcf)fzbE<;Au9tYuKn`dB&7#b`9{@!xSGs| zLy166Y>t-zNy$|OSVNejH@_MAsHkkep$cW5zz@nxixh23i>OQXS&pX&~>f;7Q`An^%K6oFH3z)O3^ zqL+EuO}x$fn(>=YMZohZ^MalNSe*9;`DpwA0Kj!1Rsx)mZ3D0Y#n3P+sG46Ox7e*31`dwd z?|xOb^yIgR<(fk;w@dKNoJex|!dCy#&UL8=ltt%!k=RKvn!dZ&buJ;#W(9 z47?>R@)kaZ1#R-@kOw!?Y~9QS%`dQTVrKgx2jUM$4*la9?6brzz`cVr1qZK{+64_7wv zk>3MFE32R*0rGbz(Q`J#wQ++NBh;e}9LF4n^t-!X?aRtyFxP2!isFB?B6ZATU97{eQ*Y_%U!%(5bF6up=@pO zU>y^*K51l(<&nGTEsh0)bf7ZZAf89KsKE$DAx3#upd>8~5PMHXJ zMbvuB4|hhkicRxRqml4_xLQLK0lHa^Qf)Y`h+H5!e}0|V5mF`}0RcR!vKz~s%9kR$ z#{3uk1%lrTXKM(Txz^oa{p*jRm@kjGq6~$hOSADlyrce?sTY)Yz(v>Vi*6V*YOYqI zS9ExzWr4d~d6)oA&y$V}SYM!{TQ{7BF3?ePaen5Y##?306ql(#3n(%H&RO9Q4cym7 zyM^+;4TFzaQ`01F(2ZvOBt~7MBYG8b4&Fg}YFz{eZjG~Mls-re;Lv=0y9n(`@F6_o z@M5>Bbo52y6{x}vGU{c1rc>P_;amS!-4#5#NmR%*i!3K|4A4%zsT&y_q zf$I~y!e6K0jLT>$yLWeild@f<{&iXU)yuwWixRSlobnA4+oj69ePsI~E3mGW#eYB? z0=LEgh5Ha}MuPSds$+kjVc89i=w*b#4!A8_R;L6lPpqMs#2_m6EJ01o&nMUC`kgpo zGPg?Ek2aHOJ00LD;hh_MQFm7Zce1cp{H~nU?c&yjG1!^7b^WsB$V;w`s;1g3 zjbB4ftY7f|j^1+3Y)73U%s0i0fqfzjjj!#pxsf;oP5ujT9S zUjt5gr|`b22Hmu2rAaD#kwO;H*uQQ$1`_$wq;7xuI_ zNhZX`;VC)CdI@QQ*c61ptz z`-;epzi=D#2%31~P!h}{O@#n)2#FtKA&*4l{#2jzbyrpl6E_kBS2!EHEdEkifFK6U z>2T@}-dg?N2fUSE-n|nXE;g)kaSjZ0BOx+bO*K}G9-*yUVHc#}fs;UC&ZWJ(x7={J z{0vMU#q=MUtqeS~QJ5CaJ5WAz@mg;LR$T(&ishU!22Wv;_=d`!mFKUxNV?TXgX&^xNSLFBJ+&`O(4gb!C-yc#=EL+!H4*W z5E&6D?20k_)u>Rn3&-q+PEc&f7mzt5=YJnH=7&Np^((s#p420Y0Y~%rjO*uT#`eqb zy~saAuD`yW`By~6^upPsYLyfT1-t)^C^&tllLI=pIrIA|e5H|#o48hD0}t?S5O2%r zwu6g4ya~9_AeB!L2H?!YFEjw3QQ#*htoavBCem2^gYh@tkOQuaKv3CCL>xz2LOARO znyBAc5(L*xpbJ|}-39%cW_2@?OVc~vaASTI-Ffh7kK?&oD-12NzyGmcJT{joe} zz9v$}lV582ExS_Qq#Ob3rIR9xBe>@;9Y_j1{C<#8hip8-#cSImB>Hx7#j$2x@|&(j z&jM>%drd(VHRK6;{lRB$HqBq7f%%lMNr1>I1`s@7njQ+|HL&uaA>21cg@Bvo4^p# z>|a=w_piVdD?TfK)qIfACLxnQz6Fy@UWX)pX0CKquRp3HZggvTXMJpkPX0+F^^XHG zrMhJRs`jv9OocO3A=f&xbIn4PI|MjV`cdWHI;h01-g)Km79 zM$9_GNvSWu@StTO`pdw))ox#Cx{#XDIpsY4)*LaTUuuuhF&)=D^QT=blZppMCfZtl zAj!r!?a{9tQ*CTwJ;5=?eTuCrx|LmT`B!i?a^gfMeybV!aBQ{jQ+Ot^M+>9-$W&3) z{L|ey)5%+kZpH!6WdLY=DA;vnQ+Q?je*>ZhSHpO*X8bHNfQ0N6HBO+FdgVEQK)cBp z^Oykvbo2VT4&)72$)Ej=#x1abRf(oQQ2^@>4nv{?iz_1Kk^?(0r$L5gCOyT~F9mkX z{bp1T-k(O^11X5IvDlfNJpnF$bBe+8!n*XaAs`X~j(V0Ba-vOQ_uv34sr*?bv zG5UtaM_x(GY(h2>Z(Z~1guB4~yJkKNPp7w3JAha~{ocY09_4$4+8=9BF8kj!m9tNT{9)yg@DYw%$S~59z93%}$ zd*c7A;cTEo)nPzR0gt#ln=oC(T~^Y;?t$F`EKR9fqSsHE{awZnSO}!yn^=`Lc>XDB zeLfbz4}bDy40*ZqynfyIC=Dm2AdI$)Uu?PtUJpR|0-5DzdT{DX?O`ga4frmdG7;t% zso?boVCw=TSD+Fvus;v%KAB?Jett~lxDg(*4R$6;AsEhuTM6L;a=@vQIu(snD2t7x zOnr`E@ez*hDd}g!@B+JHy6|}_p(O)8AP`h~YkIOM^*H0hFs$T zF(Uaff`Oud$ZL|1*N7f!Ai<7FhfjqB62Cse0Q(=2-3SnI#N=7FmnT%_wjtq%$EyF3t2pA}=%$0?T%(MOXfkJryF z&paIb2PombDhL;!4{v%8gBr~BT{~BCU>q}km5vh;2ar+fj;b>6V%?a9P=>oG{7w>f z`ka9g?nAShWzot1m-jq?8MQ$7%hLMNy$qo@A!O|D(H9`oJB5 zK7F2v{l)Jx(?lwrGzO;*JnQ&&jqZzMK7a8W`%o?0LvCtKgBP1h-!$E1xT$Vwe4w{p z|CaGh{vPFYeF3RFHSV3;bajz~=tN+hS*)R%A^0rX5qO0-b9m~i4TUVCo^UP5*sXMmWrF@3< zgH=kEz=0i;nE2t%@pD->Tg>tX@)k~Zr@jnvpzY5yX}|SG8uw`AnB$?g7$!^36W!S_ z=aPilx^j){VIAsR?vCk2(GP7)mhol$_9xq}FlUv3cNQ>$2Y->`JzcyS-cde7=03G< zRIc+5c+4WJc|IQiKv#D8l<>0^zhyqT$GRB0SY!f;#k0N?)3#3nd#=D_lABSke4;C> zM!QVIT`4^9_UmQ9YT@{EOPjK(s+uAL=|IVwReO9UV04tflbp3v_4C8dcCYeE_eZSbZG=l5+}TX;b5`Grdg|)3uiWvD$#a=d@Q~U?P27 z^Wv!yUd@kRa2HvbG5l1EwfCBo^ja?R%}!#(^8_ZwotrFX(>hC(p1jDxz2c&$+c7wT zd6t)T7*rWyfh%J}Wh*>w2jz2({dG}SSbsEH3{?zBTK=b>S67=cDLL1V9NK(E;IMS! zO)9#sM$Ro3d5JAuji%4k=JH*JHiN>Q@3( zdvQjEJ(1Ch5nH{-uicE+Q4b-$+7BCMo1t+dpO1r)2( zwDGt9H5uURxHdEtK=-J7Hh=lW1S`H0x=?&C+6q}#8VYL9y8IQ8SiT8Pp8v0*0NS(? z@@s#ZE=ac8Pyj(FHli^?^8WwIaLyDyOb~W5p714I_jAo~(%97ofR%uVP}6eYMMl?` zU$+73a3236PUR$MaFyOJ!;Q0~DmD|z6DC&clBr1e<{-40#@7`s9^88D?-HU3n@KOx zS)%`iiqo9p|K*VTy{i~zxpvwDgZCR zD>MK&cE>fC9IOa%pl6+Q{GXVw2orMQh=-73o(b!bbwu}xcIo^*73t8x3ZBcO*aSrg zbA9K|e|Z5y06FE2QB}QNtoV)_y!`#KXUyjkj6jRbR$9ICf08}?>kAMxpTh{WvbHZG z&2U|-6QBq|1RCUdXLWYqDic>Ss&y59$bkFO$n4c0Eq_?GXe-}?3O|TO*}@KB7O5-N zlk>Np0Y;;sudSK#oyuCjC+7R^ogmR#_c(%egYW4m0afn22}r?0n683j{cnHkhSt4M z1F#N0gEHBr{`Z^)8SoA8DC+$zxPtM*@nGw9%K~DzAts0S1npmq&~vT{$4yk^MlFt|MG$sx&2^MsSO^) zhyV66mKD>mEF2Y}D@9d{mJo6NJIb#H`vLZdc>=x!D-!3#O@Z-!Dj8t-La~|Cn7;)$ zP3Q%N-pO`?BVoOR5;ep1$-Sg5#FvPgw|c%_%&q0+TB^myY%7Fp>NSGI1Xq0qy!F=c$!rlLM12q&i-D zSYFE7rR%HOuA`!(RB$QFqG(soGZ(Ey!Dl(qpE6<kiXkSfy+coD>KTuEI$lVao78 zSDY7|XznXQaM1Aug&lq{+(07jI(9%zz7v{``8!EWeunmUW0GQ z=o9j?Y=8$EV7DFa{Ttx0bzY|FTQGudj>E$#H!TAU)hX5kXHiE#^X0xN2^7xCRCw4A zCK_BB%>;W;n1u5WtW9jYeD%CkHQ4VOpKT%r8?Y_Z6N@*{0Asr4>uzw`Hxx_UA1Kez zZ34sMV-{xM6f)G==8K*1fJkb8bX0?yD}ZA%_ii}C*e#@R9NO@24zKAgqB1jMiwjQ?2^LpZN)Hn1jmt9vHG9dFSi|JRI#K>TvVGwwm~p*AvThsHv=TfB^T0UWu>BFfO|59-W^N zdye;$5$E%dZ{d0B&M!yZ*i}qY<*gZx)LCcbmRYM033lY3|7@A34~~{{>b*3-KX)Ou zjeC|4ImH*@-1Ih&f1#_k-Vqp<7Cw34u0u#HlA6J3mgtUj_;Q3!Fe*hM1H7goW`AV{ z%l~*f>#(NRKklDnd+e50DM3OSM(I&X7^nzHi{waU8!&2eY`PIfsFd_!lp|FX7-O5{ zB&V>Tk&PPp+~4y&zvmDB(CgaZ+V1b?{?z;R&RYD>q66aQqaU z0gm5>@cYZ55++0$ERGFL*n+5U|7l_Xd*wf@x=ov|^Il)T9LED4YbyB0D;gn+y8@|} z&vv@9a^wwU3}HzD-*w`i=iaSigW}eU;EM?@{TbhQN&=kaOYveK&oxczF(G|038?d| zMz7u)4@|W7zj4{ft1vs*Gv)9$z?!^YZwv$aPzMKY1}%Bh zmv|Bc2rca&u`e=yh6P7=s60wMz)4S-sj{nzag?EX1LQy$S}4&gB%SoHhJs1)>r8(=X7@5cZKZ zyhEq#aLn-Apv(q@ZfOl&?32W)tw(|}eie)9CmzY+_yXI1c70T{DM4C`ri_^(8eGF(S~`OFu*=9(N(4_+%YO3ZIMPoJgcJ-d+NpG z!(`|;{By`#WZxoFJF|HeRoe7|bM0Wi?AkIRO+(N%&PNJjpaoq?Usbn}Ot~%YUh+J8P+x zaCyMWWAojaEX)n*c!w$uk52lX^nH-D-VcAExC@dc8*j&7(EToc5u=EXmPz_rzI2-n z^Sqh2>=j%}#moUsC%|ca9?9I#HMq#jP2N`p=fC?W8-H>1_ZK;et5V%5^)zy3lS-i8 z+me(`z`rbmH5gX-6HIZpnBe0QS@*>gCB%);M)5hf!cxV*8-Cg%>q|G~u)t*$Q|1&5 zQc8G5T@6T89t`dA-@Oh%g~mijYG6oymKYWlGCD0_`RC(PZ!i&%(7Q9naF=~Uie1QD zO}+r}*eV^1Z78|t`)bdBb`olT(}ChJMXVWpWhqLo{=+n1rR{QmW%iF2fQ-0Ih|ua^ z281=8`!^lQQMp%6Z^=aiqCRieX>hQ~*KkX4s$?KM^l+t-U~ zuMqD-FG8PqldYafDrg_H89Mh!&Wm0=ES^QGmo09-ezrqohKV>ky50I>WH)ib4gxZb z*Zak<#&3Iu%vTBlj`O!I@)TXJG7yVgA1$l=zcR()j{#6J^W8ZDV$Ct$yDDOrM@@Ty zQnWH4QVN=`*c?Pnw3!M&NnOneWJDLl}=pq zj9wGENJ*Q=azm+&TLbu9{hb(Cu+;6|ddP4mLtmow`sogLPhrvi7+sjh=^dbi{NaH| z#Ql}Wj&D7xIolm*zO%Cz@c8~{FwH5j}D zjfS~RTC8FIzMJivEL;*?6U$dZi!8JN-z|da;Nd#&CwKGV-$y`|(*Hp8FKFsKv6o{- zhahQdM0)|p;Hmuo+-i=W7mRT7L#QRP9|i%47{m38Q*oStfOQ0eaN=Oa3Bv^qz_&vZ z$(_-w5AOpONXQ)@pJ^>-vU&YUW8t{fwRYWR$cN@y;jU4bWI(+G^I-HjLo(@vE;eC?2%!%rYHy#kKeWVI>- zgiCqA+cpCnDuV)alclOk1oXT>WJuTi6iaEyIc&Ke1ZGPbhamD1au1 zxL*$r^TAYaY`{38W$9503XA+$yLwgc3dZ+MZSDPo$It&20pUEfe0CBGddVFPDy|z0BQlnHd(_8-VR&V|K?o`K-awx}SKdxwApVoG4e_Q?&fYFVt zGMX8Q#;K5O7y7QjW(@?bCCa$*?Xo{*D8C=Pnbp)XngZx*7J#iQi7FdKmt2nHo7Fdg zC?5^r890Cv`)N0DBJGRz@i%LB16e7sw3+~voY$PQ@7CVLueE>(NHKhinVkUnk(%gC zZ4dDvWNX3%jyHZh?O7nwwKg()k!0KY)tJztQF*7r_+!f;=rmEGj!t@F9T;`qa{7*r zfIM!PAYje~H|1UlB^o>2&E6Pc$7VC2CFd7^zqKVIKf}NW&+S2}17uSMWd@~RfxS0h zt=&nD{seI=;kFA}yvJ^C?`L;v#=u`Aq@rl#5209zGn0EEBnq&rk&FRv2*`jzKzgrL zHdBS?KY-GB2CP0rfQaOFX(792u{Cr7$iCa@3)Wb@Kv)TB;O1Gp6FFr8m?@oLKc+pOj);>|J-X!eZvTuq{cl}pqq%#iyn;(CE23F#dsbTX-=45W1!|wy1 z!GOlQ8R3YN+4zcZmQG6VoxN_oVbPLFQOY%RX_AUbmAi~D72K+SxWogv%hMElne#lpc zG>QBU7;JVEH`A>5K@B+|OVMYDoNC`Bq6_ZeKs z+1NTxJ_o!9{&P|g$#{PxOeQ|WbKY;IeP3s-N1gb*U+tCtWq;|+XW$t$dTZ_Cwg()f zmK3PsMsXsvdV1_o`G>ke#Pe|Jxz@~z8i9yP{(zHJJ-YodKh*ap9kAE)&7aDMtmf=& zvh#6qC0?3O;Dfe2?0UhO=X$Bnt(2olQc)2X(bDs zccJ(2X8wZ}AbaQje0t1M_j2J8jpiZyauL$jVjNcHwP*oIZ+4P#=Sh7uVl6QMsw_+9 z)}dq$g_kWDd2r_Vn|%?!Is03H*NC$B^7C^JKOU{SNCr%BWpjo#%8}X$UI~yh)`V9i zv067r6u45mhf*G6bkl?DD@_j9W9FyzaYJRI+CsIq0n$v!+N<17(j|wSi%j%RcETL*&T#HaqAgv~>xo>O%U{iKq;Fp3cS94ExtVRHX&B|(} zMgTgL!z?|ldGHkLpXp*tW>gNmwLT)b{19upfYVq3-IDuV@vCORJxTnvPMcrtuxvYl z_G;k7;x=I|*Fqek)0KIcR+7P|{qAYj3u9yJ@P3eQlDD4(z%w zN{BpWwyf@H>6bf2nD5Y#T@A0V^x-daUaBsYZ=W>$+b1?Rb4EXHG29lK6|Fvj&LUQ+6wFMwfC(!nSpRqo+E;Hld1bwB%8w zN)}M>nU^}T&QZ5YGB^CQ(k`yb`)|=>QuD{I^RlyoxJ{xJqk<#!%?3B!*+X;TSmM_4 zRkfYwQI21@ZAViCsKT2TLidwVH~v2`3BHH(`SGVe_a{SyoILmEn-)Kffprn5GA^u? z@q)hg&0kbSS6>8Nd*ZKkq#A%TQ(7ni^ZAg&ZN+WKj0|cLostO13|krOkJ}IRE^0>p zYubQOzGuhAVT}IjjKgYuqAKf)M`(zC8+Ugv)JeLlLyD=8oCEew=L-N$sp-}Oerr{Q zxhkrTILQZ82Cg88myJOtaOL5D2ui7OGLkIAgK|Ls=e1R*A1JZKm)kF37c~RtNBjW< zQ)a2-W7qoi@)Y7FIjkp4P|^7cXCYZ5|sVC^}dHF`s0s z{VY?I*VuVUrqWP*tOmFbrOs9%utQ^SgDe*QBj5``mW(A|KUZIDw~CDc?r3izZw7In z*F^m|;T?pn4i+!ZP`Oc%9~zsarC-)vuPFOi1eldtgh9op@$?9Y;l?X*hcPWoUE!8g z=N%B}g3sa}b&$7ay+Fb2vz(qR@DPNb!B~VBs|k>>2UxN1%XN!fl&`C3(an*Cr98 zK{;MeO{a;=pP;@SWDsYSV?~mwG9f;0`jtyNML7E(jME>UK+Vwdzd}2zfepaNy16F3 zN%Kk$EoUfc(-e3zlsG3$j@Jh}5mnn}TrZbzEDC9*HvGfXp1?vB+EpnIwrK>iMuMUK z-K+rAcPyECp5W>~+63eSG0_1_88|HD?Yexq$D5v+L<3oA;<~?1XN_$EJK%(amqxpoAVDq>wa(DVBNYcSF4mG{%k4etKq2uBB{>?sJq zL{zyRTan;8j3K*!hG3T`ZzLM}cd@w_+8@@!y~h(~(yM=oitr2}o0*dK#Ts*Lt|G6L ztU>|!XB+zp=K7`KE89GojvCJa9D?9LV&7}7`}Y1$SV4_s{Fd0pea%*2RY8GhXV0ax zr<{G>?Tf*EpV`uA4{WYuS4p$~6~tWJE&^;A&up=?D%^+$b9qDT!ur25IO zjp6iqlRR~76O%I+O3w z0X2coD^>Ix#>UAg1$V6WEt2v^?#1RVF2{&akDb5BhOu~adi_L<3Gj-8op z2?x}PbmRg#AHYZ>4sx%1(!2(8&mUv5D4~6YgoV6N~K<*~t)SM|gF* z6B2lsoQunBmxN>Az9t;&NKk}KC`+QK35$(x>6st=E8hrQ6xHq&XKntQ9QE1dSD{^J zoUlJqooGbBtPatd10t^b4z-CnI-D{KKQGi|H&z(Kdn|n})7mUoQHmyi!_*YeSRIMZ z#rkzp!xz<%oEwwr?%CGl&h1o@jl|>edq^V}%G{zi++9NcFQH8-(I@SaQ9#E^$FZc! zIQN1$VSE@j*H5m;U$M*aZCti)|0FC*4ASr;|D@-R<&~wUiClY_EbnQA>Il~<{_&>nwbVJ1b-+?| zwUXm)?sIirkGL+D&Eb|UTOvGPOIR`IPx3UCt*68GpNr?Zmsi?^QItz0d%3*C<@{{BT;+8; zBf`{q&R`2<>g|x(P>{QoOG^-#;f0E5myNFSE~=@SCH@hQ!xH2=@K9}S-2Ah!a7gU- z4h_3ymBL8DWF4L&%j%wo9A(AT4)~8r%WjiuEW_`a+HzhhYDm^zb|BwM&n)q8E|&m? zS(zlHHcp@ns`2gNM#!C5UN?ymrUiVZR2hqWgsIep^E~E<8R!4?0uB>Zy;hYjs?~s` zXQzs>u0>NHT=g$YK`Zk=OrI%v<08Nac~I_L4vgjXzrklJuf;~BYYo^uSF_(h>YU=_ z-dWkmZ1vw{g2a46p%+hOH9QR76sEHTNj$k^XV0a^KG2;WP6-_w0gSKy(ojiqoE8X> zeq!wpRGde2d7ISA^aUpkeSMvUwlla#%5)M(dm5UGl9uk= z%sK!yxapQI_9oOIh=R;E_m`JnjrGS)LXE-nE-XR92UV7`pv1=!x=cgxVv}Y*9>(Ns z?Bg}3&@U)rm+Fzn#IWi~UL;UP1z%~u`=V5Sg23vkc{HRGJgC3W+Z?(8#~X)VA%GsP*Eo$X!NHNc;6mJUO3}M20z{e!kC` zH@k~BCou%|u9!}JwQ_Sgk=q!bK##stgB|z@te{Bx*sO{OBRwr!#Xnbb0e6IV9CDp3 z5fMUuPCZoV=GtyXeSKjc@10bI0-s~SU{EPH_kQ@lo|bL}h3xhSJ9RZL1JhfT#)pB2 zl5KIzdup{C6TIZe;^t&VlCN)Mn@ecA+e9UWJeT$M!PmS%6P9q+&1k9+%dPnt$2j#g@Thie~XV^0EoQpqH zY{_k>lJ?FR?wVP{@a}|cR$ERC=46mtIGtCF6Iq;Gn<_=12W<(bZ;V;_XBKAbGbN^% z<#e0b`r)Z?stRxq61WxqM^{kEyTNrES*6|F(!@{Cc4@>)$n-fyBv3yjeA~vZ%OyV1}<>{eRh1=+h#t3!$oHtqfU$Ys5ZYk4ELpI*Ok`g$bcQjgf@3nwqBoZkJa z=h@AhH{&lIH_U65jC_0PLMYGg2QF+iYcMN4%cqLE5XhdN@6vFmVBE$ulz?E-s7YmW zjOh;B3pbWmv+4bfu4@?exw?y8{d5J^c!N!?# zy<{hp5W-Gy%YHQkMqz46nvqQ-kyFNw!B@Y0%e(4Ix(f=Wd%owBWph;JX$Ol=nYWnz zg*z(fdgM~8`ksZa6Th1sX+YIY86r5HX9y3vh}pfzPiEEUHL^!-Xb*qFS+>{YI6$`9Be2&3gt&^nKG-xmyRWCZP zYGnI^xQn!HThckIxWX=OG|9*bEjXHeS|X;RC>PlX18yh*S{GEMd{db}QYY;7T@0Ol zS#TUD-U6n$)LRCw{#=T~`C_eW3-`7(rq8Qp`O z0}1+%yv7;DtnrG^$ z%@GONIejfo!P0+MpZJa`kI7I!bfV+H-l^A)?ygfY!?oik>zFglHS$as^;9d|l*9RY z&lGkl5Y^cTGn0*OBOYOFmWMo*NwE<;Sx8P6fcuP#n@aymOz_8hO!Y$74z2S>XS}3| z=U(AM$*1CL)y|$$Q?&0B1T(o`^e8+VrUui*@~j zH@v-y-C6zoK64|ls^_Oc`wo1%I?*B@X)o8oQGO6kRm?Xn9k$k(Ay$gD^9s}gPt}nS zC(u@DyfWNCHO19p%7rmz?NE0o#h1nY9M`VJQsBXQlUs-=vGl%2)~kXlS&MfKYRR`@ zldfN97h7tgdxliW_#llPDIC4O(at>ogw~N8_W9}ia!__3Jl{|;Hn16B>xy=3`G7R- z{Z^m%IQA2MXMjQ^YS2?Fb#tm#-00H|N(}Xs35%+}x|gQz1JTmeVe=G6o^7YcjEbhl zg2@9HQ%+`ePXfXvV4`9wGspQ>?<+oQxn;fHAoLe^x>lc6`1OB!$!W3~7I<#%&()4G|xFb;VyJwTNgqx@T? z*)R(dj(_KQ zn(IrO^6I_ov%pXH`x|+}Bn-TFxxn1Ce=Ew#23qCVUm-5M?aSeT-fg4N@8>V!0&Z4U z!k~NEB;=}iI~z8GqY;dTiic+k8#=Sybx&15iJbZY%re&TaD z``RLw&%-M9sA0QnYy!*ep^V{$n0AE8{qNI}sDCxKmU`>Te7(F%I!aE~k~4pc^5D(w zPhA|2W=wEP!WV{WN;+=0TaPh`_j_=(>cE9DjJURWePxbR#9lqh%(eRa7Q+-|E86|z z&{8BdNxrJ_?+c>M%;oE?DEF-{x+Sb7W-Q+HM$jmV8~I)W`}ADHk8hFD@we5so?RKH z`S1GS(!^Kxla1YGCymrf4PO$i^TugR#NqEq&y2n5Joq5{crosI<5-37{a*)4@z-E) z$f8)n=X4Zzh$Yb;{k2GwiGG#zg{2ZzHRd7&*UiEskE)b$y-xQ2C1#4n zBa@AC&4ov#*qZivDkh@2<5n@(XtkTI)wpn0#K%Q-A74~Vp~8AdPq;dN`P6FTxHPS7 zDyUC+Ugnq5&_mz*EN?}7wcb4fyk6Y+V(6X8yth@i9Ccgyjac+y%+y;Tt?^73vckcVQ z3xrw4GW)~!^7Ax4C9=u4}va>(?#J2u<5sB0*<4)SDy=%3#;d`SF7*t$eeN{Wk;^}q%|vRj~l+S9T`6>+CtgRz!5c6v$jfx zcetL_PLYVobgROw5<8}JYyXjIbw0lNAl-b~B(%xAXNw!mDdR(3dy?0S#5 z$gA2m-CvLDEV(6_Tg=#ZEf#6vbXEm>2Tgn5sZ^h>DrBQ5jiep`}2M(z3@TiWG0x_+Og{!%psn^@FQ;tzIU(4(@nLeLJ(Jc~iw=pte z2#P92qKWTqsvt{*fxp!FFUOEEN2Z{I#eQK1+X+Y*2;#?RPqu85TD);t-QB(OcpuhU zTrAf88AI#1YQ`+EX_)WMQNrqifutsqf8yX*>0=$N0_qwbvm@+NHR)*&7nz;$Y?^r! zTklP^SFXHKQzmd0Rj26lc*unaBV184|4V3n1(*C`|EM?Y=#vW}p%f zPE|ms*6p4*S)A24wP2nj^mRw*&?HcDd|=dr@qYjO-IBDsz9Z3U$KCgQjNWG<|NQ;s z5WVktRz@lLny?-N*x{7t*55Ewf7IUMb$Y0lB3cbH7fJmQ@wusC>g7Pl-O>bHw=xD zQ>dRe%91i@Wrp-hvAd>b75Ept95;q0sn0{roX4Ope%Q=(ebuo1R*(FgaRW(SgKqee zCf8ez`3Ok9Np*cA{vM6le*Wqhp$(leIz`$}!-6U)lhjqCw@( zu3yO90+|FJ7P}y$^^5`KbDvBr<>hx$lBHRFK%d% z_i$tzFhgAL*7Bad^1ohywLQ)2T>baw`bi6n5d9S@#!d3IiXj7X&-0HNy>P4S?Xl3G zyS<*D`f$0=l{9*&&vkjT-*q{&{&2orNd=*_zBK&k^VwT($(5J0*cqie{IrkyNk5rPVfVX3RH09%!T4X&sT;FRUDQ{X|dd zodD`u>BDt3gFmy_tXEg22KR(7p2I$_-DA@hPWE!B5_NNMCOEr&i`F`Fb?E+YkbX9J zC<=l)AttqH_Dt-V5{LC!38vjmH~B{qhh!eAWR>O4-5$*4GhEogp#JDcT7WiGu4pEq^XzFi=rtN*#w=nGpIOK7c8=JZhdD9i+yu zUeWj_us!h?ylM-q(xCdY!Yejz+7w<%IUf$Qd)6y(ItSKB1JS`4gy^n0wfDnoO( zqb&iP^!@s*K_2q1M6bl{)8Fzg#H6`?O@+JyVM?)=jtniG5y@5CSE6sr2gfdLfVE)N^{S7+$EbO3ReGo6wU2{6UK(R(IpIIdvC0xt zgD=Ph5{w^rM>$j@s*8R>p6=%$BZoy`i}hC7Ti=ymWHDd)XSC8w(R_QbE}pHB&zXEF z=-<)27Y?I}_pBZH(49n*$x$8!7HFfDK_`TaU$lQfHJY@47D|8fSpP6>&hK&CrKmCU z2-W3bu$+-I#BX87mnt6b{CIHFh}%Z>IjLGv9sKQzLqY{I3(Q|v&rD>!?hVSqv61$V zW4@lV5A5%_TX=m)RnC6);kg-Q5hI!0PYjT4!v29KD5G_eTVPH}U)2leE8FS)kWo~! zT*uqUshyvWRqTC8Y2F0G-@Catdv0$TV3!QOf%Rdq6cmRssq~_i`z%6hG-rJHC+oz= z4f3yTw{@N^|3V+S>iGZ8&>(vMdl<&hptbL5t`o8qwf2ZSxmn%GEr^%&;fqUE| z6L=B`nuEcpyi_(Px$rbp`b3SG2udUdboZfK={gN3J`CyhxmIliY^~Z!8cgG1x0p_# z9F>E!i%F4x+Yg{BtYdy_8mMGvR}`?{`;?Q1Yn|cLFtv+43Cx7JP)74tMa6qV2!FXc zkQrZMFJL+(*yxR2aNga1IlMGmB6pQka5{frdE-Th?W) zO2dp>ro{+Te{uTSvb94p^h?g@h`Jkv-gv6TzDK`Zk8%r zVKxYcobQ-abh|GZ!MAq^l`Wv1-Su(ZryD9_{Wk2`27?_)KH0#f^S33lYMt5~vsB`& zY-)(Z?`O(YF`43~(}@L(y?^Qo7311)r?;Vvy~iA)q}g9RgZMz;2er}D>2ZTQdA`|i z?-@r$&eEf9C~vt$+wGo>aZ8v2OYY;3y8EcnhFLR-zmiQqKsK|5ZC)oY2;I@Iu+R89 zcgef|$ZT=762~DnVzr*u6bX5zeKV5Gy2n^cOu#7K@@H9nQ^5vAMAurRc1mxSi93?U za*|%wn)B-qw`GYtbm>pHD-U$j(ar3F$Nb?l&xBs^2I`tsHissw$na)NV;km#(Ba)& zu;x!VKlk$o<8$=ag@^oD`+LG`CSj>X2!S@+F;42k`-My;zwjSJ(<>?UMd#`-2jY~6 zqVI)GzI%B#uyH~t>LrlLd;|KC*-N=c3X+1#b0HDO8N{kbQd&%$(x2DtRsVEv)0@Tz zKbgjF^gW%vnp|J1Qvj0x{`!~R>qA}1bqIsJ?w7kxwosB)|AD-s(ErHEnoxQn1;=U3 zRdt`+klWWjwtvz7-a%jmMGYRPiqtxAM=}YKBYTBgZ0Ei+wy@=pa9y}Z>Z*B`V$2KT z*4J(+c4IDSCY08UA^g3;VrYx|2#~z#rS)weI?-dWH+90Sb(oMpGz0zk-x4ZA_7P#8 zs*opL&f}i>j`3}7MjC}Bog`nsly&{AJxi-bwVBc|I_Eo4C5?3#OA-9g+rb0<&O*hJ zwSP)Qg_iZ%pdq;`=l?|oRx!VddXE8%7CP#WO9b4{%guCoC?h@zP)fKOvp?=h&R+WB za=*7$G?_USzs7W_LpX@6ecFF^(d(p|=&EZDqP`se8;qT|Y^6*g8}(O;0l>~@T*MwD=nsCJ=SHZ4mBcFenDs> zk@jl=k%O2NenEWhP&@T|wVJw^M2E6~-uK?N*>fj*M`T`2)jmHJE|$|8RrOMP)=4Sr zS-WCEYk&UkwzX5IT3vMxmg27Uf50VmCg>-kPG+Ajn8I(yVQrS`{>i5S|6iI$hYigl1OI@@a#QJGfxcZ#L$dU<_u6ir|*xUDr1*|NTm zqb`{ph16JMswcaul_5Ve_@TRAynzAB$H3SSOHYmss=1_kUv5 z5jo&&M-if!){N~S{tf$e$nk!(YNlIqQYKe-`c9m5ZEeN^cX^<(@pefyOL^Xc>3(K= zAhk)?YOpUo$15WGYD#^9>00;I*GZH-z9#8*O1Eoob6k!{o;j~ty_Km^TQDak^tNYT z52LY#d}ZdvjL-UDLDH9Mw9CCwgp+<6`&~u#Y94oCauY^JD*2q3^EpkBQrjG5D76pU z1+u4BLMsGTH|RUzqi@YyV+IgB9hJXZ{}>z?l*HX6(am6v?c%vFlwWHKuiOPW(^_d! z3wA;++}U>HmamJuE(ToM_w1%_S!M7$K4F0lCd&=&{DCxmOXXPFge~`e68MM2#ywg1 zmI}<`xtFHyu#NqIrCFoIE&_H?EcA=2kAP1|g_oCCO~p*FYSNwM7soUhW49a*<+O}K zHf$ZqwMC9=&m9lgmPz?MKB%r!_65D7C+DAVDc6kc;9ej>JBd*0M1P;`E3gmoD&gsz zFrUM@FWebYqXwMcX(6qU3pUW@+&G|K`}}mEHIx}XeM6WNKMtazufkM@>-jNu=pIxy z*hajkdGaIh$<;h>bRk!!an8rjE7_Aj8~IepY)|1kyb+_Z*hMRplZg8XwSBNZcZ;0Y zQW#?BJ7wQLZzg{P!#AyFwp^eP8j9&(P%h368h`z5&Nr~(^NdK(zxHPWLbqFir0w(X zFH?rezh2|`Jw8>A^XD&oD)Q6^IEmI=e&oaKas9%w!^E9a| zWF6g1N48S?D=5T2jXe)!PSX!w@kR~^$KVdVyP@we0jn>a4Uvm}!NqAk&!GULPj?(@ zvcqT$>Mj2&j@lesb?!{djQ8=$3R_IL!^g?M1+fsTQg`ZPDVM{hs?YU8{Tn@B9fncf zFFi)>r*QA$Pc~@{cCRb^H7UAyJsel_xS!vuDCdC+07wUS6 zFx@Z_E_iO8!qnLBnV4qZ5O8L3lT6h-yNByTxx(K7^WAZthZfj}3y+xqi@A;nLHE1Q`XnnePP?J3eeb#KA)wSLDwae1 z`e|kx*@e`XPX60T&W@b;>KEL?FxIb^)iL>SF*^775cHtuW~IF3S}s>1_74TIGF7uG zCVKMbO1ggH)0(^oXJ2zUHY25)=DpmB*9UFy1KfN`UOMLUeS1qyZS){Y8f|@tpIEd)S)r#T+aIq zv0(%Dy`9Rvl)#Rz72V+cMR8ioq^e^25##8`NW|v8pd?rsxx_bd z!PNvqHcVO;M)%!`(f@@@4Jhv~crIm|?Q95pF9;SB^(g3UbS!mJt7IHMc8}7-@ z>tWJ|Hdok4PJ<`C=b%fqd04qU4)q6r*vTD8wV6B48U=&uCKtDk^GBR zlmoAw>HEqO{0V*J{ACw~t|7ArFZVqAG(lsY`e*)rvgyBk&*gF{Q#b3L7FO|1<0+Rl z;fD6ASyfT>4^Xnx_&YKE>*aE`Wpmx^k?N^v<;I0OG$z>nb{l=W@ukM6eJJgdU`w91 zQWndez38*1^n?r83fO6_>|@9gi#!Zq-#uW;z7V#aJFj z)}r_5Kbj}guR2%Xpgnt#ALaJiHsdEhrEBKxkXuOVM<1F-bX`r2#R%w8NGS$im!|*P zl+!WrD_0InG0v`~CUMexc$YS?qgrJDlZy4Rg6H(HR$_;m?r^PL9ABrWT_E$!^anLT(agSb%oZ795;Qo@cu0qJ+ zy_6!a0Ux0Q^pX#+zKgaR!xK=mu zf9BtB-CkdUxHzo*!c7Ry|MX<|P_h-(IPg z%=Wf~tRu}8Q)4*VSork1{#Kblv*|~PW0Ds6tDwzXZP-2@y=i}@ZF6vMz`~ez$R(A# zZ_@lIAiF!+sg?`!`UQlDfQ?b)$u%3I|9@d@T z>)MNC;g`CG<=)B-&sTP~_-}eA>7obU*nBfXTkq}qtIIO2?34bFco}e<`l=^aCyIq|3C%psS1Pa1<|w11Fbp8aN1nemvWj^9?jxIVsjvT9a} z;%2R==}=1Wu0*u*REIyW;=9GPCTMRFs| z+$xJbczgT!92s}n^ZIid-%bNaeKZZ7Kr?f(;w+WY4Ht%WvqY?HI-qX zkp*^FwfO_rnux7}tzkarY3?=GlDILBTm6n@rXQGzT1T>uCduY5M5Tfl1FT&2VBA^U zl(A@pH(iTqqpA(+Fqv4L^-29HS<&D4Rkmv~frM5qI|teMf_2G`E450~e~&ii^$tLi zZnd@MN4y@EgyWEF219-(U0FpVj#%SjZNqo4+&g@*Q3CB@y$R2t<+Wai9d`gb$_trn zpaFg)iTE#BD+B7aaqy*hN>2IQx-^Z45BBsk@1gp8I%xzy$- zv`t7<{4;cR@YpWrU10etE~?mJMeXC~BayU5mddyPYUKe_6eQJNEh8A^l9~-=n;D2F z5^7Z2()Y@@2E8ndcW0|`oq^8Wdgq+psME^JET?@5`rBT;cihE-%PI&?kGME`qva1o zL~Si)UQAgwleuTW;_XxilDEO1T0>NDIbS%7RL&|;izD&ZWkfU56IwbsOHnOrQLa_S z>>~!jS4hT)XE|hVn}HlX+6Ra8&W9zdarF;TFd2*c|qKn(;^CuK!vOsJ5=j(2?su$4qt)70+`uMq) zH#z2ojc%nfw}%C8xFRgbA<(?MtK?n2XBO+!eXEE9AsYj#xW9?;oC&S%!eRwbcg#iE zpp^m~pH#{te)8ii^8R!Rb?9H%*i}+OKmdyqU-OSE=kM-&Yu|-c`3GL8U z>%BVAz%>5RT1~e*j9+<9*{0?nNIU4+9m}&AZt&cW-9yRfMu})%iNvbx317+;=|9k~ zWvIU=9K7m#&`ol&%G_X8;Tn(}JJkFL-L*RcGe>>;mA2HP*B1p%bfSQvF}~(IG(@v} ztmL^4{M|>S=Y#Sj&?fzuyZxB{wR+A5Sh&IJSi0EEc`GBVy!;)ytNY@>@7gcAS2@P& ztj>lnYy#cStMV3Nl~p_e*4?${X>-a_xLmm_=GHNoX?*{Wqqz7eRiT7&PNY@i`!@rY z$(NCTcgstj0fYIt%!p_)UM}1LU zKvtUNg{dPlS{X6*OOh2ddMa140t?F>yUa5a)mkZ#%h*Ql| zuQ5rI3Hq9?(X2A!)z;j_>F}1`toeH*Ou4`!0Xy|w0`l=~i4q6Sef+d{WiNFGzVPU! z=Iq8O%UC~)wD-r=`ALN!b-y0>LCkGfsY#?#ed+X68Kf!9{VWkKlI?gzS8|`Q^?j9$ zX~TF+#|?+TBX@c*ND$xMVG(pt?OuU!yhYs*uH+0x+=1Vav}*xWKTwRY=*L=cNhgbH zd`Opke>O^pHitpVY}{@AY2v9_+Q&8(n6&}_lBSX2B`@)Rr8l@X?_ykPi5_^-ZvL>X zR?q-$%bGN>il=$khIL#P@noaJ=T8@NG7>xdXixyVu-p6O6pQ9)riXWoe#_{0sH z6Ph^TmD-;^26_|=2UY-PrPffp%K7@YXs7JnYgQ-nP%BU~qK8*SnGG3LSwd*uG8>0h zo4F4@@=zUS?)5txEjy&y&&e-(Agzv`10kLA?pV;L-I34|&r{cl9TmdjPI+LHmsW5> z8x5Mik0`WwwmotzvGLv3*ahD{3O)z<%u|n#(mBDY(TA6-3$=Y%eZ!5yZmg7B52kjN zRO`wMP)xrifHO(_pGj(tH0MmT^C92){kNmCS5-ZISlZ}R5EQ@6MawX3` zrTMC`6gHv?C114c3r|Q&;oMx%b)KWCd$#ll?x%xJeT4^lYyGQlJ`t1 zu;S?=%ZhiTwkqwebu?)nFWp9sxB6Jn(FBk`^Za4JxaUst{#hAXRjl;Q1lN`TKY=vt ztpkwLWp5E@!S8sL_f*f<_nsF3pKzU(oMuyIb1*&tS^5Khz(pnd4p*}X>TC0=1O02% zSsqA-WRXOkc7wMKW0aeo>GeWZX@gj)_K3TWj1e6{jB^6b@&z{#ih7uKADUl2Mp-*I z{vbhYJfcoplH1-#K5pHs+Up{-``15=r7v6q?w7%kJ) z)$3s#h9wmoC?#Y4GyW}Yx8_6z?R9iY{V{nOXKyC;Rh zn6&PCQ?t6VIC5Nl;bP|6buw3#V5OotCl!`}Lf*N-T_BNit%FhL@j;{usUL4qe8uLr zgM4)L;y#M)>E{QjYZr1^Y&cML3pRFcx^LVzKPx$mY%o`n=uZ6b1&lwKF4t%qAy9jn zs;-{KVX)%+7&BSBTUo1%)ru?-4HN?@F6{$0>OC-K(wSu96n@mN*)~KnJq{b}zFyB8$Rl9)Ywq;_gw-xN~Wr3#){R zpbR$!@~uj66|#PWZ4Sr(Qa->c`?847$v$oj1FZ^5%^mn%gPJ=JXy|TS^P+&~Gwf@B zp<}rC$xS3xUDH;<h z$-kfg?PXSf$-5 zDsNf4w^bM%RBB20&U0zN)268^>tZHH_k^Tyt)oKA)ymJTv9kP4#cOdmv2C5b(awFn zU_MgM-o0gz5%4fLShw3^8Dd7sXP5c-h6@r}NINFjgX2%LBfW(@@!hD7wls#6wZyQ*{~ii-z#3X>t0PumYQB>!M>BPyEIOT5=Uav({2rw zzO|sM;97mQf59d>L;b6hxjf;%f0!D}UNLXEssrfF<;iP#4`!am#b;j%E<}$IODlvE zw6Tp7D7TMA+RlefsRSaBX+!gc+r-&zrbQj4b|H^I8bkz?3PA-@A@o`0lJlgbS6l13 zk@cES7v_hr;kFSA=G}kn0I6Pi!618P;xUWZ1TjK30@J2T)!hQMVc|IdGR^T!S_^A_ zk(h}yWV=C%q+b=+EL7=fG zReiBNuEu>vFh0iLdq}`$!i8j3tgPolTmn(bJw8&nMVxDIGJRjhU6js!ohQ&&dmbt1 zCxrGKKP_TpDS}7Td$zfw0X9)V63X&brpJ;GUkdhG98>^vK2x`iJJ_)M=4=mgNmB2y zR6%|<`cb2nj4S1Mp{I?K7jzmAi^b#fgs9S>d=vJbc2;!w$tFsZl;q?_dme${-N|Gde#}mAY?GOX zz=2Eb$f&7I+}_x7l<2+2f-W~y@PKI9|QJs>ORcd;4&)5aFkGto+Vti*(A+}NGSUjbtaby|mLkwHfwR0tR($-i( z&c!vJc5?EnzfSNBW;nW0dpnL@jdK`PY6>6yD)&6!KWYO+ata>eu8oVnJsXB`(m%&> zMcIDnnIB=9Lfwl*lO( zP2j@faM9i@pD$eH%7A`s06)S<+4nI$$3ZwZoYe@E&mWFRYoX!fTkQe;jP@^+or3xd zJXT$5>&}UZoI=0~rw!!a@4e^QyCGqzcoDe_btbs9=u*2oc6lk|F@xZN{HR!O@LJbI zh1D%9dXo-g^Krd3Qjw-bo+5G9$J{C6CR$aakDKe*F3$1NJy;7uftH9J+i#o1Z7<|( z;DA0}GkotvoIc2=9wJRGc#V&GeTfua{w^jq4OtXT!;e&IbXXH2wMO=Eos8TI?+|_| zMT2JEMa6z?n(}vXH(#YrsFD^XZ7Y9qAuYBM3)Rs-9;lx7Y5e*TY;oDZ1RL#te+@E1 z>yzZS+XJ>}87g$Pi6}!S;HhYdfP>^K?9)#;mDy4$ZW>U8`ZYkE`sRGs#!@=lgG$F~Vimm}7LNiu;$wHZfk9?x$m|$(nP0Nm?M}lXX z-(+3?@|2)s+#~ip(%!b;w7Sh-y~Gwdq;wJd3x&rgbL;kT^^ChuSHZ}(c`Bb=oUUOf zLuKSyYWMBMjl;RJ`6Y4BEE3JTUkOh1Y{Wh%?fDe7%ZKb2q-Q2$J81ISYud7|?Idq5 zui5A*y0ka)XF{||}eA0|(q!1UeI z3AqdgGyp|JqNFYOkAh*eJpPQ&V#rW5;_^o=Bas_2Q!ltKn_%BUNm{z?qu9MZ;DoeU zW8Qzb;%BIrf+r#@&7XQzG&wgFxL@d76eGK~K)-^?+0ImV<>xhd8e&|zVm@C$x?=Ih z{Hl{0HPPB#IP5f(aCF;NV_GD1!n;RBC}t4S)w&XEC@%5d{tG1Rd!;=Yta+QWrYGHJ z?fPfrvmOrpIk^!E$pua7lb9Xzugj3_RQd(AbX|I5Sjh3TB)7+$Je8PJo@3NT4h~oEBs9&BvxQtE zR0gB8FT3YVyM5P;?ikodoqf?mB+j@GfSQ^Z&-bd2C1d{zEd+??Um^FENLb9`V+tk#T5C9=cz ztuI?fTWgtbMb0RqZziiv!^ZEcq~ygusDbB(+R2#HW^>jW;XZmeI9DugZ`97J8}vAf zbTVhCK243WTNR~!mnR#iNIdcwq_4(i&21~>-g2X5ZSp~s5T+I_F%Hv`IeZhcVYHzi z)WIGj3T=JE*5;Hj62_GqS3=-$#dX+RC7E~(G%8@&ADRWZt9p~=-QjPSEB2X!59 zTq`Uo5qwD6v9)JW>u4;{Q~Ut(efasO32}{zMJiXku&Vg961>|wi7Syotg&>oQJkKl zBWrt)Bi@31TR`h4APOTAphI!f8OevR0oUT)_{fwgUG1IFnQ-K>#hRNv7+S2BGsr)5 z8+in&fDMJVM7T%DsOa3>g8}{eLQ`yvlODrC!X^&NZnDX4u1Un49x-*^yP~|A_nxAM zj>_tD3)iFwufYo*ukE&m+40G(Rc)`AFP%lLTql>$H3vM2cZ`%YySTfyDnF)wnlI~S z!VBA&`+1TMbbMR})PCttGb{HFZ5A3EWPK;OZPnF15I|o1Ydy9oMU=~SI34OsWbuxZ zfkLaFhJM&T2o|yKBf0k1#D`;)rQ|A+CHHl=?*Q4OD3Dl@G0vh0CH(q| zyKN|t@bwgeG&lfW@3RQF-@UWy(+%DE)ok$@QD#xzgAf1bY>2eb%iYI5z&)| zwQ*c(y$os2OV}u#h>a8!Ds^;rwPrv+8PSog4mEHM#KeqjLRktU-AdIiy{a-cu>?V71rr3Ss2yG-1&$(mm0nuy-%l$XVvIkM(uCI z#FcfHA#pva-FU!s<1YZNvK6A+oZaXdm?>)=5vYz)=10()mJ#0k*i}WffmBo z5A@Y2Ind)>9IGsA4)87HmE3fc)U5iN%EL8-JiY;?z3VQ$pFv<#vH~9~@^xXKeFpt| zX73wZVK2UT`K_$zQ>KqB=)yp=(ALOj@rBc%t({TuX#}FJ5jPzb+vvTry&aF(w&sI} zPk@F*nepk&3-W`##Hhhb<4x(k=e@oVdhtrQeXi}figIG|$f=ZW6F%4nDXJ!2rgvYh zIHm2=FPj8?K}}1+f^cpFJzX*k`g)+?VHq|QaEHe$S+rKy|Kab*w0O^eGj{!H;*RZz@t;osV%WkU((d%^T4%}w(T4lKnv*UhEU@_p{ zru#{)w>ljO>~yoUL&K$k6ZMCMS5%(1qMw}0xr{we5fs~yRT!O_M`zw zICt?J^=#x%PzAzgfUWJI28D99>=J&??>k#_YHv|~d?7FKyyKT(*Xc}U56czcp+kLf zGx(+ta$30*ccAnyTz++P`TAuTK?G$63%?_Md2{e?cktT|duwW!yoCgildX8t6h(-i zJsMRzyI+1B6E1(5j7kEOCnd_)WMOQE^Ed}&?c7F2d<8hm6peMNa*`LL!X2Y3KdtBW zycX6;aDZ$GR$!-6H!R5hrQ{o|#1C~ChskE!UKah_OFF`+hl$MtN&P}Tm#vQjupZA5 zJe9>gQq$Fo7!lXlkF2onD^rnNi`;lOP_t3e06AF7DS63y65Ll?)JIOq@Y@5=+1uvh z%*ksr+v53FiZXFIQ<0u?k-q*9~jgH0VQ0lxM z$vSXoq;!pt<3fD4nJYpv7sSd!VsEEWvITV+LOm1vujFCtq9`(%Q3j2?wTi1Oi#sGH zUz1Q(uNj3&R$bIK@u#7Z6MN7n%*oYJj0%pb@EX!h|J*I+!UWRjaY-U&vSMF z)|M`B+2oV0KLyx2B^M5XSqUnaKYk6DfalvNb|YV=&WS~SLG(b~a0`bhaEh+$)&_*@ zP>S#h7!1SFPIM7+8fQ`3#U?5_mv2|u%*1)7c(y9e6=I8W@{yX_C@73dRZgV5uT$zC zfw~x7o;6`qyT3uxYoXtF9dgJJlD)&mxkre(Fu3fiwQ~^lym2@y+PMKTB@FuJvK(q+ zJYb3O%$#r3&~Un3v#6olpF8W38%zAjPY;F?_qEgG7x;I~rPn?Wvy(mSm~yp<>dcBo z=<4d;mhS3UzAHn{{Q95(_f{27-0t2LbYQ+)j~9UUoLk+qNXcZUYG71xdu~U1m6v>g5N#5)X|p=# z^-0-Qpw+*wn!GvS#i{)Gvy<}%L<+~>uEj5l%@UP?!Uc=#Wn7>++rHna>V2RaJB{oK zNv`hXrU+*8%f&j4e2`7|)3)YsRf}CuOI9egqN?A(2^P$q(%xNnw&CMEjZPre17Uc( zyRT(&lqmcfC0w9v^4-&kp4K6mE1JdrO^SrmPV|*c;-9b9WNa)85KOOIln-eNt34C$ zwCaSEgWiBb9zEE8HP{j-G&G@)+Mbzi9i)eDcMnu#r(`_VzW8PdSjAE+88YIQU)=h? zsY#j^#|fC5R%)b1rbjD_Ogy5!-2!#h`Sm&*;7EyEr|^-7Neql-9-@7%lnI-?Eq z=?o~ErZe3()OU`&+;jsu#Ocg{iki!HtIh=(g>BdYk?X#=L7TN;tE_ zsk>se{{_^$?>0B(5k=4FweXOXHw4VnE-l_{MLna>u#Zw);o2tTm;7UBpy0;>f&&QFQfNp24Tp z>z$ZA(rD+f0@S7vZq+&*ox8yX8$&hq2t;DJg{qJj{h9+hd{0pES~8|&h`xS3 z#lQT)F65=2%t#Kj{EEF^I!6K8ti3yNAI7P$a4OmlJK^d9`tF|VL}46#A!&ab78XX= z=HM?!7~~}825O8{73NuE2f|YwRi)GKREHbioJ^}}?)WsCgjunxlAs?uEy|56i4~cf1yLJ}+37;;O#7OPpcX# zTjRY0qcUZ%3o+Q(7g$8pDtmZ=(ubmEQmlnV9B|fibzj(plG;I;D!nRWyoG(C*vPeA z&yDaLjune2?aYvOBH{DXvJY!h}m#^-0NvX(<(52j~i=u>1-*{ebO*CyuU z__1*%?C~pmsd~hc5?*^Q{ypd`q|-7C9!|k~r#nKV-?MysLwkU!5>gLDTNt&DJ8V3! zXskyzu#ypAV{&rvJTKVxNH}7mT$y*f#aa`V!j=rfH$W8%=~`*?zG!kH>A zxV;i~GHtI_SE!Jw|8chdJ{>9F!+x*wmKeg1p!DFtmRCo1qvh2QZ|8QqXQ)Hm`PB8= zxRI35S;>PbNy>{)%jd@B=lgpreG;F0fWUwbE|u(Tkyj?*1|dUcR7l(YzVW1r#YLNe zjc>G}y$IZFDoA1y&sK?yFH+try@2kOM2Rrb_4Cc^VBo_!qAL!zPPtb4*@|Cn>>b2| zgslJKNrH z*vU6n$LWnT$wOXZNiCP5t6XiHi$m&ntGCVJc}4loCD}~bC26uYBPHbKXYTCIDCOwk z%*VyvXx-X1{hWyv**w^+t!=*JgnX%7pBu5uCc2Sga|ws8-y3fF@lC^wSpr)S>K@;Kl{E~EmjQ-PNl)Nm8 z>4}~@Mk$q0_<>!mbd(hnww!s)p=T9xmHSqLi0sJy!-QR^+LJR+n5cHOUAhkr!-g~bi5Ia<3vJ8oovT>MD=OA) zTiTM{=u0C@?fjjbY6l#el<+M26jLpkkjmKr^x6k zu{`kv_ha^%7cSHi2Z#3OTv_mkx8ILsi@2(&9g@F7<5Kj2i|bBDw5Vw3LBz?pFg8?& zXGpp+)cBlIh8|@S?3bWW9vn=u$tnNX%Qh+f(emUnu7)r(Zj&4I?8E zBaPeraN)cvxVkJWgvT0ym&{%<1!&F0QU#)^j) z#?`)|ja4I@9t?Ub>WtkAtIenV#@x_Z2zoeAJT^wmC(z>5o2Iv-U9YZ%7<4NpfX3|w z_pP!INZ*QuCp?tpK_UW2M7>(=8Ji+U|5B_Do|1xz9=7mP!zpmMtcqJu3uR0Xy6&ws zkp-a+7d;E2%6KI*`N?%oVIgV&oh03JN{JROI#Im;@p|& zHM9&!6nIR7$ux904!e@Q2L@80f-9JIJz-;rTDvvxJJLvn=U-AnJ%lV+s-03=Ya$%O za`;JOhXOMP3YXKltx4>kUmLymz%yw5!?nhh&do)~HjH*uc|F(*kS#D6NLC zn|TGOr(yXKErKeo(aVIu8_GN1+Zk81*KfDqf2o!Ybb1pA8OKU`eG@KkQA|tY=c{Zch=rCBCQIBKq)vl7HH6ZHnRB!{90RQkjkE6|vJ^xUC4+%Q&KX zn7KF{7A)R_7rk)W^#vnnHf4g(r>OBkDlzM-nu{4IMCpwKYpbMov z&O0v4Kf<$XoyP1XlWZ~LMxWNcK%@Tom3FtX=UTJ}j4X?3F-P6Dbmm^rUUf=5>#hdq zwR8|!I})cm?GD3ub6e*G@P`1ps!IuyylYV!D!PZ`V6~g~LqCe1-&RWly1s%+su9x9FHcV~xnndP zF`~KLiqBKxaMOC*`QLW({fdpJD_A!Hh#afqzD>Jwu0g3Ii@NI&z9Dh0Y-B8{9#=M< z9O8r4y4=U99xk?l>P?&uGorJ^a@cT9%J?(K%rK+P*mgo8w&L=DwsAw$uJTfBBAbJL ztPnbm#d4hUkHF}erva-gEQO{^9b_ui90jR$evu7!rz1EKb^@_5=S zF?Cy=G<$AkQ9B#>xK<*Qtws%`0m_bACknha@z|~|fmdTb(8L4jL(`~yJEZYR7jI}y z(+u$KjV2YgXIy&IramExeqpf@o0H5QkaJDrs-J$ER-*tS(>HRv=!4>(cG?dC14P&E zxy%g5u243XX$zIkLJqwNlA;{T`=cQ12knsn>wK!f&FpY*wcxpU&Yh;fuZ^9GzOaD# zDGs@dMK)^$>b8uyN1Vc|YXY7U(3}vea&*Yl^#jVLsj~feNQP|U5z?w)wL(>CSHm^+cxG(wumkH`*ICY)^f-)bw;Q-v(C9xV$}bC%r!cP`VBghpB{$R{{1?gWf)1zfgYmAW2irr0k@?QCV`iPY>5 z-7}T6bh8Z=s7TnjwxD+v=>WSm^XS`Q-uF_%4wG<1kB?Y@kLdEE!&s(ds55LOHTimf zG{l+%{EDobvfLXbH->Pl!qp{$+f@Shy1ztM@N6qqd(~;Ldr3>SZ_CH-l(O3?&afD2 zdtv0K!q~RkMV0f=Y8cpDYL6wgf6v5cNK^U1s+|y9N^!hID@uuRsJ7c&pyC@T!`_|e zPvXXVxzznQUvy{I4f!D4xOVJpF77(c{}smXrTXb+!QWptV!bc$(P*PEYnqL!D5)MGG- zZ^5NFQI>Ezi<)BLs-iBk`Vm#QIA)yCb^@YYDXsxgB{oFp&=Iy4>_ns-9}(y_x0g7j zJDlA()xL9W*KodQkwuiJeOcDRR8kIz!W6noyRs6FHnH2<5^M)m5lrK3H&=Pv##Lc> zGuK}@J^TjjH4(we@)iL>mHJ~AEUh)+9{1jUn||hq$pv|Ij&X6y={cmKlGUY|0!EAPDG<{&|8uQRwGa66uYN5a8oWi2*J?Q#Uh?n7ue}h93)=`7?P)>Y$a~{Z#qcb1EgH$WGs?K# z>gs67zLZ~%i{JGfVzC8YM9M?v^V$Lx;v5;e(P7!K_}D_Yw?_Nin&r4ctiplBvrW66 zyl7{%-pQGShk0cDlX`hvS!H2Nev~2uuQkm*2zD61K+)d--_X$fxVr+rjJ#kD4wx-k zK#lJJ#;v01Pr3sgji$<$S6RZ0aajPHcJErgc}sH}N@ow!+Wnv!f(FO~bUB#?aVx%p z(IP6FMneR@h-5I)?EJaJx}B3IPP_i@!KW=zcX-Io)NFUy(N}G7Cx^L1?aD`4!7xQJ z8$P~gFq{3&mq=E0g>{Q=%RF(8H^{C&A@qoI@l3NgM$9SlwAwXFrRdIr#wuD`ALhG0ytxcBr5Z?^Y7sxrxtKuGaY z{8Zz}B1lT111TBbK$@VMN$-w-NK|K)P`Zz*w1@?{ffl|EK`+9l_rsqb8GvWjs}#W? zAEFuvdL|N{i;3Xbb(XejV|}MMMWXCo=Y&=yo1k^=+4LT2&0ia}_PlRbPzTk8NT({- z>lg_3vxe0DoF|)%55&*yp~RLLznIF6%q*(iO02^tSnND0RZ4=-!TA8BHMR@7 z%sB}UoWB!Srw!_=i_YwgIH3c$L2h!g@Hd@v68VhopfQiS9#Gb7!^c5vsycVl=N167 zEI#HH%bb>wA`m|oHzFul&u#(J%22=qu`u@8x$Hi0sWS=|3292T{5_;L=UE+8M_-P> z3YC>B25VNf+ngY0=4CeWqCdXkc!I#>UNAn?;T>060S$g=fcia^ePvQGVod_Q(m-#p zXs|_)+PSgX_% zpLY7ogfQcD%MY6x?Qi`U;S6xr0iUl|9*q*>76&CS%c;>fa`OQEkYxdJXSJTxL4LXN zTxR6(3GN5z@>HIC_#iAYr1G8x-EmGUy26@&JG=s2ci8W&^V?o0%?DQBQJ(4sF552F zKnNBYSN@`{2n^wcgYF^lmMG<<8-D?YYGDs_dreNR#C@p#v;m&y5Wf0!hWt{&T|5q! zAE?Lb3%Qyd?>)p`VbInCJtyMjhsl{#n2fygLarv(nG)@R{3Jt*>t^w&$26l!ZwIPB z3tI6_Qe-fJn%oA)Gw4qw8K8j7!cDsX>S)#gW!dQ<+V;DlGK!M-BoGMG00?7!*M;3< zL*~Q=q!r@=D7QePI30J;XTZ+n0U)+&)raW6!FT2o@Aag$+mMXISxA5u1>A?Dxf1dS zh!rRbB$X)&pmiA1-Oo2pE)~73vO}7dV!ir z-|x~z+0UT2@xg~GgDL8P3;t>@pp}uyeUX#U2tAEDQ72u|mCGP9bWVYQnh2WBYj-sjwT9gmDWK{R7_!z^cuJiWCK(TkLeCOgJ#Cxz%lE*C!bA} ziWFAhgSA3==uiVmd#I;Et(VqGp5XKS=T##6!pQ@rBv14r_DAzPc;M2rxvt)yfeVfu zY8f{9nyT7-{9@S4V`wH4k>>`NJ;^k3*YpNxR9!j#EC5mcGIx`6bWd1rgKm81@u!QD8GOeRJ50!Q+6Bf@ja~I zc*^bcs&S#?jw4IFlNuB|IR_kU;sGK9S}ISp!SRYne3gx=!4DuCS4ecT+{<;+hlLWT zzhk~oVCl9&*A9fy4Nwo-b&14bY(Ul`LI0gP(XZ4tIp^E6FpeB=RZTMWE-AZuf?Q@+ zD65*(AoL3eIA)1JL9NCW5#Xb4iFh;LJu9IWzuD}1=qlW34|htLs<>76zZ{|fcN zEx-sgR)bVuHpr1m36(pofblifywff-IQVV491ZeAu4a^nPYzKBx`R>#q%CTWE}9B+ z(62=`odfm0R=!Yb#6#fdf~R!tPV;!7f(P=&psaEF^$eNwz>2=L;%_FF45DBLs!-Pj z4k$DJ95h4356ZP_$lq(tw-!q2&VK^PpGoMk!WpeBDSBs*kB{l^9yTt5s!d3TmsW** zikS}HG3>odG9CldrA03=t%H6E^!}Wi&_H0Q*VtW%e0V_lPULxDXM)BXGn?L@)zCTa z(%E8b3X*@<2_!5yHZS-aEc z2O+qi8pV_Ev$l`eN-F^dw56gtjA=_Rj{rV>+n{z8N;rB5x;z<*&B-dWgSvdeb0Cal z;|JxkH!$&_S|)LxpK+}rS6|juov=)`^U6EoFHkw>rwjHF`?;fg{}X#~AkuAPk!|(k z0@WEdy#-lZ9*FgAuDol#vtY9rw@q>-$tHUlI%ZT|O{hy*Ck#QgsHnb}%za`GR$0b>wn#e~DY z$U+|>WdGT?+$f|Ww)B2mBc|HmGo+lQ9r)CF)u*x@bw5PQ~-y!rmV z>Z28ZxsgM_emklL2GqufPNQhXDJ}Oa68uEU0c3>P3-IE%+0Mfu>=B-zxv#7VDP=`zsl`%LlfE z?xP60@(X?+!v7n-92g?_8@_!tG*Xw(SQ|~VMNR*vw3?Ez3jX%w@0i&OurZG^RO)c^ zM(`8Z?*jBEh<``q3a{z*)IfgxmSE$WufDPJ; zr%ET$;AQ?7a{O`iNr!-_lLO47tUFd_=I?Ffy!4Ly@7U`BUA(fcx;D|7Sv4u0U$+6S zL<&NlAIp5dphHMxfP((KkVgDpuOcVbd+)z3hRy>>xFDwngBJN%?S1Z_Q2RlIza06a z7y3NcD?;gVZKZv3Qjso&{oYJ+W;`blw zB#ame$OZ`)k8j}Z*3B#)Uz6}C<*~uknCSAj>sC5D?pN6Fdc)f6rS**uAKWP>%NSVf z^cYSIZ#b}Kh+2>+P#f@^&+i!OKghGO+V%FRlPHMZt*cttv6#!hg)6ToldR&s`h5*_ z!}9U==n=w*5;E+h*|5(%XImtkdtqG|arE&rvUW8FMZegcj}kq#9*sgJic80{-;y== zJg61h7&K0YEAw+P^)mAh) z4SX<;=TZEzKW^CGHYO{gfKtIoA)C^EV%>V<+J( zUK`ERuhi&DY7>+FNkB$_pwn=EiuIjAf~to!R40K3ouQU?4RC%tS-?DmpjDuNPD8?b zzO-@1A|bQ^Mo1timXSW8?jkrzzD0{S4q=OtEPkcSvcX^s)i5e=PPYC1xttf@4_45H zpRKwjg!8qT%#pU3KBUBc?Igwe02;xCs)lKZNcJh4!5XgY_B*-V1Kn&?0Aqx7lPo;S zhIGD*#R!f9v(AL42%}e^t;P+9ZdsP|{=JjuL^hQR;A@;FaGGw<=U9tzR5L z|9SHey^R>6nk0f;T_Y+a2C@{>pSmPi#ay2N8iG7t^iRR_UBLY(4M8d&Fr+WU$d4WVL)QGfMWbj)5U7t=`~u-YMm)rC#>PK4YsCnViQ0x!8$4BK zN!(2O>zTo)|J{-)C}#rJg=1ND$gi@^XpJB&>UU)P&1$j)Hj^fW9Pc3t6!_w=0`Jpr z%l(e$M~JXZl%s|#-wspB`UjrYu@g6+{O$E$cWd=0IG2t!bM*JC*3gu~{J6mHANoHa zCigiFJ%f45IHejqWjMz`11sZ zkWe@Sa2G%D_TNhpozY^!L;Y8T8iUq~PYr{Pl%$K7`M>S~dh8oMhwhDdoCf5{0uZe@ zD*}f1R>G=!aRMi#@~sH^$pKK2#^&lo8f4c6o+ z(LP=Yh5jDF&8s7ms>oj(nZR}t{5AK4P!>QjJGN*RDI;6_DHeDWy_FMYA~RqSOQNtoFT|Q9BNGw$UknM6 zvQa8Q>wIk?iX+@28oFror;{mbqYwY&=jqvU-%-l@Olx6d`x0NUam}daTDi zxeP5s#Oa(>L0B=Ur$vK@??D0Bo}KPp6w#T%hiCz7ldMNk#X0PO+W-v)AL1P&aOExl znZVuTFgsSn)kB~h5e~YBgcZ_aevlCj3n2bR8;291W&NogF5V^PvRCW7)%)Rrzw(=# zc!@vXIQrqp@TW@<<5x|6iZNzj@1wA+ykC`ok4KK)%GiAjrczg1k0NTEF{KqG=K1 zMIW*tQIdy>W3={6KPB_WTT)s_FfzIhNh+dYO8~F}OMK}B-4d?C%xzWMzd?9-!@nCk zhey;<2e`*w=2r~8dJ8RdrDaa4>;tK3-%pS%a9qOu{Lj%3KP_sZ8{qAL!FCc^Z>jMU zF*SwA*{-zuG2T37aA#MN82{6IKM;$+`FkDxI%SX>J`LLgki}feUKWhgdE;{gSyz?< z&}O5XPgL^|MbUj{f=>SXsnF#Bn!+ykIUsJdcfz#%jkx^ic4Rv>bTT&l*8i&n8hOaQ zh48=KQknZZ{!=olz$HuQ%uTKjl~UsqXh1`y&W}qK${2B-=x6PLv#7yY46?O~0SaQm zazWb)=d`WMzshXyx@RP;|3^=QG_8-0fXBGjpwELi$HB?=;cYHu64UwGup=iZx&=St zpJw#$ib*nnae(EP5}v>Z~>6GjY|e@9LEpg z0>AxD;Ov=*LtN+7Gy;8UQZMq}cVqL5{th{G1pezE5=R3{iRl}nbO2lj-G~pr(zpcX zXQ}Fx69;9gGNRK@5*=~FpTPa;K>CH4&?VHfE2u`{2pF5m^Et4iY+q3 z_dV3*1qrfX1BtHlnS%FEez!a!ZZL<>_YW%9fkdsZjdpNOwZzn zfOE=QSw!|~6jS2lFHrh;SL)%vBO@MuHt=zdvRj`(0d$nQiI&Z`9xF2I?Cs5heJoiD z5=K4+>P^8A{@;Gn@0(tkAUusI_G(8m4j|j5;wvT^r?w9njTOfw2v86Q`hhk-ssHHLXlR3i1Qc<{z~%-Q#h?;F@+8^x zw^;>{07vXl{`U4UbOUh?CMHYT+c2KCmH5X;kG`&@iMP#Mpa%#gV!>_+tZUAx(pR24 z56^*fWRN7PF7d$|_!EzwHD|l@{hS4>_;9kAT=zy00y_Mj79F1GPxVHN1AMOE(y@gT z4?mWh*P^!^)R78IwveI~5W!in*??#3|I^b2KD$pfh$ROd|5|hnhls7;O7+cJ!s%Ky zqO)xhSOO%zoUYbJYLuDH+B>l`F_--Qb2|!nFN5t*R7t&l72E$?dF1u&YssB z-1cP@dF-8Wc;87h=&Zmk3}MAeILY@njfC*t{3@vcOwcIe93>p6TvUNvgR?$I5~!3MnlBS>?6!0xOHUztGa0308lZ;(qODN)N_6KvX; zy2q@-ZS=32Jn+NA$@C#2is2x08e+9D- z2|*lQCp&)T0AEQQpn!*|-ykh(;ABU&vHXX+Ula6?o^wxV8OO?*KI zeZLL=5akA$gJ2xUyJ@|eSCEsX4{W>`_wRW`2^0o)jYpdu=}6g^z#||-_QOOkq9)|P zoGwFYQB4za!Oxb}6&T2k1y@)WOU%8-%6dha_6KMBIiSdT(la3VLPWakX&$6U2wYJYJWyBAJ%SiiDUN3o@9NkLZLTov-SB?o4IU-u#(7G+f zV8#eXKZR`x33xnV*i)WZ;e>?oLwquV6SU@8@aOyOSf8~yo;@j2=6>f;eWkn{=f=g4j9@05`ULEIdYPnIKEay}Kp1?Y*h^6e%%00AD|+EbF?l0qJQ! z14An5?x<|<5776G0)W76!09pfaNf>_eTyCtZRJFrKMGiY?G*Jt56;b^C~CiD6hL4S zehf`|HpB9pjap)OUm{}dS&Dw`bwm`Tb^^qEDFxpT+hOVvlhf&QhP_0}hJw%^)nF3H zTdKW^1!)2s_@PQ!!|4_LrtmB8tCU97@)VLqTb5BmB5_XLvdP(N-vxq}Iy#R3F2W4892)fUgeS%3>OJop*?xzNk#%?g9HoX=`mMg=}SzW zSoILIok3WF+NfA?>mL=QW^0I2YMSqABow+8DbfeA`x%zfuK|0^8b~Q2w_Of#^e3Xb z@hYj_aH`MnoBJB6f|;sS>`MKT3t<>*i{54!NM$cd6J9OEL(43M*q7X)(C|U$FW&FXzgV- zv-K@jrFdR9L(F;=4X4mDiGOQ}h-5gVS6+7BOeI=U&CT5utG42+1Bx<-?`tWe^Bk(1 zcZ1=`dzy$~pxQ1#_UMiczbLTWZzRp%GV5+|yRDX5lx=%1@(-ayLw1KTiIbpT<5GZ9 zbSufnnNibjRCw%VL@TIW2B*#8sLrOa)0UOg-Roc)?Oh$MIx~DKlV_a*q^g;;f}KMT zUTLB|D7%(5UngtNc?I%0!8+HyXgK~v2arcK5qF9uC=6xg_>5d&D!fDhav zGPm>ac@UOULFSibVJ-$-+o_iI5TfpM!M8c_LDvq_^xSq^#PI+;9BzbXntsuFm!vw^D3ZSrf9GuW!N+}KC=Yzf$c%Q6pXz^1fsU+45hu1tw zz&utLG@U!ebF>~hh`OX&cPxIO;Bu=`)&0+o#Apv^LoMzU7Wha_EK ztoV9k-)AQwG?j%QdBIfi&;jhHBRIQovg#AV>1p2)sbV;yX9HC6!8X}RnK_1n zuN+OMJWksx{dAGgdVA2UO%IfN1R>#?z(7CnMuG{DLuxz8ckY+tmYbAQN~W#4N+*KQ zU#uYB8hkxu9?B@+0l!N&jZrmS%7e#|*a#3un`RE-c_|2#-Pbr8{PgaWLc;Nn1%zNQ z4EuxLqA^nXBJo0E@xi+OGV1M!PQ+W0Z{8j#kFT;%jA!}ySet+ybo&TRn#u-@$&mb_=}zuh04n)U^knm0()rl)AfbksO3 zXX14waXi$0;&^GznZ9@%sU1zD#1+ij9+>HDS>4j~%)6!@eGTBAfQax{{Q{y9k8Ltm zKXE{tpj@5pNR&UAQy#T=q#_~9sCtkf-ZAH~r~%tC03a|38+fy#5z?)>12ct(??fD5 zLiR~hG@g*oxC;{Q3BDf!y~paAQW)fZ+Q$>M@rMOXaSWx}A|_Ww13zr%^ETf9W8!h* zn@vpz>lxJP^fS@jNKUzsC~wl2AIp3cmEMi4XDsK@s?31dd>emv7D$`$ovJnE z_o=64PAx*^w$}0*Z#EEQ53%V)7MjTA-}cu{?FPMWXX2?Wp{Ro}gdHk*TEIMn^BA&p zepDGrv^G*~KY0U6Zcvr)!A;VLpE4j21kE&4__V#Ch?`gp6B#Vt;ty&DO_}pcmn=;g zIM#KMQD0y~L=X3h&|cy~GPO89Iz&MjH8(wwF1>sBs2KSrTIRY3*pQi2=}?*DoHWQ3 zTG`0=eCEGl#d!qhjwGIMj!kjWfop1uN)MN+W-8WBE?v$$gFoGG_ax-Nco%CkmKg;s z!@yIQA!EYi{NqrXDDp7oIvISwO~m_(4#7u&U&K)c4p8wXMfjv`3V#7s0sjOBZ{v3t zRv`&(dEDP9iE95i+@_{WE(kHAJKFMzUz{03CtV!pq1^~Q;IKa5nVX%Jtl-U4t9iRN zO_saUXqbh*2dBR5fsHYB&LUKbf*}(e)~mo#@``1P5vvE z&G+8HUXhS=T{d=jHorW+!gGhs7uN6lD}e=Y)x^b>F&ey=;QNf|RABXjvA(wNjEut4 zKOk8YI`orG%avwlidp&7ab0{vybiSiNZUKo)u9 zmh5_We?f#ceKAR2OfWTW0T z*k0k|{vv?T0*EVy6Y7;PdpKcH#fem4R5bP8=Sn&S8oeLP6eeB)X?B~z3&Xt$Sv}oI zI55u`+wdx1CU8MANp*A2Gtf=o2uv}gI>XYj%Kdb63EY2J<-+C&;)q?bn^2r#7sJN) z3W%uK(^e8!!1^FwW{6VZIC2H}p$+W~FQxKZ$ zL=TUgS7##5R#dOuCgi?rN9@B3;PXSwJ2ad!LBTJf*Z!dKi9WVKxMFIUX{^kHl}lD z!RoPs{xmH1Fc`4)nGWR0%jTZl`JYD95~Jo7eRKv=l)&mW!R?!4X;qyD zCr`LYAk3~00qO(ay-&=Nm1ZfL<0mOUPi;wC?N~B&%PxD44VMh7tkN<;AoAXiI_HBBxTih z@euuisLwNs0{Tn~QwTJg6<)~qe!xMkhb47CszD}{z|}>~TDnv$;#XqcBD;80Y?QA| zyBxZd6^ag}K=XKP`^Lrxia`{_MchKYBZ=0^GjPUY!^0#G;DEmS*jmY_LPo9kf`mA1 zRB9h3-Ud1@_40cL!8831C(Py30fKRryuw|`^r+Re$px1j3S^OpI+wIccAWgDZIu#) zn0O?ih**ZBF672)t|d{lu~0_lVcw*&wpy@@8~+Po#>GbwKP;6G`H-^t7Ji>2VEhZQ zZGBe#N&%mp@^wQNv6oPIf~_Nqa=Uyp3FZu$-V*Fp;-V{d3ocE@au#ejjV?7&ke6=( zMyp0&Tgol3jDq-}K-eksfsXmnX6n9_5;#d-{TfA{uZ&P!`OmbiHuGGCK1;^aiB@j! zx@WGbxYZ>ADLU;XAVAlarmt#EUj@}fpx9m$%zmc9m1ax?-;mN4HNw1)6t zxx5ws`Lz?sG76hY>n}!=+8FT_1RZhFfIMq}_=_CE&>Dy+8Bh9ksd1CXuEcUbDZN+n zt9AY`KkZ8rh(6>zNGvUX82VV4Z(RiCq{Kwm$IDK}@-na7xjc0>A9#YKTi3fL4ggv| zqGKDxWWbF>uX+1yw`Uh9x&hfL;{!#w+BVsrUSL5t>0o;w4Jv( z{~|I<@Mg-DBI*it*OX`B4UmnoOy`_nrDB^k*nBN2_eAaKbn&uWYkWsyJtr3_yYa8t zjJK#K7g50JNJTRLIVl_^FvYcTQJ1bfVg)x-%(9_ z9lN;&Y29qoP(H(t_k&Yp{SYW1KQ_q#S+r!_zcshU9jO0p02?B>HrDHHa#)WFxYJg?yJf4GAX& z9??UbEz*lYyuJ>j%Z zPIxwF>8~a5_wtXen`Vl)wZ6@I`PZN8UJ0i@-fBZNa{AV<&-WRFkR#MyztQWQBA?Ez zZC}KYuRdn!mH6MYuBIltI{w$sD(3rIp~>H-FlxrO2WHR1dZ*whnrn@@A`EpT8r3An zbtQ0Gro>tign%sOt;iIQ=JnoOx@1PpOZ&kEc;TwN9B%#nT|}NC8Z*3>-czS{3;?e# z$bmh%tJ03|MLqo2C0N0UBB4VMRXO=TbHef=Y#DwJm)>#G^Gcm!Tmmw$ zK>wGYIN`kS=6rD(|I{tB)qMfWKeleN&)+r#rxCR|^*;266v~mXa~oHtviJcCpKAX+ zaZ=1FB04IPShMvH_=~_zAfU4avkt-`+V%4GSBGG=h#Jcd#ZzgvS1Zp~k;jIw{IN9J z^)LH|Agmp=q9yQQu4-!(34~_t;wQmKK@rBzjvz{za*uMfcP1IZrzs-g@r zvjGkDKW=y-;UXfZE5dCd5(n349u;lu!oN)o+k)66E{-$E_yV=D%bRDGpZ)h!D|r|o zw*)F$a=;eFVQ@clz&}r|F930D+QXON>Hh%nS?4G0;!52QevJJtPGX&O)fBZ-lP?Di zD-&EX0Qgg&z4OxjwlB@&KcP<5f?<4;kzED!i}{t+cl{gGx9<_$<<#a=xe^N;-9~Oi zsvi&%tyM)7{7S&NTRKM{7RI3}|GS0O2BX>H{(^U9!mwr$u6`sY6j_J0mI-$Vv9uJ{ z36MiiNBjLpL3uripaSNzpKm9j0#@%M3W-S9aM%jX;l-YS)-?OdpNm3tDpvj|iIODV zL~<<&gBfrFf-rsh!8$rHu;_99Gc&2h0@!qAteu~=aRq*H^rOMOe0Yr zPMDHy!oF$*BHrSGMVwJLoi%oiB4P>tN%@4zo<(W%X6e}`Wt2-Q8%jyKcdLExsr~O z$nNVjG(uv%B2)s*spycO&^n5xVo+lwLPBKX72YjDK$zJoA(9458OJ6XZ`%LKPELq! zs@2TC)CLuZwKON0$MQH<*8B#<=c*wR*yZN~Ai;YPim>{S_3Q|UI|%A|)cdzz=Vm6B zr`8d2=y{2(^O)}!I7jpBBcxkeDROKf@pS4RaK51F=$Y{%=-JJHoCfWnW+}2JP{v`% z(*Yt!OJ32Q7r&effTH%b3l*%(!-sqvoi`_O%tzIsO9*txhAyVXw?BN{t9lxmeQ{C5PQt4O~1HyomxaB|_-*G=KahV5tI0EBe;a2w37 zairx%DRT7Km*AEH0}E(9S=Wpm#sPrd?Z_xCjg#&4q&w!lqIdE_hQbDbD zj$D0}0Cln;aWlEv{s8sMLVa0dD6obo=&t=+g?Nr8asoIb((U9S(to9Nw(Iu-()WCq z)bu>kcAnTO#ZY)Ws1#|=`d#fOa_sLpBzbeH$4~JgoR#TRiu4uz0&RoKnUik8DX^sv z4u`81^oj6rghL;#5s%_aC$3n)QE`8qzS=%H0nKCc&xY>t??z5se;=jz8R<#?r1{9s zWzJbz8@+f}GT03QT(#lSn)G5~nri_}2VaYV&Tq1_S_LvyQ_xQ(w_)Nf?JS(*uGYHu zE5{;E2Oh<6SWkLIPjv#5wiLz@lbwG-fI?*-nhGym6|rK~G-hM2TDfcS18}-Z(OvMO z=U9Ettr@GD6S*Lg_coyH=8F!ISBPG99pzZ{mTIVV0?2w~iTEQKJ=p8CpLq-EL;NV` z;2baW3axYq1~tiduJdkoai`LN_vyHc6H3CQ18b!ucVB(oqNuHiDalqG;yvJvvad3N znjmf*PN3urIJ^_srX-lEeC@N3AUnvP(oAW8LNVrGyB@_lk`9O& zcAO#g;0XlbzF;DGXbB(*xls+Q{mPXc6iD|*(_(@7GB|?b`xiK-(~;{Qv87ft^vBh> zj7C@)ho%Dj@#=(kae|h9Yi5t`x=617*R!U^r`Qs|+Z z-S3VH`{NjG9b<=AR6401ri8hgIU&wpAl!n#PR%2y#|gdq>0V*+gHw0968}}FzBOA< zMi)5+cT1M)6DsL(p_n6=h#WApA5XWkf)vk?ES*AYk{{HJ=;bU|SBWB+GRmrocy0`G z$cXK5yE1Zs3=eV+Xwq#sPgS6gS|D6S=v62j-%*O3SO0|^PPJejg7X8EK8)TII09`1 zIkz~UMf!IMkK(q`_3z-{>oW0EvumdLT+9nN!tNUN$PLwe4FvJKjo?^o-4OCIT76@WTvg z!r7A7oVl?pVlvTii%|Oz)vt7(-#=Y`s}>*!LLi&majLJ8w%8)|UaykOY!a^2hM!sN z9M+GWNWXr^D=-w!1G_W!0Xfll>&Z{9xLGc9yDH>xn*nA|u`>hmrIK(=y=d64jSqPG z&pJ1aw(Sdt10We}V&>-kqo-zCik(xxgq(1- z$wgN+=QaI9q??||7lW>M&f&$AbMprKYT3;62ThLi%$5+fFdQ6m?x~~x9@T;`^qX8- zkWE_9ye;96BEyK zUeq-*?HtkTP&4=KoncXLdhPM=qUWNsKUhAp>8~$^Mh?7)u}G8-;cw2fG49PY?Z}cj zb;@?=I?^s|=LrKP4y$kPf(TA6DY{*PWwxC*s=Wmbb_8R+8ynGyTXylk6T0<%NQ=r^ z3usGb+?*H`_eZAe>_}UBOySHxLR{Mh9?p_cU^u=^k(Ojg|C}n5Y(Y0ExA2(e55y;! zd@8|ZS@+kM%rb94Q8K%dk^HC3TJl6v$MsXL&j^Z)^d~GkWt}L+LwPJtyEp7s_!%(O zHI(uX_}K!xi)O!Dp}xF*yz^_KN?l9&Q~8%!yPSepX|}xAF-Wdtx%1AJa!k4R2yq#Tz|?wDq<7LYviMOtR_#O%dtU8HCk2J zuqWPyH|0`~@wJ?`big2qbM)^-Z^+`1f3oqUmhC5BEH?Upd(#T*Fkn{GD60#ZS4wQ`$E<7!JMbt(~?zx+L z#=dOc2O08qr98!ASDF*mo0BvJ6uV?T`p%@GPI{gELfJs6-%g-m=E)yAe)y==yh?9C zbBqioSy5MIpQ>|MZQkA}LRCg$2|wD29V_!n+i7OvSX0$-L7iaiLZa1$ekIv61eF%r zo*(T2l?vV=ZnnBE^ESOS$-akWv*HwEf_3OKVr)6^%DtJ%j1~HaP{x@f-IK@b>K`5QfeKFo-zuCM{J9=VD zPG**J>J8WmXJ>0}@LYI`}NWW@)*pi^vWgSXp=qTISo_fsAo9STb^o7yGeX z)W`?oe-c-+JG-*FK=0ZVtGQf-nlhc?R4bgI54;bSOnuAmx)P+Je20Q@p70I6Ym=%;4_#g2~vN3*KRf46QG-Q_V-+Ry={_1e{=vxl593|a)>2=@$C2f8e@VB z-ENlqQvC&4n>Ghaa$R_J+PrX5Wne)3$k%|bKU1yy6~<{34^WamA*r>l61Zy#dU zO_3kt#prXFNxH`safn4y-U!;fwDi<$d&o5&E-8o7gV<56V|%y;7QevyQN&U$s|{~4 za`YMYRu-@PSUKyKWNp)r1$Ok!{{H^zNH@Ji$!z>g{Kq(>3~q61i8~v&n1q;SSv-6n zsT3CE@w)*uvUoFf@!Z~Lf>z(UZ*5|a9+ZE>#C1&H#=R8wc)jD!A4 z4ZP389FgjME+#ejvbMjTqLab;;yZkgZxI=42(>)!CFaHLprSsVJ@aYIja0!A*YiD% zTET4#9S+~*I&ST})`#?*90!fwzYuZnk)lc|Ua*fo__fR{C4;*Y;~D7Rnp~3q#?eog zkE5l#!k0x=WUhXn52A}i)h2?O1k6k#TwtHZ{*mI}MZH;$rgdzObo7m_^Y)&xzis;D zgT3cFl|BhXXIlke$eL~HPMq7iE4gs0eYd4A{pyHupHSV4<1R@ddu8PuEq;G_)E8x4 znf4Q9m0A<$&#B5#V9NBKgO5y#rDTtF-MON{FH^Uhk6nHHt0v_xL}ocUp+*aH(>2D& z=EgtW*(RDC{7YZ>ASQ4uxLd$};(G1*dUUg$L8ZkV)zQ$sG4`N5derPMr>nY0h~~|X zxcnY$m-B3mDf*OmtjwR5RVvk}fzYFdX`^8?B4_ei9@C;_@IFlL);mTa`Ld1Dd(-M% zh@Wu96#Q-1e2Wb*b0c59&gvL>C23|SXVe(Gi~4Bf+`K&f4G7)No2NSc@Q)LiP)W1d zjk`>|_lsHc&zc*49(b(#y%~ca=g^;@Z28&lmpv)@?8r-m#0t~0!qZVboTCZ#HRm7Cm2k8v3JV7sH*)2W`BFb?P&Vu-p5d<3rK+BBaUSfr z@`#J$h?w0uoMiFBjB!-Fhwk_LSKHF`!`&Mb)f=odg$1HEPH^cKefSP)&hh(k9(~mm zoijW=1{{rCx069X1f}&9oavi9+Pg;qAuV+lS513hPP!WmHr-;5eTJtt2m@0wdA8&W{j5`|un`-S> z2jPR^I(>e`Q{QIkFb$VQsxSoz3q<)U8t)-hmmjj|_uoO&-EY!u#S-}k)4 zE|TH&ecdk}2EX&(i|(>BXufK}_eC>{MFcsewmI8e_te+NCruhDy?0K z>gfn)-2Q~yB!zo*;r`Z4RUNxSieqELOr-@QMoaSWsfg_a;q3~8N&(E| z=Rs3t2c0ksb`$aIs%?BB@<^hBj{3y)I%dtBo6kF;KR>*qP=On-tbW-|QvV(V2em5WftTR79U(_AyETa231z_)<$b=lYX}Te zTKE}n`Y_Bosbajb`=edHdob0~^8SXwCmhJr0 zaWE;n-IaX9P|nvUC&JPVBb>6j68)1U8wk?lVYgbeZ)LK0olZMl_^gWY_`CAl#Ka>P zIQUss+>%AwQ45f_)7pTddEtCFcHfH7BH|+V2da?qFK!ZBU+HUR6lRRx@=?$gTu_2p4gVq(snoUBgU4ve(2tGPS95rgyZ`JWg~ zKpbgb7VJEK!}VHT*$m+sEw8sA-`T1q4xpwXkH;Sbv}suQc9dI5yad9uEzFLXJ1=NvnFN7fUM{O}xEc;s`C2fx4du?z_aQgkPP#?c zr4TK63!{B&xR$e3x#I(2&#e#dO>HivP;EbbI2IBnoxGDHU0p@f-*zR9@zI>_ee~es z^6Opqk=HK~4C{fSsCDpS#|_xU-W zdglr~{^Rl}mKReg49t#un&H{?KQPPbqc~NQCkjVA`P<7+41d1A?|W`_n|JA11Yp*^ z!7uGPgE}>vM+uWLMSkJ*>_{2J{@>?`1L> zd69|Y1Dd~8Z^O?mdyUV|4xRAgiq046yQ+CdVvp0r`)1HH_%?Cy2ha?9x7N%F8;7Aj ztr{;@#M>dAu--vd();nHv`DiEbP7Sb1B`7(#~qUo&a=a&pbfV8xMu{R#pTbV&fV!{ z;L=b!FSHPxH`%N?D;X#f4~M&ynZZgQvGhT&%mrb+ayCe^!&K=&N1VSLMNPt+V~4Q0 z#KB#rlnOoh{?;+fb;*Dn=lK~1mDCCl5pOe9lMBo#D3^t407MskXqgyHDgKtp-Nvw& z_qJKN_kMz`dvtdyd8P_tvX~@ zT0)58Jmc+=!k;Dq+O|LVh6~1Pnu}dre%r9gRoXPGmmNnJ5LBWyv(2Wadn00!x!Zcc z4LBj!amu<+uR;-A@Zqcuv-DSYHg&YVylPq#DhMfrmNpelQ~Fq~?dudQ-E%T!QRa;x zA*G*~>nTX6M_rym7R};{{js@V`dbJAp9>1Sxdvhu?3P&N1eLcFNDqd$yg}O=lQdgy zG{!z<-`PRq@wg=H)<+{wl|&CIr|tA@pp&(ha0FZ*|(3f=An&f6IIjQXC^` zmpq6R#nMs4t|l$$5#`&OquI9zEzzLuTp|Fzyd zCZ`xXfPF3qq7uNu+_p|kF#Ah{so5T5JF-ljM#NdTre%hFM!|O9RIlQoc)>N=q^c{D z?1li4x(Ae`=G4lV!L%txgoCBct2zA{n)|#k(qHD79kY2C% zg{?2SvOm=SEE4y?Dp3?c(lJ&kx?r!S$xHVhOW|jXO0SV#kp`cd2Q9K6F>qA|n!mog zncw_l!Q=ibA+GmURPV~mqtNF!hd;8Wirc+BwiPGf?g8qb=j|u@E;HdO%ufk`Ml$Zq z%k1i=ug^4TOM88sijwY8$yiSni+WQ{*eS6zol=E3s8Mw_XBxK@DE|artk7(ud%Plo ztC~yL>3Jw*YugSR4RB2w1=?~^N)wmplrg$i# zmtM{P=$i7+)~nl2jTDR@P|v;+rxd`gohTnEY8Ur?ZhCxNf!qDSCq>t5%d=b_!fjae z-b;!myJC5k-?lQ_YbPLECoJJ-;BG(xxQxiWNG}D&iBtJX89GzoEQ%Nw114`>}!z zM`KUK@3|NV2Wp=P&-7si)kk?oQ(We&ARsWqj7>&xT2(?O&Vk6v%t@~rrlqA_V))?+3?+L$ayT=+0HzO zJWSoh)CAm^qTImdc}`5@ow2(X<~N+EdiiJPtq9tSF*A7wFDq}v!CYz~@Y2a&m>D#M zII`ATX#Sh1CzBv~W&TK!!^~ZiRTf8#=K(gXKPYqPycf>t`BjROL@GzO>z#p-%PF8R zbN#I^BRzKu)|=&g-dHs$@Al`tEmYkt1nFIxhG90So=XQmS~p13rtj2ks_DMSq+V-I z{f;%n#3!_Ar$0|e0n_G0v(CJo8*VBU?WI>Qw4cPxHopcfIMR4-yq`658lssgDTT+* z;hsNQZvwnrBS9tUxV^)*+MJmWR<+V0-|o`Nq|QtZhPX9bOBHS(3{1{!)}ZG%&mYbr zTvfXqUFB8}wEI(EvQ!yK1{6X7NU&0x{0O=`)SlhHQf^*pTab}vAOOR;f6C) zQ*bP5D$pfpAUZ=w|Aq@>Q7zo7cbVlppWcJEg1#QH-Yuq=J^7kbr-nN^GK?C)`f;BK zi!kf|(A;m|0DT-nAZxpYiMO1I_xh)2j~A1S?u>d~q^$qx0CqH#3X=d74n)VEBIO9b zJlDN#C-p1o43z}Y6=2G&re+3HI$)8Ef&a56``-TvVNckJ21o#yc?r*4$6F#uR&2j7 zt$;~u9hl7C$JR)4<1pDM7(@Z-8@-$<1y^Vf%VgwQ^|phljRt$^_J6EUPypWMN z*`7QzlHKJ@Kb3^`q>g<*4_2TrLwDLwr^gY(jT#}-hL_OGFgqNfqCzJ#9BQpWL*ke<7tocX1MMJ?r6rLX)c8HOHKegvVbkmQ$ z0%oDKbA*gwRpw_|?A5yGd@6D~do#C9#!2!NHip}M+Zm(NUR)`s&M^zg@N)LnQQhA& z0an`YbAxi;vVA&Y;cTP}y6H*WaC2upMrdnpDteQtr7S{Jx7ZQAkYL zU_jgqV}Hic@62*8V9yDI$Niyo3uRvWMt=3TA1==RXkVq|1@2-a;sGc-jQ=cl`&A>@ zySrgJ?rS~7)LBHisgdpIoO*7l@6$J;uiszG z7)IM(ENbBwmh3dvQ0^RmU)0a4!z;o@h#NCHSoi#nw)J}vTS%J-;%+0plF^&f+IOta zROs(>9yrIeNacqbu9d5<&_x+N&+;Y@4AJ<~6_RZs4HxJrHg4U$BQ~gP&#h2)YF0T` za1yc1Ws%ZaFlh#yB z?nF)l65%1$vIfCc{J9Ms7G-Gf}KFeE*Ue> z*5@FA#_T?_lt*ABZ(oV6nZ0Fk;2v+{DALHhK`2YuSb6aZ1vTXnq-ch&`6_T{FM3#j ztDl-Jl+5A$iXXH@GMpC0Lzdgz0SrEjYv%+PKD>YzzcCF*+q9M7Hf(6h0zVo4F|ciW zK!at@DS?6&(`nz@-JZJyrhIp*{U9?Gw6I2J!57khb$nVXNh~WI(qqDkzJHKYNJzU*tyXy!qYoszTft~w=PxnGVCHAIp_M-~GwfkuL~ zlG(EZv5~0W{TCik)y*ZMks5U3IKkjF;#&;0zGN^6iksEjxpw*RchT2{P?Q(X1Np^w zZ{%3z7FVnz`ua|ot=x57T3TAcutH=xc$RjyuQwl7gG`YLs3l%cYi01=I_w9NOu;#{ zV&yV+ijfH$LeLLmD*G^^WVnYYMy;E;GovLNnqOWOOMv*T)KmDG^;TUdqVXFyAIXQX zIMJXwSlIIEa=3o5aT*cueKb`KgXt?%eL}`Cp%$3Uv|&)5~x0 zj+p+F4nN)Y(RtR?us*F>*Lk|fE`bUSuhqv%3v@OD20(o9edK-59mie^&y8Mi`0UOY zI`|fxK64}`p>5wEZ41UB5H$=TyFss(GgJvfT?NDb1Q+Rn+}JO!eOxES*+5*YLG`9PW^Lh`h{(08h4_TA*RcM zEEBQ^<4y2~Gl>jxam~PE=+Ucm6x>yuJg*9a%|M^K!1=5&vv_ zfo(<5V>QU9R-<-mC142rl_)Xf_gH)K8!J4&!u>xO@$ReqzVr#H6q+HGLPxSz#Cs&U z4QoVMCqcS;L=N{TCgZxeI0zi{Y`dMD^Mo;T(MM(GKs9by%dT+Y7alAXiI=+gkW7&9 zn$JD^{_XLrPr?yxFp=JKP%rE4dg!-2QfvlA?B6Vst}DYjrP3`81S)v1a8r)Fe%0bE z%@?~GUo1ZuIQq@fQ%b=r)Oxv_ec-_gn^TLB50EhZ()a5rOsxn(#1(rzBL-C4W^bz1 z2`zp4JJMjm9%Vq%eU1_)D)xYvWIW8&+T=Ixt&CxLsn)%<8?h*-@d^iFHN78^*{!Gj z@(b3m@8U8=B3{mf7Y*Oi0xu2o1vhqn-PMrFS{&x$_Z~>*@a3r}-o=F8Epmyq+7PZH zSwoU%K}vQ8Ny&b^zZ~E$&k8AmWv(5^x~EVo!$jo-*};eqpZIz^e)oUkCp-s|M|dpk zaBfAImWKOJgIr>HkEbxrV?u$vE6F`dRo8lw=`22G4QYs9v1tP%Nc?OK(5W&ywpPkW z_>6R^U6)ys+6Ugb|D3`fB)3gdq9jKvIJ*Pd?^d zQ?4F6jICHRM1&KmUC1>SGhN7Eau1Po#TWUQYWUoa%GmqgS^lUYkjRoXifrw~aE?@R z!X#5)9Kf<<68~}sTb^9^WoXHHJMqVVSP*1Y4scVSAJuxJDL(nv8z{&hF0sVw3`m$m zLK0Z7Ug%>6!_v>&Q3dPt4I~x*PN^6{;4XPbQ7R;s^WSH+OzD-UU_T=qJz8E5up+ivp=ce>(SE0*HsTpptV~%S*6U zB-U%q!oDeXhure-9S!H#-hz`tnGj7XDL0TX0_Z^zUE2IW1M7$1Dfh2-}R#W!0{icZMGBbR?)dc)$M zZ;H3wd@T5u_|Ct%z{6$VryW+0`!`JBJSK36j~;)u_sDkeF_N+9)iu4sj*VRXYw6Qp zRcE;h(AT7={66+Gx^*p4P)2^QmU96JC$7;7>*VL+MLhnM736^*y%CpA};@gEc%a;4ZE*CIYzQbr5`uZDe?pQ z&<$N zO-)|^-&FkyKSBqZQLP`lId3cj)xd1>kq;0{=BpYMKCScQn5i`p|Iy*P(GJurbU8_= zM^txzy`K26C5yZO)?&Q(;+kxh2Z1c}ubF~X=bA{&F~r-|QDdir1pwi5T&C30JJ*~1 zXkL^A*u<8*YKVMXL&HHy5p}v4j^tPN4(VSt;P~HF8OkoL#|9Ht`c>Opp z>bf%h>q6Z<0;4U*gP*y${+B7`)+mO5;sj>7Ot5YAuOpoPr`lK?(+W4ekpLc2+atrY zvh;TkV=t{A_+M=)rC%B_e6-_)WWw-@%dyJ&eyk3-GTFhdfL~Q0=sm9D=9UPhwvbI!R<)q;XE~-@_!7> zBi_}mM`7?=7B}CPB}9`FgWeXDJL2dbtr=R0?1-#;lTi#N&JbRnFg~4avvW{Sr#{sZV^+qnLpCjoSdU4M3)cCFINF%}p zQ2z}rGgh+o{?XQ|^_Gi^aex)n!JDm=O-u1M?l$Gc=k&g;(w0dZ1roH>)mHN=i(|m* zksn`2bh>O<=2E9QjIzVjiuPQ&=I`ZmIW+Y~aN*lGrmv3HVmlt;#kIV@Hk3tv1U&1g z)|74cMSLo1Xv;D+R+QR$jY*{I(n23VZ2q;aXnK+y1G6(KgG3E4J#XDp*i;ZPn=F@y z#Az&sdklo}-+C9jlqyl?Dp~kX0tHD}>eGfV&%WhFP^&BJMEvvzg1=hf zeB!4S)re&dgg3q_5MQwr7caPKtmluv3+S;rU_7cgdE95|;T*j6EVZieNmy+;W|fmd zyoiSPFOrvUgCyC;ElfFtaHv!d+N*42`0LiE6`ykDp2tYc`{Px#5foXN!ECM@^}h|_ z7vu$MQt0|g>B4M}YW$x(TJ8(vvU7`81ClcDA|G4#PEc3>7yvQ+721wSA+<93fcsi; z1d@@8H3`fdd0iWoH!ZTpcsUeOl9@6L%vaE>Q2!nm+%zNU6Bmn&V+prZ7RuH zJIy}(-NQcJ3`p45x#3VyFrNaH;;C_wW{P;EL<9$9OD_Q}aE_D^>(ynpR`?#yPc~eE z;Ct)h-bK@%4rueBTO~4@8a8RXL~0kwx!;%u;XJ4?hwr&a<)nAgN0R#`K=nQX5W)~D zVVqdgR*@o#dw=Y_0+qF_fu@9xt*3_c9EM)rZ;Dea3u?)O^nZs34a-=jL*j5}ubujy6L})V zIHV@v{$6{wlu7)}wW5OCPK|%CG-v8OSb7rRPB#a3p`H$~)29+)+W@~_37NxGo1tlU z`BQ!pP%#|2*;yk&@p^mR?3?HSog3SPf8o!-ZlS3!k9gV>Qi=woG@55C zhO@iO?A?y3{kfbAaM^4eVb)UsWtJ=`17}P33|i|r0#1Gf>h4;ii<(}Cw_ z^}3SGCUCxNwQE<(FZe$6^}mS6iZa*_B{#+ctRf^q7BvDN_TGj=AQ9h-X3Ykn4<)j% zv*yfCt$YCNCy9COwFRV9D_|cDK!U`CZKZ=KFUouZJaZW1>44Pk5_Gh4=SLMN*Qj#p zeYjZ9H2Qt&8VZT1{+b5>&uRl0r5+U{w>1Gx#n1|1KIxPPHV72Srmya~T8M1nD(#VV zaidJ7U1mu<9E2L4fUzMu7tC^%sEj=@`h$3}AF!{(VG7VH?XW}EboA3JWFrc))#MCp z9he$QuWbi#z>R*?fe-QVWjY+pUqA&RxJ>{HH<-YxAq8Tja6Sym^;VSuY(PpceZ+Xm z2r0id@EX?aK3wtzRALrAr%KMZ28rSE%oZK1fdzoFw!^lyVXw0bb*wFyI+V(Q3p)|L z3CFG$7f#em@&g28cCtlRdz?Z&?l=K-2#e_Bn@2`+d-ociy>Raj3#p9X%ssi$O%~Je zFa()5)fed%Tp51S;)N3^q$%LFJpBs$u*!xX4bI&KfQ`-2kDC{=DCsJ`AXw>{V+VMKME!(8}|RQcO0l1kBf*a9Jmi~+vAfL z!aATDADS+!9&mU{FkSBz$^l?>p$mzFZ(*R}{4q7Rv^N1pfZgyn&?G0_JgmX!guzRB zv%Gt85d$BZ`AQRZXv@-NOb#TZv^)Lm>o;iOy>YBSiPCLYjL%U@{Ne+KKMHG3&Qtc7ontagL7bKeI=Cu>vb03N%L zCy#1)3e5+0!!8`#_MaS?>a+3@4Z=3sk}bVHKcnp?&rR+cR-FFxWQ=lhN0imYUcZ3^ zD+yCaWNT7yYI0nzR-|?34ghtH7uf)=T&RU^&!&0LPc~a~n%*0S)i@Py^4xnSZ|=;m zSDW@(*I0$63w;+_(3w7w)G{0NB=h4}>*}YT$(v_RNqDs>oUK(7B@E}O#Jcpgv)_`= z`!-N0-O?mAeZsV5I!~`@Qfs#En6j>d=0teXd^u0z&s-^+%MKM(WF(H-lD-vT_P#o* zMU&ypu^NE+J$5ZhZDx#wvvA{^;+3m{1wV-#@`k;wRHd#-vlgLKA;_J0OhM{ z49N3g5UyqbA}Xdn)&C`nBL{%~qdk`8`)vV}rwO7mjG!h#MW_QDIT3a=s>I0b>7^R) zelZa}1sHq-1mNWR4ItJ>rNMTmBlvCD_;nR9#I{>(6-J*9*v<6EoU$nfaY=$*`!eb? z^@4V1F-k^&MMfY9GJ9kJri1`Zho5Phy+rmZY&|=7vjg@$OADt-)C?Nx=n`O$UK{(@ zH?*=Di0Xe?fIu$hwE)=mD0 z+iTM(Px#7t6d!Kswf?|a1Rwac`m7wOo?Y0UVIo?!$wf(ohs!soqW^YD~9a;#zS!FD~ z7ieW_*}x|?uhg40Qvu+VYk8sv$DDqYGO~sQ^BK1TLld2~PJo5f2u6Ly+sCDF#Bx6s z!*L9h37Pwg=?~^v{JBLS@Y^<_KIYy~D5?xf;R+r&#-(6xx5F{4q2nqo7xwI$w3^;+ z_3l{Bq)OSva~2hLO)cPN8m6`6K3_<qhD4RUaM9%#_+q~e1sBURoE$djH za@kLSvDu6bZQyk-5d9DlXY0*f{k>;^m8lONrh8N%>{Pj9Nm)Ia=P_`@#B&sAwu^v6VvEF z1cvJLmX$K<4i)eiogJ`hk4ZjfS#BNPI~XdHeD#XXaJ|j>UbhcQ_E^wa3Qh_# zW&2zTXuS#8{EN;hEpHs|CRgW~^|eR-MXx zWTCz@V&pdGP$>F$th(Bqjc2Wg{qkA0t$Jbx<10r1Ioi8z`v=DinUVeh7XX6!T1meb zwwCI_XZ2~s%$LiK^mwYyZy|j3YmZT8SVjAhp`tmlRg1Qm?F^1jBBdf@$5M1Ep@Y9m zHi1skw3^@Oo73YtAcXd28x_}mJV2#aJlV*Y1YVL2N}BhIWS*mQF|*8;@Uat?J}AqH z9-D!M$(|puO+-O8q`6V*zJzx8u#G}Q%a0by8$yPSBfP;%gsHe9jI{!pCLtqkR!XN?+)T*Jr^QI8 z$2fzvNxFq>vJe4?U5gyn@f~;}#Lg^@E5uA(VoVHT6N{XI4H)$WZpm}IFZf5(v`+*F z^5&^u`8cRd?yS?{mK@NLWAW9^AOVok2WzaVRV6;9eF2x_mU@Vt|4fMJXDR78<0mLv z`W(Wiedt5%-u#`jt^@ID!MsLS5D@W=r#)3nFO^uD{8QAbOg`O;CT%^}yCava9aduv9Tv8r4TkFxz*_`Ap z$KN~L6_v5#86L@e@Re|y>Da^w*LN#Sy;4cQM@{Q3ma@urNy#snETnO6{bO-8GqacG z#Z2S06nglCY~%A$19XW=@{=4fGY+1jx{Wq!E{uKV-o@EDwQ+L`^RpqzviAFGCT`u$ z;w+cNxiC%F6*g8H+YGBt3o*>hzO=+NLkx6;;yT3|#}n+~PH+~h$dvvFiCZndx(a$7 zoE+EB3Ggq9ZSvrnIncx@{k&zcN)J61-I204I)*1;az{HffXI$3WO^lv+ZNa16To*M zLiD8Mo*r&N={!4hF1y}-?O>Da{@cd9`-*j^V!n%!Z-Uu~l>dLdaqr$=?4%BYu{=7%ifu)VZw z`6jc^-THWEg7!MZjVT;P(?%yt>NHQkC^;5(V&GOq$MNa}6Hl`s_OO;IDe@Ns=6M_0 zJSr3Z$ezSt8m}wg2c*LDoPFDb%0g72u_=gqsRn&sg@Smz+4?wxKj{d*}%y+2@vuWWN-=8s4 zT9y@u2=A+XT4uvqCNHow=0>+nCmCNLd^(n0P#O8YgD$o&OM8$m1GdMV+9hZi)ic(` z*P`_`SUYl|zdiovg%%27m4RO|RB8=f%J#B;kzQXo4$8OL?^F2{hWf~9qE(2W#C!pV zy>@`Ft+4NmrcE`sK<;Wx@|v#B%NfoS-(&F9&mH@Iv)pAQB*48!Ae(CDQ>)mr(Ex1U zZ^9D;zu~PD*HcW!?}epM>wY0nnan~%u6M&UPUkv;su{nXn|eBY{a6sH*RAmtSJHq# z)2}Ep#_SBU!4INk5?r-h5zVF$_S8g9^6pbfl~d~t9PkrDy8jT$qzTC_X`+seeHOVC zw_HrqQ=G@Ig$!p!`_g`Ot=nSw(L>f?i4Y zgxR=#ah=1JG5)X?CVy@_ZT@ll(3F-)(@75*f1$@%ZjE4{aJW$@zSb zP&PQjE>5mr(N9-EAam=s(HU3y-6$cAN+~sjEfUfxDBXyJjHE?L4lqcoAPou($RO}t8v#A% zdEe*v{{A}$VbAQn?^yR**Y#QGSUmgn^gx+;w1Mi=JXWz2{4x1dfm<6`ua(bOUaTF& z#^o*#CiXZn9s6eLRT3^_*HvKPR}*RM$?GbCl|DFk)CpAM*=T3>cS3QU&*&9nMpZvN zEQm5%otnK$tC$O^x2XJ|eK}8l1=nj!`6~=BMCZ6;E&NK{4Pm$x$t*`>ImTmhiKq<*$&vnIsCk>D+aD&PoGl(dU( z+ID9?L@#GP0vBKoqd0+w0x90eA!QIX;jV%|1)>*~0^PQz`q#>w*-y1uo_$6gYI=Op zP2s9*p;l8Vq(U->SFkOI=9Qs2V?UQeBoYrh)=YAb&yC`jSrp_nAE+i$^L+`f1Y<-2uWuvg< z%Uda{!xF2bC8x4>SFEnM>s(4cm1-BZ77*~EVg6MDjd?Hq~h=% zhw)&a)iioc&qA9L0#f5`drxy4^ImysYR@z%_T$}wF2%+W-$uliBsY|%JbFvJE9mW` zHdQKqlXTzbj_moOnNo}Trn~J!!f*4eW&ig{kn7#S?4htHw;iPzL7Y>#Vuc0%p7{`t z!0~`TbsI|RfgBYKPcUe&olt3r2;CcFwG)sBx}YvUFZn<&?~xS8&Sc3W=tW2(a3+s* z7hg+u{*==RZi$x-V2czci$@p)ep9y}X&Ry2QTkn_}L1z6x;q1NMs-pQf*EaN{n#N7EGUS2Q~O zkR3tzOdpjQ>wHl=(W?j+K&)_cq`CPO@>OiY0s$TofIbQIbBc|j!dC>vyEp@}QLLrLZ4RopkuM~5| zIG&ymHdF7DI%;QzfD{&En->efKzddEF2)sXb9C&Urym}u0&dd$(SaB%FNX}^@PuRq z4bMSB>*^J;M)fddY{f0ekrZ&w$@*NWy~}F6H-zxIflEcP>Sk`~%%fi5XoYXrX}d82 z#<}E^lF|)FC9tLGSV$A)A!o~?eyn@yM-b#k_BqDhP4WsoJt7{Q@7|c9P?pL{MS0}e zLYumBq3+A8pTJpo1(GxwJm@T`b2}V0>Mkzd+Vo(mQ9g1w^P@Off9P%LO+Ijd6dQIT zhe_Piu#IdUCK6uX-E>)9WD@+&DBRuG?4RzjN~=C(L%By0p?I7+^xUpw>Vk2r1mtR5 zU?_e1WD#$No=;azyvLz)-uMycgG@*P9Jx6QW*>6}^#_z0*NUo+&|Z3fjV@%%>(F7y zg%SLwWsRS@PbyarLQ>$np_aoN$gYNMBV*3J83U%LX=Dlzz{^$8{kA7k_)Vbi&0nFp z-pc;@iQ-YTgCk+aRf)9_aV)^;GcpC<#%WjeF4uO~&u{LAN9OO~QEz@cpNkE0vB(tU z}=c96N5Ni`50KH_JNj_~jAp<-4K60Ida zDLKFYZUOpq>tceePX~4^iLvNiNrLuJuUUxVFS7;?|Hk8!BB|#+u4*N?o=BJClK(h7 zlQAoC#zg;kla>FT2oJH^wtJ8T)WB5}0uqQlxoAb{<)=f`rO}%A0saL7}SX zBf&AgJBewZE(#kOQ_V&h5r3*B_d62|;|9Pcd|z0CUWF%4Ut%|@<}(yQqNVgSyhIV^ zm>1i~@+dr~;{5!I+l2>+ZP){|qejjRB|s*a=yRXuuUPf~F2T^Hlz>~>76yK*hPGF5;&Mrr1MG7IZq{it|DI8TpS(o|=r_ zT^Za{y;9$Pa9RKnrdVA`=>5W>WlpxmdhDr8e{6Y{HqTuOJ-)HP8S`jrkA!^QWUhBE zQDYuRaw(3CN@7VtJOD{W$DdS+28VAaRJJU-;-&N)w66Y?PGn~Z=Cf|A*}(EdJSUor32qt3g&tRIefWA3)fo7oFTVdZ~{Ja{20z{JQezkVXtk;PoHabXX?>+oE0^3CbKr#mj}eLCB7 z;OItN8H(tdh7y&#*P>5EZu^BR;izo7U=p>=As4^G_`Ld)7hTcO_=wlr=$wK^>vjCa zQ!%vuoX^RU-kDvRk1rqSI5Yx9mN`vFf8ip?1PWUblS}$z9z#6xIz)Q)=WZOnoDUa2 zV<|lQkvC5I84JZ;zB*&r@(Vbclepim0$ev@_{RaWaD&yE$KqFux~#c|{oE_ct-AsA zq~N0tq6RK6C&Ls&1Wpka{8P+}rkM{ie{zexT6g&MLiImM5;!gwW=-Voe+WM4uUzg9 zDtG!B!vJ;@LyB$;gB)KJ#p7QS6Od-l4ax$0xDrndR5*>Ulo5O>*kL-@S+ueEmybZ8 z?gt@Js>#a$Ws|;XoBnc7MK5gw$UlcSSSDZ>E(lYiSt_x= z2M7BdZ*dE2hi>he{(AfF_das}QYOxx{Y2dkr-4k90&9k+UA4n6clKnlCjWB_rd4#)3W^4ltVdh{@VSujKlPgV$5I z-#`V$lc>pN`Br&A3Fm34A5?9^>G_PcuwnYExEHj3l53w4Inf3qh2M&VwgViOjynM%_uEDz0|uDmwVopital-IFC)`&slkt%S=n0;udVAYi4Col)e6l2v~eS>eoG zK3=~=+(l3VuZg0L6}TuE6F}wt;Pfd8Wo_P+gm;(9^C4g;*ZU4uU`;u;RG8s8t)w;@^c@+);i9aVI@R5LPMO!> zwzt@=VZtt@_b(ptugSH!`$p^_b<(#N>2=x`>d5xF(Y z!v}`ONG>aj!)yn)R9MH8mb92ywl*qkrQCP0MY(b#7l27H@IHbZ75ONgnyNCBN2l1_ zhGesp7bqV)Eh!BC(G+gK0Eb8(>aRvyC)X8oJ~G>H_42gt=2X0*(VBTI*Yp01CWE@4 zuoPrD4mXxiOZHKd7c=IDG7tHoPqQ1dw;VxLE?SWQ?7f#@MoX-5zHpEWDtSLoI$RR; z>&MU+W8u7hlw^%yJw1|{_EWq+IpDl3YZr2WJsfxT-O%}`c&u~_Qt{kU$Rp$Jn&7L= z;&ODy6mW9*QIE5`Vu`D?$IYPP8a6ZD6*c8oCwIg^i~Z*WtPL**O%WUfE@_vA=KVc% zoHR}oFRG)ZK4lghuSEMB^yC^>kO|3bO3_18ya~%y2H4Kb)I+bV#-??OiCGuOe1a(1 zA+G_?$`{JfH0N|SrBlyg57CJ3v5u@)%ov<0t@}YsJS}uh^W*ppvvj7X%wttHE)>|? zCpAcraCjCG{FJ(KbX%p3Qz9(U{csNFmAGW4;0UwbLP1?&$1mI~x-At|M`;VwA{D?9 zwMZ?46}1wdSFdsGns)yrvAWoVQ8a{_(3a$dn4i8oPp_dQ)n=f))r+IQqxP$EF1nSRe&_qMUH?8LzKq}Q@T%r&1Ga$__()dCAzZQ4?R0{TunY;J_#Kj*An7awBf z%@r+S=KR)@CD3Kb)USwSn5GMED2^O_JkZh6Chf=CIwHRclal4&ORVNV4~ThUombee z_CUn!p*aroG&{&1WDDQe(gX;XL(wAURmGSy#tfcg7Zh*Ru7B06%>Mo;0=j!=hzlri$-xug9F;& z_JRTLtE^+d^1<4> z)8_F<7R{cZ7^45R3>!2aF7QHWVT@`vJ=bVSRDLAnhKNX6%WhhOofn$)a6-h-;3ZGUm`%KUjb5IwetE^Ga#lM4liTx zu*<93T0SlW>-8?HuN8|>Y9qGGpd@aw0B#_ez}i3lmi_HQBRaJCT0#RH61AU6q0gr0 zy)l&}Yzb7%YCK;#O}qi*F5OV2a0AL;lpz@&x%Ps7Ya_t?01M^dPGxP#(>_upvD5hQcy)@v@l#tM*Yg$Y)lHGwK4iKcXHF|qcW-+<4SZC#a`>JH6SPk^t|3Y;mkAwwjc)nb!Q)RTE<^W6He(i>q$5X?SywP&MK9gVig(D)VePXLi?<$UQ=X`w)75?EXR0 z+^DH~ioy#xqjf=TbzH9lGBgti%?MDRT>!}$G|$mk#9^Rq>s&q=8^Tt?zlU{`Vj9k1u!)Tr9(ch&SA5@KZrL_;gR>fZ!X%%J)ql)sj z-E4}V_?QgK)5P;NC+A3Sbi0GuJX^9bY$)eNI--p6vGJYuo-eYw!!)1va|cJ9)HMOw zTrU5$zMiPyE#EZ)2u{F77cveW_w&f*yQ|~%y=vQ1pa0`FrFPvwd3;14Qm>DLmJp3K z+z2RqI{(q}XB4OAhHz5fTP)9E11Dk44=Hjj5mkBxsnhV&dAySWOrC6VXB=K}Ivt`d@n}@!IbK)K4p6=>A zQ53I2l|uTuk$_v~1~4(sqMB^f7!3f-uP_8>>Cu#BtHD;l{(TZsN*@|6n|;KUb6txo z&4?*|l$0W!YaLpMAgJn;4@FLz&Ht2M0(H&(xLIYNTj)BZByS6;lScMipxeA&w@~xc zrLZgKa#g`Yt!o-|gCUXB+dYvIzEqG1~jIjn7Xsb%wc7m7S5qN9RoK>AtdFEs^okx|;$3Bwb&C0dftz zYj3$HYyaorX!dBf`uY8HVv~S#=tuE;BKu=Y>%hv~{X{GcgNOy>SDRph@g-e%)PXAYJLi#p`N#ylE7) zGa{~pBg5@%#Jf%oG0`WoRW=V7k<&3k-qM-#(9Bx_ae*zo$#h=YXy4D&1N=9> zTy#hDMNH3;N-$}D(guBA`IJnR87Gr#nN)`cy?<-UfIwKT8(XTZV7$S#Vn&VS?4$%O zIyO`huAR$p5mY+zi&y;_wabI9N|F2VUj3NY@3q<+1=1T{d_KK-X65@WdqtaFLnnah zbq$>)e|ik4^;y&dshDoa(B2F7A%R%_{AP&P!d-1yg^PVkXn-Q2VatfPqzs>1>M=J9))&&`mg6a5Vq4~>kzIP0QU=HGLA@xBYfJ_+(ID z?tijR<|b3^__q-4_xR*#5O`}B$z=6|;0LW12nmC*=~R^}M}(}tD85tdWrv>4C)6Q> zeRcN3AEOS)3BA{QK=SJ!5D*JKYbm09?lUF+2Rb@er*-_ty~VUl`ufN~2l96Yu*q+8wNHdB!FX*0v{x#iNSvsmu7a~mqUf423=o!h> zeMCZ#+98 z3v3@V;VK=Z;xl~4c(4vdeKKxZb(l(oYfvb=#j4kLjLLb(3<)NFD4TYr1CD#C?!eYa zRc|8)2mhMz(%A7*DM$HZUqOdDtY?t?%U<$Hgpr$-Wn!oGeOP@UJsxh=5fv;w?Wi^Q zQyDgK1*ZNXon4!_Q=j1dGB^91$Bj%Ws%@AD85dNj%zr%cjv*WWJw>;3LLjSh<_}Xw zlY5}8a{OMfy^56s)#A@;uFf0?y2Kr{J%eFhen8Z~rCMfiCg$QcPRvYtEvfUA=l&6Y zt)lrZn_-pnQ@Vw#<(^C5Y%{rEJLW3h2v4XcaxgYto1_Wa&-_z{iD41~ZY=2Eosckw zBklO(t?cK1K>Sz;l^UK$iI1ttQI6l1+zim3;&=IT{e;;-;$KiY#Z>qm4wdKuv?<>i zu{ypII+92NDE4bJ9ikD*D%arN*&ol`a$mtVR##6jHLE{#K2m6=hG`PCfLR*B^mviW zmELmWj=DeK6cT2U3kd&3kHfsA7>hDu$Ie9dI%Iu%b1*>^mM+G83exM&YzG`+#MxMN z;Us^_&@5-usZ;BZ3={_&C#ZaC#`Pd(yBSl$QMu@znK9O|Zc>}Gkzi)%>9IgtO*(!% zU|)YLE|ZIIaH=HVrcvVp2)Myx8d}dPL+|pGk0^7T&3upa%zF9x?fqd7srW-N=L&Kn00H8dGRt zSYe4{XcEdmFYbz+(9W><(h`zBuDj=SLZwjLfrtBReItSt_c-=R>~vP*+UEg|Hz5fU zLIBNb>qJQNaIiGC@)Y1wTXKddE}!EAe_)@VK+pBJGj`~GNZPqB!ZAvf0SCBZF~8>R zMPsJ80C(1z5=nC;b3BwD2zHF-oMADyxxBpFKkB&ZHpwshOnsarCWy3#r+dXjbC+Vz zuqIh;f2ZS}g6RI=v+P^FruS(4mWuxR=7C`h!r)>wu;>}|Q?8;IO%RRp!bU%ESYlF( z*58a+$CVpdu0hBrB<^wv5~Jyn3! zgCfOjpolfY9Wtb$J?zGyvP;3|QYQpm|Gp#)Xv_-PNl;zS#}#!Rum|8bl1z`H7mHNX zMFe`$#%!JvY*Vx12K}^Zc zbki0wa|#xP^1B=iJ}34kM6X3+?>PpXZ_8!>a15y%Cp4(QJpw@-6he`MSJx6M+3EBiNqvckR0w=nDgO%5Ce&X`bjgfio#*P_m z6q){bm>CrA6QA%$-YYI?T`nG&nzGUx_v+ov97&B$C3=;w9`IrI?$VHvw4@p_& z-*fWLnm>btJ52}l8rjnr@im>Oq%?Rnyi}}zHeb6rkYaZ(Zj7^@_4*X(o?jwGA8pO{ zy-qzIEJ|`oE%!ADNt#bzx;@V7mZIC1JInIK$O=**iSM4zc9pt1vQ@sbS*@Ez7@D%< z^MPYIHXU#>G84Yv)0&in7fQskYm1%dfsim?YE_-*-q=?VNvTZ|bS>iK2;{}TqDbX+ zN`i~E@ou|i#NvDEl${Ae)8VIG}u)TPfEulnnx zj>po!&n9crjNx347xoPguu8#YJcSxjriPZur5PEmhtBr7>B~Yw{_{^tC-|pt8XGOX z)ybVzk<>k_&3NmuIcps|*DRJnHc1wHs!b@{b7eMul}`+(F}-}Y8K)4^mrFf?I0jND ztL0J$@<3$J;F?>fkyA8oP!6g}1n%0AYb@b7eiXL5NLWbqQ-llP)NeFzng-zZqI|!{~z=l&DX^t9Z!@GDC!FjTa->{?9_BC_@nE348%@iU$N|Bk(3X+W;;xq(m}(}CazbP%rgvY zmAa;TN*w8!vL(Jnn%2!)N(F{3LC#RoPTL(KTSbecNGKZ%8%M+0vs#JqibHQ`lU?hf zMcQTg#0%-C^R?U&B{gD?isgP!Q<>(*zE8@Nm-ym2qME3ccUJ3l7S-qj(Ms&s3UGR` zRUUU;eMlyqthw6!L$ePuT*8wbIK+7;B;WT<#CgojK|`e7c#cz_P9AmD(ka&VX3k9b zDgAF;?x3LeOqN$n!fc_7hj_etbQ@Y&WnUVF71Ur=J=*pGH0^7mgY$~Dkz}CKz4p}+4~047AZNEKw}QO;EDjWQ)AGd>J|Cl}@y8 zqfW{YPp{k`mW|E8zQUo)jgV0tNnx$YKs-g%3`>@EZWpG@9DGAP_zauPhrXv;>DQGp zt4KIilID-*RBSxtWV95v0Log0Dq+2jBv;f+#ETb3&y+sy*A?WEZYkbn6T*e>nlU;C(e(Vt4`B>y+XN#+{7b>el&T{FEi4K#0d)D zhow^EsWxF3Dj|a*-HS6YC)uGFPC9jXD=EoyGwo+oMKp{R$Mp3&+}hQLILf+{OiwTv zW)7vfS$RElJWuR+(I#B{OD}b1xG8?Rfd6yyOu;on1`RgXr_i-P?&*T;v=#bM>yjVo zK5M(QbDL|LjjL|dDn^hR~=@CkUB z<0>oxowM}RFQrBr^w`pFtbvOdb!@Y?fpD%*ILhL#S`6o)tjHLC^gr3 z2s1dJ)J$mWvJ21Eqt zTf1(688Bw3G@Xm6Un`SQ>7*-RAyyK9_*gwqqH*7GAaF*k+WzjmCA6blb?Wl+#5ZS#RcEXQ_>we ze@2!!!?Xt#=~0WAcrU)zNQzXL>c41zi~(U&!2dDiG_$fkQeRHOq#?N(`X^&y9d>20 z1;(B)as^%|w+N#W(k(N$w63^~sJq?*#TUr`v<3~2s z94M-iA$g-z;q)om7qUYL1XKxQs;jH(aF*a&sOidE*v;=4z*T50lo2<^?G|RItNYE( zqN|YN#Mxw7TkU?Oj~25T;t7sPAkIrsebcrI6LWwW>|X9sb*H$v90Q^P;sTxpB-d$H z%!O!ID=(=soaEX$m5NQ6)$?JYCFX(vDJ6lMC~H`);gaI^^HWZFMm}G9&gcxgzwZ^k z=Mg8J%B`508zRC`*AlzS2MrUYb|ZSlIGb?Nvu`vXzwir3cO}fKxjJX@S=|v%j!5jr z*45R;j_1vcbaDn7wTCP0NguK@rrj-1FhOOFM2>cH3<&YtndmOdEUUXZoeHxtiRsSl zI_`x`AQ$=JA|-L0R=p~*b%kp9&RDi1i@GcAUG6I<*}_t_VbY}L=U)$~&|g+3jC!gQ zZkAP~d0DD?YE-usV8h!_ugw>^pCg)ty&O`T6qB@n8sIP*TK$gKVH~qV zI3%og2H%U@RP=4^v|yAM)r++G9z*a{4D5YDiPsYL+WgtN}MJq zpj|4i!4}GNqT5GW?qmJz+B3rXlal`6J-+X|yukvv!;~-}sQ4*7T)xkCK~{YI1E6oB z#3`0rje%6_eRkkb_F(57=Mau8hZBvLi2ELdRsxpyGVI!E)53`Cr5qoeXGUqszL99$ zaLQ@24iMk=DbUmj7^gUxmd34}M&SN1^oVa@3+JVE+Z;418f?&Fd|X+4Z}-uwuia;k zdt<^WkZ@!=_Wm)42X-&sA1ud*Y~*6#IrxmARd|u_>+@>6nY;1XZjT!`66w|--M-yO zp40Of`yWb45aw`2+u)OIl(5dkR-bmHWl;grfP|>_Edn;st4bFEx0^;wKEGM~Tzem( zc^Cno>z@WWu9!sA(~ys?akln}z~xPD%2@4Nzu(XdZ+ii&x}xP;*O#Ym0~KqZ!sWV9 z4gs$fkNzk)-;9Zz+Obg}TziWf=)gBr+evTx5=P%BT!A|IL#8o}g8dB>#5yMAM2 z0j)VW)7fq?6TXXQ_a#O%A4pQv-h9{)BUp^|T)OE-g*-+Uc1+0MACmBha_+5xBK9T& zb9Vn(S%CSlFkvphZ}_MnCCadnH@}Ay{2+s-W+>%Aplx=SskZBM7DrMgQk(|9a}FZW zF9P9~c2{8sV%^h%+Pz%2gYJ!%ONTxkP!_#{@M8=;iR~`nSL=qJTsCwfE(N@e=iVZ5 zfesvUv*2{)0;i3^e6E=j-k4pKPH|mOO%lsh?*bEJ+~I;y*3FTh%#6|B{~!<(#4!BL zfq_@tq+kOh{T!i(Q3p^sW57Ibxt6-aNK!iCW!0PG%tP2l#AO08p(r>>UqHGZAf}&r z2n#!bp~t15w*aM59WYT|u=@c2Ws7(vA(vC9B(dx?-kR)Z2(aglMCrXqZyuzc?2@*B z*RG9R0AkoglD8lQ+2!0%4K9fHY}X;j&TB|@SH3Ko>K?s?hB%>6(*!LrsTQ}|h7>Z9l9_X1&mC4&X0H>J{w=wlM?feLL=5T;F zw6`+nY*3C?FwcYf==w5%a7Oc(IS}3OgK{$di)21j`c-_UVLN^oLpR&?yxKX!8nzT(L2!6G zc5UZ+&F(=FS>d1M7+KRFNn;jRiC>M%tkrnvlq=bvew9|tRP**+dvpM%A1FER_flfx zz7o7CaA(5oZ)&={jG9sp=cD%Dc#xcccFB=j z{N!ptX}Rt93Z@ENDUq_V54*Xzk92HWDyAql3d%A_v`h4cFGLQp!@Pc)aO>(wV%! z_`p7vXq7*2edEilA$k(7!;qzEj27GETWFq0crqNkWu!n9SeKmXJ4G&$`B0OuGgvnZ zC6I_*Sz~Fhe?Le|QZ`+hZ&o=2Hw0cBgfDba8OaiCVG7X*RNM5wi3jY5O`iw+B>xrdW-1$hOW=)C_<2lvi|=aO@B9p#ju zgGv+p_rAJG_t=Doil z_SYL&zq?JW$)C6S$QVZF7m0g|_T)pGBWL52AcWcI&9w0&xcDY_C}}%TRVd!9gVNwP zFvln@xBMS&wgg(GMZ_?a>}P*W_WzI6?IQz!dMN#^r1_s)*!Lii?9`^#WaH=mCi6bh zg&Q>ycWXm<82_SqtiAnz4ycU*vG&N=55bg+^+g0`8$X8uXirgqEZ%?p2>$V(k2oK} zGwTaMJOs~;+Q?h%4`*Qg)&6gfnDGR_<4&L0=4-t<27bSSck13+o0DbZt^9oz>Qoqi z?_GP+)}usCkcR#pSp2Uy>l3_>{l`E_@VO6sfOh`JefdxwN?6?ZFl2nKEmePwzw5A` z_{KchY!UF!4hEYG+kdtxaD_*Edivc2V+h%AG4tP_`(IuOEM@;<%p&*oz?aPVpYH=M z_wSLgZ;df~Mxv5$FS!>*(q?9NY~w*(l=hDP*IRKSz)1fdujY@=oC5 z@3ayIsvLngRD|?BgxqJ;22rKo{VK+up#L?H$SGUVdHazcG9IK@ zgBL!UDN~=C{_s4rWrObsn7pyyv^LC+?IV2;QqK1=a9Dot0aK*h08i`2S+3@icj-TF zAH%V8Wa|x$?xyA5H2SD7S%(V$O`5HgPY7C&M$?LF*Q8;%dnj@({@FVec^?~tI`j_N z!cF-vJb0NuQ(=Qs{!RrUPM?`e+yuO0cA+f)FT8emlhO=-;BwhHg9W35E;K%(J zR2n$n7>)%W_8x|ZAVp~J?YPnefd?bG%YTN6e*@b#$L-p~!ek-(<@V1>n{nsbb9_q1 z@?X~}m?9j2<5>P<6z=1%`$-;GJTKF-#wlz%-}WL5an z*Myuh0|E7B1Y1dTYCb&w0e)E@L_r{8&+QDyGLR=r9YMC<>E^GY^S^Bbm{BA`Y!&J#K=PoQUlQ(1(;M+C|8*0<*O?3lQX*Ry%0|9R7Uv;Tp^cY|}Mk~UADo1gL^pUnC1z;^9Qe|`589Pn0{ z_wEc^-|2k5&?J2OKMVxMeTd%d%XRA6Ph0E$y_EictXKQe5D_p7mDZm@Lj9VIxPFG& zyx0HXDva9jp$;_65)_MvF7N%ne9g_Nw(ku5=?NE1+MbOF#anUZ%7=|y0L;j>gVleZ z0nl6m22@XVU>7V(&nW-uRJyV1{c{+t-{9}jUzap^qBURJmKQ-@_E4U*?K3Z@jiS z3foPyDhx7c=Y?d!peJP?B3K(HB07ag?_@LuPMM9nw|EKa^%LM0aGCmS6<4GwgEZdN zh$0jD^CoMI63l#u=X`V+CZHH>36AJ!I;Uny=-vGK6V%acp}}e~q+*(&*S9%1TW=P{ zIn;5?H(%KM2?ScoDxDcCM36P>gQFI4Q|CkDO#B(^j;ttt`<~7pnTD^6CPCli0-aBD zv)=CxHERi!I??yq!w(3^>Lnpr&^{hY2@77Tp8zrEZW3+Cts* zdQJ;Yi>skhlQ0D(kN#ItQB8?z(dL;I{gCuWa#kYCFTj-50o_k6)cO%d=Ngp@dQ}g9 zxSk`&fc^OlI!_xaMR5|El1NDBvw6_Gbb>I`7R=T{+aeF*dP{ImB+9(8mEWOQ&w z36Lw~1NiP+k96zwlkC4n;2rMofXG1jdD-CBcLDAFC#0l;NRvS!*?_w_J#`Z4sMt7< zbgBkJvKg=v8q!aY$oruWw&DW&oUY=6fcyN!=!Ei8*-RZ*HmGzOhhIrEc($d-uk@{e zzs;eGJ8CBv(v}d~k1l1wbsNQ|pe%+|`)$FndD94L@b1Inp;OM|E0v(eGX-bUXeD~P11wJ~9iV)IUc)g}bmC2Do zX5n@@kUnBLbF0p-HMoU_tXWVoOL0%K>n?JhdNL-0G((Q*V`$%N(eyM9UtVKynA|XT zE-sK1k)umeO5t(8y566UlInrNm(FcXR?LYYjp~A8wjJqmYRu`}iWMBMLv&FI-46c@ z0jLEd&FLo65(Cbn?h!%9wn!7LPg2uKmAJ?2c^Kc20!`I_nA>qUZSOkF*_R-#z1!7W51%q@>-Br!v&tQ{pT zNL6bkMI{Q$)~;-P%<#s08pOJ!4{Eg7qXgW;snyhif zLYB>p_?5Z0q41b`7SDtgA*eT{<++8TDP{H`jD(r&>4?RV?Uok{kY;|1`{8C8)=bL> zCI+1Y^%tHw1bP)Qviu=+I=)Vy6zId!Typis z-pWAiTKZDjG5780t;34ewUg*7PJnTk*1n_V*v{fI4g-cW& zlj>V5?N4itP-l?>|AGuLZVY?|mTq1e5^YO2=9m-J5;L;zPf3hd6VSIbykD04sjjy2 z%lFrof@wihUvjwaw_xAIAh|q2BDesLA4VK;G*s((apY$RQv@H08_wW`5185K>_U_Q&}AP@w&F z@t*^?-{aZ;YB&OkLeg__eKM5jDE78*@~`k`+~NFazy3M{%2j?523ObP;>|=lM)Z)DUyRlAsVxP^d0J1hsp-x|n3IK|XBoI@sGpJd-We z5<^Mng%ZW_FT^@FzCawFb_doKg6yt7I$N8pMs%0Qq$}RY-rT}8(xRrL>Wdrv2XBg_e8SOIq?k11z$98|(0jNwQD2Sl@-fs6s5|3x zNuw_$NhTOU2jYTU`TT{F7T_|tqLi;@_*0C}J-DYZ+KJHcLw(8Y)}r?ZMVOp;(@+T+ zvhz7Ujyn|nFGoZU7mlG;e$6Ud4Bqk|ORg-=v*ttGF(WEh!lZ@Xvn5Ecl1l7Hr0Yg0 z)TswEE4w{*jO;+6zu=e2;`T0f?H#(u7SvQFJeM{L#*|N?9O`c}c~~xKVU$%``Y%Lr zrukT$P3D-NZ2Mi*^0B2fIV8FOqqP&H^j2;kEd>~{GnQr%V3O^cyHprImxQFWo^vJ` z3tQh2k|5%@J7HT{Qd?@ANy+LnxV&%9U(aUcvv2b`ao~Gajufp;%k#USk5+za)d~Jl z0LbPc#h-H88QAcC%1NL2D=y8D!&EU}nOK52V?pyVz1;}^3BO#5V5%YBk>gf^A4&Uu zg0Mz0Ft@EnL_Bz#(wF(RdzH{b3ls2Q`^dwK*o=HwXY@gGC>D|t?`FU2##_WT4Qfjq z;`KfrE~omQJ%QmeKp-C?GC>+sV!9(4x&NDSwvs7^r5*XR>Ic^glMg~LSx_4}+lsV@ z7It#!+@@H?3a<15Pz~NObb1&j?bu5>m&w!Xfww#(@$36-D=dHk>}7R}^)P~Z#BCKH zsrm9f7H8V^Z~dSOio)Iq^o*x&=k6~{Px3A`0Q>vHcBjuTr7|vhN=yxl9P?D>F{=rZX^*RW zr|K<$9HHN{?TZ;GK~SxWEQ-(eN>&HZEfSTUwV2{0mMKP%j-r75?u7XA8UhoCexseB zcl`X30Di$w2$dAS`&r~!O6zcYvI4vA4v3rtyRY1*i`9;r1dn_XKopSFP&TkXkk}5z zS*$@duDS)c_%*c*0iVX34Oz`S=m1^SOAjFiUZ_nqO22r)5i-jpaQ|%PJxl7l(8<^$ zEJ8sXm2plN%u^oi7{PZxonLTsQvFE)XgtA0E|eHSeeQ8~vzXuGhaRj0)iL7FW@0B0 zf%A>8ABLQ{afC<$oysjU6lr04I@6x?Z2H-K`Y|rMXH=)a^@*(PT=y7sW|n1E+}{fg z$G+Wra<<99++Ag$2h@~bY*rygl<4Or1)tcD8~`vJJKmkRrR4QJlnbg_;-c}O~QrRyTO0O`o+Y0M4ghVz%9 zRd7eY#dY*vzT1B2&95^)xpBep70{yyL<+#4v1lh3mQ??JSthKFBS54KbZCAi%x}(h9eS<^;DkJgzkhbMLmV-{MfdRc zp&ZPW&-XsX=Pu3N_tIqGY4>Rv&LFYpc~L&nO~`XzSjs%%Z=3XF_*aLLU??lJt`KzX z7zKvwRq)p<@k>;*vf(It2^L$e*DiL(ie6A-_C$Q%8z%)!If`9E^D=mO z(=#3Pkk-SZvlZX~c-o!+DZw58X^=_z90jfU>wOXf@O~4sUr4ohg;DKsdl>db@Kq96 z=~9-`RKI~NL;AUa*q9b0`En=H|A4!>uTxQT`_JVU*-~GYLD!%8{LuuGDq%lP9<8T9 zacTA8G%S5Phy>qH^refl!`+|^jUS?+6{f`!9gFvjRam;u`*xp0i&OTA$ zcf?icizY9!_soNC>zn5=`oYXn3DQRV5=d=9i&C|$U z!)dq2De(jEmAb9HqOI4vprb&IkcsIGr`_b|F)-054C zg&rLB9Qy8e-Oe{?j19QXykYW;Jq&-vjdA4PW zyD_LstRl)*C7jf#@YgW$J>%}gF>^qUm!pIZsdt~sY;(S=*Wo@3r`6HYL!-+ha5i_$ ziMldjJT$h#*pxf zbS$muBiP{L6>>?X`L}+U3_nenI$|LQ{Lckb=iCV-vCtbDVV=uan|E{VguV5sZIJql zF*ee#^rrXGZ+_WMq?$ZzN){F;*`_VMlqB7Y_he30{j>@%QNfevvAbQZd1_;3MZK0C z-jjb)gkQ0!o+SA^z7(ydn)fKaKr@atVpggH$DKN!Ih8)*mTa0uylbzmv#>>TA+4Y` z_X-@ZCcqn-r>-p4j*!JFK5-_&rlxmy==|wHyn*@nvNGWgBY|QF52uB=v9fbD%QLnU z1rqIN7zn4@hZ$CX-e25Jnis*P)S3Uz+&~HVSI27@#}F5qqc8vH?rd{{;Qs~adFwv} z(Am&7B2c?x$XxLAM@^fD&BB7IAt!Nt0E0mBvS4fMfRsok;KSDPFH^6lwd68sNN-1x zLCuA|k+nn*eooG1vx##`TJ9tupGt$7w@K6f|^=DjK~#09_nDd$6l^yY$?U#@}dJ znagk%HWF?$#5tWoxdhOQbvez}XHUQ}zsnwxSc5kyb~L-V&q4khZVGPEWrlfX3~4bk zWJSDR+;#dUoegpP_8#FHFkOE=GEfX>%4)f4E6~I`BUAAfBJ@G;Rs?YJZ)c(`y5dd@(Y3| zjpD(uj?sI;jkOnzyUimEV+Ar4k#Ksprj$0b(S|q;d5UP?8cv$vt9QqaA24KSi!j5& zxxW+gLSl}iEor!bkuCWKdx4*h|Ix9SRh~NA&-Jq3_&~aC*s!M3nA@`38E3qCTMelW zy+Z|tV@r~=l#1L7FEUr2s^!I#vPph@+vfVfTJ1SJeW;9K5{&agplqZ1nxI;sGz=URD|nsi6nQ)KrgK$!pg+HWJmtT70Ppc zNsi!+DPr9srlm=6F-+Ce{$$w=R5SA}+x93dt&CM)@;pDe5E5SflcaiLH^n7@tp3iN zvs?Rkc*st=D6=ql?HzxqBJh)R#cyhQ^yL%r@iFXuao&-ax2Lv^7&RTSh`oX{&Tg4; znr>*q(%%{&#}CK8v$z_T-E`h4aTlCt>gR-%mhi(gD6&y(ggD!%<2`!q3^!bFf2^2$ zk-CJZefw}MGTs*s5N}JL%3L9d=3=%=IPZ;?3bfrF%_dYf#WaGApK2M0?3{!s$iW*u zmo5sUQb%5znh4=DNpB$wNQd|E5>V1UwA`#9PMV1QRiaU?RjpSo9j{3=Ed!!mbXRW7DPWL)Z5^kpA#MAeVM<-945RI~y z=EfEflmDogif4PaCzf#6qj#pT+cZGvgEmezQ7Vd})RTpEJ%&1Sh<$9J1oE6B<8Kqm zw)s=1j<_%HPmb9#*jweminJdj2K>^~s#<#3MCP}AO+uX{F zO`0WJWw@zD_3+=j$O@a$4uygl|rD|FmxIHTsmod+e z9&ju0COu1GWH_l=KEtf+p#42XoIMsu3n@ixV@cY}87hKVPFpTGw+Sl>ci^5R?0uad z9Mi$3jHLRo4WXr{&v*eoP9)JA@x5yV_}aEEvd2>hIpw4`;`T2LSQ`eU20a+@vs{d`^-sBjxS zW6;xl|MfH9fkjoLpy8m#o1b*Fcx7)+SM24Sa>LzB-T9`YC4}S7{ZvAMCT8iamDXm# zA3CDYcbSsf%KT#7v#dO;4#fWDvK0LBj|E;d%4Zhgd}iqp5wdP2Qv||z zr@mkn4~-ydtWQ^X)mPYsRTgK}G>t4itwq`K)Nzm_F^^xTWH{{_j$EF)K19=3)F1Lg z7@8A=q^T?E2M--di%ai=z%-H?!@p3~9JdXODmT#%zRjJ@xn;c1dMbhUHyF&r{Dke; z5qU=+puX(0g0;{Zqc*Uzk%m3kHQh%8P&eSfOO9eKgXD}^zjJ-$N|Y*a7bKFrKCJ%rhcPZwb6q19d-W7QatKG*_zR#Dn{Pl)~VRX z%O)eok^HkMCRkIlQ`535C4~G84;>*ox30Rk#g(*Q`-=3RqTXo7lrQ-vEt5!d+X+M@ zM~zX*cHd@Aqyj>cfROBJmos;NwXQ|r9Cy?(-i_E;W^#H_xtP|IcakRlG8`UvJO96n`6c+j$p>3n~m+#HAk*9I#M_FImo**k|t#`d3n0=)A>V zZm5X6DBPX3l)dRyDt6Cw zP@@Lks3jUoeP(lC1$K_LZ^1-7OanYj#hR)N`t5I>$|qOL4=ODavl+kV@cs~&Nu@K$P-T!$-9 zf~i&*^!F*LcomSa6`ZMyW${h@2usBom}=?XMzf>F=wB<7An3QAA%PI~Vu224ep*FN z#|{GLrN<|*B`ml_e(?pwh5dHYbgjPIC%;%`8@d-p;zFD+PSOeHBF}f3@Grp7_bJ8Z zw=8p)z@V_FU1qdQxo>(2abGOQ2z@3)dR=UNcabOLO$#?m${EL^KIOK^UdJMYY#Xpy za~W2E*l_zk5HtfJkSgC0QN-?AG+0Ea547YS-w614i!gLItqD z25uRPS-K&EMgACQ)kO_@p!b|~5OH2@8+fJ>j;=Q|EX+J3vgLtD^7lvWcpCTpDczZ7 zIx=^LyJF%G*-`_dQ~KC$>rd}oYnjD_AhKkuW!~sN0OQ}-gg(bWtaO1dZ#}xN`LQB@ znT8?quo>9^R{Y>tq@Kq*0{dalrR0dFnN!ORC0g~V&#zx|Dx9>_pVV_ zGTt78Zyj9txJB}ETBxC@5LkON;WRJ5@G|U$h%RIO_386j}xFu z!A%nrH49DkK>h&maf$X0=T_ zoKdw@w+!NpkMRv@NFa%_$PrOZTY)vbg0HJ}74af|`&MLyEt2&r>>!-(6lYNwx1vOz z%P+5MK2mP#9FPm8*vHZ{XnZ+Z7vCKb6sc<)&)?I9rM@Apl;7fo^XVuS9!=FTis#o* z_vH9bDT})`lM(ilxgSavvUv2xa$a-|B< z;ad7ElG8QkZTl>)*rQqWl~5CEF#l%9SWO{ENWNKTp6u7(< zcrKYuqIMK6xQXozPNS5uA*QMe$>z??Azx;md7#wALSIrMqoF49%uXfhdqFraQ#74E z@lCq--8LAecw4xe9i=uDDtMbv!p-wVXU@vewQ}p|j$HODl@_;x_ZB`lFRWIqL?F(! zI36R+NXLH9S+PlN#SRv?_vmreV$?uv`9hn%NXmspVwuKUIUhjH}U*%FQGTK7U@3^G_ZIzDx4D*h_%7w=P$@(@S* zW;qMf)=D)5BSI%br+zvudT{!$i6=XCprjPPl0z8wk<$ZFqY_Q@QkCEQ9+@wKS>sc+m5 zSosu*id%Lm3jGm0ww9#zO_P~}W>nZ~>h@Psn0Juc-pMjY+ww|UgZP4A?d%5g zF(Ry~!>D1ZMHfX%wg3EB8(_m$nC87NQA; z{8w01%<*WaJw2aNkT8L$O_MO2!z_Zn*WC*xmoKj)AABD()3^JyoRnQGcvIg}55OJSUU)`TV4!jr09n zbzUTj<0ofuag3`-UQ+he)u)&KbC9zi*aCxAgA!j@6!)z?y%JS&PwbGKCG$Wz%{&;IBI+5&u& z%XN>KKQqNfLGC$P3w}-bW*#@_PAn{D=?r^ksq{4B#BgV*f@Uu zksa=9RJdWTXT%fiUGna>MqR4iJ??V-p9OsyQ=;j%Pfv1pe>N#hq5^zs5@WOuL)&ig zr9->*=bEfLu^X|}ZkNe8jHpFn@u+Pj$TuFgs(g{h zd6;kMkg!R+`g~aC41>8An}cxXo*2)&ZIaR?TZ6S_0w|bLH`9*oJ4gG2E}l6aZGBO? zRvTZwu1M_?i2posDSgabH}Za4X7X&a%f^{z#wkMk$Aj|7o@*dh)`Mt@fS>{o0u?loxrfO+NfSd+}7~YJ5xk z;E$9G@NQ z|9~>hYHbZ&v~2v1d?bg$rBP4~!B(w9j#-ngYT0p*br_eZ{d|XV`eh5+fL;}|dNcS$ z$pwZ;dpe`_9vU$$)SGB~@u~(t0!q2wfia=O;-4P-Vb`;1dw*M-AYR!Tut)*L5trbh zd)i3nAJXD-{6l1O_e_xG!etLz#eu`XeB3Euj)_+tkOi!P9!PZ*P&7T2P0T)a0ZW$} zo^wxzHefz3r{d3@t4HMy7M4YX`uoX38#@9ECJz>fY%6V>X{_pV zw~wBBE=sE4Po*=&PXL@9d$kULV0pwxIf)zYc_7~DIBEWMEXwvrk)@t~hK7Qb)1FQk zpTkY(6E+tILG)YRcOEA|oZWv9H)jLAwl#H;7Z8)Akw8#{Evx1v6%neu#$&e#uv^c> zsJN|slj4+_&sY0@rWC|AG732|jj8;xh54x%{6?w#zVM;(@ne2`1qqJGcfodnc)p`Di<}IK!@zO*6&rU?FabAycyXg$%E|L0n$xSPLv!~aj@xd;v(H7YXzH6x$s#JajdV!kc+}K za5n(9`aDU9vX=ePt!8MQDzJqpbIDxoH4b#zHZ!s2QM!NY=N%lJd^S8&iPv&ab*V@# zs@`EY@^)|8ZX1WS@W=rNK9k!MwI@JF(wSSlzzA*g2*(Nv0xpLFiDY~mMt|&MrGve2 z2CsLO)-@GkbN=mzyquSn&M1ZyPUgLrGS&e4t?EON@59%yJ%=OCdAeYlLPZ_|>|d&u z!4fbcPc5pnuf(%RHN9PZcEV)tu+G8w3W5H^)CE~S#8KB@*KY_@9enCjJPwOO)I1MY z0r|!Bs-B~`wUgz-^0l@^gp9DQ&%9RXVo26!| z*RL!lfjQz|f4Bptq+%atd;ZizfD+PRpI~$H`V_}!@z{Pn!8kLujg8W>3T6et6b}dn zFe#!bt%M%S_$&tGZ-jk7S$X+m?sdw#e}Z%SUIQNpf@O`7_pH3o{id&l0rRx4u{RY5 z$z2wc{S5;3o)z8@S5mF3di$n|GT7q-J>Sc3+<%;0n)J-J~JrBjVO=zc(o+ z>yVt?w*YD^U<34s3klR64*&LNJ3fghX~B@n0XO&BFA82w4Y{o?U<(7*?i!$iOsB69 z`x&gRe7?U&KalpF&yXkwuT+#(jnS5CkxV73fzh#0aVSd7s=MHG{jhznpk23zn1>UY znx-8JGWiW77UY-kp5MZID-C{4$u|w~vkNsz{^tOdQ~dkY0tt0QV+q&8=qOHm5wVso z5-!|W9C7Aq?Clmt36twe+xqB1ysDp;_Wt7c-~ws$XOJ023sklJJZ6Bb*7uab*YPBJ z&6hsq{uB)e|Cxr#0D}o2&SZqr)q(V}k059GB&eI5GK{};tfl2HsA2LsCN^+99=dow zW-Sp7EW*wYa}jr)bc=k&$P=gUo2F`$8bOz!Mm+w8FOIm@SsU5@~vR|68Ki0n6V zeTeDJKt`Hz!K@kpN>M%+a1xX!bReLShe&i8R1U6qVf_uXUovXY#kYq})iL6y zaYidHQy@xn2^7!-E!z&soKavZ(LHS}Fx!-7@=Gytur$dIE9f~+D=_;84PO44F==4a zv+%B9E_+rvJ~iSWm4jvuH}k;!ILE-34u8)gb(X1CZa3c4I^3wnr}KeLmwj%q4fW?` z-ezYfz8biF2N>YxY28rp*WZukmN);%VC|*czEojUWds#w0S6S-@uuQ2T)w2dQ9)m< zl?Ls7!O+n{ly8bzx6NMth1l_PjSnF0AKRMcDMDein@#JTi%l;a`tu2?0ghB2ZA&%# zI``Yq24cSk5PYxDJHc3T7fjqOWEJyWRmxE{@Rajv+-r8gOzB-?$TEmi&ue{Lsox-K z>uXP7p6tPwK#lDffqf|fQ&)|-Q{-&+?`GiyQxx`BBI@ytdE+|29FtFgeg7hrsqWMx z^TU4+NvC#1o;L-{h+%fv6eYzs2GRFWJTQr2h((0@a*OUq=INyZHg<^k*4E;kdX3K) zejNH23s|C8s62-IxA@JpCuE9@iUznQaFkXqhkQL1602@~IrKechd;9>0t7Yk40B}; zM1?`_lb4G?@Ra;8UvWo`0XM5buruQvVhzrm4^SrP1lR#3kXjuD`e>lLFp_!4#{J#% zC=dgPWz;saN+YU7OIC@;G$=QS4<=KM=|f|uCmBh+;%TwVlPC6$*$59$_~Ct^-5bsw z533{7jhxf@+2xQ4oA{nl5FGz`bFlhH`Tak7>RQr9;-}xw5NT2_8f{E>Q=VPGE=mU? zZEiisFjN^*UhB%-i{bD3O|~mdkJcCrJlR103jf@%LJl4=`}IB9p?tTv-?2oPGX9LG z-h3uCBgm(+UEr|?f&L&L`4TK73S8XNIc)f?r!5;mWYN(y061;#Eug0&@XtMK6A> zW&lm@SlS_rS2VrTeR=L1WQe+%ifb-L@N z=A^pA)ND!2{>?*+S%3NdawIkzWUZN?F^gGRiXSOiTFAo{K5LlioIw1wQ)}A^=)neq#eFjh|06kXUXoLq?#`%xwz+&Dv$xMm#ijfhIDSnrLs^9XKa*_v43( z26`~!0V|JOS6+opafHEcYxPx3`=dBDT2z_zEFj)r4k=cy&5-Tnb}DBHPtz~4Q3OoW zamStG??@7*GcZ<`8XkK$LbfKh)+jM+MP;--kRbd)3TN9!yUvD`{=-dG30s}-F7z@( zX_zFV@kYOPfdG`bPr;CS73;-~a>h^dT0;%5!}yO+)S3G*i>o|xjlP#lb&tMxF2Szg zGXlG0rpL#zy^>m5zEb|TcGR+@2`|3hYFaLk@o z{VG=129=4V&N*eN@BDgiB<$F8ts@2OOD}}bPRvTHY@TuwyY3qcuxUhOCI1+hQ67~9 z<|KcmI`;VA#O4Uc78k0#|9*krvk_`W9Lv$Xt`=m)Cno-;O55ctC1emQ%jnIv$jP}N zU3^jn{gd()xkm*SmYB~15m8=8b@qY^`lxFq$VFTm)IvTo3(85t8f%aEq!@Rg!$u4q zon$OHB`13fdL5k^lcYxkIb3_&p~HqJq5ssalN^}6P?%zPxZHr`<>cd zWxs8$XtLm5?hl8`# zz5G!&9&)XWk&2gf<<+&5GLWdJB>o18{cyoPnX(xNAoB{n){xet@GFJ zQ_;k7D@qI2R?Rm4U*FQjf%A}e6Il@lp{hf8gQ1m#j{*yFVGg3mzS|4wK!Z4m=a7;u zID?MtFvNx8x`bHE5CTaWf$xqBr_rCW&wiH{->3*o29nUh7UfNla({)k!ZGpDxiY7K zVITm%a+R1CALWifa5EP`!+r!h@(sVNcADbpBV;M4j3-X`uo-qmD_tP?I>I}5l;$X zPG%-z%)>>d>R>SutHWKQ!Y4Z=i)~+3?Z}U3sZUU-D+9TY$wJmN7{zwb;9zY9$rFPs zG!ln#PFTK3Vfq67^gvcyl;obNpYFA;1Ir}L;FU(+uLs*iNF(^HO;UJNL#j%bc9<`W zWLSs86piP?#Y=zA(kR7p_D(&)(G7F|7!5V+GHWxR>dlG<*n{7lwBcXopUmDHzC6w+&EP>5DU*_olXn1@qW7}A3}bn% zK)Pu*(n$w?!W&GYg(o@uw5B!a4@B-8@cl)vt~_utR-u?Jsx@C;gI6B@z_N3QTNz~F zD!#GS)6P0@flru~H&rHg)CbcT=Ft#Po%JuJBa5z~ko!J5xb~ zIJ~+*l(%3D{_r@{b^9y-7 zs3b_EkteidrsU?(vj?Y(y$*`~`}Rg3`#^qg@d)Q(jqN9&H(R^eO5B$x7+y`03ozQd z>o&xH&dsH2HtCEvJ;I+JOKem@CME9M{6f={cTPZqelD7DB5g2FH=;0C`Wm1bn~gEs zP84M$<%8i-6x-Oqxu)G8TUrUStzZx@H^Hbo5#T2#!7!R2Fc=#BVu$VLjJY|hSk7is zCCFwJ9u~s+o}IkdS6vUhB1aW{(ySl)zBhanUG;X1soTPiqfx^$<=10YdAHan=ipEPt2a}xj)%Wfmb!;4RHwlBmzv>j88IUt<>1|+6?hQVEL<%>b zfYA+>&4v%&WXQqOp33z*JD067%v_}z(1Y?syzcUG^+=+}9FHNpaGGxLWl4Z)7}@9Q zH7VGgtG?J*wQ`Sy!sWubM%#=XTET<9q&;sU6?MYCw6D{;3IH=j#pa>5#|Wz{=*h@lW}{EL6Lun%_OPP6i(msVtPgWWZ51#wFAH;;!+zdC zC@WwO;6brhv6azl@hV5W(R$(#l=_Bn*@zhF`r1y&(`r6oDYYwQ|4ng}%G`blv)}(9 zJ|Rq79d6U_nD#~P8;dQgg% zl10S+=uz{WqFyh|gjkCB?BN0$zVw$3JSah~@N~4Awimo1DBH?|skp)m$#IK2D63e* zQD7JkqFy(EpwA&I^Rr`*6m9aVu|@(VS6FMY_A1cySzy|;t#e3Hl|cU$C1i2hXz@jB z)J@v&AFjM!4ax0Wx8A;T9tCu}Cr^l$iP&oD&~`XDx%$+Vfm%m`k|r%YGuuG(t?<(5 z*l02ydADh83Bv<*Y_V1BYw=>-SEhUaECKFM8JE1x>}BNuMM{O*lq6gFMZYNBTfGkR za@62iCBe8G?B{PwU5akrTm0SV3>0y@-2=f>cCUIn!&PnFb&pHF3|}o_yW0lOd!wm2 z;wcg$rBTi#r@xzf{Nqwy2gj}A79UuTc?L-B zwDxnwJ8s%kT}+)-hP-ok>75PgEHpTt-2!-9@YOq4L&19?F!+`eAT`0`-t&vkvg@4M^wUZPau-<+%?d%`9K(yXf_-YWLGMAQeR&`-D zw1>aM1~Jb#nTgd0to|kgx*lp~oyiz-)TnX=;kyLyOAL@9Z0iXmB}ANg2b=IIvTSZ+ zjH0Tw09L-bqFYX#g-MHe32WW*a_RaN*n$fbV^pfvQ)a&aSXF~HJ%!JHcy*I4>_<{j z$^3)K{(hj%yhs1J_>``+JB0gP1$5iFM!lo}W31^%*0kf2Z(c_6AdF}D{cF2coOwYh zlhvK;HBB@LVI+8*anCN)mvx-(=?3waU&dNF+Z*#X`hnnyx4!giKq_iSm)Fwmsrf(S z+6^lPE%e<{+(R1I|6560u^g*S@Zsq?*YyG=e`xwM+R4SS`-MS6W}VI zxa@^Em*xMF+vs&FPBXYIC4kGegp`7VN!h4|n7{YMgfBN2aGE7ebZ$xLAWEAK7i`7G zTV&&nM#H6S9z>A*xUBYf;Ql6mKqzq%CW^2F5c%cgK0&kdBO=cLER)(W{F>Uy0Gch&x1K!1SA<9O3f9P8_)qb6j>DiH15VI^o346w|d zOeUIxs;|)2!ZCl}^vh3uVBKmPPTC4Y*m28MBXCDsv2HF?Nw0>k`z>h zWjz#yL`N-{XBxl`sr8h%WuLPUIh9(pwgW<`#gla?)NIfXlh|_%g<3}kTos-yNC3BcXSBml2V*%qsz44^s?Q?SB9|&1D~d( z9Pd;cg~yvaLu>k7wpzG-0qXh!2YS&Pl-Hd7QpqX1rUHXe&eGamMWp)Y{td-zms!9T zpgdNSfC%DW%4tCfn`CF*$jl_53=hB2l{H(a_p-`UcQeEERy}ab@F=Y6MBoki&zjwOrcti4pS zJG^bBF!NwdDf4L|1Ek=6ohrMU<@?UJ>lXcq?&ZwA*YthR)}uIeAS+Bu-u5l1r?)aM zXSjA^!ds&_@>MCMNu^S0!8fq_syAmb{8JA`LFrTE7ew>#&+;M#x3mN+h!)o>T335U z9mF-2lvcAMH5q=Z(JkWY5j$@S(PP+A^In#6PjN?2UOkkH3TL3b=1UzZ6j?V5nB?m7@t-iG9+WhX8 z^=%T{^Jadv$+U5wgG#T|*Qt7RG>0J?p85blTVTlrZmKpc&>0vb23a zo^3JjwdeeKgb>-RDWkH}CaZrC4E3DjPn3drbYsIfGyLDNz#hv?N9^y~3Q}EEhmSn^ zz|dATt6FREDKkHeqpW)wZ_B*3*-WBK1^HYjqK5##kl@8xqTWYrK6_j+c?!6-?uALb z0~*ZKu9?z1Rx#UCIXet-x(eVnBtzV4Uyo4AY}*F+)Ic=9gWYIxQzwNK5cqucSWu!@ zHrM}rSVNr8!w)(C`q2#Z-YKx%TNjumqBC!!G@=X!j3#AQ3Donzw;!|>llcYQa=)N{ zrllngSSUF;KS(r(w% zACG5F;jWk3KS%6Zqn{k0JuUXhdtC|8rC6qtkMaEcDRS^;dQhD5k8>*Dp9^S@jJ$g% z(I6TJB(AU}aCa(BxnzD=d?V$m_@6J%2HYn>qu+&kzwf|2YGhdcx#YT@Yr>B9mxJn8 z!KEW;;^O5k@M_fEdidHn1I)%uj4~#J9vfc7&rvf!j?m7f=b?Z;7B{cVXa3Ef`pOd3 z@qoPx!SG*0c;LXm?-C5^gpSmT05@iujG&Lrcim_|Mi_%b)|u?}KbljBI)?GFz?i7- z0nf@2hn#u}>4`lj^R9WZ2?JYr$#*sgD5w8QNgpRC|H53Sn2OO38+;;yaE7}K>1!LX z*Kj_Vj9#8*EQLS6$4GC){RLrWBrX)Ti0_NyzLhI4eR`_vcI#4~{~Ek&PK5nLC7xQr z?jOfCqAH$XY)r|qG~EQj_#`z21hLpG2#6^r0JG^FEf(Txm?7649=xT87~bQ)a_b?Z zNu(tH#e#X!0RjyR05Rrcg zx@ub8DpuVNF3%EDV4nYJg%14u7XqITxDb%P0?`V<|(Dw1B8pGLB~Sy1n8_9 z6}chny?r@Vo@-5{7M$Wel{294jZs1uu=ibPjm~H3XCmb_c#QMn0LTIb`Cj>#O-DKc zaR!B!ToA9t)+ZymC5ivu#zpQS3=yQ>UWBfz)IH70&dXKmcQhP`^_Ows#IFcw^e{HA{y zMmOI66p2^=srDu2+-*`yt>LFSCF^iH*nwbZ}PV z&`PO9P;35gz6usu>9+MmH0a2+rrbRXf7$f&{0RP#RA@o!)2#od*<>U=q^$H}Q#o3X4HY-Nh@DEi16vS3Fc`AN0Ak5) z9ZLJE;GH#ky@uPh+GtCg2f^@T6>2Kel4byyJDpSqUowagUnLZ&&7A)n!#&)yX^m1F zD*Cy~>&4J$%p`~Y!^UP5Fis?LUclMlG4kOK($VYH0xkg0Epz|W2_jD1ab@|oZIM(& zBnPKv5x*I5d?W zTU9uxuNA<6$N)2mWW_p4w*UDKXpo4N>nas(J;b}hbJIp;OP-{Hxy!~d_R+J#ZI})4 z@>hW-y+LBMJjl_A@b_F{9)6M33>Aladt7I9R*t}b>iF{4X!H616(e4Ad3DOwBzukDA+x`-b(fQgHRn2!XO8;fp;6jq7b<#VuwIhBo!_4cRl$T&L;Ur@To~_7{V&kqcA;-V59O4aKqj_qSC3oRxM(zq)*55PENPYym~bE0a!A=?edu( z{S8VSmYS?w6Uj<1EE4xyK18fcwKRI<--Wdf_`mtO)u*~<=-{3k_1$ISa=Gp$@0u$v zTFqQ(+BGi!bzMF@9FlT5$IR@z@454##uJ`T_}WreK@CYPkB@avS0HFG48*G5OS}Da z6xp!16*-LvjO`1S|DGOs34V1M6dJU?O_!L?As*}Z7fS6-&o-P*Yju~UT`TGTOYB>; zaJG8~s?QU{`$Q#Rqq{$=xrsdV}_@H^Fi!*rIkGTuy6<^gNW?Y&jcFN#Ok?le@)aG}&akbYZR z+bb0&Tmjs%g#M)9zvUtkheLYv884$peDPJSmHJ=fU||t+iUmyw!q(}`x@M-Mj5KAS zYi#N^Ie3SMPnS;?SGJttRS})yW*}lYkzu$Q%39+VJ+62;pJEv5CX}<3+iNwh5Al#4Wa5lJq&Aahp9LBe> z>;SxOf;agLcsfktuiy7cM-fl)L5VJJ`csyj0;U?0;S{*lf>xG~JUxk0n1rgA?`t5b z+FvqOHuHczPWcW$Tq=R=lS?cr6#qTwrfh2*I1%5`FtP@pTnR%;d(GSK?!I7C62{u2 z9@J56PDw-7nupiGqpM;>2N_hH3|&(3`}mFDrwo6H5!pjtX?`!nVNe5%|H`QaDk!Un z{l@5NTJYUeSdh%_Q!t94$=_3@h>O1)klP!GK9w}ncs84dsQY%Kb$i3)A2C%3iE9^~ zAO7dd2YbL-Ch(;@{JgWOmu;be_N|G_Oh8pq6$9?8NHUnT1@J5~0XSgdXIu%c=N2Ks59@s~GsgQ4yKn+otg8`D0PF~mz#qUb4S--DS=A>LBQr_jY-+>(1ny}uUY)Teh z6**wKdx<%jV1Rv2uElCzg}9O@hy0x7XE@%Dmskidh=EuY?AHaoC9?xChPi8ru=tyv zZKhGr)a3YhYP~s7x!noii@J|e`Zy}JwP~(LV`2#-l}qsVEfVtO!h(xaq=G$W+V4dn zIA6Gb1zXcRzhP3R0%Rg}7_Pi^$JbhX#$jvQA9$CyVzd{iq~qqfDQ-_fR21#tyAl z!J!1qux)zfM+p0W=Hd$heyJJ^y!Ij3ljJi;dB#rHsN#l#wik4+)FMqep9StETes1e<4Wa2`G1uaiPJIr( zqP2Q+L}_EddhkGfAI=L8a?uMPB6lSRI**L_JSMLz?f{~WRjQN%2k{K(-qc6Xg(AcB z!Kzr4mq?oP1+d(=05Ush8XGBBQ?A>E1l%#tAkNtRvSn%=2wjKY)cs#MxR{V>&kfxz zFn?&d0I!b?8UQgBi5k#msfV9>T*AIr%_^f0899^I@w)pXi$ol`YerKs4+X|sM)59z z?KFIDjZq22BWS2)YNiO;o=|)i3{fC|B~T|I*n5S-E&b9oYa+%J7|31@>N}gO0e8FD5pNYT%gcW$l8l zjKt1Ht(Hb1n6C^YVsF9L;b)O?3`CELi;h!+I5E1nq7peBi>T%hw&ZHr7$336*h`sr z_7nppANcLUlJNUOUF{T$-SCQEc+jsZlJ&6uALx!)sQJA<QZTKoIH2<`{EPZ;Pa(x+E`~Y>^&>>Hw(|leMVr>2kcv3 z^yCq)c7Qq+K8WmHEkaZ~2;`AnR3c#2TVbarZ9Pj0UdUH>haq~{ulm%OO-C{?UcZ~G zwLrHKxSIZSzF2eORVN2*KJ1c>{#z`Qpd#3QU)1SgoDrb!m$AEzE{A0gV~;ROUx}GH zapyDW% zWXYIjs4)~vAlsMWCzql0tZer=^HXXUh6Q2r1V?b9OfTAYK<5K|4@P$QxTN!ka67KZ zssY+R)kUzNn^ZGWn0QgF$xP~5b5rmBwEf4#Khx&0^}%3t#TO6UfPrFzcvFKZ&+tE) zAp?$wWx*`XCZHBjXRopZq)1m;=U}m`@-BGj@kFU`pq0PrW@nC~{PCP=N51hvFcqB? zY~va?rLLKH6IA^zVDQz7%`jxiAu|DAE8Jd6Qzj!XPH{>;_1*-#JEOL{$nJ^7G_x2& z`7iUgF_AiG*`>qu+^EpDeQ0^~(}0Y1`KcQBFLxt9>h1Vs&$U zm`UVHf)UrA+{*u;)5J(kb0Fc{86MK1TzTAt+lG*Ahe3Kdn_;`-(B3p!cN2rA9`YiB@szJkEgte(XRLO?@hl#lq9 z(qVjC*ax%*`o}gPvs3PE1Ti>AT?zgA)6YH!HfJ%KD^7}j3D0_D@lG=jbmELiwXF}n zO!LZ%j7boY zvn+r z_(+kZ`vEqe_tby?>+aJ`>BqAPzN&yYVB|9ny68AD4-;O^a)`ds{0sbyT1Gq_#QcM^ zl>u6sTG3KLuDCWrxtCil&~%5q*R!e7jgf4y z)p_9%H0HYWrnXPNKw_%IHumlN;*U+)>{H6|eKBIl9tF}*?CljzKQu=( z;@<9GVyIoR{+ph|5Jc-UKS7Uhi8~{6Z+n33#S0Be@G-y_@ZcO-IL;LWkEAlWy&GgW z!wT2FHVKSHkvJL{P}c33Oj38O+QlO*nZC~7iV8;k%5@N~H3 z4j7?)o4!4@EY0Ixe6RcRgt&KEc>}YEm<*zKLu~h3XZ3b>-F%t;-MB~T>#;=Ghfg+m0T8wU!lcH0Ji_1EV$tgDxp!hlC zDc62uolD{B%9SM$U?R)CK*}=bF_3Hscjk2Pxf7UT*3g8)elsb znj>z4RZb4U`El%NT@*wPH?PA5vs5yB=QWg`bpN%hThYuzVAq$6&9V);7s|c3_Nq|? zZXMiflBFaAhwZ%sGCz%oGi(33+@lYGAmv9AEFcV2x;nE+eYPqVh?Lu^I&*sgpz(Ns zWWPuccl4l%!5@VIa;OoAz+`?0!crJLtCZLg_(3#=wlpyUEU+KQf#h9=fI`v534CEZ zQD!C)H^em87=eH8&XR`i^B;k7^8wcfuTC3CwzP5dL1Yk_o^7G#lAH})F-uaQJ5G@CO zj#T@fJA~5{H|L9T&|q^{Oj*GOP>N3GfwYaUqAbV_eLmv|pwzDh-AI>HX3s(dFMuBV zT%Rsd#wXMXp9g%>j)0?RRk@Qy-<|@nfdT!D5aM&D`w-aW0vVYeJv{{4a!86*8CTSL z_|2K$!2ev(5_PQy>?;!h+4MS%z2IDcgSzVfZ=Rf($m#Rlz4f*b{ip(~oQwC5gruNHyob>5t78`ZivnfB2_MwUMz(?ThFu z`&{^p83%?s6olmL&GfLR4bC2gSl9j&S^wys@T=xo<%m<1OS!0vSz(lt=qy|ECp?oU z5USnA1-_Lo8lIMCPBMNClg2$Aaz;!0Gc=(Bv964_0Z4ml0$5vLls;R|AB1^T2{*^gEP0ZWr(sfq7#J<8ZVj0mEB#-4qui*c&BFF zWaj-5@LL!2WQi!Vfdf5fObe{D`<;o$XG*~DFnv~%75m&Wsh+Xf%E%=2+#3tu~ zC}+pLMVTtxnLAM&*2knyqK8);5znJ4K0A!oy7jjTe*XW$g5QU4!~(^=|F$9fIrCzv zsg_{rx^ai1x7>c0zGIqpLl|+Z*Nyk3dAu$N?6bnifdBi9eY> z2VW{nzNoY|jH&p8PB#D<*?E z30lI#W*%#)W#{>Vp<~1-_=rj?G5R&ehS=Kk9M`MR6l;)ii{3=Q8Q7o^;0(#PcvY&a zmyExFE$&Zbav=WBofCNDz2pqZi7F@508ZEmU>Qy5>K@DBX;m80f*}!1)BP}5b4Qv_ z9LRg{f9nP({XZ$sd~+l08-zy~N#}jtXVyj_Ut*Z}ii|Rc-M5vldQJe;(3>u0b>DN+ z_!8e5-LVwAnquQr%@8y?t{kphMR^kLq1B1U@=6*@>$I0Q?dLb~zX zS}qs?bJ#g0>ISr@pWD+>rl-+w9AA8$QQZXz(*b)k)m2Mw>8_kBwJ?c@@{zrlYifMf zPGD0vim-UM(SmVuC9jl6v~Kq%Jus=6qED3f*Y~>F`&bMt1HGmtZ-dc;AM(pu9 zRzf*@RMLXJMRA?%vsa!_=n;9uJw!^o8z%$fGS$eZpPO?LKQz@McUGXUw`o*2U=PBPv((a@FubXI97yX`z4D}7^~T#6>Md?D zJYak}baDv{nu%c!Bwpn6>oE0lJ8Ihh2nPVjw<#)zeXkdziV{I!@4zzL)jBdeNDpkcA2?fi1 z(I&F+ttc(#CsjUD;_K{Wm+Pgt(dN_5=4I_{2j~Wt;NUyj}lD_*-Dbu4LQm@G1c3Gf&FcW@B z|E2*}5rPf^5=}OHg0n42IuYw$*vB%3$Y(nVQ76ySL5i--R7ui@MQMhdn*1rI2fPXs zM#A3EgLuH28LpHjL1Wlo<{uEi?-jI*o036LaRZ=$zNl2IQs2}XY8f;Cgo3Hkh)=5Y zaU>f^9iu{Cke>zH>OEW@Pmb2jN&E&d!#4igQ0_3=kc|j!H=#K2 z4w%t@q$0pz9ZuT%^a;ebyYV1QJ~GGEjNB@W@0MpHrK05&;(=(fc_`(BHQ&h^ zf{3C%TjQJ-gj)O8p`~lEKQ^en-v!2EOGi^w0PSHhwzbCt)BE&nT=3fd)sp@vPRV!{G;_)vQ00^NG z;yp+JdEOf1GoMSxjQMye_s#VN0mw9m2q{6*B}(4g6#}@$N~_IYeAihs9xq(#7BFBh1z9`Wc_SGnEeq4cAIEijh#z&mcjV^QJW8^$EJK2&#e1O`5lRQuDScX6W)>n!!c5+s}t70~bK4tPg zY5TkEy3C@F;~~mr-jf*%$#K=@cU{B#M6`{sUbl69lC?fkRWkt>C>#Um4)TE!6)`FU+4pCM@PZsLm_ zqnt<~T`R7c9ZxnEW8->SCQ!xw@qK1O_qbT>bvUzyS2r}D?b|I@_H}THQ2+I}BhF{% zMG-r4>jvTtyW|A1LEV${O{8m02WXo|;+zKGo$gbm+E_QOo0d*{FZ!xcm^EqFSr-Zc z#XU|J?^iZCKCC%qUqt^_f!v1Vt*ApM8nBfK zNmt#Q+eLReswf}w+ZT4OgfYS03}OLlR|qxrM)(DNFyi4)WWwU@feS%oLS`VaVLEx{ zNwamNt8ADr>AKopr_@^AvX?-=w`b-P?SwTKyG&W(Wd2OFJ$HYM;fXSW57=>9Q-(xu zQbcXiVs6VNDVK|tXI*$ZUh}KD%+{u+lFu%L-xk7fLxtlAi)Y`j)&^zt6sCsQ8V&G4|!Ug*Nqo>`@PKXm62J;o@ zgIoJFJD(SDP87yu=Qo`Nj+if3R;Z>u%qbNw3#|)=iKeaUZ0BI0I-TR3?5THy7h~iF z?9wB5aN~Gm_tJy%w}|^Xx5@&n4L#@=4J(``=3ueTNTLs_r52 z*N2@Go36W`E@s#yir>u8e1gHxew&t=cOh*iAz!3vTg=|OVmp=fnr%KZCgSa8JW}?i zlWr(JdtJ$ssn2$|K8N|U7ZD!jKzfQ5~n-;#*!cB+2sVv>>(Yrm+A|aEKBo|$x;YsvF`!LBu1`L zm(kG)gnXwb!wpiTht3vrFY0w)o!Ru{j3{==Ao5TqP?##D~5_R{YXPA;l z)Lq=lgp_uUHo2-k%hmmBK2~ssCu&4>ezNgW5`3EW>`NYx7(AOf=H_$$czFPL2rlxn{3!FHg znIJ)`hc#7#>DkLhENWCu$Np5_40F>Kd_awXV2VqWi{G1UB}djYU6)S+YS_CJ2kpjE zYHIDC5w=msQW(g7XdHc8Vd|LCYq~aD)=q@+i-7|DpLpvZ~GG9`= zAq^kkgV`Noo>Dc5(r+H@G_m%zeLk^ZB3HH8V}{f`g|GFII0b$8l!4C*6XB#I8d^$@ zue6dSF1LR1J)Sb;)0`F3C5Ed>Zpww@JyabQ9Hs83aRxvI}Weeg?c#Q`Z7! z+OxKxiCazum-!g|y-Gz335-^wRSu4=-{Nb7NO83RGpu^qGGaJnG8P@1r|!=P$Z(vs z8aOsTqr&nq>b&^G6lWWK7SD4DH!(O8E}kryiyVb%YW(FqSvw6>1>Gn0wbI5jt*ppJ4AVdw>3O0kTe9JS7?5%>RUT&I9k#|+`8qMb% zgKI1nL~HQ|#RF^uHN-i=w@b%KRUhk7%gCek5MMWyS(Gwn5mCE_9qu`-d(r3;Lp*b8-AU?pgwjBKiacqrwyP1z&n(qc z{5b1?+jeAD=UM?D00&}BUM|bS`qSf&EuRuppUm4<-fNo;la3uvbuOC$^==vz=W{NT zbe!~iq-!@uPH+En&j*DgX*+* z^aaeyRxUBSuwq^Q`(2;Frt@-0jY6~Z6Q+DgVsh#6R{u!n!&3kAZ}DJ6iprALHZ`w( zR*xB4dkC0G(dfHwy^tg|nl1=}pcdfn^mjlC zlEL%E+Bz!QGADZcm(>^lcpi5Z1U@aw*QkfprVb3*M?fP zg4?t|Kf&t9qhr8hf9%leUajs4KwG+CTN0+k&&{taAz)qpL8bT~-nLp6RQ61=UoKwe zUHt!8o#jJ*06wXxljTn}K}At=>9w1*^rU}3PL~~ z3%eXFQp3BpU)55HK4dZfv1HgYpeLBH@$l`X?LSSKy41Z~J@QMB@Y`elc$SH_lb(Gf zn-rx>zwWz-IhCUD@8@YFI z3Tuxnc5cs!Hhd&l46H#rIXTp01brlLCXSn>WC6BpvqZiw1Enw|uKu`V4O4~IBuh>`@e5L&c&x!2{VUlkxmrk@8>cK9&k8 zU1Lc`vu+jUtw5isflKC$)PL$Z$5xzy@Y?&|-S(~sqbb9m9_@pJWc#0~$kk}HR5 zbypST;HB|#3hh6)v0BPK`@i3{=`pO2eP#xclOl@+YxVzQkxYw_HS2a!SYgs!S|3wa zip)QM+=bP(`)$RpfX;QzHZLP|R(Yrt+5S%(LWXSk3lg|QjosIzf8F%p1hV0unw2XX z^^48N^CBzg{r)t|l6NxKa1Z|V^rC0^XGg4rxB03|uX8TsZ;OLZBfI)@!B_sa_IjTI zm&hf3nquVx679)Jt8v`V*ZtG=KF9;Nvmvs#@9gHEKll%C@bjBLev1$RwDwJi$H~oq3Bi|^ z=*Q|T{Yz^bc&7*AqUKHPO#J7`_%F+ztQf3-91VK8cz2T>2*7an~PK9hu)Kn7pSxJ*N1v zUd-MUnV`6{ydLLHg74o>yJ6Fx=NavjZ^^ZLD6aqr?!=N<*P28cHqg!BtMybI)5{$%m*k0kq;z>lDXLXXAlmqSzAVG{*}yb z9)ubgS+!#q!X;cDFs%P|1LJo6F<0{;hqa9CmoMzM z&0@;TUmv=={e>mBE0El|{?DfRLnYtDygvGW^#5(+@NPg{^vZ`nd;aCCeC|-mwA;wKDVx5Uu#XO{kBfq+`d_&7W(qcb zZnRvp##@Qc$uRpB^C|cMub!+GQ@L`Ze*A0Y8ZKQv4`pP; z)O?12xtku9)Qa$SD^IlgY(IlBbl_CM-RDxBP}zij4jV{T^~-NSj{t9wRsRAIjBNl; zDW(3yhx{MEJY@I$0kSg5qm4(w$v2?G| z-9EB@*F%q*=#QFzt?Umm#SqV{&|-maSXbLzB9Et9{v@g@?&<%W0RC;kf~kynt|yyO zYk97I_#bPbwF58y`^v0|`miLhvoZ3r-Kfk$R_^@wa6RW;&COH25)@6$5|3jX{%np`a@uu0j^aI%cuY?85d;?-7v?mt)fTns2*#=MXw{Qe3*&XVPl%i3?O-KQ+%HhG3H zq-FbzeTp$<`j1Ps{4o^o!qdKfXLj&6`w!KDe+i+NOD9{g5P|#rIL!aUyZ&kmE>b}y zV#k9|@W8wO=@u`CzN_Kh(od{-dJhu3R71~A`sb6H@=|0XQMzB>;`hh^pNvjH!uvDE z+laf#&HsMQe)Z`qOT4tY*sJuAp&W2t1Od2AKt!}4rMFqY5Do*3OPHtX+*q*QVNk^~ zfk|~-&T)VRa)pT5o4@~)n2b0F5dw4)aE6W}!L5h1L~umx85lR&V!am;EOxn0;$TWQ zh00HU+wyw~>X0+8sRXPXm*1Z52zX3zQAa6(YW8qGAZZgpataA=0wrd1B7k|aD>)O| zIxJGG0P2tjBtEuX2s_gS`m9V#Gjj;7#-M+_6vVkiV&BLH@|S`DrZpgp31frMG)+bP zXqR()>78Ld*%Ib%2w47DiUZr_LXXt%0jx11)EEXKoNN%{>_n8-FpQmsC=Yd5A0j42 zT6fyVaxX7puA!km2bkt`azqkX4uE@bvN{GMETaX0nZo(FdFYF5umGf1j{NMHpNjmq2WKDdu>$D^XXz}jfFz$C z&3g8FhBzThkbcjp7VdoO+*;sMK_lE&a@65tZ)j4}jILE~?^9Hmbeeq*V0X0c9obfc z!+kIeRRgHo^;~lX6Ro7Hv9uTaKp2_C zu~qiO1***|sT!f%zR|K1g!f)4A$J(9U<>i=q$Nh-ivXe8R&#vuhIP;}U>@RAA$#K4 z71=QXKw0O~+c2sTvKxkH2n7iDhq#DaB{6+*&njnu!C=DEgZAAx{N;?X)~GFxe-g%J zI{+!&k2%6=I)dO{!x==`EZ87`M2|kW)^=e7pQ*AiqdmWQzBISSR)3NPF%cM`5ts;xF)~If$p@{L$VcFhi6Nv+mS{gh^?aseagp!pQ_rIr~iy8HFmx8|GkIx6dVe ze0~ggD%JBV{Z`|{|284S9VoEBah9WQsCaQc#R?GC`d`!~&Oe=F>!xQ_Ce(w^)sW=y)W%nc zLJ%n%s%l4Wi$X*-td2;rPwpv87WRO-{YcmQ8W_Uxj7nrt(@*L>3<^C!XIwo5T5NI! z=4fX5gX5Ex`MBk?Y9#NzeoZ#D;vm*!s7D|Z?sHeg9Ry@*F1nqNHW#`aoJ`A=@ zn#Iky1Nsn!-+l`={v#O{LM3e6a3oMXy+-81G8RVZbDwJfgxr#DTs8NaE^M#Fjpqo9 zPREnMc>Dd8mo!fuutxa~!}0ZkmCNaF3G=Tm20YH+-6KR85CgXD8E#E6Ad#z=vRcD= zD{YUL+X9O6K;x^OH9AHme`69{M>a4WbA7vUb0=?l`TlI+K3>mo=(ji&_NksC5w&K1 zac`?{i^oHjh;ii1A4NtKC$CscM1I8Q`oMf{AkW19!iNkRB%iJ)r9xjY>!4c5)LA zwL;Tum>oB*7wlzi15tyj>d5TuSmD(z|IeTYe|uw7Dts8B^s9Ha?7h7A#@7>PEUuQ2 zZ)gL8c;$KlYC6`N-END8icj+NrT|zNt38DP;?QKuRjp#a`eS)KK2Sve$1Zscp`y=$ z*-_BkB$~?6YZ-0p1jn z?VAKefHtsQ*0Il=;*mL#8x@1Q8p6$G3_KN$LttY0hwn=HvH}v%#RQFkb7+ZL9|oOA zbZ=xwYMA^3zrqxBYK1RbCrs&tIOVIp1_{M#AgNbT`5N7!dABseT(Y|0Ntas(o7f(^ zclpA~QMWkjw94uL24~?mKM^oUi2j+EEq`FVI)=EGcxC0{Lm6EXJ}?8hO2uo0!Yl+@ zL}R+#k)qiSz`L3qkbJ<85$7m|=ja5RqT5I`nht_n(kl!@51;vX9ra7E1DHH#_tm$D z96;xx1N0)SwCP1&!qgt;7gnM0q9%6^+`-uMh~LFAs*=N0zSNE?62`OohCsqK9oFD| zrYpE*zKRp%7RZ5yfl7Neh_!$={JG!Z`bqwx?I-Z?927mX`TIFT+>OzJSWU6HI9@U&gC;IT+qHt_MXVPsuh zYKLQ74Un{iqnM#3%R(cCJ4?#gqR4sb%d^-?J=aIAup)gJL-Sb_JXD<;m{4}rCfLJo znxNRcBksyu*@g}`G~RlxVDNe4H~dHaVON-)*>7H&)vAIeU_8-&Q&M^*XZ8y! zuIt^79aE2)GFA&VoZMKO|9y21qDGZ~k8$rPu(~`gD5O|YygZ6Vol5rxTh~Y4=1DD~ z(BIkwv#=KcED^%NZUmnwM?Op}s39ef6weYckbVJ_UWTK6V5D2i}IDl>9;%&210&M?K88`S_+CA;m$Rhg<8D07h9HDBQClV9`XT8RFZllOEnF zg_FUEx)$bAuXO@3%P`Pnc)iVUqr$x+xF@nbyZ!<>NXd#Ud5-||MtU8~3qbB^Tk0g# zafE*i2qxO$xC~V0^7P{p5j z+8$!|_rV6Iy+^L#2W-l$3%=c%YP;kgCp@4R6(t8zEwxc|QPuIMwC58XEM|dTH*i$Z zsSpr3+9F2fTStz_B+y)#Cgd)DN(U&Hh-h-t7o>X%-TH9JC}uSLM!?o0T@esk4N=u7 z`-v}i+v0#_+- zHQv)f6K^m*SjFKYMOWE*F6wD752skj>mFEXP)$|K{N_k0D9{GbOLKt8x^@M-U7}-; zYb}=XQsCg1nw}&zCrFVgeM_oVO+k?_KV&nlF@ynO|cYp5{CG{EjNWH-tm z4WbYiGU$(SV$iN#!GtWakk#q`k2PSX23M%&$Y(CdE>4xbUOcHXKzaQ5u3<>hZJ#0hhu^$kBhtpl91~d z4(ml?Sti3Ne7x00AZASknweS3JRLc?4a);`-t5~fF zg(%LEaWG7NwEaYvvL~qGO0prt(}7}Q z!Q*b}3QJN`9HB(C^*nQcfaJqH!%q^21gS!OPeeR;<}#^u6fPAt%l*R3;VzShFb!v{ zi1ovu3c5WjxcAUH&a6cU3jc`JHhYzepw=8o+tDp|S|NyCdUh-gr6S$GZwk8xJi9HX z>fHix_;PUEh2x$WJ8ZK{lE-3{vT5rdU_*I^#xmdPQ9A0O$4@N7DN8K=RmNLyULsdUgqT0zyxQZB=!5(UYN@R05M z=%m(0Nx;4LuczK)1}X)~m9r^+z(2{I1w15`LJxTtko?M$gvdq(^VUdjUdAw+-6=}> z4j`FuTIVY`)Syv2N-ri5-H0@v4MNTHxeMvhOY_rPGUWIKoH zH{|Cs9|Se5P^hR0K28q*m@G=@N<$+!@%}u0kr@VqplsSvZrn(V%5$Mu$7N&300p`+X6PTF3 zm&N^Co=r?9i}{yoXAC+yd3yVGz}bhVjVaYV8I(k7(}@4fp+)ObJ5)q21F2CC@O0=A z%3NVEEjJr#DcW#Vbu+jEyOa@%?LE zL00W44Zqwg5MFa0hQJumR#Q0PhB>HUbed|V1tCT=&`r?g){2@yQSHk)6e334-D4hj zDbmive5e@s^ODA?&RQZ>U&+J^hn6dL92+08{73ChRdyOH+q+r|fP3wwbeBQ3vRn^U zon9QK))KS%@ajHEn#s33Gb^qU5f`s!NqJe}0dJijBS_r-s#*MDrHk4mTUJCv9$^ za(+cCycG1BIvFhz>lLkB^igPd$B}jIm0R+o|Augx=m1nnd_mu5m{@5OLV)~A++ zyjWVIHhITLIlI8@eIsuq>nK>%cDaXhvLEXo8@NpPhrxg)t|RVoA**a)9qiOj+^6g= zr@?j@u_pCi16>{Ab|c{BK_dc*6}ABw&|xgUTg8(Tnq}TxN&EIwdKe5d%Z)4y!YP_$ z^Das#s21ri*cbMh$;3;R9m@$Q%eXfWc780(U&uiAADA^|dpXY8`N|2~ycZlBh)-Mm z9Cl6Ldb}F^NE^_EUEB%LQ-}W6PmtLgqLCK>0@g9T&M>Zenl`#`TmZaDre64B+9l4T%D67`)c(Fz_C%G z)GWq`@OV%+#{x#OuJTgIyrRETA%Vyp$b^5uj%B4atDnWB0s)k7H}V|Mk|X} zhO7#1USqSJE18L?V;Zpdtvo3p$n{6dQkxBdC zKA3!+xx!M&;|o$_?`Tic=P<%G3&$~uB=?v=c6_OHVxcm~B)B46_QQLQhv+r~``bkj zOz>2h;XXJ6=64uIGp!r^=y|l;AbYn)0uOpHfvx_fUmZS(!>OGYdW8}<1qA|E*r}&2 zUr?QfWukImEElehh~F5L9)x8XLNtvYpYdWbYZ**PTdX9n?X~T-tjS(oa0q{lTeVSZ*JqvzfY8_8~Orh`|hqEQvtdS7(j2LyyuNA}(ES(l1eI15T(ecrw9E(%0JKj32Mt#i5 zP)M9PijGpbZ^@Q4P^* z=Tg1JXv^mK=Hn`tt&#J2x4q>YXV?*fmVZ)qM`iYVhfc{eZ{a}VK2ulE!E;Z-X<(3U zNNiOzTokgn!nj(@UE0d_9r!b=dkWF)4IsyGcu6DD28rp;L3@xyb%gP9ko7>7*EV#l z5QiO2R-%Afz0hs2@nt|D{gLKP>gQk!r$4UP0f)2mlSb%I3?y}4|A-rgd?D*(D>?KP zET9RT9rIMXQeDfKPGt|{TSNGulcP;}%^C*pt8!qX0a*t%!eb8qNDG3^LlDg~fP9GH z;;*byjzZUs4Cgt3wUZN4@%VzvK`5=hOnFKfNeE_Fu!gi{)Jj+&ezquD&?d2nnkMeV zU8m1V_Ui?Ht#FR77L9TuLO36h8{e-m37Pxt2Au^Y(p6Sz3m+2auYX@?R2yc`3-->q zZheoW?`n^YJwVkH>Fu*Z05LD?dee=a?S=4l>Qdnp6w+V9wCPb5>|byyhIpzL*^>Q> z=b@52sL%m|Q}6kQ1tNT@1MIS%NS+8XYwoQ(wIvI(uoS6D8@a5M5F_Q=n5^sjHNco* zoV=DQp7-`)l~)hdj}jzNfgEEC(X0tV=c9^%&R73p*Pap&g2x6_Sw*J_LiEeoFeR!D zG$2;M$%4|knbY^REO48rpn*X5-W<3tQCo<%z?)$ z6Z<^+u=L1SfUfQRFt9Q9eW!1)l+&5D!5Eo_cqM5;vrCrzQVjif*s*q0=@{nM5g4=1D&~=GZ4KDq23h=(ee{s$L-&DUz+lnitx)j zG>5~Y2R=zh74U$eN7JnSH`vN4sJZc$GEN*<6ld8jKO(Aa#O^%URhUl$C0)=Dcg=^R zHU!5y=kbBF+8m3Rvmw;$8*)F{LtN=^`RR#UgPAl9Key_Z_sucPFLuZuDMhggtCXSN zfR!^u6xT=0=5|~%dSi~u(+4W zUoT|kDu%lrP|WS7?;!_)WKNWiSS3@^2s}fr!iQri`<~Cg0#S#=;{)Toe3*+QpXi?T zsA!4XBi_N3t8lH!RN{G&cB9Q%D^>nWUz?%3mfzz^EgL0By39)>lYAkMLEAKozkV)1U(xQU~H0nSx|M0 zB3sO!^y&R;LK41u< z7~Uv0X-U>QYSEq^!*E^fd&g`U$32?-Gewh_wn2CY< zGNkY#KK;oT*HQ)w36OlqM(J`gPe5^7Fsx0eGO^^rmoNDsHzHShhiV=7Bf*w8S$ps0- zTwuaQ5{}*!9PNhLoy#Ed7iN`Ox}Km!c6Zaxix-AfOFE!w(fa5t3marX&m>>JrXl%x z=;H+Li$-l5x_EhePyzhD$AhC9#4f3!;<+0^Kkx_UO73F6b=FQ(_fyK|NlA!dEP{PY zpkAor|9JtDCF1~@)z+~h6KWH^as6r@@i>Nd?o+ku*KqC4&l(RH3yEoBM1yIwKKdLL z*7}^+11p!(|6W1ddD2o@2)C=%=t2-RQV$Tgdd}o3jusJ!i(Q(8QROyhUiINhP`%n( z@QAorCX}lYVRwk+dukI0m@L9vVj?8a^~bxvxg#wb&TdRwFc9KDu?Q~Ipk=ZYmyosS zX@5R%KLzc-f+MhJFyop2(AiMSl%-0L^FS#DCmkvQsa3`(uDFVG`e2?l2?MN`o8$fU zKtE!De)|(=o(;8ed8jt8W)YnNtbQ$(l!`MDY&75UDm3yp2$W>&gh`<>h5(^ONJTW0 zLTPtF=Q^!3^6|K?SmP)31VRQci_hy=+i7T)f?U z(|FBg7NkG_6GRWr%UrJqDlhR9oXW*i8{a~1M8_Uuwha=QOO3a=<4%#sFQYu&cR~-! zfi|&dhJ+~LR5R7H_8e=CJVT8v6sLLQ3lm^Wg6oySD&txWgV=<#wNFD&JfMNK-aUP< z6NaGoOUhrCLXFa#TO6_4Q16wqQb-7d>bX#bHrwg5dtK~&d;RtCBi_;v)r0IDo7>#{ zyA=HHV&yg}QvS`^E)|sVL6pZJRrzcib>pz1nL{DTKbXTEw{1x!p(v0p9RSp@f5 zxK;ORGmjLUC{F!rG$ntLbiP$8F;m#HWBtMTdzhn&&an7aWmllo!T4_4b%fu@YSh~# zFhXMU&6=>Ioq%+?ICCPn;N-_$YCe(^<^`FVxj+DQteKf>{>WZI(?=YS;uUESw~0!L zMuebEiwl?^ng4ztpc=p4P2BMYWisnG{i*o)V|`iUrIPT;Zs*CNouVJ`;j9%? z5XBy@7%C`$^u54y8ez$ZiDWUq8GvQ*_l4V-KOK2@>9+1bVMhqiS%7kE1=JXLLijIu z;`Scza(08jys%gMxb3HNjm(?=5^Y{+03j>Yf+vi15pDLlUK(B$D?tz5$YbVSFlVmg zMB!hE{}7dDVzzCi^;6oac%|rREQporF&(;i%c|hpn>baK1J6+uO0a#xc^*Qw3^a)N zRM)m5I^D877W!0#3l(}J@uJY-K-v^mZAIgh8KQlm8KKoXy49~Ky@%2-fw}bHVFQT$ z4`lc*!rrQ+k>x?yN08EkilZ^q&mDbxZ5W=fk0HPm{hI}Ei_Cf&HKKcEQiS)&V$%P8wIVGfFz43&ey-q?xfR^O32*KN-X*>hJGw*#)3@SXLa6ek@#k9uty+Q&`b}J4 zgbuZz?u|JiF=t^!MXxgZRL?=H6L=wITkY*F&9v$S+tvrcwPIoQ_E*=g5Pol`(%A2q zH%YJ+I(eSor75!#@gpaJw5z_p*yW=T-}S~?hyowy;o-x%`bZTmQ_1nQ)&dmygGnid z8pgzGCacWHQ`1VjGY#zD_G^bF8^%Ag8D(23D>b7zI7ZT;s-^=p6EAJPfBnk=YPTV- zNBo@_d?;Kp=b?mj;o}?JVGMqtOw-jLWav!c25Xd-aEHC>|3Za(gt}p$EMD^L~^!QJ@|(THY(dXj3)odAjy+FNa@%ok6xZJkaS z|ER}Cyq(=D7IylRb;$9QvaN${NSj@bO-AhVSmWvH!K*Ru!k9>!odU0pdU#7S=_dQ$ z1iG3=J7tDRM=C;d=A6B4%no|YGm;N6Avupz0I$Smj}(BiUGZ|i_ zGcPv!d(jS?M5V~LY{=4i@veVCOS^;mgIA|I?uMFa2vo4^sJWG|JY26-mnAGtnt;{_ zXJ*iV*uCW5{+ZjJb(K4J%15~`byFCcqM2D+dx)NN-08|5P9pEF+!Vo*D!Bfpu0{mD+C5yaa6UDwPzHz4AgTPh?#`nxh{h^FaRv9jTQ!}Pe35q zM2{Nfi#HRNN}UwFi0RSyOq74)rA!4xYr*AEz_C@n)JZyFQ0s!DTy0I~C0n&RLIdaZ zai|ox#<%eeb6FKbeLRqqqO{|sK)q}8p{r2NMQpc)_>;{d+&s$|eVFvn5ck*_uOY}% zMyL1)C;7Bt%c1#us$X%gTTu=zFWO71!tCSOCb+--TiIS~>DcLz%Xt;~A?JZ}nNVMw zsp}!{BzsYHw;+YzmyL>|Gl!zF`KWLOEwaLG@#8P08{==fUa|l3PLwmDDS+$xSu3D< z{$m!e*?m3kC`f#JwL|V&NqMD@{d+1kv)^_uBsLJGKUl<5V^xO*$2Bs z8wB$ru_Zpmb8bA@m&59~ zaV71@^S9mlZ0b~=O_S15$LwyHG0|KV2QSE1E>q^PkgYUE)uyat_jd3|3uUOhT`!xC zf1Kf|@O?2#;`Zsp+Q$_2k)t^6WGpu(V&GO!TG^g4pA(o5aooF|sc!gZNF3!ne7sG? zBUC%ZNf(kO{hn}%PABd}pJAvfbH(>dR?(@(i}klDi#J`y=#qHD5rGf&yQX_iGqT|z z%h<*L*>(NQuKvSgkk&hwy@cOyK^53|Nw@SKP@Cezo1DW9eMAd-)P}i+5BSHq5FBZ2 zQXF|&3m(|V`sPHuJnrA-mfmnEzBb%*Zw(@Dy^qsSFZS$oY-B903Ar+NwpAKIeFO9ID(r3yfI>L1&9vSVBt@MX_^3 zK)!6kv-lApbTHlP`aPnG724UTijGIIxLWA3Tnf4yQG1U-U{g4-;Vy+e6uKF#HyUHS z%>;$3EuIe`vQ^qYRip=~sFd;V!HtfHQ>jFS4e;WXq0h>MyOWKGNL<^vG|wGpHpVk& zp4u$>is`9#sqM^PVW5^MXvJu+K^EdUFm9?i9J|}0trQM#sXCQWqid5umYD~J{#+E5 z{ximQ*(fH`**QwK(DPLB4ToZOG()+rQ!qG~g0t||nT2{A!aRP46sgjY7ci72U1l<{UxJO7qXi01F zXUzfqZHW#8{l$N)VsZLRz(>Cr3vQ)>D2hjs4Gh}JWX$3&1jXPvd#5K4C2gt<^nq%r zNsHV7GI^^eL7Gf_$Pg0`w?Ne3_A^hK-kQ^t_bBhA8N%Za(`f$%3JtCxdkYME9Bl~Z z>i}3vVUXi}uwGwLzj$HBqO=+SxqKsWGK?;1=TF}CXZ(7iHAG^j|G?d*gDa;Yo#L<{ zZP~gX9ii0{)30Mq==p`%zd|a^u-l)jxFhd`n46aN7;3N##7Xi_51Ps6rM+jdN{R`B zV!Z54o>xf$dwLn)v3@ai&I@vZM&)J`rN&05xkMQZLhX1n(Gp#@zIBEwIN`jAcq#ky zf_^)Xbx!6gY3>r@5u^qgKAuN11!`33s#iWkWc|_VocXeMQ$}MB#^=*lN*f+_m~uqw z@@uzo8K40uQRpw~4oQ+K_9-z9P|7O}O6z?Qm_d!bOHKlEamGzA9Gvo0-Z`0`&IANx zW2dagBb&YLj3Om(xrknifC3PyiLq!p()X%y2l$;2jnFoC!_mf`&v6Slc-S6d%nqnx z?+Om89PBQ3>(8^LH>WK_jNzf&E2t5VS5U^33C&F0)c&6n!dPZz1l{Tc=dxE*`9?Zq zK-o~)I?EagIV068GPIf`(Bhkb4Q(x`-zo z>Brx^s4<1yMRqQ502f|sbwfd3YqXH><;Q2}rDAwtoi9h-0Lig;rV;}|fK=`pJ?;!U zd(GEaDnO_5oUc@?$K3L{_zIga#&Z2^2}94not4$jK|CwoJq)#0pu^9Fz%D~hZ$YUJ zCNm#KcA|zt3*fr>!mISX6F5 zk$v)E;d|&wT~yNPwZ*+A;goOEW5`nUP)0Qg8YbblAJ)g8=?(g1i#c6!RQ}nhogC<8ID3Z_TXdPl zh5#g}VB01=&m&Q!NNi5eq}X~|04ZoNhaA@%a)GOuic(Wl?{6&kVQfxo6JD6<`L_4c z=_9z$M=rkF*m!6T!LN}f;U%r{c>tmF2H#l+{1P1Gj5q&*KnOl5c7CIx%HOu|oXU9+ zMa+gi#ar9l(5r_Kbk!_V6!Q*Ynh4uoqFMziNfw?hCVfjmQ17d*ASa<>+JRrbYyIE5*O8*iRH1o{)kh-LD!b!joS>D-6nzGH3=BMVNpb&!1j!ajRN?gL%OhFY=x#N~ zsl>h-B%IGUQmTGm1Mjud{8b^KA#x(tBMQNZq4q}4sG35EDrg4vSm{CjTe5=oMyu%w zk04;h>d;B3-!UQj&1Bbt77z+D#>_yP2Ee1>$0s5t5|>cz=T5kD9{|hp8!Z#s%R)*; zgJ1X3!;-4sCJuaZg33ow=}_*3f&&^VyCNNaIuhcued@wcH}Uo_3cl|=LGr);4dF;> zx#@G|*AGyzKD`iTEC>Y@ zx+I%-9|w>br})g9*T3JH6c_9Dd4w$xqJ%`?I_|Q3ComDD`skTHWKI-ol z*|&4~sKimK@?Yo85`Od#-JAzxbiYoVG;Dhgg)}SZ***?J_%;V^g&(A=%Sh^TmVZ#kcl*Ru~U`lot<6=k&A%&;A-F&muTZV7rxi6W4JI= z71H;3fI1Q~aqTPo_q6N07LDa26o0Y^kR;u`{*81*RTmG_^1@IUD&x(83daEeAxu4a#_E)7Ked=`u6GI+P`UMlcHK zCsV(_Joo)uL{$_$FI;;x_7^C5(p&mt-mx#|Y@I&ESafAQx(p*F_tgwb+{y3v`Zqs- z)aY4Lhy!4jH2M3lLN`jj@GSrdIyyDFkZF=hXEj42sfgizVxSRvuS02f zCcybvzRaw8h2XRhqZPnfS`+f#K`W9sL@4@kClc}nKc9y1$TU^H&%Q%EVF?FC>XHS* z)I#Hv<(aj*zys%FS(o2;OhVb`PAhO+zG%FYt>CvgfkS{|+ z)+PmO(T7AA4^(gN0DfxaGqV7{Lw*08;aykX{slnDsQM<_=Y(;aJyY@c9O%$@00d(Y zgDC_|Rpkrn%?%&<3wM^ep$b9DFo@4S&;z8fkIsD95bWmE9v3HcDHa#z7xe>Q6r;h> ztI3SUvJd<0-uJ+H%K|XNdm+aRX%RR)TCwtBp`Gu5^`kNB-h2Q>A1$^OjYq?xSOFY( zib80DcoE~q8OX~$20<{NrA>c2B**zmaQhqyyJ73Hp(7sb`4zmEk_xs9{MrZ1=#o~= zbSk3i0uq&DTiMToy|*3z4aW{-CoI1=gdT}{z%&OY!oPtZ^JqFMfConY2t7QlHw5=< z02E2?p|v=a-1I~&p49LeaKWYAUFbjSRX7jjh-cDUgC#Q?H@YR$q0XEoWQNz|E>oRW zeIF)bYC`#OUAo?tQp{w@A6wd>ox~(g~_077uMWT&%E{-|>&MlAo z&|O;hnM#4vz(1j$a^8<~5ll-4%*7x8>eBEul69fI1zJFO7yn>$0%cQ&O;L97Wfs?rbvql-QG$jbuN7iugdP>Zd{6mgSHd9-++}Wp7ja-4Rxcc?LPXO_21553T2C7 zSUZ;9Sx9Ld59peGeEv|bS-7`IwJ5v{wBwitl|mLAqzbs)dGFttJLVTo5j8h`afESN z;lT93wg{U&@8fM@q$lrUyN0U^jo)*`y9uq4z4@$En$&v&lyEDFA!>+zKW+Sh9xDoPYKpbc2?r5bR8OK_%EOrvWd%oQk`Wk`AsXcEn`hj_%Cy$IYb z!KE>^CZZ}3Vr-B}>mL|Wprjqm&fCAg0v)X8C z?@iBxSRD>w{Dic(8Ja+4z-BWeVDpoLKOF-D8{|0yiMBuMpf1BJN{LP|OtMk8b3f~` zhe_^70Yk(=xfV1=j(cjhcL)afa8GmJZh~4r0!&A_+yLmK=-vR{bBqH@vz(ksp8FwC zztC4}q$*L`JEIHpTQjLIa{=h+%h|8Tq>FDMfK(E`2bU;qYM2f+Bz zCz(RtDR)8~!gp%9gq~2umkVhvQ(jTsN!(}*z5vzl6$Pxr8YFSHBH#s`$?bsf;cLU2 z=}VB4rvjR<140dDK}~57SCCRt3>~5-6!txHMo1&hKvuUH&(zTXRJoY;_NwWI1W4Gd zAv$s(*uoReVeJ9{VLJfDtxfMi(S|B4`M{6u{kaiXpAgf*Fc@s^;A~9_7ka-|Xs&5< za||xu1tQ}9qU$B_ujAa0-M75E%hjQ!%>9><`lbi&k{$@x>b>46?ge9Yd>zpUc1N!N z0H@IQPDmQfV0OoA7??8XC!S0BM+A7bBA0)2J&gb+&`c{LfMgF9#106g25{Q3WCdt0 z&H*~8b=9=)(FLb87Ba5*h<;9m8`cd{+nWbFvIdXKM0L~NL-UhRnk&}0i(v57y4t|L zv0o$)kdT%z)?}6Gnw@eG7)BHY2ZCoKkB zlB}Rnl?{^*RodB>T6Q7NDaY9j0t~U!=9wkRp1fIBEc$@N6%zk|!kesAr7hIW7of}` zxZ40VE0NYAVcT}*)l(J6jyg80A!<}#)vH#3P`4lRWAoWQ&r zgGUoF!eCGlz{5;62f~Kr^D5V%E}iop7^=%5#mc4E zK?zKt`NRn%l^Dzr+6p+xqlh^gOhqbWgpjNP#|&nn_R!oa{M9dg{X=U5wPJ3KCMQb>6Tm-BP45Qu9a=xZWi zb8+d~CPBusQKc2?9xiNQc27_j4!n_AtAqpi-K{Q*hx&W>DKK}@%N$|kJXu8k}0MFG`adYi}x_IB@18C_gE`!<2ZK)Rs z*`^0Yig_3(zCgDn6jLqw%Y*#I;kM{s`65+}+}?(#DaLJooV?y zRc5*q*DeGN)92;44QlQx?W$&VTANB9rxt%KE@_^8fALgRhHCcWP*E1RipwRH3jJUw0jV(^pl0(||k*_Sy9YLAmnI-GdQWUbA0 zbirLu^hu=1iH2cQ&U0Gvyn z5AlX<^ilKD`l8R;E>+5&renz0BeKl15V>Ha4&{vfr3A6<|MbU&?ovm3_Ww&O_tO&> zT0*aD`VRiGp#D}5{i}c1A3NxEQ}!|>%|ASGp{J>@_^$ug#raoHT<}6PlKAEioAWnXqyn}(dY|8GU z{KJnd2};a6_u(1)zg&P*RD@T5YZUzJ3I1EO(+GLCBk4oIPWwxT>t80rUv}4@e}_J0 z02rYZCha`>1snihbc&Dgw}Q-Xv*`D){`FhYCm;_fwKKT9AEm7S8lnGkq|g~C^ktKa z|A_1@R)s-(#Xk)!f3vgfZx-vB4|pyv@#sG?ccF2O1Mj^QF8$YheSt;5)PUhPqtYK{ zrEcVS`48!TI9+adk%LUK8E_!SKThi}&!zUWfzkZ9JnA~WDZfVS(16vUEZEgpvt6?3 zxvPRUq3cPc!r(6#20Dv8@b%@fv2_9LrPI%@5H9@n96~SndB)tJb@kepC*K5~>5;d! z`~5Ug^1Dg++gT|l1C`Ii~PeuB{XueVDvWHpG~6tELe^R@f?nE$do|2gyi znADlzKBD8{1Tn(TDNrVi-v9H3zd!oNq>h0wRO@%LZ7+a5Ni&&$lH5`0)7!Sre>Wc8 zV#v+vQ_X`)dD(*K|KPumj)3m1sWbeY3vQo#o1g&f-=+Y!Bl*fmk_*{;3$zqI+oMqJ z)1RaM@#r&Yw8YONL6-v~y!OF^Y)SAO`XX%l|HC0118Ejv(glC_Ug9HM_)(Pn&-Z{c zIH5dN8W(;W8Mh$X{%>RaPY&UiX>&0v#sgQ?>;CZuT`|IAe;dhPsu0DHC%M6Q?xo+S ziQg%r*8gY}=yzzfEH|Fb-+TI?d_uXqyCFrV!;p}cK|xQ9o&RXKuMCcmj*CAtM8?dZ z<33@BVOq$iC~9Ap|bw|^^Wz9t}k6(=l158+OBP+=M|~FNu z?!xlvvxxkw|2oY7^giW`@1(hbzn%98=$fcDJ;#Oqa-h;M+5hE4{x*=jL~uF>_CDv@ zcZT{d2>j!Ef-C&rFDV#%DLag&yDWpsNSp5G{1;K*JzND-J_j(Dao3`RZSmB%5^KtNec)=dX+0 z@Arb=0tfr5@rEZdFZ3@buAhmpOznRgNsle5y4-tPdD+XxM9LYfr1N2a4)K?7{BN_% z<}rLv8!P~CWaeM|Wf;)?!An?n{(l+?544hT+qQB4INlz8BG+sG;iBq>A|XSo@h+xH2bKy#9hYi2+WTnm4@vH79K#Ix-8MxYl5@`v4*!gpi7z1EQ z!62i(Bb1VxWQrr2e-F67@tpuX;)!~|A7%^u1&R*#cSitw2!l)L=SA?J@qh_1~kV)MjMclr8rhaK~_0~hk6LGcupOb1yEhJ`*Q*H6Q}LTL?`;?Rq*m;;C@{Q zeyrn4a29}pQM~=c`R%;id(LyS3-;GR8%?N)Qx3{$SMI1>K7+Wa2(#Uk271$EprcF1 zBdbc`rE)L`u!JoAPJ`sXyt+BRiBp%D=z{b_fP4Ft%xpr|F`F!b-`1-~e!gMQ>glJ- z;p=V>^(H62uUhvU_v>mFf>uX+gO>=fkJm6m_EX1fE`7WVZni4xza^Ri# z0WcYtfGSsD!5#3ZN*J`$O`un%NAuumQ4l z!>MUdaO+`s_D)P3RNt79X8|6UyiqBb;SK{7#Okx(TDewl=Z8NvkIc^R1XVbudyWpmoBT%ZGYwx_|k1)7#58zHQ5Xl+&B& zrWqGc+AVgpCB0t~M~@cMH-H?CKy+ltOGDyWQQk>dgyuM8R}I;OQOeF|G{43d^1_5t zY5{%F91EB)m>MJ2pDr2fn)E^%aP3shU@eAK@G447IV&rqJ?N{#V3F^*QPS`@`uxx&l7n< z_e;=ntkFO2ybM)%F5_QTZ{3q%kGULH&<2K-%RtyjiUFDf;-o^eYlcK^ssv1Y3aUH>t~p-?566l{b^Z%J}&{vJJ| z-Dq^5|1aBGnFiQAhWCt(T=OB~LW6i>$I_k-Z034sM$o~goVat64bVQFIAbx;886QT0RrQC|N{18NQ>;s{j2M&_p)LzE}DF z!&Z8+RCluSc=V%x+q^+wy1@G%-uGvQ;Ri+0g?n0;%@g6C09u2b8HPbwKYq;}sEgZN zCF!_eoeygPK5Glg5N$H54Dz53?vNI7>A8`(U!FZ6z#iIz?J5P5g?Y4?T$WOdrS-dU z0g`V(Hp0~wE1)QCJNsOJ5oE&# z5fT?azPi&j1r&=yIqowEza(~1uj0exVn&kHv|o83DPwHq8J8z(v&;o3p}{1yVC=#h?|r|`fC4e!h;FZ&*YbA^IXA3 z$Bw!JU_j*99RL+1A0gpa&NU#WKHKug@*HplCn&xx_wfSLi7p4eS-L4;? zLF8V9@ufK42a@P3Nr!psYkGf=oBrG*ehH(lN`pCi)tw}FaFz=$VD+)>GSX4$qQs^JV+aPvh$CIIDtA~pnh3Ov7BRt%tm5vbMT2sZe{7l^0|_)#Bg0CRm8;GzY? zhxE338;#G5Np=#($Z3X z*-nw=HOcbNz=~F)K&=L_{BDEtebZnisN>!uL*acPU9^*Y5-{zsgZP|)*w6s+^6r5M zWlCPZcUi#zXLE+=kvmSsU|0J9X$B#E-$y`&35v}Iwc;*r zc;O_~%aA-VlH(7m^QJJ-A2H7TO97Zf3wi?>CNH310;G}0_>TW zAZX%3=0WVW%OZ-IUXW*u@faXJV1O@I0pPc)-UH%6%H0rx=$B+g=7BZaFwtwUEBU2wc?&neOyMd{VIG5ZE|IArV%x z!6wT<%Iq7U-#pD~0f4y(ApmVMu8jemt38ObY9MO(Z2-0p10PiLQHxWD@047@WIKf< zy!PKiVBHH~ag2TVq%F@7hT4J1KEVEe(>=L^&Hz&f`CbJ;l%DUv{)>t1349tukP3`3 zfd5lZP=bzCjMJF9*~ z_y7fUK&5&5D{{Y4Dm(iV1b@^5Py&!M&0_cW>~q+8KKZ03=K z(0kSDQV>hEOol$ZAbS$Y$xeitmBlBVvX0>}kOovTJ7B3CNw06H6*v)IJZ(b`3#r-@ zyPteu^z2q&6uvMdmt3zMgx7YEb-1l{P(|r`6@A~_Ij{|esyr=36|f^OT^L@>(GHH4 zX%;p=(JWkF1J+()Duz%gmxzr>4*p@{=o|Akz2OBu-z|9-e++MderIAbbr4rf!_GX^n*qSE zxaV9*7tvUR)gd+SVvL)ToWy2l7-5Wg0K{&)xU4G^|m$GGuu0J==Jw)xd$)gtI8UB=ELd@~Tm{3MUiuk4& ziFsXdqd83pHZhW3%Kbv)eHU|1;=f0C(0y`=x<~TYaO5fq+GoG0y$G7)ti2BmbL{K@ zGk5j0QEYLTUk`jy3Q9vEE$tw~B)hURN}qEGn=iBwjZ;#p#|DK^D$p6lp9Mbf6ellu zw5w;Zs9<>_pU)3sEFv(*{yooGW`ZnxqNwK~N0f5F9T)*jI|T2uko21Nt>yk_53z1v z0r}!ts~i4mKr|iV1dJG^U@)yHPhx<)H8zC@sztbgdOq6-;D?KFcAnP?G5{HZ0Q_YLfUr^?T%j-MY>4sRQL8sYp2aU--g-U% zrt)})2D~mY3&`WhER5dbgsWN;?m&v10AwfJoNNdXXqk*B&Ty#;BU*WDUGxFlDWXUM zSgm~_GJ&tLAt^Yo)_*d_w338IqiAfa2XKh>4IQJ)kWHyNRnQAjQ}M&5{&yf!^@? z1L^Pzh$w>TPz$+~Yg`b9-{-;Q1@2j`2B^!_?Y$6RHR7u#oK<_VTBiAsIWVWgPnk&lyqP6 z`iNR@Ne=gYsdC>Gkhe~WiMO!041w^B71s|rZ@`aG_i;oD>WdnsKz>}Sie8UPo0_>C zfv}8@yzXXAbvUuIMz|eNd7i|R2T)rB>TU$F22?nJG;Cl|55t@aF|!82PMV4BH&RWd zkR?1Ss8v)ZR?&!)u;31ejC+Ilc}#Z?mk$rRsUK^n$__v=eTeZDz^+dQ6@&1a4?q~F zlo2#f{B8~`5!R4W2{NeL4wKxO1!m?qZFxfBA`X+BTt(Avfcr5FNJ~I(=rSQ;Z2&^(^7{{QCQZQz-Y*}A?*;N6LFzTD@QHx(r;a) zIcq$v>3&IYJ8`}?zwLp)RB@q9v4;iT(VuZ%pac$D@S?i$W-Xj-xUIk>pym{5!ubXE zR=T4Iu-h%$YpIB3JRO61rswCdPz&tsvH^nQy_-HnzbIUhrO_aVG_+3%>pbj(n9@L z8{c9)l=-jl3$22yhvR6hE9N@zBpJDs=2%a98yIsiAoa3#gi71`>oyijSo;glM7CI%3~=lznCvEh;4un+#2L@rnZPmV;93%F1?Mv zg_=L|dUsgcA1i|p6nYA+C|!AIMO|A(J191E?w0C2`Flk&q2oqI$q4h+!7R+kO2fCx z4_yKpj{TsITB9xeKE5C@gF{RK;fG@9sH&~fy?DByHE;qY&QSwzaM?C&wGgf?%i|{&p4U}=u#2E>bpWb20{9F(cK=8Ezw7W1jAV=yK zNi(VT#3QOW>=jyRo9UmCTd>{D$sL;yq0WBu5zC(=~tlqJCiW;jUqBix;iqeBp}cR_w? zPT#X2InNLc3nu{)%$wyaDJ?Gsh&UjwZM{*i-{lyg@*&PoF(N46r7}!{GIy|VOH@l9 zGH$CE#Sdx|Lgk79Gc<3=7b44ELlpLF1hY zY?&Q!pJ=~Hk|{D7LuAcJx+sSl<{Bq$;TX|0mkA)&L9Q;w3?4oe29D?I)dX#mo?gli zD9hU|{M>+y(@+K>OwM+0l$x-3DJPxa-Mf=dQ)14Keh2FE79;DOZKpor@{)!yR_{Y! zw)Zu_$a&AK(ukT66)pa~B0jiNbt{}lr(muF>{N{JMqh~l?Vd&Wb!BdM2Gej;`D6L^ zqv=J303T437q~6)X{d&d!<9vMlI>f`H9?mP7-awrnCGMW{dcxjHQc^Wc}ZFDSCbUk z@s3j0HN4Y_V*Uhwh&K)}Rcq&|9Sv}#)O6+`ZvF`6SQtAc_-u>Ip=yefx4hh2?Zauq zM^;?&-k?S!IvWfb=i)!jwmgViZD7HTL#gF=rA2uh}k2%bzJ2^kzIcXf~x>H!WC`cL@DA%@yFN>|;4(r;#bLB^AU+A_2w2UcZUZ|vS9w>E<8h}5 zwy5MC4gtkc|2ISa&##=^1hc%gZMeirM~UGTXfVXE0T0X%d58EU>MhhQKb+2eE|7k- z;@nb$^9>BS`LO@}o0tBm1px5`-_<>`zBbcWLfd8hN#2;4A{i8Yr9Fmgg(s&8{nAX~}H69knNGx-T&Jc${ zUdz<0L80#6N;jFn<(wVTRffx(=+@b=8Cg50in2|mh=s&{5WV1yfRj4D(778)-WPbF zxTc<90wAD2l3!}SAZlHcUw;4bM6Nvq7HrOC4POo+AVZ-BIWCZj_z|ND=%qEUfHkCQ zV`)T6>jr9T0B2upen^SlqrKW^4qkX|KegUA|9QE1fG5hJO0)|#&(5G&>;ERfzlwuA zn_QmlemRg{^C7vF-;I$K8MBd@ddrStzD%SnqK$pgT;W#txXZOUS%NS(>J;g{56vdx zy3vDPo&p0iHx?C@1+Z*sV7gM|U-9(r;_Lz`fGx8vUcul@fC#4qs&7^l4(w;K_CM`PTuom!QBPDD*^!bYbj>I01`%XULGhcuiSMrHGG z1osJ*1p#EQsoFHJ23c1(US~wOrGZeqm7|>MlX4tlwx!)OM+a$i!oYPN}w&VV(KyKAmO` zyk^f)Ie)?{)xGhwZDPn7IeuoG0am)6u`$V8dX3KI1Js(0q{Y7Ak{Eqs+9Mxzof^^Y zAZGiU@STb@w%{3og_+=qW&w<}H|ZRjJ&IU=^nj4Ba4MK1E>JL&Up}RHn`H~wz+ZLE z*G?o#*re`q`fibM4-XF>P?1mZID$xK3)%&FzF0Hm*KDA4{fb{9h85Ns(?78?PV@Zs z)-cW!k$|0$(n{~{PSNyOcalG`gkE8dlnU_EaG{0CwS8+}`3QILlD9{FIKd3RcewGw zhw*-6;k8#W+*1r^ZWqkl8|+d?>EO6)!By@@esGt(`WH+yj5}lcZxgE$kX7Q7Bg}0c zf&S6FLi0t^38h1nK@hrCPkg5y!G~9m8j;7hN&rY*p8vLmb)M-P@1iq%4M+o2c(LO` zix*H7s|c|`UU8qickSMUxEIlB%4@_uSBHQ)CbFc8eShLE$5%p=A0k~qYcFYuNJyOn zNyibhaz{ugOSxBxNA8=?H0Io@H8gBjCs+*BFnESX`zP>xz6d`CdAb(x@6em~iNXjM zL6aJ$qay*B3u>Len@x~NTCop9m^6z5bK>^E$`TZl42{rScw>rY`Mfo*ULIx~jK>GW!n zfm+NEq{|hVA?HJKD3QHwN0vK0wlE1{ZRD`Zntnm=Tv%fSK*j&Fq&4zg2ZlF@Z896GfS@J_QlJIf$;bCkgIm zfjz-te32!SNm&TapuQLh0b=<2Ksq-1=4~K~DeJjPktrP@ZybE;K*76_1myQFaRX;i z{&-1)-pPE^qhCR{#y08?(IjuqWb%vGW3EqL6&_8UOS&aI#nk^U(8(K=sK6do@v%;k z7DiNF;EGKt3Ja)!8Eowz`HI)3;q93PmTI5Wr{*5K`V8-R0dD=&ipxxK)(|?;C|!Ov z+X3I1VtTLqj?(CC@B|d}qK*%|^T8J*BPDxF21MT9V7{_WNhIn^_ zL5=OnQV3JM^d4$~iyE3)RL5G5xT)MkTve*8XSrX;-&R_$?$MvR3qmoJuhdjV3fReI zxwO)TxoS^mVNA)4nqNQMJ#)WLl+p)y1{VO%HS(Y#jB`+Z-*Hx&2sL^S{O@NByR|b0 z;cMZ9@>E|z{KXTd97;x|d=BfS?r-+U>8;yJFv&)ax3}~Xx`Vm=NZp;7g!|8PCZ^*j zMn|*E$gJpRyFd-}CC847cj~QVg8jsMQbw6ULJ^s*)K05%bj?lGNkW-zfTA z0zUfhlu&dZ ztuDo{sd!#2$iD`5-sYOnanw%w#7H89zXKe!W$+w8S#N}l!PltD+Ztq3*n*OUDm}#r zCdC`gCOCPK_kf&{x%fH@M8V?NsT(H2p*6_LAEk32ZROAH@-I8lI|zjs2^M;H{?P4> zM?c8uYPfwHaHAYkZ#DZM(TIU}y!7W^;qB-4zJ^QUx$^}vbt3>oi+E1_uzZ0_z8ax2 zG5Btsh}C_ZkgUn9(7;ydyuA5{0nr06E6)JaQjC+cch@=_>Z4ihu5)uv_5oIyEawAbtAc&JE?YBK${FBnHF=9&k%E{oTc=o-NH74__{x7* zE9Nq)KoeM`WC|PZZ-Y!ovhi%V66$-unmc6ytge$Rrlc7TDeI77bvHRVnRgiW4}tW! z2!F^qq>!S0^$<<1YWlr_VGN$NFFFc0eHb9v;e?^HJUI3;ft!uQfRSuNdlG?h;f?aq z6`r|M@XBWtz}MdTU!Is?a>CAjN(XaCs=aDJtcArw1IE5p`?lS_!wM4zDVsSMD4VLY zXVzMI1h`Gq>=3muZ13xIgWhgN|8)`*RrU^fd|`V@CR52wf3*m-W@MZ?LTIBp?Mh|s9+&jB)1y9&dH&RxXdhEv^=nKYNMB6KZVYIl-OpjmqwPA2B3=q z5bVFlz*SELi@t3NwnAD%O_)V72fUo)KA$u$qfrnmrDC}_p$i#pPe`-qg6az127SQH zYA}?Vk+!#(o>(AQ7jFk}?RbJh+7dvKBXKW!@3j-ksC?F2?t zbT|U=onOfFFWH{N4Gri-C9iISa^yrfRk^c@EA71}hoGdb(m_9NZV4c=`+mTKVF-YY z={#XcX_Du}zT(dxzqkH$7Vap2A?b{<5~BtDHno|!1N*ds=h7si445$$b{YAo+D_pf zJ`5PLeHl(LY&n_M;n`d1Vg+$0#Jn30zVte)R`}1u?-csqWxw4{7qw9U@jY$>1ltH$ za8TeP`x|nedUx=3Eq8g}!*|5GQu&g38nagX0JO^OjG*3999@Aga8%v4PRXusj`h7A z+av8+{#FJnVu0DFffgSNx$d?V>loyP;XxM7#v065sP}Gjc>COsUb!yKgVzvd7SHTJ zsktgj(a38O+cMil}>dTr*kiG3AKQf_GaL)T2&Bh+n5}@=gk)7K+o{( zt7WOuFXut`b}73Lo@|=Qr%$FHo{&CS^BC(HN%7nQ6YLy6*Y?zSVR3Ip`V#9*Tox+! zkJ9At(U~p_VGi==Htfd@@<=e_G1@faEuXpDB`cZQ4fK<3i$k3%R4x=O+g5#kVrU2| z4Jq#TE+KgVyJxS!9rZwZQ9h)VdaU%;d*M$4bP7fWK>-9vq@k@=NSHsyRoRpF-myJ} zy~fFTV84R(+F2LlQ#XHi2l(c+wrMAu`AQ0X3naxS1P)NwBSdVWuERk3uap~q8ynU% zXpRi1h>QG2i>|#tB&Piq^Cr{5tjrpiS4a?@oVLKJ5am|vAE_fS7-)`SbMPmj`krMj zHDi`TO*M#cdI2JtAoY;D2$Oj;r_`b}!4vP*_M!zpQK{W*sj$>CTOt!8)fYE&=EC~BakbMKyoAAiRo~fzaT3HWYLCPKKn!T3dHFfnST3l>UPl7 zJ#g+%DPIt#sviXAjm>!RM4gVgUY92a{iH7=hk1MnCgF(ddwObD z&}t#9>GKaf<&u z)$Ip5&UzLm=Z9xIrZu^ro=Xg?y2v_n(X^0|t)_j=xWR>PZ0TVeX#+KvghOq@4)h%F z`yo)=njAYi^du=BS-3jg%+r_Y0-?FbP>KjE*bN^&FZMvSwX8K_wGijn?dFx z6_Gr~-a?LJ{#jJ6izw1CLyfyFlg7XSKdJt~?Oo_0A~R5tBYO!%4twXQq7A#H^_x6r(=r7PaBV0<;#{o44@@@8uD;l*)VR(6oj_L z4Caq%@?&L+`720CmGx#TaGANe!`z|640k4^odbKN3@fY%Q}0PnvdmHxlvkemDtEJo zJeb-Wwy3>|C9~Z~#7_91&nyhD~ztT>xwL96D-?prleRHM+ zy*w&=g^w_;FO$E`{lfH2P-)p~K(d^C?zz7%w=7gKe&1U+D0z;*pse>LJ|+wvWOa7u zWqPz@?@BMsV^2CD-_|ZNM+U_ramX>+Ds>3z{N_QT{URg^YzDgfqJW0<@s82ZvNd4O zbmyL(sK;zk;B21%!49~ zX`(F2{oAOX0hx_APe`X7Sk_p(&9^V*;Bum#Y9^+JRfpvN03%k;zfRzDH@VKL!rj(> z9@9En3gd$YLF$}tY6DUbS` zoLM;sX9tDqz%lu)b8R@$(PZ}vITVkUpFS>xfADw^Ad_h2`NmV;F6JHp5sJ)GSG!A& z+1BqFA2|n_isjZt(d)&uIb%JorV|5~$w&RSBPS-eP}B&TbGAdGQ&rVlG$wsHcdX7` zI!D_!vrGQwtnQUI7i;N;;O>C#@a3)15-i)NUG;XtJB8g2#PF|XU*5-`GWM78U7i8P zqbaM-6!m9kDJR|j!53RX=;D44sD)7!s{$mdt zy&EO=GNR~Y3v7x)7}T=&8YJCIQrK3u3C~_RjnqZVRD-%u^Q1xod`WIBH>x&?GxKZ#x)d~S?T(qa@fb&})a=85wN*V~ z{P7zL4JtCP5YAjp#bQi!z&PEldzicAfr=p)8Ra!R=nO20!g4A1CjjKy+E$lfHc<}x zLk8XC5v*G$m}Nk&8rG_0$exuR2vxbK>;|ZvxS0|y|7hYmN(<%((wSEeVd00yYs?dk zyPw~7PfEE6!p^mA0?g)pmUqZ)>bWhaz)?7C{5~<$SH{ZT270}2u+vy!@<{tQSjJtB zHO#!AfOYtQ3o*z(;Cc!9MR@_a zd{ z^uVUOM$M$D_)PC$(-gha4)MoK`BNu<%SG}6eDnSm(Lp?cFOV?oi+p{XRL04|==R%kvdP z8!{DI3Ez>Eabsm5`fCyXb8ufFgf8NZ8z+w#b*&zeL)PrMK0c!%NB3lP-}XpzoQF4N z{)X5(6dEi0VJ@JvYhkJhlzzy;pnAvchon6YF&5{^ns(T{I=avHX-P0-6>Ze{=`g;OrDVKtXl-zlF41dD z33t~CI6H${*9(_EYW~fktZrwvO{0d_Ot(&kI8h_HWb0@`(|KJbf)Lih6We>?Jj#RH zy(my>4w5`RtAPsVylr8>x+W@&;ofXWNj|4pz2>6I&|usW7yn3e8}ou&_QemKki#&#@1Qjh4)_N=_}z=iqFy#_KwNbjDrW1pH*`|SXUmwmxbul%>8 zmavR>Le5c_aBexRYFG_7wIem0D(k6-*?ZdQbfiVh@jTocP&XI~g|mZmAuc3?8I|$U zZ@Pq{27`B4q}}5p1c4Jx zTMcv4g)L_#*|HR8g4ZqrQV9JYandh{mN}gN`45o_nio|6#64QC@DL5{(5_yo%xmCO z-YP$9$#s6b!;istX?g2$x{#J(KVNkW35*-1SCku)!;Cu4ZZMgIdc;e*4O-kQyEgl+ z@N;E!`HT8Bwa8jk2Qq4P1L2bU10umA7(cGU7!obRY~PaxK6`;Z+>7d{8`HzZxiK5I zg_af@eogz%18nUw7{m2;rHt7q;({*xH6+dslc}UX{`5saa!tqWOQ@t0^t=(Rf-y3T zAAhywyBqczgB6xxKl(Jy1sTi%_MUoxJ*Gl218tl`*?h)`lWRrc6#5L~u5na%oKU6X zE(zzQF_Ak{v}j!>mBh@>pnh2X4Xk(h+uL!4)Xr`YmvxAJr7;Fu_&lh=`TT|iNl7R7 z+R;o&lmmxZjvR8ls4;9r9<#N%-xV?6TKO???~y;-WBwV739`j&#_@}^?`yfNZf|i+ zf&#z>`&NA-dHB5ov&rvKfO0mt+4eDjh{1o9tNk)@z(9z`-naSX0~ZEP2@?Xd10`iy zV6(LIR&de;r0n5#q?NOP+Z!&^eGRU6t92Z}UZ={juGXV6*5u{DiThI$N>O%}xlthG zK0|u=k@z<>R>{_@kfmjbv zXqcl~gDguNr5lR7!-t@*l0jBgf}VY;W(BV~j@B=+l|uN9Xkmog8TmqEd!(@rkbC-b zD8l(B*V;{i!I784mstsOLay)gItnlhQrY!0GMVfKe>6e<<;QOPs4wicq$R}!!NmMT z)+ogyIO`MZTsCYcfF*CCxQip0%{%Z~0d5%a^#(*&ku)eXH-BTXIz(qr?WQI#F;|js zaGJe3NpI36VR8JHH{5xMl4?~E195Dw2Xb=vZh0dO5gdFUEi|7|)V~#FL-y2*?t$){ zMP~s|Jt#Jp+mGIw=`KEET3hPlbz9Bgre6LwS5q$^rs42$h#v(AYR8h|Zv#%DEmH&&=6r%yc1K{9CZ|*VOm4gwS+Mrzs4w#6>NkHA%PU} zmImy!yxlYVG=~q((c4rw!;Vd?q3x9uK^)T*4bu*cMD7R@&Ef5HJW=dT911NKq=YSvjJUk-A1d8Y}&BLSXR_lsvR>lA&E*R6AG zgecW&AHY8BY>7*OkstHt2Jsb(?VCWL3iUb~dUWy0aJiI%;Y_p^5dd4qKDWUc+_@1bD>3;dDkj(q)S<#-ajb6t1)OwEj!oNU2%QHX_lt^jOw_CkM{ZQ z*Q`Y++?uLrjUMslaFLVcpmO`9K6E9>;8wDa`-GQ*4kvBf6T_EJ9iL4e0gVT zeLkF4gtcM{WgDF_>{~kjsA!IcMyaZFZ9;!RfCL?304IuUfg-R zB~9GZlx({gl$HHeX879*;$Iz|VqNqawQvvRrkCiIxZ?bs3Uz$fyVB?=VYB#%8`4H- z>}-Pku^gAxM~kCPX&webr*($cF`8^=@4BX>?2=fV{eEhkYLNTdZ4-iVlZicwvleGR zi1^Q~l#ZR<^L#v#9t_s~uA#ek6Q1bHZ(`!5-k0ETo86V?aNL6T0NPfVF8MUMJAq59 z){9=S5nHkMC$y_OK%_787dpz#W{5M;%$AmymPUZ%LM;T!jNXpGRxDdNt%DA?qR7LC zsIsDp5bC3TRJa0o2%d8$-Y=PL#1PfEV8nDw0qzjvt*$n%>~T=X8Mxfi$;pR%Rk3!W zKmGYV{Hy|VeJO^hF6@Y#f~~rd$E$N%Eu8$gvw#6m0$i{I-WeWn`*AFdm$aQX<{>OI zp+Jr*trEZjjd6SDs8AIgES4!|18<`mI&a&q^C%~wq|No7g06bXmMoY7^WAEI;q=4= zSTKez%B{OJpHD40kcXiocYHrnHX`~`H=cTuZEl1vuv~j}Te6W5^E7wu#=crem>Gcq z>N^&3^7Fx@@)6rNB*6vu@B4y?P983%go>fd^}V6Z{#io1PyXhqkt^m3OAu2o+0CrX z3FInLJG|z{k)0(p1H^bbrnpjS|ECBx_Jn8cajR?`4yv`TRGE)IckR$-X57`GB-yAf z<}sIQ##}6hEGd7?UOrG*icKk3O3T|| z71^$X2Rao?PnD#*2AjdAm(6Cp{A3B#sc_3;9V2i@?3Hx0D*vJ|o;<*D8SXV^Hs~uE zOph@^K8t#kb%S(;=Ez{dnxrG-qE~~P)}(EgoIXP`;vGB_+qaB+6@z+=yeE|Q;;p~i zjhf8~;rC*|gnKQ}90Rw_<&j#L9qv3U6yY8tXHBKtEo?Wud+g1Tm6UY=2R3&fubyw% z+y$`gx%M!*xFdbXj(Z^P#}i0HTFWelcj@gPZ@f22#c^fy|2bEGS+9TNx2T~fWz-bs zlK5()hOk^NhPq^Z6KWa$u|lHItO-obS(&O5L9!%&}D z(X21|Lz9oRU)QkY5)U|O-OHk35z~EA+a$;O98t^;uf<6_vP}Ns>{aCh-7hKa;zqA- zFBK$!qI1f(eJ*QZ&U4Sa;65RmF1jKGlRP;p<6nb=m)x8}9%S;bnM_Q^bZJ zbf)wmDHbox9>&@3wmbw1Mhh1^wz_dhjZtbw-oWnG#LwreNNmV&h})2!BYF&K^TI>= zD4$g=j3_pdW*z7bojWyWDjGp)dl5I$gi^k`L3om7G)h{%ZNE911Dv*T4Moz4s^P&a3aV~ScWbF??IL}U-J#`7&e8yTVzRVHIns99 z;M~VdqvTSv_}hp`mCNO>bF7y~O^`+IQ=hg!GPb*+TC**~VkP3=r&_BWw-hFN zC7mj$!)_hklP=S1oLb{ILt36g)>m>CZ%S!DmKUU{!)8u+^4aoXD({-eenin^a4#jL zJh_zOkuJcI7)$T&+^UsTzL*4@nw$3S;^z}j4W_$Kl0rrX;=R1FbM-N8>NdfeFI?8R zeJf-OJbKTHYfbH8E@wN)yz#lwuJXpg`Nj9C)LqDz6bNnVJ;>7bJwPXBnsA!^Ju1RVmo|EJ9R8IaTla@*RAK|djmH#ZAfPLA{4&S#?P7^WS!8k29&|? z;W8Af$Lz@azR4x7ds07fIbieIdndElt+Cm*qb_3;QwwD24sy)1w^%Z*eT;fNt@kS& z_kL8L3i0Y%K5ELB`Om^OKp54{>M@Nl0)l2Tp6;6xB9Vkk@$h zVvRh~T%(D)Q16PeaY~=Z`9Uw_1gab;}Tgitz}q-2n$h`DP_(~hKLGFB(szuBxJ};#>_)9MJduUWGERH zRx(RTq0BpHLVd_Et~mV=Jww4oGj zht=-dI0(iZXMfguV%02xB0ZgOWA#~Z9X>ufinqYL`l8c)?{4<0x=QVME(aHyJE%z4 zc|>6U628!-8^5wV3D$rO=ZUW{;Z167nY=aJXKv|3=N42W*p;|`O~oMjUb_j_z2cMF zp_I4%GFaiqOdI5b)LUVa-ys&_uK`;aqe2I<2t$tgT_lxd|52h_`1TsE85cm962us0 zMmD}Vl^rn(l6(2MgkN|vZmGx}XFCRRo~h;KKE$<(zsu|W&NzMc)A@&IXk$C-*&a0M zd_P|NkuuHIn$rSBQU#dPRDZ5_v@~bre#)nb z$s2B;seFH{`p(vd)=R`Ue*>dQC6Y3Pb`iXzovDHj?a%X@um!8cPC zB_A(jC#~|JKGo6ubVmEG;zD^N6(9dN;yR;u@lSq0&6(N5Rj|qJ8gIDO6!AdvwbTL3 z7SX^pxrA!bru#NXM|F-Bh^w#d#Gdh~b{o6(qBJY_!uLYjQD9V1ukn4;44CGH&$Ma@ zAZ(LJGBoPtp`6Uj%cI<%6`7Mc)t`VWPafO5a!KlFJa1&COhDtt)LMPT@a&Q^n9(J_ z?09kUUj)*;D`Y*#NT|sxRlUwtx@}oMD(%bIn{pyylIwSsg4YRK&86E&r`3h?m?2>+ z>PMdUqlQ0Fv26eR7*X%vohjuv$UHCIa_oeSGgIyt8hQ?E9>ts0mZYFgT&~xrDbUQF zP*%+Kf2$N#EM_(jZTjCJuW0@v{_8)GxDai=eEolS^;Owe`hBKHc@x^a>OYD+q@^Vi z{60Elv|9*j8`~SF1`yG?HQ9(?(e$mLO^klHNg0y*2_97jyWI>+727JJ2WY$7WBK98 z$hZ$FpcQHIW(l_)Z9H7YCcgsA{3iX)-{))h#TiX`CBX>uGunfC2 z7oAn~0d#9c=2=VMyVx#f7yVjr_K+#lEq}2LaME1{(+lVXi`GUCr})1R*GI1a6(fyBWJX`ZAxrlmzo;iD>(L@>a80^kIV-L+O2 zId$mdiGBzDii1nb>&-i;p0A0z&VDyrlMmsfsYJnKcrkM0vI>)}l2!F`>ugTU(n*iM zdjVdCQ;pM66NZ&#sqU65N~ceutA2n^xFYkE*?r8!chFLiIW)uD)`UKsQONYdkR=a* zO5jYFPo6&WWUItu%ZT?!_T|=Rx(`+--1wbzgy~DwXp;7F+QaS>&c2NXDG5@!M=oWO z*&`!7w#hmAU3ETgy>Zuiw;8Q)0?MxU>uK(ORvA99gX--sCZJsO#wcQ~cwi?}2(XUFVXUe<} zkdz9xba35@50J2QmJ{BX1Cg;GY`C_a(5k@^L`>3$xc;}7P=XfJh0@_J__(NI!%WE* zVv)`tEz4|u5$voexljEaFk0B?SO_}$3(|y#w7c??A%glTy0Lfa%LHwIQK~Vf-%Q4f zVFwc|oz1P4Cc;^4^>5%CgzlUmmaxs9ZsUr4^QDf8$lW3HXnnxbl9S(oz=+KuNs>87 zO8AY)oleU8E@swps7yN66xi`|g}90I7vRIkbAH*ivze{G+Hlhh+Q^+}3_lGQL}ATJ zmpf$jQy=<-uRrU%5W2tA3U$!j-YH4pF{*4mVj}urtK{~*NQyoun4>+n4E=6PYT9trH(1+Q!hJG%(-2(OCK@(K#5A60sQBrC1Z9V5(W2(qQepRADVW^V}hSp9H#pVe2@ zRP)(%A#Bw3lTwA0A%rLTJId@Et}YeDCc#cR8PO=`D;(LPl%A?c|ZM2XBLX zBxb$5NexAd>0D%el_0COPF<|RuM_Z5r|H3DB%0lq$-YvFJ4eOWt&y6uZlM+PP4E9L zO{|X``1dW$=md25W3(UA$E#6X)1L^n)+j%uW$e-}j&HpzXD^ldw8)nwS4<}bd!}`d zVWUbUG&5ez`_|?+XM(tzl4?C_4lOypnJCyL3z1hVDQ#}63Ii;7%p$+NhFp{gpL4eeZ+7g9Di}kx9Ah*c*&B_X`x+}w zZ?gzAeyqDmo2fV9p_fORf&jVg36J_=yi)(|vFCMW;eniV%o8Nuc?`G5CX;4xIm5bH z4b38u3;0d*T{O#rc06Bh?{PdLd{AeV1o43v-91Z1U2Cc+&kq_iY^TRTcqt~X5%P{u zelW$_I9@*Vx~7uL{SM2JD~zIeVEHK;vngiE60iGo9(}|SY7nm1Eoyo9R{SAZM$nXZ zap5<5bqI5Q0I%up@u_wKaeHLM3kbk!$4+n@@_ut{GjAn=FQY!=mVn-L&}(YW6u6%q z2Z`jiw18^v5dqadN1b@prqKrtzy67CuoUpq)-ck(-LmlHo}y)dP%oisN{a@cjawQw ze}9Q)rR+Hj}Y-AMtNI&=LYHdOi zoGfZ8r>U{!QIedC-G~~i^hd_a8W3vOgHgxz!HMJC`thCvl^kReoG_K4b2xZ; zR>$kv6g79J7Dq>Yzj61k;?cYwL=9OmoX0*u{Av+b(&sMFhZZ=UupVZ+1#0}O0q!xU zw0=+gPSk2xUK6S05c=0Fsf3gA>fdD3|BUxli(v$6=1e-)BpDLUI`cb8DW;In%w~1a zgEDqH8iO7prm@ob*(l68*`!5mF1vcO=PiyxV7W#rtCq1XGnUrxS|Idi618;k(N=IH zz>4FSHB=}c5z(Y{C}85Ou?plkP8JJ>mY6;$`&>~U)eDlkz`SSklqyzF9N&`_(G83X zDUW-S8F|!S5l|AbylwNcxrnD>~nmp0TGV_CanZzUBliWcA0Qy(1UdNf z!ynoNgHue(Kdn{j^6yo&)kU2xZzI&%#(6SX?|lKM?yRPkcpoknZ;P_Le|heqkkOKq z99$h=|LS*V>(g)WZIa6dMve)%w>ya57dj47Y`Jxm{ zqbF-f>#_t9MS>9+tSkJk@uWW#^aN+$`rIS)6ZgL;^(jVfD`O_HCuyI=i1Idb`3c z?*jdnZX?U}CcPL5*2wq0k_N~h!r6x_My=H`CGe^!<4qCh<8kKk!;kDdU1kW}ZEFA1 zZ}?Z=A^%&Tb!BGE;~mxWTOr4TBHS=qx(KP#@gy@PA!YI#FUs*}MP6B*u0!lqCGG|egM5?HtG-_8G15j!hT4D#=!w~>dkn(S-SRDW0|4tx{XThDJ}iQP1_@+*D?Jhn)a+%`>lN{ls;TFkbjF>sd)7tXCaW!>fnTnNYXOq4N| zdZ}E}?{};rko@BGJZdHS{r!N#<42)Sw8@6-fpE2r-(aO0dU-!g23QX^YwhW8%_Zq!uofXDQ`gMa)KN;nUcj6En?mpm6 zgwvUEok{aDp-EkHPJ%D#@S*~N=ntx}D*QOHS@W{JB_Fv6-$u1gw|aypZtS|Kz|JLQ zjN{hQSQXx7fY)47|HT11E0jxx-kV{qpW1n+J6Ik%9TL$rbeUtn__44YiK?_&q1<dM`onEP-IZyXyD-G=FP6wgDmRP zfMgmX5VZn21Gx$^^TcCtb{3jDvmnn*au7wyd5SMEdvjJ`{`w#$i5eJlRywBiGi6#kKCI73_Us6psa z@mg?%9UDLkViRu5X91bY>H$vKql*qcV-Bv=JFPlvS14 zioJM+f65{JdH`SKtkA)>57GL2l$hd82uRJmc7TD)KJ__V8Z63JG_7)fkhYmZt~LgB z=CB;BT=FGrt+#l$e!uWQ1fq1U=~pd#GXDQdMWrIcGG$3I80E^qDJ z2e{8%LHYW*%_-jUWfD}`vb4JU(K?Lb8x#QXVofR7#oT~Db1$jq-*WF^$KQHfo_opk zg~?Yf%3t9Nb?&W&qpi3GeY3B7(AQwqJ9o9=|1a7;9|cl;wtf1#_hk2BJGU{4uA=A{ zHqHqVH>pld+qFo(vHkQ$IM0PJqZ*fPUvWTf|4_9^*wY4`vmXg7s=k`O1^e}nY-71F z&u!Wt8OgUPCUL7%)^vN#;DR*lRk{9H4;(z;!t?dh#A_8eVQ0&DDu{OocQ=tlF>_nA zO5P7;V7SHibugv8T%jduvC^G&E!ff_~@=5fm{D2D)R2WYbJ3lWv=zqvNh=%^m7^d+$4o? zR@8KI+2Sb3TxYTG(OeaouPM2_6AI6?Bwnuh5lJ1Yta4`Q6CV*g;_THrUoUpr;^`XJ z4XDJ#xx37wpTX`u#T@IoZh(zU!Mb30>H`Q_4KT3$1~WEU{7G%&7maTwC8vU~Mbb>* z+oC4w_bgm0B5aCdz6)Y_k{Ab$ZGi8tI_l8z3Ve;Vj^p(`GQKR>iXsB#t}GSe^Rq(E~AFVO)><3H-RMa^<2R1pWIoC+e^(J zpl0Kg*j(mr)fZM$L8dppSf;LtkJ*xK{sD9C%#uR^eiKm{Zgsa=@@Dg&B`564nm+QV zx_=EYCP=n)%JJ+?S&GJ~^9hXq92uj&{znW3^01Ij1Sjw5V&cih+;8EyObJ_Q$%UoxwHVtT%WB{aTu)l#O7{h+EQFn)p2Fy^1Im}Zma#VgrF~N zp12R{G}hv`)u__E)i)|K!9gF*N)zRz|QqhjhQaXZFmVfIPz z3i|ZsRXYrx#0|#{I9w-`n%uc(qh=9RNSn7w38e{3rh7%Lf zd`}*#AQ#3q>jnjtz@a<|OS!<&dd!`b0DuMXO=Yp}lU>UIhgGy_rUvQym$PFjxo3Z! zKoQWbRT1By3UvRS)20-GY6AZe@bBozqkZ>@l_ApRp7) zP(sstG3mx#u#V+x&TP5L-{%M&vRgWIR7rAnoCVm)!Rl>2-Pc?E;Z=lKgOof!VS^X8 zgUgfm*?&OkEI^4xML$62%?6F_f4aC{K(aV`%?yJmq(@_wrr~SnxCgV7yNWR8V zKQ>0Mp_^@tnT+r6=BmkCW;pqeE{*Rh%pXnsQ8fyRyGCi;nXR#Gr$>@vTZ-Z2fWC2Q z)iU;nx~pq(X{_Z$F8+MS|6EJKIr8^ENKysyBN|@yj{z5+T#-ldt=gz5KQ05h$PhZM zqAp5D=!^z;dJyS4mOXIa{uP1~U4*pDuZS6jNw{JkHboIiG>Iwpf${D7Sb<*wx^9~L zix{TRRVznSM&1(9T{TiRTzL8b(9&?Wpx6oF%ln9_Ff5!)5qnktSXCJV6+(UJ^-^7LR2ky&o7SPtWR z&D7dMnulqxha)#3-o7F+;uKi}v2KnSY7fvy4u*L@!BB$8EcVr?PwNqWW$nFo+v6kh zk4l?juspHoel=kLNM|XG<7d_I2_W^yn*UTZgV$gZ6CT+D02Ht9QZvc zw2W^mbUKVmGkKgc6ZUy%g}MoYL{8D_*W2<{lDUG%Lz!}PrWoZ&DZAK!iEopmS@KMx z6~?KT+%vT2J^_zEp`P!t7UArSIeJmskZ@8C4W3e}#i6snk7`@4eue#r>beJ*xDI(qdbwmqe!u=$ zBa?}da0fTa1MuTqDH`yRtQAIAH7=0k?A~LHKSTY&Vc~HEuLPS}%BN(HOtatcrJ9ph zvVN#?kBwS%`{=isq+Ox;LR+7aR^?*lxsq3knO7*SD7K;rDe5`|T?owQZLj-&rgh+- zr0<%m#z;v^V1Lf*w!RvQi04)LEPp6q*8DRYNjk+#b{)TBsX=?}f7q6YyQ!YGMCm_U zGz2{`_q|^bF1;zlY~4b!qE{!PV~bU)SZ@jWqJpTkdnXh4FKzS`lWW3@ zdtENKIk5;2!~$*7p()=`!az3YM+$Np3vV8lrChbNlB64fzg#$({Y7Dli4CfK*bN z0!O%j=nt9tt@~lWle8?`6r;D)wOKnkegan<|K?h!5X^s-h3J;+KQ;uuIuXv=-g2W3s`Md%IWc2xgOV<&8U69|QrUU^L{tHaVMas{2&%&8bbMGDrTs4UnSy^Q2dl zgwc69EuAk={oFop=}*PM9xd_pw9O(hL9;2g@Y|e-)m_zKuFoTg`~+cxw-(Y%arE5{ z0?OJNtz8e(YXcQ0w@KyTFDfO{5<$~kDnT#vtIV^Es0TYTdt- zITv6caOIw6gp_-KHz;m zglixMOx7(yyvC)~pJ&D@j)aOFWnBKv=S#Y1#p-ap?bhL;u!f?FHG~F_P1A_f#MvHP z)UQi;DAY{42q?MAV;*o>JTzpdo+M0*nGT=i+8IEUu-EvDR+zBiMEh3tm%Isb1 zk%(RG*v+-r$36nfOQZE(ufE8mA4@TUF3BAAFi(K!VUxr8h2Wr{O>>8dX_k@d*7Vv` ziQ?KSEb*_vtXa@_Wt+G~L+1_m#|HM}t@Qv1bELgkf50{PLtN_Zm&BU(^uX)C6{mkR zNlxc3*Pgv?ad2PTXP1+QYck{$oL&|wYw~JY&(0O(t+}wrbm)YAWxW<{oVv}u0mLva z3-%PU3I`40yzM<*;79yA_2%s%4_z8Nv7A4?o9&e>Lu(`@u=7kqwfr(pWL5<)>qup| z#aE*H$#Da~5;C8TpECb-XFG8*>O(NU`nsTg?MBwjGOFX_D8Qy#8BN8L2+G)is2=t_ znS>uY4s8 z|Fk~UJM7;(Wr!YfoHJiAaeU9xtYG@qR%J*#L2eRdSJ3ZHm0n2F3e==P-Q2s;c<-r& zul{GI)_ARF{X5(ill%v9pH&E>+{yuUY<3$5^v$BD3F}*3q#pG*fGfT}#EZZ18Cql; zL37rNmCuJRcKKEHR-dx$*7OZ#tCw8)3$v|7+lo26vG_rPKE3H6if@<;a>xtWWZL!v zMMl?D%>3QK{S~BUOr+u;q+hLTp@WmV?mW5;vt$`^;o3`SQ=j4Eo!f&sm>Qes$DCW3 zk}O3i9TeXS8n+-C^liMlAT1qwAaX-1^>wIo@)gQOz5Vywa~#nYD<#Nnpt>tYk5INxk* zCqgk35ebK^oT&VoIBr2QbX#J6Tk6c*ZvBxydw?Nv%i_+Fac(}=&!t9p?#hveAqxJE z^vW=ZjVehPj#d1^X~d95j^&LztHQknuU)Qh6Q5xe#ANKl2QhJ-C^61%4-DS5)DR~t z<^15x%Xm9<$!9EC0u!_~8x&^J6?yP87Ccou3PP_$F7Rf(RSWdR^o*U~c57L?*!fL9 zWu@?}#?Ki%0fY3`jN+*-2><?#Tavb(->1w-V98gXuB2$Av zR$JOHlm{?zA%gpuJ(jEo=tA=0#l@j&KD8yv-SLmOCSkK=9jo6))WtC(g`7Y?q>Afb z>~F6Wuw}*<*PD8&z%Yxvz=HFbJFDOh6wC^OSeyn$d~?Kge6p>_!4>+TKdPz;%|lH3 zi`pnwH*;O9nsZY@&Db|TAJfpi1rUo=#EiTO{)T|a#p7zjQ)`=&IdXsZ0w!-=*I}OH z=df?2?@DWO0Hf5t*}DEvv;-~R!KuSbv*Aesr@3(5ce`QAz;eLhr=Ju~*E+;#q(?=m zmVbDvqr-Aejo9MFYP+o9Zu}=}7yTn5Yq&AfKhb?cT$iH>?DB*HfBLm6AIdMa>bLcU zFV(YD;B&5cW#09t8rmi~eU!Yvhcj}e(tFG#A$AE;(8zV-Lbs3l-P%}Siw*J1dOW;` z`hk8u({vTJG;NJ%x%UEkgzcXSyaKRc)PWP?BLJ zvAAA4^XzZ3P;)3d-K`0e`O_$Y`KIWe;d95&glWmwMi_ITEnz zTK>;!-JmLjSS^oBa*;4)m)WOT5U`0~h!pz;+iKi*G^QH(n1{%Pu9=Y@hYr@Zp7?3L zNA_fM^WnRteOKCJvW;MgBu&(p?5u%K)d5wEnSW1%oj?Fxlef+rX2xrIU>^t@<{)7l zQ4yq70B;NuE-)|ruFYlLN{kKyyv0A5DnlkuX***73| zm=UnnQ->2W#RUhTJ>;7cIZ==|r}X7ua_s9TM@?feDXVANPhQGVl=i_r^!{zz8%>#Y zFB$)6^;_(WecM`egF6fi$_leaQpoO^=oMMJA*M%S4Rs+`*piM-jV8X1_YGexhnN#z z?`YMll1@9iuSk=jcP88GAnTDmlVp%>vyvq7qoPiYFUB3!JwxuWY=- zc?X=rv2!?y^s;2Cs|cLT#h5l;@1eBsHfNmB5Pjc1;$_ZsP)>CgT7;eF8NMGXd^t^9)fxiSAs^( zU3-s%rOE}(tX*;7d913C)XLL4CTqY!7lJR68fsmSzDcxdpH#LZCkpf82n!QY!u1vX z5jw(gb_4DRBphaMLUA$m3=kraT()U1WU6Ywod3m|!2tl>-B)=hvEkCuUvoB2XP~4g z*0R^;6sv@s>z3n&vsOBk)}gO$@8IN;=Qo=b*`Xj>8BKxr=eNplkUSJ>QYLebX<_QM zw9oOTpEmWQ(f4Kns~>m8w}l}HVge1cxexoc@+AE9uC}CHH^_}jZev{+s8M#{-h-h6 z=iSUfad6=NRL;dDjjQFq>Ef7B+JsY=3&$Mp5Z$v^)~!f*)V6BJ>d@O2^g`XO?F;H+ zGLiF8F z>{!V>o0;U}=uo>2&Iz8rHwZxwuXlU$j<5?*ZgkFD26AQE=qBG)fv586Xdi%47BxGv)BUrJv@28zR%`OKzpvJDh4AaurRP4R{HSWC#mbdi= zL6o+2+3rnz`#J3KW!dR`daS94^y3`CSVg=htDOOA(GoBBb4K6<$u<+rW&}~20@SAX znYo1{&2#?9%~G6B(&Z)gpe6cML}5QS@WAMHAp7{*K7u~$)B9gCzJZhlng3}WW}>bo(% zG-|3k5n5B#Bnk3nzFHUw_pRyAuV?)I03oldB*sx>*GtApkk1CSx#Dr>-7?wfe0A4Q z-!&{*9-dvbyC13bLz2<0fG_$4WJAc*v^iMF?U{I#FdOJmcpREc6%@UD0q>l!fMCv8 z`IUP@W{!rt1YIP{dX>VTs`N<8?QO#T42k*fhJ=#+#)ce;s>;s4HI!J~j5%h#gK{^} zVRFzE3h3ef3Z|35eX-AgLoPikZs~-%vP4CE*Wo4Jm!ss4Qx%=zOq|=-IX*+D`4#n} zZf?1|r^@Gw$y0(Gl*xCfpP}9m8|#6S){Z-=xoFJFW?^Z1OKzDy(d1g^oWG#Dxu#?E zhpOX6s9;(LTPc>bk}M)^QuJHfwaF@$&A$^_a*kr>+Kxh zf&+9dgp8>mMvG75Z|CuUi_!Z-X-ocl9&CUFOdn}~7HhVDA6A?Yt^@+~bLOyF0OeoU!3tbZ&+D-!OVaz~Fz(FdWFm<3*` z0%~KwO*=l`L{XwN$3!()iwA#%$d~p|0mYEqmQ}xMYfeLVvwrfDFRNDMb(&aoG?%5X zMjsyXnRKL0=ewd4&>Y3};#&u?bc3l3PWVD@U>7;Zm}X;DDWukbW|{ZAhj4E2Ep-=K>z`R;m~D&$tq;>s z%9C&M)5);5q+h*08+SPhB}a+s0*By9jlCbF)%z9G>5J5NiUPVZN)|`jw{4W%rDT`S zY08IBmAmZh2_!1n>S76(xO^O!rZ^c_?$LGmJamP3tOhzv%9ktU7<(bpC96Wro;zRV zU!(V)?BHYITlt5*>EHkUuc+Y?A*6fv_R-^-)HA{Dsx^l4VDpIAY7C$T`;mU67Dm$% zth^_~#a;4RNykQ>HXu1?NZVq@6{?p=jsv3SwmUWBT1U2As$TS~)6M8#JjXqWs+s)+ zwM%^M{2H6OKjCLPCB@M5;mQMDry)M&M9n8P)d;pq;HGts2eVPvpgPSQ7hgo2uldzt zYa~y7cDwv>9WcFuJwdU-c9$*+N_{HZqX!_VJ8;3suRWb((*%;uw}ihRIXR?dEeM0 z)~NifD)h6J?##pkeFmsgn7F^v@L;%+sq#naFcPe*2mmq$(Wq{r^^pJv&tK;a<=cnV z(WRvAx9QALzKQjmqB?PREFJd1ZgR6!TtJqNKa8-h^T6M6_1T5N)-ft1aSB$obRB8ma zmBP*i&HQHUbJS77JFyFL>Qq?gUBKELa;q!g$J_26!s=s;_zbO+SW9u+owoV~Q0u=V zs3sJS7Dfj+h?HAWaal12-uT*Ia1&&RoFnvX$5vpm%I@%}^H3dI=igz*v6T(;muOzU zSi+n112HCr?58okAu6kR4L>drppl>QED%IFFZn5p7(g;x2`chFG_Jzc6b6*^7z$G{ z%n`P$m-}*0F{o)l*zfA4a~@)u6rr4uOyo?^QHp6^LsHu?8C%`zFr&A4fywo3jT1qt8|Q z98X<(H(MxJ=zMqwY6^W3TN!L}47GEFlDDh?C)mDf`PhLig^AA6lF8$(!eW`-y9b$j z&VwWb&Ry{wkH2GbHSW>V#J4ATDN!92y<{&|Q3D@? z*U;BdyPC0^7Iw^xO)=7^R&$NF8lKvYOg#@_%Cs7{RZ~F~iE8P0K z7odqO7~Jv;7r?5uz1F*TneoY6lf_s* z8YXCMP3m>rzXdlnY5X2vW7{XoQq!lN!sSNv7EAmtcKVKG(^|%Uc;R!GG%?6k5`@LQ z#EYjPCCnfg$qT%S8DgkFpR^%SAu7|(TbyO5ZDtlgjw@AGm-c))HGOK2I=kf9hkcXc6hiGMP7zvF3G9=M0(tB1<`E^EB?=k)Vpv3ir@&o>lgN=`} zCrniX!qp!{JaOnj{{*?mcQ7D#$|=Jq$2^w+Db&a@e09ZlR(0w!559}m*Gg^08@i8o zag1E7h~9v>+V-rB!IqN23#qZ^D*Yo2O*bjmv9x$-dI0o!}W zYf5t<1>s(Gp!4G8YxBUcUyPF?Wp2aFT6CxzrWyN0Cim3{BiOgN?_bfmHU;9}BP!He zZ_;x_7(>b`J?0YY$uYSCD?(H2akwZo*hN!U{4^NYrl74vMFl%7&HsNBYbnjIxDiH`?)*_ti^e?99=F3y;XzIgO1ytfULpeHH?Xc z>och@3*s>3gWk5>$0IrPtLrnbNjY-9jI*Kg>Mlmi@nm1A84*_#w2{z%R?9z@5QNNM zgz)UY>J_nMBq3?l?%8kc)5e2h*dqfrABIHx&GZbeNJ1c zwwr`gYJy>35io4kA4nptkYi9o(>-X9aZ6q}za*)70|o=nj89{On;<#v5Ln{|4%95! zSs0eZ@!Q+-r-^l4*#)}#HKH$|T5V<0Mkh;0T}}OF0Dwh?B*tM-ySd}52e;h8m8m!$ zyx#@hbJ-dPjgl}v3NHQ#Qx7<7Y+A2A9eqP}Rrmx{CVKCX z98H*N03UL<`!n2$TPN+2pa>7?A?^8lIc67~g{G|H$LAo8GXBQ|K~)J7R$ZWNR-!QA z#j5oeGWF{(NQZ9UI!I>49>uI8Bdv;#Xn#5{bI3NDlJbK)d>gI^1j$_K*LESvDX#0& z_VoMYtGMQ2^}YChu9ZV;Um(K4x`7aU-DRbnU3NzV#ME)!`=fYnW{!bAdxe8rC8%K~ zl-l&Oj+ue$198+Or$(F*nbp-kjvs5^EmqyHKD#6pr(1R!Nf{t#I{cpM=5WNHHugR> zHK@#k>qk=t{OQWb`;o-G{Z_>A?{%J^Om(5=|ENP)k4T=M5Y#h%Pj$806HhYn8YE&P zBoByQAS9`4hP6L~))X1>21naNd%$x9HW9ln^Y^3aMKAb67sg)J4n#WT`jM4?|Fmnp z#TnZYy*T_fvBn)_;0|L?@Q=k0b{Pb>e;chZ;BBqBiL^@?AlC@Kxb8hAze*%k343sF z4vMiF2m6tipxbSWT{5^lHF-6df^tzcn)>4lCbfN2RI!-=d$NyGrB!3ba>a^XdR&@C zL0NsZoOhB9FALAHC$f%HjVCZ=4RAX)1X``(j|CbutuJg()(Vv!8X|I>M|q)F`B*h` zT3^iop1K13k@DE(!=4@dk>e-3C2qT-OE#sVb=+Vj#9zj%%0-&!A&G`g+POS`ajixi z@Kq;k&qh?Vb7zJG@&?P@x?vO1?ZXDLwHfO`GbX zU0s^g6gIl@mw0*gk(X1RF917PfXHF;;g5NA|MB?IlH&^4B+XQm_1Zr)Kn&(f^! z0e*m|vG-n-zi9mHb5(mVMq&T`p?~dH|K6$kNeD2Cq`W-)ZWNZ^7(%oGr)sxfS!L(3 zp2xaRN>Qozn?VBTiYU{2rAR-`16JC*f-bR%P>N+yVCT|d1ZM%B=hcK)>D@Z6?&4J0 z?x!Gq*~;SD)90%REihLp+Lkdo`G1sRA(a>-wf{N-x8dt#BqmJ4cg|+o_x{4)>*ybQ z?7yp%{y5q_*xonc?)V9&*xy0zKy3QGQ_5NUaMk1w>~-HEhr0$6^g578jorT6zwHqA zeITe+HX$9#w2`vs>p_GW6bSo)$CpsduSF>AuV{-IL6E~gK9-MTi- za#Ubhb1`O}FX2A`i_~%IMlSY78f0dx+Z9EUby&5;O15NvJhT;qJ6U`amyIMiBv1Tc z{78B2HsoNMjs1+pjjN$Bsd+(rbP0@pSyqgoDS4w3uLRM z1CVe-<#Xr3BaB}l;`#xUd13Y+zjuB z*Wze>6%zSQb`$*8MlsU)zHmqXH}tQa05VY{lujT)l>GZ!i=&c-x5b{nJLh2uhb*FL z)Xu}3l|ra3usf zA;|vP2$Fe{g}j9)WGSCK`+FJub2X}dXECB%L;r+o1Bv8CQeZzKxt9&iazT5(yWlZk z_QEK8OloRGS0jRIXe2%bvLxhfQ(b%fXM6)fLuXx;N3AUk)d-*IOl<@yYbxwsW6M5* z*~saEc6=oOVV+uls-{g)TJ0b#`ym~UPg5}QtN?@AM?imGK_c)j-Tc^YdeOOG{^wAGoSuvTo9ub#X>bu|hA1Tp|DzV{oZMAJtkK`uO^{t(ryK{{h_0Zz|Jb2%dnys$hEsZ}peMa7n66dm4(YYQB`#sfBx$(#lzDB&J^}{U3N;p+&lgb}rQTg$*15No_h-+^^exh9co392R_`@eb^}2C zfu=9ukdP4_uG6juwnJ-`9QeXtVx6USD?F5&*io&Rm!q z0XI+8+imYF$7K6`9D#~zvn&hrsP9rfQTyB_Kk3*wCKQE^;`rx4&OWBBynZ1#{N0{L{XC}8o1 z!)$C5zglYlTTvI=SWB)Ze{;Q@7TM zptb#kfF>3J`tK4E05+h=VoBst9C~;rYm&y~ z{-G|f_6vH>aP()>gdnwbBonXIN++W2*o4}lYVyMXwI-BnC&<2~HxE34V((uknW`(s z=o-9DPhyc=o(;beng8`Aj6|v4{p|?o&>x4z7+WjWb5AD;`M-=Js0$?drE^R==J=qdacU~Q zv`i%C+wDKQmrXw*{dCdqfDp?~6L+N;;nOlsHFkvlUsWXxY1cTNfvN=>B_+}Ol*y>- zZX!go+HYh*Eg^x?W_8$X5sBdvx-_Z9Oj!#NUu8AKctYknY%@K2wI8hfBi>B2hr}?0 z?_8{MX98L~S>8R+j&v;6|FpRX?S~#cP;tC0TOB?O+3#i5=CrY>=)xN_AX|O`IHkg9 z(W8%1sfo=AFh;hfd|+&;Yj!hFf3~@U_6Jj;`Q&fyg$PL2604& z<6#g9vW;v75J3M)`RDll*0&-bW`*n)(rV&t2l>gf3NXSImAN)Be|u+Mco!y|wl&BE zd0;!XM|GreZ?!9P1{B6|i|iKK=O4~#HNimT?M;xJCD-B(s{{oM82p6x2okItv#T-{ zJjb}F#h!XMq@vxa#VG}SpJN?%5Dsr&I2K?{I}hDmF?v(8;wiKhXS>7c8Ybpy=W?A% zC;YAeI$=3S$khQOM|N5+ZVjIWcj1 zL9Ah!6xrT`AhABfen9A7gE#vln7kQcTgt}~_#oEY9y2HAOTM}VF|OZWNM8#c+z~y| zr}pr(RUn~$3P_Swg+Ku4z@=`sS-*}!4#zr~(2P)aog|8aZuqTSkd59l3W>cT`ZmJu zWdVhC+_m!(3)OI5d;+M-H7HJ9W0Kc|)q(@x8J{+Xv(N=nVH2KJ?JShOroV#3c8=ut zbzl0m`=hBAu&G^)3SaZoGEzH7rzT*-mmxFFCnu0y4%r@GsD~~hx%;cryR8_}a-v|8 zNGJh+ZwumE-40n^Ib;{+PhmF+0Mc6Iq%3ttt-%!X#m`1+6*-buD}!v@uHwsR$)M8- z+#mU*c(z_cp)~R}jAvnqWu3NMhdp-FDYNCH$*(qry-g`gs~rw*)IH6Rc&Tjx4?u`U}nEu&xx zQ#}HQ*xfS&e;s?%ROp`6i?ofW5e#w-1Z9#7WrF&~vl4#9Shzkn!5gldOQ1yI-zL*m{gHj}%t-|~NBkQfBvP!%EVF}@;q#J3Z zTM>{FX)x%Nl~u?bz?J{0`7PQ^ng@X{@W86^4iusj)u#imwC7pdfE7WuzDk|V?TIN$nM zoeX6uP0Z{3i&jAv0+aR7jW(NSU*4Zu!>UWZH$-qvs(*Nl%5Lsx2&P`DiD!ZT{J+w}j%by^0qtO-tdI;;<-nWUhrSK22+&y7 z;t9=bZ0Gyvb|?kYw=dPBC|GmQ|1>tiHi)Lc=D4+ZGLU@UIUkxL&VW*TfDW!UhhLx- zS^lZ=Q&3i6`+UUJ#!90lMcsl#rU4i@$_|y&D<7SZbk-3O@Kwsud^VSro!nOZh2Q~h zhCDeBP|WiSxEDIyQMb`kuCRG!sD$|uW|y`nNrp%)>G_>Y`=HWG{+vR7OT6{4)d42H zo=8W#b6=JQbOMrG>3pv1r8vWJ_tEb2?9C-l4Pm56urxBdGOrP}tZ8N%#B*peA++yUoiXjsIgE zsLQ5vx!djH|8Kps$-_1 zbpMy;>H-(|^nbx-xZ^!2jm^celOb7SwL(0rfWpk+sOm9m0E=2Hc48{ZNQ>G^ezMpRF*X-j@%^kp1N5QozPuU7Il zZQzKhR{d@WTZ&^pMQM3P{A!D0Q|OumGQgBpiE_o3!7`?zNJH(vk`|;p=EJLTf|l~6 zN5?+)*tyFJZaI zsDVa!-tuW5@w` z$_O`U1jB-YtN9HM_O23>BP16rOtWU9NH2#RMeUtId)sNl&GWD-+fy?XHzJ*lZudd! z$_iyLmEt)JJ#M0|2bvVtvhS0%QnFv_2xpt&xcq2!x@z>{-P5CZZx)gl(P0cA$AHcC z1$U$m?^3ed%JnkHZA+kiFH5PGDacr7Fu16=?!O*>Bds$E)nMG#`tuTzGv@{-5r6wi zH(f?`*=k#xsIkBki(ICvWiSWW83?LR1&&jR^S!K^QhMGl$$qrnpL_pJzL%;+G;Z_g zrA|tGHbKFa<=1jwci8m!*3`{V_(wPUl3{&hxSS|+4$pta5;;7WJb5uPUjJtpLe7~s z*)X)se`D*DC=)Ja_i> zMO@+Ql+jO{ir7Ks?}1$#1F!e^qifEsm?*tzQkmDE87z-*@^~e#W$(feuMc}gX?-KY zH^6=bjhxbc-mdPnkmgjxNAO%Q#wlR|a? zU=F>Y{Z^yq8ANh-EQJqI_Q#M{v%(vdn&NA5{?gG1C3*LLjT3nWFY(Yd)wc}G zYGworO)bV6*Tywpr2Q($UkS|xRDj!7&-B3%?Yg45yUWQTEN2hk92?nvX@_>otwSWA zMDJDDlH6bfUu{B!iUOKxB$YGZqw8XoG!|iDUu+rhR_60KT5ROjE~HtMtS<`Zz$kcqSk5Xm~5S1X&5%!VW>0m$y?31jNZIjTQepyl2ce=Afa>^GEYcA`>A)OGv+)# zUjY&=7*-TlUSZ;ocIn%H9Uh(*Danf6@tWA8B9;%+Di^E&y1_+xitxG4)NPYWt_J8= zKc62hxY@FJ0D0uDj%8J0RTt^0tnFT9Gg)AdfuLxG0`UP_ssEH=;@u+NX8-VY2qvMz8 zYp0<1agf-}H#PQYoM1wE@~F3}IW)Z9yp5I290d5wL~4DaxVvaA76Ti_L$kr(^=~ii zx@%p-TT1UPaKy6>@%&)iS#w2I3i^|NN^OPKnokjHZ}US8Y|0eg&l{<3s^GIZ7b`c{ zS!`#gDyi|m72X9@+PW)E_01){vxGK!vbsK=(lqaH_NePC0Pou6Vt;Z%S_V6BM2+Fi zl+N>iZuMW-I#_>o6v1I6a?^>9+oapXV{ehM1+_Y2f{Q^)B?F-y#g}_`3$GKlh&iLa z?p*rCN~&RDh*B3bDBI1mwAHRn-E1bI^7zxh%6&Ra#h1adZ-@W!GrT89z8{4}4zuh@ z!H3&O*kz_b^u1OF^~cS3L?K8SlsLI8St02JrJrbp3ED=qZ%fw}smBLEnsqgjvR>qm z6rlKd-))lIn6Qi3yQ3t9tH0gYG2Nf4TC|Ch=mhCHF%+B&XAXP0zF%lb$AsWA+2<>Ar5; z$o3pZCgjgJ3>pTye5kW9tUhhi{bA0q-IS$s)09pH-&KVsC6qqF%=h0@Z3Zr&N*Ud2 z`9_g))_6r;|1@X+91}YQ%)v#I3ia1Q*}MW~;%%j{LuwI()m8?-c#;J4rHo%!?b*@H zQEWcHY@Yz2e)&C?NjMu+#wP?*BQybgv|`JW24UAV48#2@9GT6{-!}XIA#v}7$B6>s>+@UQ5&(Qh=#D;q7etw6R=N9ew?5%YRm9R+m5!H7X z$FyZ;19bGZ*6Bd+mHGUfHC!OVYvHjN=_dhQri)6I?=F~gzd`dcQ3v#(^v3aLhAoh% z+vvvmc{SVCml1Mf9-5#@ITW=x5Re4HNxre=&_TyVp_t<_Hyu?HM)o&I`L7b;j-6x# z%&RY%xp9$};>}Q_5AuAYw_OqQ`oVRzR*hZLyP$MhJf%Tf@+AJR}!lTQdT(fk?f`;5lMWbU!oy{tAYjJ$rnK6A3{sxE$&bC(kvM6REOtfj39IDrQ3(3K1diYj7arFdhi7iJa7z+4RTT+uNd(JgL zsy0JJBVDO;V@0+>@g@9Uf|>XB4xnCdCFXmuH7%R=N_zr$Q@@oJqr0w#pQEj-74dHz zn=TX}te8iJf90MAKf+G>_w*}=ZG;_+G{kuXqkR6Um}IR~c|}?~VZQzry4p29kj)&n z(oktya0q1`L4L4rR&E0fYNOL z#DefZwdNd=Ro0v>8?|$#N}YNSfm{$2Q~$W$s4NAN+%Uyg0QJt?lG;{4ii8tz5U;_P zFd;VOWYyTwqDl?21P?7Y1~HfEhREqNKPiFNj)U4OL^8O&`>@aV=bfV*xAmYg~vpaixb z%!4_bAVrbrN3j9&ONP^Vy1zNqz256eAI>wtBor(!`;pD{VO;EVp}dGCFEA{ zZ;v7Y%#s)s#qFGt6n+nW@6}RnJ#U2d^H+_GywTFlDv(b1_8nxuXA;p}4p&cIX;EWN z3G34`APr4&khl;`qqh{Mehy=b~dbwbOU;O)6v_WIx=G=rsC(_o`_pascEdd+wnO zKPCfDwx3AD7ZRFa+ff~}4Cq<(rWt)3|BGRp`E7o_OS(S5S62uA*)ZGF4DU>@UO)R` z?(~V1Yre-C!w3Gsj`4dE$GlXmGb`&?4f;G!^F6I8@s)8iAQ7y1Z-R)PZFZ!V#v61 z4V#5Ho%dA}FBRN0@^}L*(x)&&T1y}RwE2z=9;_FX15z*sjlOuaTNziuyC8CJj)LyC zyISZ9A?yJY`hE{EKBh|V^spIDvA({!2(t3@((p$MNUQ1$cuJVTMRhAsBf7OpvCH|l zPh3PI0z3;PnY@OtuVP0b^O}NlvB(&QypXO>j1H&d-M`wD7jZ3(?p&Aj769~TUrb3l*jEtCqRP7ZS#S~ z4F2r+oKM{#5TGYeC3XBO=x}@&!TONWVM`k`Ar|5ptvMOSQTHqvpS7~5zaOVkaNn9D zD;4(wgB|?PM#@a;jivxya8S8eMITy(PDjfg>x0(U8I<38k$Z3Czuam1cNa@euHx$( zv8MG%OXPoV`0o}Leg$fGpz4-^uYp5xftLRB8jwt&n>O`wH42>Y_CY&B%>zbxuyONg zgiD`4HQ(&uu>Y`cY2S1A>KEE5P6G>H5Xfm89&y#v2i1 zP3t}gEIh4iqDl)-X_;ks^K(>8rDYso5E5(zQX-gI46d|?ip_@VPTaRkvg1z9CltFTQh<-L%i~Kw#?om^;Or7 zEa;<^EXy;rr%w1&Nijt#$mx?w#^ak*F_lB!;H7fYbcnJKX->%Qj%-@@5N|zE*Qk+lR!+l*oT-x$X}f_xZuVhoVHz3 znMH~(jq2?H7gaNUa+EH9S&+Lq$BR1;`tB_xdc{o4US8zlt2R%^Wl^b6H2$_33vkdi zTitBKLq47B9Sq?E04d@uX`JyS;S%r(5V2l?#K>7#-0~gS*jXoJl`EzHMt%Vx2#Z|y zF&OKOLD}LwW@uY0Kh>;hpiX{w*ybYZ9RbW~QFZPOz?OMAjAx}#D ztj?r*fN{nMLQnh+e$dJIi?edR1!~g?)Q7)b(0?yH(ZWB1SwE^T?6Ytzf=^dA?7<(O z-xt@=9dSO&dJ@E)z1bE}_p;BCJwCvCj!9qOAxDfHvAT#R#R{*8t_fW(BbD(XAEN$Iuf8}V($AOpZBmW> z?^$J=VaQ~nAenJz;Obn*(2auBJ4Uy%h9Y~{f7CYaR1P>$TsbjKFFmnrUog^Oteajo zFI8O~t??xtYOLuLLBingj#0UXxmIktLCI8G`YrE?`P6m8@cqSsD(8SdnIi)}T{S78 zyPtP~O&2i@vvYaKXO>C92`&FE%>RhSbEpuhU{rR=9YRm)Lk*8kRm#zA>`z zWc{l2to<-A0Hs6P`%eDc2akWS0O%>YPJRJ_Nq7GxQTR7x)4~+6pda{4QoVIo$b!4r zTAG5du-9Fd1o(V&r(e*uDe7edSzZ$2s-iJQnN+T}rdfR##xd+L*8qE2tr)ajR%I?Q zL~i%fa7XGV(6PjOF9^DIoLn?A|I8Yj9sP9_uT}9p<)~?ywYl%B;q!-# zN||rGH|YX#d6O7GNlr8<$It7mr}c_&K2JZUf@cF975aIuN$S!yd(SE-g9@=yhX2($yJBSc zFV6NSaPU!g;xVQ>+5{T73DGkK+(~?c=$NK}nIsxL{=|RN`Y_=Qke=O9`Os+@XYi@9 zVHTrwexvc|TOTn$8igx)wvEYg*w=Qb?fZ=^Ycv%pC_ISb*}51MB(_L7^Pav z8{WRfCcZ_-Y@q7q!*v0#GdF(zmK%4%FqNRGNwZYqcWWPSZy%c#xU#ZOM0{4q{RVxN zdzZ|t3&b?5bVWI&c&sMNDU-{!L~ilDH@&bEn%qMqx-|EpJPf5-mL}*P7_Plv&2CPRG&)n*TfzlhV8x{J_^J zM)h>dKCuAYRJB)^7)h7mw%5I>*{>pp`2x>v=GE8>gpN`+F8Xr@ddG{0s621=3I3Ye zT5FyVpCOI&k`kg{dX%nOz(9i!ib|JPm+U1+_B%piV#y9Ube>C;Okzs;AQ7M#nrLo% zguP1sw6bK}=z6i%6VWBOLG1YFa>*{?jL%knGm^zDcWl^qO#gcNZ*VY-xiRp( ztS}?j-fR!nZ;*W@#3Zy!IhIeMX46=EauAHD5Vsx)7RP8*`vN(~$wA8*oWkDH?{3P< z2y+nsgzPnr;6ZjbAYa13_Ez9Tz9rDA?)~GYI$xCn*TGKm`lUI*3SRF^Ui~=9F>Hdp zRpJG>VyZL6PzWI!8FQY#yfE0Sb$-jJTZ?A~jmJ)^B1**6?O>9YlG!KPYaUGxO6po-L zia2~<46I_}_xeXuLM^-5@7QAgOl?SYo87h+mTv&%h!A3v3ica?`p@ugTox*z`h{O+ zFgD$HDOhD6euKS(8@ub*Tk{AU%!X*N(6qzNOHH%2*wpTKZfzOq-^Fk5_Al2V#)cMV zFM;nyn8~R5^621ckE5+$JHTVq*&>Oc@)`DVpdDG=7r9vVbT?kl@h<8r;y9@UIp!CK z5=&pHTwSt{dY47S+MS>rdYAsf7fe3-g8H#p;2EcC5uaTSFZzfCRx;dAhxKa?S4eO2 z4~Y_ChGxXvFT~hLkxL{out~ReaKK}XCh=glLD~l_Ox6m>+psPSLt26vnXGSs`IYVq z(M!=+-KyRyyF4hq=l3W3b0&{vB9w8lo7$PIx7j*8!+nsAYhMrU#W6}p9;eq10%j&P zFa56P4t4na(I3RCmG#p%>plE1thOJD)ORmOL*Ra~U;OG)=rduNb7>%;_h>CNtM{0b z#6N|N3Um!WmIuiPg}dm`v1{$i4pZBp2hlhF{h>VE$Z zm7HQ!hgtsaV%GIvhSizoVSbe`LWBYD&XO@3*?L`y5rBbT z+m=KGOEbxHVYPbkU878wPLmo6>U7qmy+W0sB7jM0VMn$vc8 zD$H!bv^HrhZq3$wDQOpf8iucvkt-{+PRqb(uk8BgG<&1-XYP(M(i$HTa%dG(FEA2( z>3O!MD`mMD@m8U#!!G0z%5X6PHkw3Nw4(eLNN)d(-jXNaPb!|gDK^3G1etuB1P59_ zvYYxBj(FSP-w?DO0$B7VZsXpD=iZatPSCC_ zfboYIp49zb{}`1tI1Hi@%WAd3Rxfpow4|5{FLmli-YDvkKxD~bd~0_(-mZ*5iCsCq z`>F;Z&E`MSJT=LddDw=~^a5g?enM7w{`2vBl1jhD4@#b=WbX+4^L;_}9k?yGZL<55 z&6q1Vg*oVpgaXd?pov@S`QTK}?ZcBTICNBp?wL&8Vp#eBgVAORJXdbXpjXNHq%Ml% zxcynIXVmXKK+`wJ^!J*n`k6;Cp!b z1y8CA3|dmmc@}PULbe_#K}?kLV(|)2-jXko`Z>%-dx;5wTA_wO(90;uix#@`Cks+% z-%G>E^9os4W^h*+ zC`_opRaM&V2{{l0|9`Alf%S=8{$Pp(l3VK?LDgfXnT$mD`9dsi>;b~r-zo1&j| z;w?%0-`)j8vzaS>h43n-=%J>VUndNEQfV>?KR*U#ZFfw$=i1L?mK}*!=fb!1NnSpzRzQkRGfY1oi*NZ{|sXRCE|%?-3|ZV zAF&aMA06AUUYLn#nnCJmYkdH?VJHkc=Vjb(J@_Iy=|6flw21L^iCFj=rli!U@VO-j!N%$FCCRa|=ihB9J=RW{9gowtiLK zo)~;%zAz?hjFC`KhW6 zN4NW2pJ5vX%T52rd5k#y?~222mbQC|H^=br))13@^}J4htiQ?n+;!UGT`V0Up5x6s!7rp@P)sqDh|aq17TSP`(GClT=)|+dX38X`9X~ zOfw6<35FLNy7c(&uworVn2f-c@OzPk%r@kV5X)pFtCq^8gclue8~#F5yNPp}k-w?0cPf|-yz49+(#cP1na?IXM$iVx}36-%$wzuSOuakbJ0Cz`<{f3$w{& zpi&5P29%Fk;=4=khWKEL({pTwrXsHb;fhC@k< zdVz`ffaJQmMu$QHy0B_uW(VE;DKPHKZ~tD`1!!KFZW45T=>=uwjd|X$CtmfL*)1^J zFv4NV56Uj#{^laMA^E{__~>H)8)tu3@vM zqwRB-Z4vW5e3aGdjdMrl&oAwSYfcL*GS2g-(-z5vMO^FC&ulX6wUREYzrfRZuy6V8 zIW$$0Ld?C@tV4YElN}Q6=Loym*06`1$1UENj*N_luN8Z}n?G4T`VP%7F~|bVyf)_> z>)5g%f2DVxKPGn#v<>qOi~TaBW4JBeKA@bFZUoKX61I$9k$&cuL$GbfgNc4i+*dS6 zc}ZdA*j?l|G%6wSeV4{Q2lrA#{Ox0>F*`eRSCm^}Z? z9~ij2)!cx&??u|mcci;*90)5*l*w>XQKUF!I(Bgd4%*>Bhxe?dNJ7<>HrZ8zh=MgvocP2!#q|A zMGVKJZwX@odh61|J)HtE9kn$@%Dt-LTR03mW|R6nW<1jC=cPx70EbBCNuZI1y$2k( zS`+<((+qlgyfL!`H@UvO{#dkIAP_UEVoSnRlt?{nqK*tWcE{EJL+RnSN%|KwH`mmEVcxL9n&u_xCZZ8yesgu(5*5k`Z>zKD3 z3BRQ_`LWt|$PFAsb!*m@3~{0w>LJcMZf8~*_4mgNIgw2@!LOD{UygR|Ke|6F*W?2Z z0_8l%$(BE~KgI7?6gqgK3Kv5+He*;^=G?3?3$osEMHt)%p&_BvsPpUIOBT@S5=-%F zh5gD8#Sk9E9lR@r`Rm(fUN;epqZBo*uP88&6u8y|Zx9cCzLHkz_z{4k*FB7;vf$H= zmphMnQn(yo#A4ie@B93pa?36p8}mq+A7A#0Sl4CriQg7D2|aWwuBq7$rygD9PmNo@ zP1@hND|AKRw*;fvt1YlEOu9J0)J}j{pZywAh<;etCe@c=v)@q(T5I7O^`}Z1q*`p3 zRugjm24rlT%}f4iUf(B%LpaOF`nNMbkIOUkTpP!?F1-~1Jv~`JBYHkKVml_btlMu+ zsb08O=I8B%MS9vVJwA*_n=UKOD+krHQ~uL>u9-*sV8QM^$(j0Zp_0SoSAiau;K!%Q zx1X-?Z9PuL_+a8li*AB1E00-wP^hxkoNZfkd0aMx6hE3ssT7vv^3e%d%f)j$?kJ)) z!$M@yv=%~->@_RX;|rwmUopdylq2CMUr3tn2^}_m)-yhj!#+jOAyyd(JBZIK1QG6c zC2Wv|^nx6CE&l9vB=rW^9GwRCxAV!aYHg~Egl? z??sEkk1R8i2|SFZp9>3OruQ?}cNVP)zr@KE+X_kzY@c)mVGjjv(@FdwEn8WXI0cTY zONBjBD0!DJM0s#0Ix2!^P2zj21{{fu?cgY63M0x=;cp%-t7Jb-V9cg*vISl+#K|xvU~l zI6AyHC)&=rOjG35fQOltxhVW)GQOyJues~#ImhBA!$ne=`FGgAk5*{VW!Cdtwi_Vz z=y9%lReEt)gXc?8@Zx?~T}#-S`X6u07elGXwL0q0RhfHl^uu(?AU0|D;ISKAQd}vcS7ev=V_8)`M@4P4V$~X@e0n*GFl;9m z`^)KxzkU7LPU?F|lzm&_w4cAx31%58619602c&z#^C|QO{KocS9SDHJ?(5dpuO?GU zcxfXgdF}AfE0Sgo&`E*tNPc93=W!Jkamg*Qd31H+I-s7TZOdUg{cS zFs0rEMZssPS8o{F7D@=>_LHJPc~fxON}bKg=b*XvPL6xS@+q4ilXOkBmL>ca;nV5$8#opvuh1m z%WW~HqHIb{7k=FL=OT-HCHCRxutUFC9s}cSVwR|5xnm@EA(uVoQ0X3;*!cA$1I&Z& z3Z5)SM@Irc!lUvP-ebLg>fejcYK?>i(-uiHw=u`xof9Q94biNjsOpHWi3p0A%bS=ZNB7}iu zUiZRBIZMk36{$bNvuF`P0g{8xrQLQ_thA$b!@BL)`rlThWIu7H-%H?ICEHz;;Lf^J ze()&&)d|9XHTbr3qufq9r6pFVcl~=`3fsJ88%!0xa#T-=Clq7Vh=GBve}u)1E>+u! zPPTe%OW(oU)YCqSvF0SeANUS#JRAkPBlqMwb)wgvOnkb;V(Ds>!UR*b0vb&1mopHT z$y06|*BwJj>#sSernD8m)&bpq^M@`|_jgHJJlR;swD{#uNV4-H;x4MKteKei$rRdr zhmLCgnMU6f4EnG0Ch|qX?=CW3iH{RYDS6^0Q;F;w887Si07@FW$*nWyQ2~DM;H7KDgkp{}z zpyrVYb~n|RfLW~NFWKrtX6-mi&==44A8jHq_4&f77tTlfJN3Ta3+TlgSOKiQQ1QB;3Ty^&kyy{@Yt-nv zH=lS9>s9+6gJ|xB*tiGi)6SHrzX5Az<(66iFPpF_hOYcTBVLIMaylX26T-6yc*6Nn1nP-{P!VpB6$5JK(wntxQQlAa7Hl&zj;`U5!z^n{ic1K zgR0uDi=@^mB=5RGb+kj>q$!Mg#*}KEN&T%oj3}ybZ&gNR=C<^njsd89$p(f~IzkAa zYMk#)J=J*1D2VeI0r_cZZbF6RgS$#qJ2sF?RKtW^K-tb@YW(+;{KYa z54Q5UMO_k}gCACsThfzZ=n(2h_YDx{84QGI)7eq!_D?}JU6DLtB9|Y;ZOxDE~X4nuFe+We%I3c4K}P!H15*j zYExV3#lw_jp6Ao_{t|c!)Zh5IpHxIpAa!P%w~EAASPt8uRhZzUF0# zP|uRez4aU6kyX7i)=aP8d%X9=+tmoZQ}(R$%9sJ$u-J~rX1ksX%LkbSBZKv4FPWNf z=No8b{L%C18w@?&m?*aKZlMKVJSEU(-*2>`EMUoK)fVW+#FUsM)Q=_oj@VV7D$B>d zG_jC6@u+ACDQ+33cIuAXGdVrEvIi~4t&e_Kv42iZhXtj~H3ffCF+$Aj*U5-+p}8>V zLgoJ8Au#_!5Y`1$)3tfSuC|x|U;#^!4?YEjM>^UkW~!JT#A$NqaJZ&c>KO{`=%yKC$+bZy@)70%K|gu_qZ z`>4^={x{+Dl@$)}{iheLMXSl@`+p*t9$OreW?{(>>|68`nsR1Ab&C0TKAad*5WgZ+ z0lQvboO=^~2f45B8F(+dUhsyIGBK-}n=_wv(+JLNp}Pr+*x8z}Z%5vySAMRsmYS3u zoaABlmNS`~8gV&LbKDkR2vKymXQkl^U;7Qi=A!0(d>KXS0bQxx+(F?tDzD?(_M8iKPhJ4y?in$;vm$m#Z5kvIH zLQI9(@6Y7(Ckuy)X`PU$=3Rgzzs2~7*%F~Y!Y`vwEzgyI>PKg7kTi4|>zAWrTkjNt zPPSVjt2%0pIm^8vXnKK4LOASY#uul;>dBk2d5mAz#V%t#qxmOB}^f zbGO6W01+u{X|~Pu$p7NmdTI8dcsK1Yo66)lbu`9{z}ior$HuDutfW7-B@GA^dC0}e zCzG+rkYN=oK_@2HbK1vWyH!%lH*^>M;s!&wvP=ganmR<)*x32;@h{?F2C0QU;YTz1 zbCtgyKMYo(*W~{F$a%4vkKy;xpIkH4ouX7n!04PG_Kl=eWPf2)$&ta{qUr3WJXu|; zzpnR@@{vTAEa|{DVeej}PJhFXEhn|RNxXpV4~c#U9VRKjbpcsXM=DDI{|;Gm8|}sl z^?Eu5C{>uDjzD;l(1OczWWQ3^7_c3i99Mm)!XqnqaQFLaH;1$5J4Z*}rm^ks8otMw zI$`3+xBfWk8#T=BX3}YTQ9Wh8k1P!O*ew}eb0}1ZjIgZmP{DbT7ZeD0imU%K)?qKFEmmVGrCXvBLz zcB$n!lAE{0QQus_G4_yNRu+)WKL*Hk4447CoJ*;C4g>wEz@!&)(Z*pbKDA~&+z#Vn zImHrvkCLTHgcoVpGX&+z-kN4hm8V^QgD{`HK$rgDv%VQ69F`<9Co2b*%|)rsG8C=Ge3;@=7CVG$*S5cgl0-?4@P#KF z%AGg%M}^hvO$O1Nf7R;$3Znr;qs_;UCH#k*&EqzZFm@(ba~2Biz6`El#OPQ~Qfq=DMtZLE$e)O=B#XXu}n zE8(8OR-qf@jNw^{ zcIAb86Xg@SKc0A~PN7K^<01Aij*`RE)siHSw)t3bzWd?jb(s$YJ6eLDqa>{rUXqwI zCoI)HO>E6cZSzg<(|sX&$S^Wp|DW6B66Oe4nR^V z%5i~4_|=`#tJ(pANMZ;^E}7{dNf`6F#m7-IOWi_3hu0o)lyL!yQ%RL#a$QXKotp=}Vbb*%SA;`o-KK9O_;g0JytRy0tSOA1?VX z9G|BSSn_aMs!cd$9qzbxxA7s-H7SN$fM9(*BH$ggt{_d0oiXZLZFh#D{u%Fg+=}bZ z2%goOWE(7*2R$4-wRcqKGynx!-G5~|@pAXEGpdpx<6xS%uJ{t>$vFo{#}nd<0cBnE z8JFW3hZ31qLlpxWK8eZlE!i7xkqAlkOoCExAoz8%=3I5HMg|oP~gcn5JvyE;aP-M0@T2iVJ zT9?=cFj@UBWfoPU6FaTkI91$`7PWP{1MN@Fa2>ogL6h4qFs`(G3Od@3nO;5i1fmER zyOz*@g$l>Z{&w*j(t44@)?b4KJ6`5Zx~l)^{Qg!iA(Fr0W9;v_|2!yuIRwJ}uJHDb zO3p}XQfMRZlP#|396 zFaNji1{<$%gMl2YXSBh0@`u@T0Xi{4B=F2Uqa9*Bc-`VGc@iE)(_$G)5whd?zn&%r z#mZ>qN{TT~;ZFEfPk2#88p_)1QOoE?r4V06^W*r1_9xWob0GM~%sLlI7H!9aiI!I( z`}Ji+1v@!hRuh0ar~F>8P7*tC%lJdBhzZPpQAXNxKuGSs;133DEWZmTEe zP=qSxySxwgV_O{6z*_NQy(!H*BGmqM+A-dshICkcP`<v##q)JrFj-A!#> zdLW0^JkQUtaFo#G342deU7l8z%LV;bt^FgHvkTwb6F{$|g&v}`YlI50A4y?fQs=ML zY0;)+yxFL%Yka$cq0VzEd8~H!1;Po$FZVjXh!MlLedeyD!h8AEr$M8U^g%NBk4P-l zJ)AfLAX42ln@9(ggVq`yj5~$5fW!(xWKNau(>)0M{0UosU6E2>pezT|N3iD>C~(W` zN{XftSxfooN%0=TG8UoLi871fYnxw%w4GVLTV)t_c&QF$AVakB)&Ap* zr$NrZ@&D@)|Mj0adwAnF{;o1|F)$!~Mh!0xC`=z&k_NT}@ZFFe`HMx4N>>|pDqGVub z&5##Y#s!}d&6Y~9_bF(;qOdE7P`*1$GcLC7=RNq{4bd7fovD6_=c;2_4V2U2@u|3MekNHcv`=1jn_Z(#h z*^>YMUqRTCDM5>ku`ABbR;bIL=MaHPQ!kvCyEi{q#M*s;4`z<#aU|M0;=mRhPuoR0 z7KQ7Pwi;w>ox3y@#27=V#NV*Ac*#_*2ia2tr~ai9Kt?g0{KP!&=Lsv|u=tVmU@3Vq zVh4hW!j*BGJXi#dmzd5Bl57+N>+eqH=L4ZyxPuQ+SY|ckv%A;IA#ch?*^|j>EpB4L&PY&*y~m zmy$C3vc6QJp z2*fatDc_g$>RM;lbHUt5IG-QsJtyoW{60nt`X-Kz%(mqLq z7=hlzEGYm1k)#xMo*lMe5i6|CgZw93r_6pSp@P;#8zE(b;_L0{v%XY@QWR(wEfg4~ zD;cvCf}&{64G7guP3z)$-%j|AO5>xX*wghg8b|=Ljb-1S27HtW3Hz-oSE;w0tU=#u z!ifcpW!tWuhw9dR@2})9e$GF6|oMr)-V7vtns-s3p8BCOy=Y4riiDz%U z0?(QdXcxN^3F~Qs;XbJ>QkKGH4>9-efZ8Sl z&P>Ti!%9jN`_0z-9i+-3@a)Ph4`e_}5b2x2-A>O|-SSgekJYEm}F5i0xju?bpwx`anfKMzLRH$`T}cR6gNydmciJxXFCj+*%2! zHavnLk*IzPBoV3oBVP9C`%QqVlTF=+IUx)K3kUl+IV~6CDJpD>hw3VkpVa@O?7HK* zT>p1QMK&4PyR!EvO0q|?$&9R!Ee)&4CS@0i?2#R!WEUYaLS;q@ABwVn@7rnUobUR* z{Bb&b#`E0w{k-ROUGEJK4i}3Wk^<7X>|^X9%LsVxce-4Q4#6*VUUchx#M{D{KZn6L z5avo#A@wW_FiKi(CvmlV2)|=a@f~-$@%Ot=wI?l#-Dx|f4*|USNNCxT2#3FBx&mdn zJgW&!q$Pw2WL%I7l<9XmV* zCy7KYj7pt{wzQegdlNW5J`bN4%&CmP@?F&Ij|X8!0U+m4H?^=OoL@w; zl2X+l8;0+YUOFhcKdQ*fU-e3-Qc1Mf9MoS3y#_<4rWiw=iRVifL>dYd{VZo8^W>fx zfX7sOKXuf1um@>5M&k=F8JpcE;TygoWd459CEzgPBlz8S-~fenC$Wq{Izf!2!!)%K z_a#k&?T3F!ma9Ae_ca1rgic6N82M?be))@cUlx-1L-mLCWUvKH$sj|M*^&L>sj$aA zm}Hyg3{_@%?Tsd*@BwTRcO;)AmGc1u5^HD9;b>TAND(y&GhLd9UdZ@)x52`ZRRPzn6{0vA% z%}Y&QsRpnXQsI~MBD3_Ed?1WqBWZ=2wdqJhXg;=djaCZ8=pnN+U@(@5 zN+0~gRHqJ6kp4DJ(~P*Giq?-6pU!6BvSP!7ZdBH>!awCX_kjB(>ZaldkC0(^0ok`eWb+-` z=kq{xXOq!6Zxr{;^3@!fLg9}1edcpX>9z8>C8OvnxV-JW#hW z?j(DhY&e-2LJ}IsUGtD)8ilZ-ywM{sHJz7H#`*5VN{Ia=unM$aUyz%hypt72 zoeIe~<7P$$oUE~fYPgYmCmJO)uB;nWH6_5ofxTEsHg^&U4YR9ETvd5I(nNgr^l-{( z;Yo9#tni)Ru0K|0!d6FmP2Uf~?mUt%{&;;06q=X&^9}bw<<$bRP^QVEQOU*q9-)e?4eDQd@&?XX_`t`eRbjjRee@ z=5->BPL;*b13R*O^{f*k@a12X<6+5MeFJ@g=UQcrUqD!FZMFdpG84orzLb+rxA5+e zv2{Vt4Hbu{1`c?^3aei~QsAo~u@t|Xg#HWAup&p=zG>?wk`|z*7rZKtES?9>XgkPS z(!x}zthBQ8tVs9a*0+l(SQ1ese9&Y)rLO65<=`_xM9IY`oJ<1 zn6LSA<5>Db|GQC0eSyvYlGZ5l*!dugqpaM#BNcV$!?NGOsreDRiZB53)#E(}?PnES z%##}lVb7jmZ&5sfg;aM77e7#by{`slNGHw;4806Nt9#IZ)B-DD;r$jt?TTRff*jz?a4 zz$|<}O$mxM9n4+zfa$ zi`RD#Dbs^Gf6_Ing*xyiH>b<-I$065!sg-Oa13%-`|zYnXwCq08~F)(j6zJt-AgAi zS3h2!YC}Mt665%8`l;AL96N`EgAD!;?7u?K{ca&5hwOuMro|bMFh_ruf4}UbEodiY$L%S*~S?5axc0V=61IAB`U^C_Dn; zgo(7l{)~Vv<=GdmwMsMgTJ%K9ml+cv-dEzPawdf7@`o_%RGk4bB8>5(;WG2t>5==u zV;Cbsi|l%$c!Bf_1W+6yyXlk3YjbR;xpqH_?P>6LW3mq~T$@yg*`H6r9OnESvK7=m zLKf|DB4!Dq`~XNBDwS@=J8>UWF=t)^thwa?B*BBX4=|gZ4Wt}wSfLYc7dUAEB*CF{ znqea#ArCdgiqY&s0~_|0#D1|Yd8I%{Iq?>tIoLS2cSqx6IheXMQSLqrqwUNSpbkhd zHbZ7U!z)MPRiV0gD8_8>HT*%*bFDZ11_Y14oc#*s=9Z!8r7|h(Zpx=es|hpAXVj6O^w6%6@2ORO3;z6T5ZyjNJCU zSJagBS^6JQVJpbe< zX1E&)qt|R+Km#ZUa)o10mDW7hFFz|T1wrugEd9ZeH^E={;-f0q)ytDqZC<c?j|T=_Z){`wp6upomc97Gwts-MyX-_@6;zHab2O2!(Q=M6Ua0&@KfXUuQ|$ zoCC_kK9r8i@HP|OTm{8z&2&zKC8 zXK7l|^E%lcXV2G)unt}oyP&VJ%YO+KA}S+Gm}lzln3q$LH9I>3h=96*_t7vpnz$yD zu+|AV8ENHlftOyVMCxlpbCk$>&`i3sa9{j}9=ub#6cz{-A6K@#g^Vyd==7K>Z|^&j zDh)G`Y8?eB0it15vH|1|GIF2qkDB5i)ctfR&WzFD>(;m8eMG`lk+>3y10*nDPx0&B z;MV*zU%VoY!pGxG4WVeht~huDhTZWvM0kibkYIWvZK}rF7+^G zf$8%9L`XQIViG*8(yQ$#CFKeA55%uk=&wMo0xQnAQ$nK?x+zMB2=xc+zp!;m;05Is z^!y?Z{@@E7N$~64;qu_OAYrm#OfgG%M~eZgMrTAa(!+ylpK5t|xqM0rC}uN&N^^l< zKY3=EC25(!R08Qc4-B6cm$zD>pl@PQWKMA|<>?VaJboQQuE=78Ren~SO!1dOv-%x_ zaun+OFA-VXlvID}I=Qd^jcgf3Ckt}(joT;ST(B3Ki+d7}z`;O6k8HoI5UAd}{cZg>uc+F@Nq9Dw2LzV)L*oyT3`uT?ydkohVrL zX^kn0;XO#4tMl~Z+n`;h!s}Gu9+515C%;!iT&RP4%2Z0n zwbs_&J}=?;VI0=e9Yh^155BHf+GQn+U4A#2bc^Yp^&NX+;~Z}F%`;yx!&0Euxx3IN z_yyToF_HeDR-QKk>qF`;jb{slm_LhLQ2N+@I)N6uzq5SqNx97*FObp0{_xKrLup6l z@UQG8oFmoi0uP=(dh?lRKkxrsZTy|_pqJnClXf74s3l@FS9hnMKxZI3(l6uhWlQ+= zk7~oum}fVn~DDlF=po_wx837YBK!`0RbS z_3H829nCWI8vi+!I-mcRDUmCkgj=Uo`S9bOMfkCd{vYR6#82in3{vSj2_wz_SW$Fu zUFUEeB7bkkuj}4%|4H{?+djNktoXSf{?-T0*M9Bbuebc|A^W-bk?`)gp^Wm2N(4K% z^T)!L9mBfvACLd(6nOvQRka-j*=br_hm8Mx%|EX$n;T^}O*w-2ufzO2fByEzccUx( z(=hxx13w}Ve``)iQWL*LI^O<$n>S_jSQh-V<=|s-G2(0ylDdq3N-B|J!?heShjv^63j>Rdbb5v!(yo z=+wt#MNfad^!Jh8ag?$XVd0X<&j0?SNRr?3Kez0q{0K<6=N*OmQl~&|qk68wKk&^6cO9I(T1XhXZnSZmDCg zbm>3dUKtP8o<>q8W?9=erv?Slk|e8{LnYro@S5q z{J5s?1C~pvDj+%fIXrJi?AY092kbunKTPxX3HZxl*>YkBlNcoBsGC&r;vK1WU2}Yl z3{9bI-6u!=H`*1#im)+}0rvRr{MgHS|1bzUg6{yo>t~6c)a7%kD z512^s1?s(L9#*NBA9YaM2gXrLM(79p`NyJn5#l;r|HtO-NT+|C)YNf6{(q9_>tjbI z{H7nv_Ho(qh{8X&E%@(~{qv8=;5YE8!u`a*^$;`u^Il|HD2fzzepU>t!1>qNBdj=4 zSn%UN6X_2Wztq>0<=SJ9RWm>-W%1qQ#*k$l{G{HnOnp4e|JlYKW&DF{{qMGeN|{U8 zng73MD3_5dC2yaR{r=04LN`Ex(&XHT;J^Cm@^7i5(2&v`k5UQykLL*Xf&o2Ril;Ah zAE7Ba0OURAX#Woae zYC&++0@h8;^WeF69L!#~bzmku0y(cU>*W6Ra$SvuhLQqtUXk(43c zE0W^hk{B+%N9u1sTueANx-HcAi02O7Gocd~Zvc<_xFlp+^tH z2>_g_F3!B!M|`6s$^hq?Hb zUT<5C-+Qp#aM48f9GdU>cVo$X6ifAQuSXmDXUJzu4wfM4oZCvG@bxe_t^8 z#9v2u#|MG_3>;&=qop0ZXJ`MjclmuykR$y{61qOZ%6Y8v6=D5)_kRpXf3N%YI`$$f zc>YTJvKVpze_Yu1vj6^~KmYi{>$$t30Cfe+{M(L=t0ur&1F5UOZ`(gVN`?yFUD724 z=f6t|*CFIzEdP#xaOd9qwf{$=;n1B5aO(Ya>&OYnd}B3t;s`&9r}ncJ@aLtY>-t;# zmHG*}JCEg9USv4B9B>_?{$JNp?MOn|=`m#D)Usn4(Dk}2VlV%zbnR!Q>8~|D0?W|t zA{77Eccx0>4{-fs_5R)QQA%aOO;VBSzsY!B=U<%pOjdKtoeTG4DR$g{7bQ9Z`>s3} z`{`r4+HoE3{Ezo9Lx4J^*N&D~@EM-@7vKNKjs8A+zrUXgEU}mGaieGP=Ju&cB(oEBAl?{_2DN za~vt3Qq~4c|LLZxsj;p=k@i2B4^S6`Ti+X;^xGsxu%|ly<1PRCnE@>w2pq-}FMgw) zefmFN@-LU;&xa}k@$UmYe;y3D*3<>u{1gAn_WRE%z`HLFjW6qClRE!@s|kL%=V7o{ zAtzA(e?LpG2uDKwmj1q8f9>*b^e5Vk@0uct7Rb7gKzOJ18=S}ElW`yo_%ntY(W3dn^U(3Cq`okb7u zicx64MOMJ24r4&oE2yl0x{tGW>GC(=^9HE$*NWVQ`tT^f?~1AffO{JJuyE>APh3x% z!kbHGc*Q-pF`W1Q@}~j}^w0z0-o&fpDeh(XM4n8+^6EGMb@Q(^UqCS6fzOA-y8Q?o zivblEc{Pa-hUR;YLW$EKjiAz5P+M5a4Vw~|Koo;lrA(c^PM~M~9W)LZve~rL+j+Ln za|Cm$`oB4(pX-quA5FID;ehIl^2qV66%7$gEsQt3kr8)>>oX8SMxiquiOY-BAlSWR zri-9+8;HnEfEpA!3%o@_?))NIb{}(Q(GYr^0$@Hk8Q5jSa`C|uxEq-J@2>w`+6j8Z z4#h=)nF@TmG3zu6Z4^Sh!CRlF#%Sn|&t#{|J~XYZr*~^7qXK!dB0wnJfWeKdXDlBn zMls$;=+4c60AIq29yG+2_v`03!d;B3E!Rv2<%Z0WG%u%t0K{F{aw7l>AVEFcZ83o{ zGTdsIO{~fOdU9A!%@yei3t3*KW)ryj9nAh0$OCKbk!gfMzDEN^0n{&4@XgY4# z(u)tW=0N+!N3=~mGf-r@HT?uB(FQ%czE`o644O=YZ?t%NZ7$8^WTjm>{%uBA&yd6U z-W<@G-T9{y{no@g_aoJaH!7_?mS0P;J9kM<<%6)*t1lg_VLas0XhP_b=&sWI=o+~I zFWCEw0|1-c?#UxWRWN>w6hXUrKU`v+qyV)0gFva9j}WjU7~|VExntJSpJW>!0fKIC z%KH&C@?>vHlz|?MLMVcXJn|cFK$ENiFj9uaj@G~J)K53JeyWc5*HP&5MlSf++F%N} zJ(Ih;-svJ#3{o?-9ihWAjN==PC0fA2$w4F!O-)=1`RK!}7#(5v>8{r6FJVj~6-~Z3 zj7C+7YwQ_Df)bb`+Cn|k0RYN72(5a`^hcpvQJ~tqn6GtLiv}khv^nwxgpN=ARTMnO zc{;EERXgRG)RTDbjXq7<4pki!DTk4yLF;YfoEoW%F)D1EAlXGM#a(1#_sodyTIJh| z@1=*Kj}=?;C=0Y-r9MC1+B^%rlL)D)@`EvNvM-vzGU$52P>UC@B{)ND?yG!mnfUr6 z%}EW!9}xX@@4tOB)GK2VE`~!Q45hu(O4w*H%Y-z$@>f0eqokuvS*{y^pUV@})!7pz z2SP0_6UCt)w?c(ze$;wkmAObIKoO5?B3pU|o@XL2d+vj)4|ND+H$)r@E}Fy+T2=oG zgKq)_te-oSYKgXjrlSvPXG*E(u@UdMCTSfkRo>Js4s$wzhX~AY@1~g zt}LGHk2miptAu0R&>GDaHuj0q1(e_eeKuBmn}(swY1DU0u}`;xh4WWnHLs0D_!g`;hIEii2PD&j$&)y9cjYRmlr{b)uz<^V{7$3E-~<23Uzb8HuZPd@-tfX zS3tEK+xkJ29iHCpj~>bq$BfW4$}oY}$1rA7r(M635S?`Cmv*Xf>~~?KX`3S6ACZg( zbWih&z{DuF!Dx;H$SM_q;&u=qM%`sptJNtzA?8*DSIIirk*j(xKwdOa- zYy^ZD^(H3h`2NZ1*6L6$afFNg9X8lnV^^sE`5Pe=4Lbx4(Bqfw!MdW!?GLV_Opk)2 zO*r{I`)xHXp<1o_dz)Xg1YqWn?hCz}n+Opaa7Pk@c(nlh%IQfmm|yh;plj0F?O4V1 zFH@eU(my>vf4sji@{Elmlj%)~C*2hf!I_MvUMJp%>^i7+5N6$VRVlmc-mQ$(%}9{! zmyF1*K2s?(>DqLcyOvQNH7S>GlnQC&-tn$`qNMUEyYlka^uCIE=Go0_2UN>FwBlBC zd7Y{{4mtyjuQ!7Xg)t_dC&Do&&30;>F@C56{QiO^A*cE;BGw}l*AM_7O}SGER^ z!n036{vG0x2C%2GRg(aQFx-wx0OPbD4H0&Rr|?FE*kE?uwl7%*(!;gNd^B+W;4na@ zu$)})T&T9o9YbSXK!lk|YjSp44fNQ2%vCDUgZcGi&%t(xleQf2H<8B0xC#Dq;PjI~l2%Lntp@7LA~;^dg>frVm5Xy8|nSt1(nuU z$L57s2U36+oi=U{giXS|^v*JsAj5*wf9IOQ@7|71m*_I$3i{PXvnj?2Q)fU}vOcnhF@?$zQ~HtT?Ipf+ifFstep;I5&UY~_0dfDoxe4+-w225QP7Nzvklb`phqMd z13-;5z=RBnig%Yk3D|a}`qjedC;@@GNIY{WVs~0RIrk1wG$M;if&H%MjYcagxoz_0ozOPS%a!#)DJTTn9Edx zS%NhL=_QJ}KaXMNmp!ofZT|gJX{QU<$M3bXOzb3Df1Jy_F}*6y$d|`r{{#XXiRxyuNdAFAn83;QihpVxd=9;>5g*ggz{q9L4>>A-?IYL;{`k9< zna3hm0JAGW?)1vDFz{|^W6%5bYj?lk%}D4E+@t?8F_|z?Gk%o@ayy^s##zuTl_Ga{ zZFlQniwtl9BTP4EQnrB7#nc}VB< z0C$Aym}C~!r?1?qmm+lwJ&cX2Zm>MK&TlGFwJ8Oetp69W=hljSu{uv(w*4`iNX}`r}tdVCj?bUPrR)K&s_uxUA zgp>xQyr)(pgqd`G;9PR0){A{d3{`u`yqM3(dPVmUr`-S~lpyn|koXg)pY1wUAMSM~ zM_?Jl9$>>>F}^|?$}AMpLZJ}co27BD2!syzG@3Q)o*@;2D^JU`n~ZLBHhgwH z!*D3}eY3)a@mL`{U(JISM7WB0Y(0!dcrIcnZ3D!bd0Jr`$hMJ(s4tYoHVKR|H0rEa zkuEZOy@*nM9DzrsaaVceqp#N0IEhC2g>^9Jp6qJpe4Bby0*f0~S}&zgM903Mi*nLA zKDP9LsE0ALM6NQy%o^Zw9JK_l>#WP`VDDK;0zZFQFIgG#T6#fJ8qrGOLfx-lZ%37Y z22e&T5uG(l5@LwndkXJ9nVoe3$q9yU*)Rj=$hou86^wHy@a~dnWbtatKWSve>%&CG zCnT_z#GIDMnFepqM9PjHALZv~0hvmGC`A^u(F}&vOGvV8XYd4ZR5df!o!?Ps(haDr zWWJ#klY`gQ(O^eGOsR0AcT7!_XTP1&-B6gEnrtg=KC*Y1(M<0nBi~ zRAcco+ur=yP}+VhUs9F0yv)Jz>RWrOTs=55Sp4e0FvzAn_^RT9=MDqI9xpqT@AJSU z#ZJr;%$v@}>QpU&jy_dPV;iR_SKeD{uN7H2J@YczRI-ii(^k3X0>bb5;zF1hbEOv?yA=qT3>onf5%4h%4GME5}>LG{0u$)BRS2z|F*Q-m2+L} zk4Cw;yRnrEWVEl(S+{cSan zfVEh3?>>NO!7;BMeqJ?ZftUGCtt0UmmD4d74J);a3GZabU7Ks;MC~8bDvn%iqdy+o z==OA~qasjIKF%RSWy<=6TFvFy4oyi8Gg7hfaA7^Z(=08B?p31uzvW?Yi<64&Q zQyqI&aX| z{$G)8tPr-c5^kbuBJ@!()3-83Q4VoPG}$0&vHZi?GtYq9E){!V&wWH%(z!&J{1FcO z!>Uj)7Uf)Yc(2b>9-q{>9ai}P^6Q7WybH3yn8jMWCs6cT;7KNL7G`|6+w!2iHz=12 zmD1Z7CGpK});W=QmEYEjsKL$n#0@SS6#CEHTAEXw3DrK=PTk-_m~}`?uno#H-6>PB z+$6!VgKXyq>hcVUn(X;P#os8TjEjm3SdYL19oposhCy^p<5%F6lB?T+hY^3oNvk^JB-boKe3W?}D`mrAEb-$-QC7yx zR4HKYjC~Laf@(xZHepKU%vpXRu~m|^mymKgeCt|l3%w(Z z`%g0K7JA#&n#Pr$I3ta#1Y_d#X`qlO(Eo%zL{CzYZ0eL~ug>hs3r84OaB%l?r22WH zGN}&bWZ5*=ENVLB{g14lutgIO9+c>G`aH-LILRqxG$tC&y0j=$4RmBnzLCR)DLn=TpLbP5 z%@GUR^FNlkQ}H3pv%i*XPJy1`2|Q-^Hb?z4oxlmTDrLT91jXHYVC_)E8CoY6L5LHC zDt1WP7Zn%pC)ssoZ&N7%S#QuPLyc==SGx*SFAA))y9iPu^bF1h`YGAe z`C;iUUC$x-G|CXNp@>6z=G>dTF*#*-0Z=n*=*n8V9DHh6W<4?Q3HU5~Gq!$slusD| zL7YxcUK-|%*e?_r47WN6qW#dsDl`l3!L;6J{Rk||sE${_co$&y8ib%J>_sEnViZSr zWhA1?pKTx#52qUA;>BA=g}$3c+YzQb3`NPjInN;&n!lhi=vI@mkzM>e1kjdKGb|69 zxoI{s(onTEf2P_PMsj+T$M+3D7=BC4zGosVO!H3WS$v`s!%3vcG@?j#xj8H1b8B^X z#q+B>t>Ux<_BK*whZ{=$1P!46rR7B4IRwn{-$UMdZ}Xv+==-Z*)Zg#~^TEv8$wnzW zbP^*Q%PuRjN({m9!WqX3t}9+8vfpoIA~q@${@K>3M^%W+%-jtO5j$DYt?tYFXP|}( zuxH_VuoGrFMDNN;Ki>ox%OO2VYAG+4^>n6KAvT_MC@7~n01eF~0a$Elvm(BzZ0wAA zwl|Bfp)4$j-c>1gedSsfoNii$rPfj)mMgG!lvPM!b2M%A8NvtUg^r*55;fTY-b4#2Py2uyb)>#f);B%&SHQK54 zwb;95MB+gl&fOKNlWk3Ij%TmH|ETk75nkF_{4@Y^s)O2?%t6LOrLTaYP^ZPIdZinR zQm>x8)uan9=sxxBF)EQmnU>>$WLBQtR_2`xw4@xa1qC`!y4{pA78w$ea&A^3Ut18N zvH}z#9;`+H)MZ9Z#%<@`{wT`)Y^h%XYhJyJu+_X8{5`E%3j{CTihVTUuoeq9U=9m+E2qUA~ACL8jU-U+3?ifKCdbP zWyZaYo>0GeHf=n)H6@iA+o@}EKFN)Klv-9Gld0oUG>`reG#X6#B;C%`?i=B7gd99l zDv1-{dwgl4QEB)h>0#DQ`CU|<@%uj*oNx}l_zY^2k7Nw)epn3{Sb)P!f=h+_YmNYq zUCoQLU%+{&pI2!ZQ^9{bPFYyoOp8Sl^ahE+1G6n?2vcYU_Jp%QDQTI5P^> zG*S!OQxO_Y$y~(WMS*`Cc+%C2SLS}D$Lkoj49CK2NNUt7Zna5_i)>Q)DdoZxc8;O0 z1NgufRLeY17RYj_?uvc~w-F~!y6~5i6Hpzdsny*?T1QN-V-l_xU_6Qc$HNf)w>d*K zL6PxMu#GaC)*!6q(CkYv;4Bdb@89)`JkrgFXh4r;TtP!A*MM-|+Y5``GHKl~0B@k3 z_vK)r@fvNdf>-Q#d(xL;t>oB3y4V+4WH(uoPRhiU27V`h7#W;H+Qahhc}DREt5!5# z=o0Z`kTE+c+H%N~wzdX51ZAH25v4-532&$YaDPx_HxSf5c&_!S?}&P#7DeSAy%)D* zIyOk7+3k?)YFiZJAN>+&kD! zQuhOw(By!YJE+5}PU_yQi7>#NGSql{TTMv$yjzX4YO*AYCFE_VYcCT{avAXwZkH` z25^|{ExyEPpTfu4Ll}JQ*11fqnaS(QgR*xp&g$>rC`#)bH88CZ^iZ2Hp|7DWu47i$ zCOgXPJR(hCU;meC-SiRM?>7@kr0HA=o^j4IJtmHD=aohsQ#e6PUQEw}p4W&^B#T=f zyQD24kD|WxmV!M}|NsS_}eS4!dhxudn zt{hz2hj*u6&ly&sDo>%JIq;;_dTr%!q;|!>Xb&sZvTK|oxkXXFEXaOZv~u_`g{)+g^_+IpHUf_=a=qLbv)*!##E4| zkAxjv)M@1fyYX(d##tIJ0=fO3GuEqnB3QgYRX=1q4_w3bYl5h%cm&$bsdHW2tgI=*+9YS!?FIN zby|#E0@JC%8%t1~UmmWrvH&`TNuiL~N z@2lf|Zw*s)+|%}aDEvy;lV^KPBMi(>tCY0{*+84#?dvFw#ol-01xS@y^lT$@k)y(e zio+p(F64(aMnFCU57px%D*McDU(B5}>RQrTp-Egb&4@b=H0%41!=_9jcsXrXL0}-% zK!p8BMJ-Pq6$-PzGR9y01lpViO~sEyh`C&6QHt(*rw~Ql4*nc9~E`;wDKFa;l`fE}^3wP!MchCAQWvRw_DCqb=`qJ(&d;bX4E*bJPfYfbe8 z$tYn;G|ocl%Yw0FI?|e`nmDEUDfpX3#&SR46zIHc3{^h+(cnJxu{+lMNyJYN4#5M? zo1UE4UhlCyfc!$1K!l*+UyX^wc<$po5CUuuiM$D+LY=(JVB7*u_Um0vAaWf+ZY7{Q z4GBEs2gofNFTlhOj^=~%5NJdxzq+q;)#5`yD`+3^1M^f2|8Q*jcUl{Wr|~kLkgJ#6 zj1x_(e6YAsSytatSu5fgdBrZY-_Za&1Fx(+cVu_eTTj>_6^g#=jPO0>+rz`E5r)o z@6OGsA0|tQCO>o{^|7KaH2vQ)_;36GSpMx)8Sc4gN2ZO>ED%DF!R1{Qqvt2j5KEP5 zXCAueiQi|?4LCG4$`f`7OT$U+UYyx+{@r9#lP$4Br^pL|j<-ADFjAkAd7xEzRg2ba z$=ZcX8rXE4iHAeEmlpF6jZX)|R5|Cb?@xQ)PhL3yz?VcOwPDmMB_EH-PnO|0VLvPC zUI-LZkscMjYO%@73s5!+gzg6~HBS}d_@^H}yN8hGw2($R1!uB*ceS1ixYVGW%%t2x zF{76QKsVPZR`rq%CZIzwl+ei|gKb05moX?-H|{jrpd}9~f{w^uDS_tkJR)clcISkR zd86^_qrgDr;%X2K)Ne-u6#}!mnm1mNF&Oqw&#eDM>C1d$Hs|NKspR+VTnPRFwoA?G z+7p27QMp}Uky2(+8+xPK{Dv0OBh)g^xqMs9jLXs*&}3(lWRjwkjA;Fmf8esuF?Y$0 zRS_15Q7vFb-mSk{R9r>CK=0JAPIC`>JmUImi$2pxGJ#x8TM$OG483 zqiFU`qXs0c94Z@(5w`qcht7eF1ObvEN!0*lhjTL7@ssi<#;q;XC0>{Vzzl7G$)F4g zGv?4ZUaR##{0KyrqJ?bm)F=`Z9$W;yk<1E7)QW7s0>0tV)CuED(3gJ{r0V#9ay6Fg zy~(&_3fB??i$;c&i8NB=EF1_#65-6@c`+`ag%w+ZD|ZU-ELrY@s!bgc1FJn93tVAT z`-dRyYXG+C0X=A%UGu_e!o^7Gf23V$U9M3o|B8l z0%%R<#pE0%IHEFJT9#JHGq1}%yB9%;`Z!O|T<$5Ztq}XG;w;mT`u6sp4;leN*PRVJ zxQ#U2YIYHtfE4(He2|HwbV!%ylr*I<$2k;FZL2^a#_DB=_0h=50OQGYRQOmtV3Ff> zDP039lOA<+YCKgA&X8*zQkoxnwVNB&E_k2&xT?WueMtm3fLnF}0$!pjYfBeb29I)jKq`@KKz5C@<7B|$s&)rd z#2iJO|Ehb3dxoboR6^7@9Xp9vcAQC9=*g)1r zk!CO6f|E7{l=PH*zQcGAstQ?Ysz-0o)uYj?6|jAwdIthppFYf};v|M90rTk($Lgz0 za>xVP>8}S@JzvW@z7e9^lwtPNhvidkR@y7hP5}Tv?8S9hvgI7Rvh1X1Wc>Jm5)y(+ zTNC*2 ze+fKfE#H1GlGD^DdC#5Oj=AXGop|uVmh+D;1G6af7IHK_qQVjL(v}$8ZqIjnN4xTr zDVSEgh$;YiH~51M7qg5znn&^j|d6=&$e@UZxsNf}0-(zXmpnfYs;kB|hStzSTQ#wV9#E{o9 z_>R26K*zhcCwLzT#By9CkqZendNIjCG|HGFv|y1a!oYSZ*RV3BJ6^>K&82dN2VQ}o zUMccA7;DbO>=p1Wdk34Ajz`!$QStBy`YK|f-XUE01&^}F%l9q_M(pd#AnSB_o5|}X ztggKm`{?o#6mTheWBcq`TaJF~M4`c=7Zv5)9^YkXgI0fag~yy^dy=%nB;DDk!(hEv zH1D&0;>xbPvFmz<=>A8x)EwreKP!*ZtGF{9Dc9vPE-GG)c|L!BvClHcX6Vzal*kf6 zIBcAHqOasAGLj-5Bt=~#=N3U*bO{?tv$lPEMV!fI{ufL)TMgi&Rtg_b&7f-(7b*_b zMd0nl%if7-zY@*GBHR>z;rYZQfNNz#E$IyQD3PfnCN0R8cTSV@z3JvM6jpCB7*Q`e6BS_9?s2z&s0 zvQXSAG!aq4P{bOCFqv&{FJ#@h<5zGSx;kF$vhAXrI$KXy(n?m1_?@Ed@IeD&YcX-= zW;LL6s12TdF+L$jn=?0Em?iHY^QDj0LmJoXs2> zR2sjZdq(2nlhyO(@<+?)TR5~!`m{bGVFq>o=Q@=U>q*t`ddyJSnI&TC1xPdnf?JYg zUTr94*7jn(@xtatCExMSoYx!VMcXFeLT8(Cn>PzYoCU>`%*~BN^3*}I>RvPCqXOQ1?!wQ_oc|3z3 zYLlnw{4ek80u}Uf4Ai>g97s(kR5|sc5ynARGxrnq;crK8U(B4mJkUxa-qvfM1cy@; zABiKHl#W)M!N$k!?zr4=RL&Y-S>V-kO+AQDM2>nw~kCRqRfp1n=%95DV$YQM(den5r+&C;QqC;x~zE5cLKkVJz)LuMhF2mvfl+ zEzlpw-N%|oIO5G;koAx&&vwNE<~+JKtR`6R>$#zu@=9PzjP*fy)4r0ewf9?*^=fW( zjv^zSfT=Xq&%3%}!jEqp0f>gsovmOIOC(O!JJADecbf9q zIvf(U$(Ot@?k;Fl@z)9`o$q`CpbpS|ag547J8rhMs!x>hSOf0LL?j;&7@`+xj(U-V z(1QIUd8sCv*OC5h- z2!b;?>J-1)6$)2Lp(5$?CjxZXDGKAJUsb7SlIzN2*YV;Zj!M4f68h>n<(78JrzJpL zQC52D^PCPMgbLBg?+A@TsweoR1>&bzuc*9}y1O|msUzqmg}VBoEndM~K;{qd0>Qet zPpfP5Pw!oN|2j{dUf(&B0ks*wLufQ(Rb5AHdFSfQ_Klb?*8*qpITKQu&2s7xB>&Rj z21vK6SSg23Ray;Sa|^PGgpf^A(5#ki^o7r)>B~zxPHS&prfkH@NJ6>!#VbM7kCmYy zHP1!UUFoExDfkJ!Tynq0l?as#`7%*%_3|YSWbJB&&8c8Az5^W z)cvj1?&f9DFk;yCtJ&5L;)%N`HXhE*8|yvqVUbo?ZzD|_P2&Qx-ry-32{gK<0pfKA z6!g{wwRjOuogAdpHx^uV97tD;DuO=;0f>cPfYPyGHeV;@{R61%;n~GOA1W(f^4JJh zy#K|V$?n_Tq$iP>+I<8YiV7ZSp4UiNV3~%71{1>M2Tvh@=h8STQ6n8m=bOhuYF%Qj~ z>7Fc&ROxM$n`0G3PO`>IjZ5R;O7^vXtrJbO7w-RrdT-U9mX;8x69tEoO*VZh6m3s< zKJ3=XKod4sl%ENC}|&5ANm;z9TSjT~%wq##nn0e_*zYjYbjgS6C+$LQa%&$@C`p}nC1FjXb1zcKb*FbR zewOGtLbyvK4@q9Wotcyv|32x>Xz49rbv1l9%Tbq_mL^Ty&V37^WM2jCiCctPj*sJ% zt$>hrNI>MmI{g-j6h=lSc%NB{q}M%F!xTxRv0dlAfmHlH`^ts8&>1%14X)huce{Q> zGR2l{Ep|NkOR$>_l8!A~1BM7;nV`?6RYCvyHf$qws@WNXaO(bEoVw4rbv)Q^085E8f|`0Tp7g+-uJ!a~FC zy`%dK+c+(!Lb(g^q*~)&Cfp3Je{BD*m+yI zv7?RUMa<)e3)sA-)IC=hwQl)`dnJvMiVZtAsIA?n7nWO=(l5T}_1u8ZsY>LoazHxlL$c*Hp?WNj`8KG`!*qo(#`N zdYx#b1Ym)N0mL5Zi2_ItUx`#dlx`td&@0Oo;O`2uM;o!$ts{p}Wz}6B`)XN;@b&dR zw>e=3>wCw%>mfxMm@5a2+gYqv2V^;LPfxV*w}|zeb9Efu*msJ^^_o>*a6c)R38=F9 z+Y7CHp)PDQHbZj7*(ndCR#r+cxXp9v-l#_YV4ATQ3Ol;qQ8HhHFr;TG>Lm*AD|>WP zb(0#fRDNI}oKMdslZpVQg+tjZh3L~HS$M(kZqGQJqpw$5MyWdWNQ>La?ghJca>CS43X)V7(>= zCGlpF<2U$QB>Knby$shtt1mq*UL^s8llbuI_xjHOD^cGs#OmCB{DRwT_G_lajHR!Q z`WK)u>OS!%^oeH0@U>kDpL!r^rcLggT+~y^%V2`I-R;fCvNusYRbnG7QT94M#(WW? zj>;9hRf=nNTlWBV)wQ9 zDCOBvd!@*J_M-^m;jS^h8(32eR0*f3BIy+erq+Y+g(-WExeQMoW{T=}4ebP~CeHksAb zV&ACk2yNBnQ_038-xWi`uQ9tZ!Jv30HImLNU)eGe%j(@$GBEG#N~kAC6O-ScB=n)? z5HpnHa53g^F)2KX8kr+F76I2WZA~^N!W)cC2U%5G&-B4*rNXT~een3|bo%L!M_ycs zL^xntUke_57vn)bR#w@f_VOUnYnf$MWcZ`Wkr##S+Fo-ronoj|JXhPl&~fLYP0d={ zAQkQ@#3b;e@a&G8N~U3LVkarKb9|i?bw&Ie)f`bUwtacO=HxtJCjF)hjlJ(OwMzvC zM{KfaamOD|Y2Ju)!9TVb2@_(+iOA=}=?Un=G8T11r|4zt9!=a*-P0^5@t!+Y&;*@S>`3^vF^p?@Lg#$ z!zOXUTNq1jETuG43a1t2y14~BLqZ-0whvwap=hY{$Ds&3a8OZ5JTpB`qf6oF#?6W@C*vBnW|b20rv2J-DI6b} zrVggb(zqCZiG71`vgWtAj{`7f&lBefRoR)=$*e^D19LxX0X*0(50139L>aF(t+bSn zR(~oCd%`%L=Q$W0ERo}&MW4p=e~f*1JlE~penc4!TZoio6tc5QMP-YSEi;>p%t9$8 zY1li3?7g?F%(C|?KDO-a-+8&~zI8v}@ALfr@sjww$8}xjIL_ld&XS@Jhui8Oj0)Ce zQ*#944w@ce^w62Q(G@zB3z2p^Rx-rYO3HYBjkR`k0X`8LeKim>8fxI zt>Xp5E;)wlCw96doz@0n@ap)_M}#w;xR?W`FG)w{6M_?&W~k?UkE>+4Z`BmP`u14w zrQXq_ca}Vz@=nBNOI>U_sulJ{B`B@=E<@*Sv9z$M57x)u26FXT8aG4MOY*Vs;m)@E z_9?G3BF4jn*Hf&$rW{lOW5tMR%id$Fx`;X1N^;u2(HD($!+t zz_TCa#vw%#jt&n)$Rf|YH09~s;&Ge`Hv;yOPi(2Q9wjlwTzLLK>}PG(qjNXYxglOq zP>psFel|8BCr%u`bWqOOB~5ZiC>~!)C3GcE%&OTrC*dr$CQ-BXbnn+tgB=E|Z{IYk zFVq`0ysG7VR{MtY?V+j$RxJD%jOUxW?C#(0zK+sTw(D+}9UmX@(jWq}aOHMQr-;|R zA02IwN0f=02QTllBq zK`jr|lOeh@b@MtSjWoB^iSDNki6aL|H^|&Y`@A1nC-3lw2#M#Byy~Xx(!N-!B( zO|SA$x6k{&P}wmI zo}()VG3PeIJ@x3B@XmJ9Em(!6MfUU%RH?6xrZvwWq+B_A?gIa9n=qmsP=V$wy%B5P zQ}*DSR;_gfXH9yIE%p-Q+w{i`=gZaUIWFW?k5or)pVs$@9w)S$1KyeYc1|{9=HUP|sfNYneMisxj5D9u(xKcq zbUIUW&U27&)NJAP;nH{Yqn|^nuPeC>1!ZeoAH~pOmHND${G_-FT#rb{c&$@Z$RJEGehcl^_cgE=La(S z>_U&)m^xBkD0m*S!!R@jMH@}eZeD)YM_3o>n_d-jn$E?&hk2JuCh#lk=KAbiu^c9Z(77TJn_^wa$&J1cP;t0cg@GVVvRh%D2~8c z5F@!Oj8%ClCWX6_dwCon6zz|^fwmkgHSu7hg+LrGPUYVri+zEyDo#N>TEZ{RCPIf9udt$ zBde4&Kj8J_!T_ZR`ns+rb^lA&;u%f^l&hj0v>G@)XJL%iiHxH0@<+O_IT?1}#%Zjn z-(q)B!#H*LUrdt22I&uNcbLT8axCicmn;P4?p223)3o+ocVZHnj{A0b44TITAXaIx z*@y()*FH4cljv*u*wq9ct*M=P--r4MXJrg84(VR=+B8=9x?Yj7FfeRr5w5&%Wjl2bQKfQ}xvpw$E8h2X2F3r6fl>>E0uPf+JAYJ{(i& zI%94;;sfm{(TZyP(7<2mG=Hl0Y3s9R*MQ>@E(yhnxRKm@|E#rpif9?1WE2CQ3v@x+ z#T%yNj}dUgTnbGyW7H{uHql=@{YyFU38U0?Tc*igFdFp^&d+U3x6mu`}YO0!dmB){El0aYUD9$}Jl{Ykh z0@wUgz1C&1g9lOG7)`AWU65aHToxc7fZ|kA59P(^(MLT%6z)Oo`$F+!>$kj_i!TOb zsG>r$H^Gm-C;Z*fIIu|WvGhZvmWtB{XimBxVbn3$bLTQQ0BgATQ7Z9=^m&LVI;sM$ z?LJeK6lvQ0?(cmW!O4*AEdUyP?3o25fL)Xj!-8t`8(>t`eEk=L^f&7vcF}dky24 zpV%CrG<4fl=HyT|hm@o6O-@F5&t^nGlZdo}kmWJ7IkBULNUV3GAS@ACr9r!Bf^;UH z?9)n?^q&IxxA+gSFPd2oq!=*~`68u}nKPvTaa+L{hpeZqW>%?x1!!*d+fA(067kp9z*5N|?bFt20va$}wr8>J|4}Tg z#M5_;572f2_VsrV;4atP!Diqn}j(hfZ;o0!icFNr;cF z(_aO9C(&>TLX66v3a|07`9W$S4FX^PIt8v0hS*!c5|NPV(4Kn?qih7;O01Ux=z#G5 z>_ELsv#@r5g*MS1$?X8n9le<)(o_56=>vrm z*vTehGZIQl+@2P|a@s$z(KY9V_jcOewvVsZjPysWnoRL-lfro&(eLs8Gg*8i%>pQ~ zs}(#vx3e|4Ai9No1+}2Ni_KlKydfr(bfqa^ui@(!FJ$NLs}W- zYe>ai96uAB+DhgWagcqFu@qm(6we9Gd*M(sv5^|{W}+7?0E6)<#59HKrQqbpv`P}G z%1pmQsFU;S(+Ja~^lM`o+0QW!seI))q&TzAh+<@znurD-I+r53@|77$8pOW5Z3=V6 zGKnpY-s%!9>Vx4QqfW3U@9lanA!zm31uLDX60Kqw9|3fD8%j zDQrC|Y@my44Q;=AcLVkI_huT+mrzi028K%J7JH=AI4IB-QoV`YY=~h@&V>$IWX{hp zwvz=em-Lwod?hNw*l5DEScSzGmoQ)|Y2xeO5YHQDI?TQoVJ8{Cf|Fn|Id{J>LFV>v z#p1tT!K6$SMAOu5A+YqHPy-AF9Fj8cEI>yKhN=vTr`okLNz?vlgTot!TKDG{SVx5q zVZ-3`_2Bvd)-Q$k=a}RBY~61@>5BXr#r4^mgRRnE?U?-7eFfog7L9M#F%?jxaE$*@ zP_-zOVo}X{Obwc>xtvv=hgR0hAT$*ErqtRwDRPZKkNTshXikQMdcE=~XNyYPZYdv@ zn^N7J6bU;h+%RI@1a+V+3>shBn|s!LDCSVJ;W<>=K*MojPR&ffNxi?Z0(aS65^ksm zkfhvOHk$}kHjJ;Q*NTKIn~GQU{+qkuD{{#r`FYphYZN;? z`wY!Sh%D%zaZB2WmA@#Pk?kO@tvs5-!P@gt<-p_DfFi4CHZ0E#UP6*e??S#Mu^;~8 z1

4enRlAx1Qi?AlAb>-}@;j+4kY0$e;mn^PMfArxKn+0DSGK zi}jSv2E~^QmbWE^TF_lA5Mhm!|Ljjs89;&9+kL@aQb4FG?%9=OaL1&=Us)}lf^^2<>CqE(z>Er7NdvhyQ4DceqDqV18OmG` zG!m4$9NhnQnT{9wt8&#hfF#qKOOt(hV>tU*I~51ili4RV+mcZ%sORf>zs_zjyL;$C^*rrANktbNb7wuUcvV(-6>*4?ARypf zz3?#VAt=-|>T?*2UhFL4dGe-Ck$q5|X!xE~6RCE-uqK;1BHAF?tB;*aKHv+@5vj0% z`amyIiHmX*?J$;KoHGU$EGhToYcQ($_^}0C#kG-Q$z{OwRbVfKzx>EyJ`7PWyLpF2 zuw|HaOZ`3lZnmSuLj5>6Es-%YUv7ueyVv0~{39gvcQ~_0wbFf7W(W}b+C%i(VakaP z7U0YY(4-q@nuK7kSvUyB9L?8CZo6CMj{>DeT6!BgRY<deR|krSG+8jzb9kO6!dxYNw8(B;oEAXuA#Y-T+E)(^grw6a}j-167B?3awwbJ zNK?;;w*pQ=Z-kppqR6Xz46`B}^<@ia^2eLU_n=O$Q`d8~maEFt1%Utzd1*8Am&{{ep<* zq1wq**+J(ie=*q~inlD|oXI@NE|(1_G)m)Kped&GU>_txf{&q>`bfDMBgwcH&UEr0 z;pbmb=07q-7-D!Beu#eCc(O4tAxq)Qnu&086_9=oOvE3bGfA?kN8CoR=xcnV8!Paaz;_Q8zt)mLP=L$t6!`fWomShJN+L&2jI) z;lFIDu-m;D&IIXrcx)%bj8xp=bGzf?=$&~ZFePw#Y#MO$a+mWIhCtbj0E#;<4$y~C zfm9(-f4-zRK??}E40~Adx}yoiUuJpgA<3ekW{J9bArHc%$6zm5jsst77@}qG0vtjFhKFcfQNXGWNbS^-8PfLcx)5IgEU|6tLWnCq*`czz-SoFioNG*cIy!b%-cnB zdNF*Z5Ewn)+!&4aW)CSiN0kfIq*tX&Frz07Rssua5N5el>_X)Wfu)ZpVnbg}KdVO~ zPsg-b7m$}n+)`WLa=q^4+yOOK)0=>LTslFtI0LH*JWLBbufsOMz)KD8mCn8h_qczh zR#>;v>j${3%wQ8#r+sgaCg%-94e*9i>{)}6qJ|c&i$uo+ZSR(E*D4GYfi<*;gOkCX ziDq)Rvx}DfBhdA?_x>&Y_(cc&T)V5`cAt*HP>n255z>eJ0?f`=om zg)%%}#jw6!@zllr>~#uhnZP*0gO8C{#5MNEhEZ2HV$RvgltT3K2{iti3wj!;zB!0y zy&}{Gohz6UrHqG|&<;R~K6M%QQ~?@=dmN_6ot!E~IJ_zGB49k>H-;dMI|>HElqo05 zCNXvg&2C0*epgzuK8~tUmL6pTrub+E7$Tmk}3cj99L2vAqYPo!(@ZQ`-fFCj_ z_#aA!%d77Aj@=GR5gI+khj(?cZ^SR_fq%avyxU?$$Tn|An!i~MZlvM@1c75kMJ zh%QfJqCbDb&-D1HO>YXLJS8@@F*BB!G@cQSngl6L4Ji|LK^SG$IT@e9Z)krNT1zBwj_7%^TvG~{s2s@p)w@)P41ad4wkr-bSZch$s1 zI~jDm|45Sl+K+!J=Z0V2C#R=6S(T`+unj~LCYA!TJ{Wj)%TKG(&FYPVMXL-yTSmVZ zUhGHH$NiFg!Lq3e5hg8Ew>eafocx3UD16=oDz~}29Im75y1>?Z^|B1iMHu`0{2qMz zBrPCy^?ouNVzB_xO#|-sifnsC;6#mK-?6e_ceuxE!#XrtK^l@RXBxt7KkP|4#z)uz z6MbYZ=e@^n7G9KgR#KD|FhF=k?x@Uil$c?_-xhq?ARK-X)w|0 zL7e5#%WKeZ?MbP_Apjv)SpysZwfcm5Y5gn#5#o40<%QQ1t}SQu>?cyJ+%(LpT!AdL zg=8y@l;o}wSQ0h6<{pmU5A%;S;IC9@_vt{M+qRU>a9Nli(SEN~&>YdtsH}Wki%6U& z`5vF+yLaG?aP#XNP+!P2>1u)6kQ>vxCra|wR`C}DSrj;dkN$wbI{j+Z6)LVFHkPaQ znXqAGuFy}pMoWbB8#mrIiI2I$Qr?5gVjv*jo0h}qs2ZyOMxp{Hymh4swo=amWTGPx zTB@~%S{0mvO{j%j+=wD6@{FmCu7Vkaiso5boE)Y9aWMa-fh>xPj-`J{jvb^!_1`Cw z0hD^iw(;r2_N!ys>|Bt6uVDbgIImN-k8i3Z37A87px%;P*q?PE4wfrRRS~1dnrBk! zZOmFW)d#&YfoBrI>A9a_I1i#JysF+Ig#wQb946=^mCAl&dMDmlC!Q;Q+9gPu8J zk$#VHxqA*Pc@G!t_e#lY!?OkXFJ|Qcdn(rC_>~uEP&X&X3MpxwSisbN>|B)3NKZ^l z;FS~Y13Ib!hejNU_020H4FECaAwh%X^Gf>PAOxD>;cylA;RN};S}`5SW$XP)g^Wrd zhOI!&KNMJpn4+me5l`aax@GLd`NhW3Q6*D9GJH zvI`P62s_FHYdJNrsz9CsjdRZ=DjB{@Nt=o4AC$A~aijC&%ZK;$1gPWbpH_?~0)haJ z`woJNfpVYzkbBNLjU@y#7Je_@uP9LrHJ~=Mg;N=YrRzVa+%IjU#`US|v7}=LK`VkD z(*xeB;4QxdK6Y)d3>#f9YRj>LwxTD{W^yV&Iz8Q?gQ~DQO^K!1HCx$>c@Eh21ZV

po|(t35@ zsoy?kQtRP4b8C|l24+vxJDI)SK_oWRS|jSEZI*w247yzs9GcnBeOmc77GYW*>(EFRM6kWfE;9KxGB;B9;kj8T24QKV@N%RsUno-n-`HB-7LF;WWqzYZ+0 z2__mF%^?>0a;C=xaXKORz6hexKDnWY*1o;9aj-71QJC*F9~!tg3>p7Xyo0FQNPc1v zHa_){i$8}7oUQ8Hy~lN)P#Mk=#*1FGs+aTl5xtsq-EN$P$?r3p@d;{|(?R$E&$iqNQdloL0Sq3+UF=*(neBbja z?c1haWR>sSiu8N_yFwc#ONQk;SsR41xVWU)s2Bcp1=5Il|Bcx9NxuDPPj!g%0*b74 z-c?0Uzr+zlGeBequn-e}&!PDbQ`tm|@y-*y>^y#;A`%K^Xq3$lh~}O`WX}5n$jZWy z;|=HZS`Z>_;x5BbbIBtLn4U}k`JRFqW(%R71wz}At04Pu$Uf5*Y6S->?NCz-yYmRO zFYou5StPk#1SVh@$S*H?#5e9TnMXGQ*G@4AJQm5!X>qQhl21p3#9S-3fiF%bi$;Eb zxqMK3f7786hT5x}p3EM?c(oY>3~7&;-BvoBslaL~bWYMey}NIieqd$DSA{|BjDzVF zoM+?rAYH?ESQdi7^t~(6E)}0bfh$tju(MRW&wckHqx`dd9$JqjH8Q7MlgsI_rxeU@ z1ArDi#nt&8R*7Tis~OZQ-961yHc)K*ozV;cMfUTpkW7zXe8iNAWHJ$bZWw^Q=K;`D zbVeA-Y{FH1XRiXO-LSuo;utk8()6i->(e@td!Fn(KdTDCl#Hz$Ok%MVb*mD=##+nA zM9}oNj&pZbAg#hI|DGL&v@3oGxXmyr*>rC*QvYpP7kBAL8#NB z>?||gVqPbrUA-0!E1`Sva~KtS2>U+e9aLmG) zn`{#S%+Q=!v!Pi6k38_~<)zvAuDAyjFF!$f0q3R2%N^(%5L8LJ>vO@!h+*ZReXXzN z%ac9c_k}CWETLaad{iL7VIcb4IpWnz&{X8k8!bB`5mzkkDJ3QiEy(Uhf@YW=PuiYp z>S`}uDKdVj>HaZYtp9~u|9>Bp5JgBVDqZGgudY5HWpn-h{$^gUw7aBIB2zVl_0XbD zhmtcSl>GNrQsTHNCequdnnQGMi1FZS{DFr=m$^hA)}DO&7}cBU50!m8-b!SzOp^=~ zVSrMp5jdZ~3;X||u-YpbAD-H$NOPN@Ln*x)lm?MLqD zsH;zG$g>%MESBl9E9p>KzF5Fym3&nct+@bQ@eK~861w>x4VtF&N!PaUGV;gJl6@9pv}_rS5n5u+xhXl&ecutlxZRJBNd*?5IhRw zgR9iDLTxvjw4i1VAo%s6l47VL;8GM-_@90B<0a;J#QwBq>zWE{uehr(i|>s}#oP(h z!|Of1`dDg|G2%h%fuiMMVk;!gk^~!htd{1)#P_qg^jT;oeRRC~LkOzwslMmAc*Bct zlfk{3WW1vTW}r;RD>B8$h@u#h&}Q;3YuFTKj7sZ5FT^| zRCyTcF@#C;A00ru^uz3-6C59!%wWLQ-ho39%@rKO3jN=QM*K2#qEbYn$7^ge*TOGf zco%E$OTixol@RZ61({!(?A=ufd$<^@!H^!xTbwgxuZqdbPEbd}y z^BU*3izIgP`S%PGH4E*dTm2Z^L`kqbFmHye2(WO7MECvr(n1u4kBvv~wr}qj9BntM zQ-A%)jS0;|C|7jl@E5v2Yc^URxZ3i#0FVAh>b`QDohv`*m188y^**`r5*6{5iHt^M z9Jswd39DnDsT*$@wa($R-~WtS>J**oNZ$I%Q1sZ<3VZJ@z|FXu5hQ7sUOd~+Z_-w) zmTltiU-GDn#pVjVvfEJpZQ_^2f8Q_qx?R?uq^srpt2v-+w9&yqds3bYJKv7n1Jjas0v8w8I4k;JFHhSUKvk)$+P}O zl7EEq^6z&KmrL2_3IFRR{Kl2?YNz{U4Xop(F)q8 z0R&M+&1ZHO%07YS`J(}j0&)j@L{rfF7^7_o^v_|G3OGh{V9yrxYoIQpGWz$96m>Yv zY$c0jYrH*A*N8UNaSENg|GI5PM$sP{5)wAyBiMfbIwACKqCaqhPVRk#-QW0qavWLV zPa_5{Jn{MaV95(&Ymoo3wa}*a;3zw^cNa#P;SmX|@Xc~(iqXH`H~OzT!uw~N|FP2L zlsZpf=RbinX&m(xXYW-c72Qn#+!pYe|GxQdN_q$OjcN@n_{$sYWvY7 yWYuVf#$ z*q;5fdjR}UtQ0r+_kG?IceDOqKSf^=42s!g3;(Z`%cG1s^IsQn@5_I66&t77V8ZYV z+yB?E|MP}Lt#I)o2xp5841a%|n+n$C|G6UY%Vni-$t}Wibt2p#(*Lik%4-Zk$tDN7 z48KD4KwUju*E3nOFvo#Wa^Z&CJ7n~_iD9*bQkI_oy)E5Dov^d6{IPaKcpu8(b1XoU zJ{wgy&i3}@+w7m9=#t>%+pm|lRE4Io(d$u{k5ns{$F z)zRb(*g_IGH`z!ap{MbhcATE^eY^Ya@)_Vz&Gr#q^AM%S4I=yRujB*8F@B3MySagW zV4A$~p8-{62@yP#zjg-Ec`V#`)?9!+;^_^4WRnQs@foh*xBy5q3!Pp%v;lBONrf#A!Eq3jWFHcvCg=O5~1}U(KL|#z5csS>e)W?y~lnZ)*x;GoBEh@T(kTd*q0+P1@`Hs@Mab40iqHBHf3;c2I^n zf0jlD4F{X-zwR=f6)4hF{B@#4gNfv0a{-?kpkUV+pK`%81nU8><#noS3l+TLq11iF zYiy!F!Um(4p8m6Ib{`^tpUB?R=4Rv$%A>6|G}P#)F|1`(#5b<$EPVgm0qTQ1Y6Yel z5U=JZ-e5zU$XyiOtXFUC>0JwRPt$GwDIkpD%UOT`07CkuRbBvHYZ-BWQm*58ZS>-Ntz--1EB$%TP*`j6jq2Xu0Lo16` zD4#uQE*&tsvx%ROtT`JQfx$9$O!wN4;ZNa%9uVp=9e>gU4a_Y_H~6vL1$S5(90|A& zm*UVDa8J9J{O8LbuIwl_3Kiu;6D|`E5j$Vbf7`Gh3r7L}1r_%hCI1)AXC2tT$+k>H zJ?{Atdh^N`4Oou_N%-WV0?k7dzdyOw66>1pKfB@Acip5PZ>Q_72(EtNR{HA*)jko; zd4yT2?SLY^FwNCCx!wJBoB~YUQ_CY)>m9KU>_&!vY_GpuCi>BLPq*!iOJoGIdSeSC zXU^Z){p%!rq~92OUEXL0J5&%20W$}B*I-snslhEbV+f|H$mtzYpsF0Z$HOb*2K{y{ zzs`l|MLgz>MnJT6EOJvc3&%FImOkdeo9UfwPZaUBt9nk3q({2pSc~K?_vkc+@UDO< z=>;aror09kmGosKD5+lLke;lb%f51};v3q|EiqCvz;?`l0%N;=VA{Pt(s@EelKjck z!mLE(UCsY++eJM?|CuW=AY-lLzLLwpBDwovugglSvRw9rYE2acSO#E*Gu`(eRKqld zo5OE%C>P3v@>SJIz85W}gk=5gB2Ajr@Rbd)J(8mv?WuzrTYTfI4+n}IW7j~KncsNJ z(!C{*QiBu=rw#=k2H!4nOW}B&Eo=B^OZ@uocT~;%2pjBq7Bq3!FDvI-^q-RCHfqX( zI#u5+u$ZUX;@@gI410ED7)S|0rFy1;yV>?-YQZEcItt0q|FQxP$#iHoMCg7B@)ySH z^71;v$r5kR#G(RnCg4+zJvru68dIR_#KuKoKv)2Adh+E*2V~WKmZ4QSeW2J`2+LZW z)lyL2trIX4ec;;JR>v0rTO$oovCmM_g4k{&f{oWmZsnM?v(gCJCw+b|8!m{>rx56T zOaoSWUQ@*&B+`Qo1|pqg3cAyJUU6HvtWP-AaF5qayDbUQx=&Vu_bYyvY`C8mKj zc(yk7!f7sy&?KmYq*1V`bwQ7?9-yWn6NOpeW@Wi?yOa`ceeUL%2t-C3pc6KXP>xRP zT+#!MrO^1fEpO-{%kuyl?D)~pWnJRo_l%GWS}VD_()wq;vlcTCJY4W$hFkQ+;^BCsbblPy+TE# z^Bm9512#{LTNC46e;8#(c;5LB14AE*UH2tcCl$1uYQU@OLWfPxGg79WB-Ko5cPnJs zY&HST>H_LVG6*y^XNPyrv-V8S8SPTm+KIC5o)WLliiLL&4dQG4m!v0#71EWG!}99X<;&2U(=->7t{6x ze{Obv=YAeOA1OU~dQV|L8Md4g{>BgL)INyv;&L+&nT^)@rz2taK33aN+IYImB@jmr_ZB;QPnSw;?c$p=> zyI<%r+ow%_Lh&ZVLk=hhJKezAHzy!?5w2pZo}3A&+-*2V(yC!(OYio*$T%iOIa6^F z8&CgC8nlPUw3ma63{ML$iCpSW802(qW=sl-A+4+BGi*3!+L07I!7n^XGh353`<6Mhm&n>km84`qWgtzeY7um!^a^m|1CI^_8@^Ep!(D{%q*18Jj4}Zy3Yw8>|}jgygTK) zpMPHh{z|3LOoDU?C5rJY!Bdn&;#z`%~{Wb@9FbUl;VhEo>7yCMr{tCDJkYq&G>Lzn_yZ9x-*iUS7*gN zM!l7!Z$=H;<<~NaE6IY#S;48(XlC3Xlpk{lKL2O%D{4V-o^{W0@nDORHqkQrc!>LJ z_m{SdN<=}EVwUhulUYn()cJMK0Db0*MV&c}U5cHyU^#o{W7PEp7H2t+R`KV{l$&)8 zab=HAF$vQ#GfVpxw|^Vc4Xq{=dyxzw`_W6AUwU#a=LDT2JT56G-6rl;s;-&$AY1Cv zFgRP6xA;)#M`r57!taN%D6wzhKNp2l&>HbP!=APNBUi+kdg_oBTw%UV0I8+1z7dcu zc4ghRGZ0rIDtPerd3fKgDEgl=mL=OXp^g?{A!LahGLk-M9f-=$TccfWVX{LOP9Jh2 z##U=bi^;b5>L*ygU?#sw>Y-e(LEV36`TIAT>b?^C+~(S*t%(pqr-NA$D^A;q~HJtKhA z^BX8gG1bdOzdQk^pGL&V_I0aNAUoT=uyIi;jA3Jl|F~`X0wCwzBe(_RiR0-*kWrj# z(XDulADiSZPQWeP1+%iarrpQz=;Z@&VJ9o4YKfR*vrx|s7cSe-yB3T(aqa}@M-7q(U(0H5RV{bnK94zKLo!wyB% zuP3<{xOtpb%e62!JN7sH_AfN9Y=gS&5I4N7GGE>=r4JuQZfE&gpdg`-L|(N0PF@*w($Fc4hC~0B@7Fwusqy5DY)=20w#wE}a)Vr!rf5{w)=^VON-d z&4^O{$0XpiQ)S0-qMSv1sML~|Bl)B0`c;9g;Rx*Up}zsdhC?`e4hj+T9jvWx4JgHi zB|TzvF=mGp$3P`Y%qXQMlCj4xeGFngX~BrG#?a7CGi_MRB2Vr*R3ns^t}dd`Bz)e@W{FJ@CDow|!5zPWu9PyQ1Dr#WSvHY}@{oM@~qqq5Bs4gTA!58+PMg%6(JDA)g?!LW!wqLNxi>{`_dV<9}yw0w>Gp{~d)gU&XEq zd23PtjLrn56b%8T>{FN;-xHQS&&gsh$XneqOAG5vy3fKsKR7|=>}1;yoAY@aHH3fp z^QKWv zYDg6F^~7xR-VU-&B8w!uFX?(UKTghp+Y{}@NC*zED^Unl3Y9;~F6Bq$}u@*AmV-F>|{yD|#?ieH2;7f30wgHJp> zJWk0J+D%8Ufw-1}W~MA=o@u{{(m7lml*e^{+$FLhf~#$h)%{w2F*9 zJw4}T8P6#m$cA+NjPQxMN~qv!B;}d3Cn&=K(H&*H`2&7JmM>joW-HEGd7SwtU_6H6 z9%Z@-l_F3*IuX7Al)_5K)kSglSCew(3@Ao}=vzP?7;{R~PC`R&`=upLRrEp?TnCkx zKWstb!?0m-9|$L8reC8yH9?Wu9?I%n4Z+;uLwG)G;Hc8fgiShTx<10mEGM}DHA>-w zo`vM&efgPe+9iGFc5=%g%tb4=+?t$JQ~VHs?AK$!XnuK^hOc3*_{-cA><|Vf&FOI= zkxl1!tB_cIMPD*!VXVkO9o@rqB0;?o{9?5|wN! z%|?e7C0CRYi_-H0nzOi8!ps$tTz0B=X_kj!56;@XyT}M_UA) ze9sp3Bp}!D(RM#TaKJ{%QIT;=JTK#;d5_()Wug5H$wkQ}h@`j;ByHMX`}&VH1d~q4 z_(rwst)0SUu_xQd=e%OZ)HMW1%?jCvcMUd^+fWIvXWr0&n@7WU3ETig9Gs4eHtrvte%8l9n>dQYYb5l+J9m|9&Kj zB3~!KSxRHqEZEIX-pMm7G#vr^Ew2Dxe(jvdz3(@e1Vip0zrFvm>i&jD*$sOuN_*bD zyN2ria6!9;HMzx#{`r^aRmYV<`ZHoOktyrV8{kX(|JXbGLP% zDf3>1)4c4swMAXqA_Qrn88KCfZ$Fw1S9rcwg!-I>{$S%vI{c41``eZ5!vUi)?re|M zIjEF$EB*~|N9zL4sgCIDCqqikhvaB%JL zmpS)9`k%(FvGJiK-3j6%740*)Q)itHhz5SCWmeNNYoml>*t~J9`N-4 zfO6k+_r3q4?!$GG=t$c=ZdHQ=Hj2%r;owa>QTY0y5n>URW9GmJy`jdj@_~*wZr?LF zoV}3Fkvj`cSleC;9Y1F)L9^w24@~vu{kh0oG-W7~mNIn$C!)&U8M-G0v{+Tr9%HwV zJ^uLZO;^#y77i!rx{GnyvDX)f#VFa{#<&)UM;#&@U%h;}TPB2OUUL{Sj>YU|zy!MB zwM`eu;TB!r$${e2c1Mb)Q0ELKht_m~MJ)L;O}Oldx&`1pUlM|mzsW|;5#n55V_Hv` z@9tADcJ*SuIlGU|c&beN-QSNTv`32#9opxZ8$W4(m z2Z*L4$6UR0`I}N3!+vtH&KN05^=uQRB33`-kf;c!;W1Z$yr~ZTTEdrlc)F{WX>%<4 z3-pzQ!xz^@KGq~wYYjcz+0Zt-S)s?_?u0L7?HCO?I=ktaloe?9y`5>;*dS$JQdw+` zaI(wvoz*vfXr%NLSEHEWpT(vp!oeEbO{D(%B3g~@XY-StWP0|XH^>inuAxT%an+9ViIW+KsM z=hm&n6WadyZ3b9!^D&nMFVi2yJ*L1y`1s06ln={<3$_d1pPg1G>d!E!cL@#d9Lk44 zIKM0VJ@;2~Au7c*m$ikUH`BDN-zHU!!u{uQ$$s1i%APtB+n@CYueA>=6^^|l`FluU z(L)(1xJmrSsO1hjej5NO7Qj(`9J=k1uOVRhuo0m@sv9dauAyU~I$NZ94$&HgjaAokB>+Gt< zPIl=XxRQW$^VA-KBBSGCtAK`46kaR9)$+jj8S|2{ZVN@NIp%M#FHaTUYW|qw zLBTuwLU1iNoo-anup#K7uLJVFlGn<5A1e7b1Z6xVPua6TzgF)bC=xw(6%M2}ib?1C zu?<2ca?J&2_$>Oy3sh}imBTvD7HAfb_huREIephCn6XLkM{)N|Pxhk0-MXPTog(mh zKU+?O!$WkF-3HhYY4zH--Mronxy#OpD{TYdU`$a_R|c~D_j98$Qi0c&L%L^`)y+o> z_8aJUt8|;v{{+?-MQUi0-^BeDcy-MebU20cDk`pQfN!aIT7=d_6%`DpDmDHNEq+{g z)A~>6t~+8FdO}M~U?bA6ucFrx`2tuPp`=VI+HEGl0_J5w}WC(d>X=7UR^ zfllW|gWDjXo8Sr!J0(lvOyCdra0~9wS=0%~?c>H}(*~013eXL!tK@T95Q%G6fZr3% zKn^uYnH1YOlW!a)P~EWs@ra%`3EhT2F8eUka-u2M)cRa|+$n^ULf$tq+eZm&5ZJ;Px zp1aQeJYuUWT`w8(8+&k#p&2dWk*(X%?;Dq!l>^alY&*Nw`|-gK4j8@qC1+Z7lw5R0 zKdu2ocD2GrRrn5|p*hfg^%+iiz`Q}MZ1}H_$IAnhx+$4#dC$_E$J_Id-DhN`a)+cX zlKfR-}p7|#PCEJr(c!UixP@+?vhT^Oz#c!a57BQ51O?P#d>p)m@1 z*q)$7s?yNPEfsJs9R$X%U(TEJ~HCY>tt5drxZunGKZ>_>_Y zQS;P>@R^&y#?6H0n95r!b1KjY8rC!WeDjd#B6TG{;7aNMxavpMiN(zg-njfc9PlhQ z@xwc^sp`2+vC_f1Pyn(O><6%R1{x7^d@ruolDW)~#m+|$ZHlSlwvN9%gO0KSqP#3v zk+{1No)`J=i(4z@E?_g(1-Tr?q!;pSgp5dF;!C?awDp;i{i-E2JEgAW@vsl`tU=&y zzP-8DR1z8}Xg4XAKnnoqYgc|D=u-~R1o4g6GawIXl|d~62sXm2Lo?z~D4Kh{HJ?gR zqT}XDEDQ?#yc_WL7i53d>OAUsB{r%YYNHn}KbjDxK%>`hshGjdLpcp~t{c!8#7=c* z#!XnKI6Va3utJW+t&@UyLN{r1N9#r=7@gxXQnD=v2Y8IOX2Ngo6;(Ol(ZH30)HZB+Hk#$QI@N)~uhDQ}-^yXN=T z7kM>o4TRq9?v%g2MCFwMa$*VTKJ$Z-!8gG5-6;%QPlnHnKP%q;qvigsOus@WvvBNO zvZNSMFTcaltm9v;hp(mvw~*mj0b-%*Fx3*{9S&**_6?jhLF>u#?>x4EypEh?FHC7P zKC7IBr_D+<2UIRj1A!KOva)wN6*=a;Jz2)B(o|0%R++HQCZNAX%b~uuFsi8%cM4nv z5rw>RX1)#P$*%-T$xzUirQl!u(F_R3$UFOWB}`+e z(eNPlmIbu1b@Ip=7;FK3H|!;}xTv(wJr5kZ95);r2(4;Pt!H<3%jbJ}5@*T;{&+3O z7XCID8SGJ|S=&HL9MQUtb(g^7xigpS9VoOkN|kj(%bR64o=F}wyh*#OL)Mi8p78>M zkM@VsDBlNS5!g9sEBYb+*FHRYulIMcbfhgDE{m~39r4LU&@qz=Gvn3+ozYb-Mw^Q6!=!~-u$UW5L)m(r%Ry(HPW znfuKItc!)sVq1=DhvnLyhAu!wA>%gh*4%5T>vzLA5|ee2S*XgOVOX~6zuJr3?{jY+C z&%a^=>5ZQwY(7LK=cVj zBnEglY|uix@54Kf`=3`l>JZ*|x_x>pKvA(Qrnk);qxEnPF)Q|h4~qOK*|&gCdM3yt zVI<;Qc?HZM>vU1L&?%maT&74j4!71?14ZX5TH+S;CLnlr6<3ir;X74 z-omD9CKbjIzxZQ0!0X_B_t>Ym=RngdBb@RDTL!9w-B!);K$SEBO^9I?J18+_qf#(L z6|pwhO^xD0=wcFvO1lQ?{m&YqvbI^eCHT_KbW2x%!|PCywCi9`b@M|b6iv5UId^gX zG5vjPEKi}koW(Ry5PCS-Lc1|)S$FR+l79$e?oHNqs5x%)LR<6eQP`iaP@UUP?AEWl zb0;AhBH6o{(B1Enpn?qC>-^7B=2uL2XUb3wKPNjQAips#&4H#>yy>ZuW2x_lm_TV$ z4B!9mGynp!6^XQt-(LqLM|nZw$2DxXI3dP&Vi|IZQkQ6*B)IZbYzX(x;|>`*-%y%p zn)r9Z5D7k_~1a_;7#?lP`bKRwFNFc zOFcT2YmK=+f~$eC@CQVr9{OhNK$UYQ zsqg4k#QmubwhhuD0(1!cpg?!?*fcVLyGvo9=Jxr22Ig9XVwXr3K|UlCbOPW1ba~@f zX&8c>X)d8MQ9z9qN>sf2`xCMe~7 zjhMuk7#Pf&LiI59lMsw8orbQfER_^;sA`Dw#qfc&%ym{?sU^&B(Rq;pE{T$6sc zR`S71310Sq%GvdVJK{}GubfmoPyqS8I()*Z(TVA4l-$P2ohTggAZ#&?urc>@`~ik` z7jOzp?}vOm*6e19?KKqjj^QrdXh$_pXg-+q_I$v^YwU(8?181SP)r0*^ToucuCZovUl0WcPN(90M0s8vx7M{U%0x zk0~o6{P#wskJ?3OS@x^sWW;NL%9i-|6%cKfQ~kzX^rPa`%%_78#1^oKo^uHs*IZQC zrv*4hnB$oQ%=LT^TK3{41t^7wv0O;J;z?j^mq`Bt7(D$;4(cK)&E`xYQ_%SFU1JAw z7pb*lw+%4QCVD%s;=0rhn8(Ez!5ki`sjl?*jbQ?k5co2n`Nfl*HL-@(*UO`^by-p|R^F z8WpRLjtJI9Ko2R?`vK0H<8RZnmhhKT%T^FD_*o0X=%BH6D_%j|GKEj`tiKG1`i2R- zpcCmyDPB=)zC1y&@778exU8n4rmFVSA0MMlLE6Xq4a~Xw^;D>cbB;_fSCwBf2C5D`A_Fs^?Q(Ab z>eo8Rz_J`WHz1s#oNPiM|0OR@E^>D1-2*1M2s5B(zl(Q)dRbFpU8Lg}6~dfZ9;p^> z_XV)%m_o!Pt1N)~ndOG49+LD8pP!^Yc-+dZcb23>QQpmi2qtjd0e%pth=*+OVoK?@ zlPQ!CNX26J6$8yFwBb8*^K5@%Xz$e7^P=|mRIJJ%n+p?th6r+W4>zriv<(QN900eH zcK5|UcpUTCgGT?7<$y3i<8R*L-dOw^WOrC(On$3gK9)fEMXOADG?1nQ++RT*F|9H3 z(qNmG>+)oKJpvAdw^txu%0c6JXG=CLT6wbqc%V!q7}P^K<%bQMwTuZ1*PfDsLa@|s zoh<|+x(Oj=tgq`ndC(<3^G1rDvv}&ZD(|ug@YFRh8#=XT?NGrRwz8r2k!LL!8 zto>juX0bPJ8=?)u4Y*j?puNv1fOh9`oeC#}$OyWjBY$Xfd)#?*XM1iZ>_sepzwO(_ zt%N}u+|y^h4A{Hh-|5U$WxEwFbBfE)zXt|liZ>}8%CvtE|xxR+ws694BJ=c z;ek!)iI?@#uCYnA)HBkP&laTR!m9lSUZzK*e3BKQ1l_tNfYuy?U-y1?Y zoVw_c3s6;;W9rTKPZCRqRNOj{9NpyA;QPR9u~ej>^M86}d&hGXc{8A-g+eO*K=xIL z2M$liE!9@8d^>25E@oMjgDQB~w7@#y1Cf$j-K{S#Z9BXTKHiFd$S<5OG46UFpmq)T ze)4=3P#hE5U=;#de52I0sIGigJTcWlKinI^OFV{}S1(gdYCqxfF~^MEE+;ifErCM# z@xrkX(|y$Lmy(L60Yqyh6Vw)u#QQY3r|k0JQE{$`mWr>tw%kv`jwU}1@h5*M zcJ5NjH^l#55=k5klx9j9R5sXZvuhuOnR4dpX6fRw{etB#kWYSidsvRjV0kNH$R77J z9EsrA7xojI2da*ogCz$3-S^GvI6>wn{X(nuEYoii_?{nu7X^Rv zZU^DomTIgbDUihiX@O}2!UuCW0XdNRD^MMz6sPyXUU%7oxH0G6s3c}F<{N#}TY)Wx zAHX^2g&55Ygb%_O!xOduxtfZ6TlC8(i%Q^I(eAP5f9;Xq$;j@%BsC$hQYgh8I2ctH z4sdce@XpL7{~u-F9Zz-t{vSD&)6PLVcgqafGgMO9j=c*ZBYP{f?Q$er_7<{7G|cSm zS!A!WN&T*uqI8eX_xI2JP`A!`kJsyZUDxw^KA#t_JrH!C%J_ovvS>f2@o1z{igHGL zTs|NeR#2$50@jC$v5U5;2&8>_T5GhLcfz7T{~)k|!q9?8L^9C#JV68s+7#bJ7vc?| zN*H$6Df9jdVV6&n>!81U0Dx5Uui(Mn3K|GNM$WMMA0Lq%pkPur>@D@OE@_3|0oy-6 ztz)5}hExQc6uEqu_cH@_px_waZPO0P%D{AEx~?Q%c@FvoTyd31j)F1`%dxh`oLf8j zr@hVjtBS>odSR1YgC0^)Lne?#2+0ci_ki}KjIX#-b?KWybFv3#;#Hk=-I`2vDo=D3 z1kYYxum{sjuDB*&W3{-!_6&o8$QZu{kPvBN8R(j%VMIMxgAPo|A%5pn)ca!8sGnM> z9}xcpVeo~I=QjXNX4M1@E*gA+g5SD+pGdc#C}YkWi|;eshkz?g)&q*u3eEe7?gapl zBBTOA6OvK0P1f;+qBAsG`5A?l;7~~}0Gj3DyRoQ$@f)ZMS=V}i@aO;ZzCjFN%on43 zMEg_H)M?8^iTL1y)1cqh3Y@5*ZI?br$AKanLOZ7U1>ggSlZ~e$5Jxi<3rO;PzC%Wc zaAMC2EHQ;8r8GC*DPDX9JIW2a7!m3uyjuW};>R?IKV>c^m6Dm1=Muaqq?daE8i9#0 z@FZFb6Gdw?3ZQi;%BrK^X_7|kz~l}mcB8iU$=M&VxhIk_hds^ISTelFi|eVhO+QxA$4-$gwV{nM3rJ}IF=YnS zkmVKv?_UN%0OV15t|^peywO}@s&QZhwv!hxwJ(!ub}iVwkfX!kW7DN9sZh0~zmm!! z9CeXi^eAM)dcR+8)|D0V@S&EJn3H+ow#WN3oZ=CeT_=XSkUvVbZsfeu<<*ev>_p1^ zU)wOu9qCjyZxvK(Bw$4MdE@$`!h$XCCkfJE#Aw}A^3ULKds6s<(*Tv_5xRurt?SyBcEgTX);RfC6e%68lhtic4>bs zzc?33-2JUne0~;2FC6Fd8+`*TzTi=|0k~S{AaYI3KNi_PYfGFc0YS^x^F6Dk6WDN! z_(tx~NjGdjrOok*j~uYJt}eq`i&pg4z(A^$1}6bv(+yvTS#+1ULI26`WmFd~e5swL z(!ZUZ+4SRi{ByOm9njSY8nx&!2U)EbRKr%LHPV03b9?eO99pY3RxficlM|iV6 zFIk2T)5~QRSg?#lk>LjIYGqf1Ei@a6#dfd9xd6~v$7x0Z*7++^P>4)4LX!gxmB;{Q zimuTes2t0byW@DqK`HS~H%yeJ97qt<+=27&p#bum>;pufEX2+FsFFz^;JNh`+S#zz zOY?9#t9Q4$=SdWt{QoV9Bq+hV0C({e;K0K?FFKV%2U~d&KM5-~fz1}+=c34QgluF$ zv0(O-^<>cD;zfAhw)-!nY34znqYba|p@O95?g^V8?Lewqn7!P8#+d~w5MX_|@|2tR z!lrwxSA;^*I2?KA1G3>6NEjB#ZXn;3k^@wSYs>q~87E;Uwcz*ivoHh-)RscV-FN8 z#8i6@Kvzpo@ogheYI7L2FdyJHi}W}m(1k3(oZbcK|B98_xUl`iQAdIE68D_Ca1hW8 zO(@sJREuL+joLFa5cCq$RX@=Wwkgjc{fO!o^33dj`nzF&E1I#D)rqE^W$`IJa~T69 zYJb+{@GsfEuEoW6t5`q;b`DhDvB#`_Azt-lVdU$cqm&h9`qr5nwPDVOEZ<8bfU^Zj z*#IQTKLllAQ=$?pJq^ussO#Gx1H)TAdTJ|3@^sVMX8>g!f`pd!5U5sYAMK{KNj}d= z_;?gMxd;cR%=?O$4M3_(hnm{<@H}L02eLq}^1Q@o^2P@s*MIAzUwk6bp95uKK)>7z zYR_HFth@gGd9Sr^5ZnUdfla#5SlrP$J=74>4p>&&nQ+7+AnfNKp)|8~hO8kc(O;tx z2qB+ri$1f|7+V*{5{f9kGqCsZP0xte?ggIJw!cSOlR$O1?@NfL!XzgBSik5K5;6sJgX~Cb-z+?PpO3EN5(z0l zyt23ZIsx^ixL5KrME_AJP}wmwxp~TV8EkV%LFJ!SvIOLYR+LTv=}y7!&|=GV;G}vw zH?=|sO&>7Y+n}y$Dy4rjak)Q1A;n7U#bNT=1;pov8gJiA=bEToYUPc&ndA4H?$r2Q zYzc4&Ah9Oy!Ly<`pz{Y1cf2};#P^7BerG4^O`!5Oi~&<59q1knT@F*?K))a1n_Ga` zX>aVab9+5$NP{C@N_zS$a$FlcC%J^(;W{H%tIShugUX2*0|Q3v7G#vWqVj?yXoaH+ z(x|3vQ!P*zXX0-TMftVCOiTu=pg6|pL_09FebAuL5G^j=1xC5yE0U9jIDCvUD8YgS zYfg#>w4l5jUT8Yn1|Zuo#2y!IY3U_x%>rJ(e^!6j^DF0c69eoKTf_gWc06~P9)#Az zDcv=SWzKOj(EA2xF9u*dlYLRs(CM_1CI~K zJ!{8|nhD~Bg~W@_l^Zd3IdO|XHxqG*gcZTK1g__gIqbpzA`vL9O|jC)P}d3ZA@m)C zWi{JCj-bYHER?tw2F6{+pFEGDi%aIvnUZ0a0hHdTKJpSBT^cP{A~;4NIz5iIrY{V# zZH_Ci?hu@HvDyUbsLQ@AIY9}T%3HBEawcs^Rj2-N|8-TA_MyH8X)MCvV@7WdD>&z4 zYw?2WrJjexi}kVbgsf`nHe5$DR0)W)lnBgI39H=mmUv%!MJXaZ7z=bq-zXnEZfPej z#h)n%`3@W@4ckLt7frZ|eTKkr%4~Foii6p_HBCo7?>VThOF`$|P8dVVZ)!-eWoYS3<0heO z;%aTh^^p%)L|$xuc<@cDKcF3OU{!HPBr()EUqo1WCF9he70>Qun=+3j3GjJTE%c&BbS^y|?5`O~lT z>?&j_d)f(u`*cDs54b>FEx3F3nMJ_0BsA^pX-45xTrcF%frn#8EZ@}?o4ZjmF+NeN zGPsp0BvR4VpWy`v9gU6;M6Y)QH3prxIC6ron2>t83*d(zah*@oq8$OsZh{aHxvwdF z31!36SCp6RrVXG=p_(P9nQ-{TNUn3KE3_A5Xl)Fk5Iw9yk#a`2`k+B;S`;l+;(Y7J z6GNLYrL&Rjbvfp`OMsGVNuJ=7-*hgA4ic5iN@t$cq!4qmk_NaP4nNqDxID-)dQ55) z<|)g6Y~a<$Q%JgtZ{UE=;-R6`DC67G;VI!^VHPgw;Y(#D-{2+m#xs2@qy@l!Q>NUKPn zsak405LOfgM1;792}a=4VkO6<6a@gniG$YpR$y_ILaU{fZ+?@|3H|Ko(+hQei;9H~ zIT`v5C11C&$=)*NRGWj%Y$>Nky8?Du)-U(=16a_!r@#G6jkkwe5ZJhd1OW%DJBN0s2tvY^t_NnNrYD1Md z=ckz+U}5&$uw17!|>a2Nl)sYY_96JE7w+iAq=y0x?j}=@Xlt=QYwDcTgIkI+abwVsF0%GIrTi8bq+zgA?>9G zyMl|yp>gS|a6(@x&A#vC13LYIOA>U)dXIVW$wXe@U=4X|&u8?=Erv_#8XM+GRBV4x z7euTvaGWxkP5&Ie;kDtc0Hnge8#Jqbb_9Camul?!^@LbsTbCH&pKlZ`# z>+sS&%|jm8-*Y#I)|*%e7H#_%Y712ebOS!iUd~2>rI3`^utc*@W&^YOJAjm)0SR>N zlq$l7a>NVSV&#(p2A@JpHAVfa{{dF%aIvfeFkxB*`-arv8K% ztuPMSXWD^>8ddUv+nN;b)|2( z4)}uZDV242io;W}LaRrR8bj2jrjAbN@rRyJ9U*lD$Z!${VmEbTE*tDpx-JXhq=uhK z`9+yNS@5Z9+Oi%U$QrAA8E?&h*j!V<2Gs98nCm-$(P4f|{Q%7d!_k)1N6?YY^w$@A zL>y`r0Gx2d?QbifEtAx;Y;Z2Q1!YWLT!4Hq5=Ow05hAa`!_TfrBf;-lPg$Ts-WI}~$ncx)g%mcW$qpc6qGD)1#f*0Ru0W~reX5O0kf)b| zYD^ETM}z~c;C!$G6VszQ2mcJtyBLVB{nPF(E7-S(&u`=0d8PLJ9a<1cX4{U_&maM~ zzqTR|)K|oZc0I0kl*!D2u2p-qcwqksKO%uKDD_B(yCLR_siaZM^J19V84NYYvWPGhhIvRaX7_LTCA=RinC76bgP&@?&zVFv7J%^L=&uiV|k_kqKkg)b6V zI9wE1j(o^r456cVvYkFT`xP`44C!_@e>(KD#{6>L>B_$&3UIRZ6~I2QwcBcbLhv}g z5ZP%pFq-8$(%{nI+U*ZVxTNB^dULKk-c+HT1(dJj>>oqpVT*K-*(D}6Z}I zyH>CbW^_VOHiFcKJw@qMM)G4d188cp=!CTc_#J%R&SDZ$RRg4#g+!Vs&`*SvIw7z+ zA50FwER`A`dieg>w)*>&NvD}Qw;kqdft2HnTlINmsFD!ZMh!S~O4~CncqNAa4P98j zXmrHG7&g^>>frB!!4!O6mz+gI4t!F)4R9~HJ=+z}?R9=1Y_^Bj3H!!XPHZR#G?*&M zP#c`2L+{BN;wSN`uZ{z&fgQF#q%}$zpvn!5w5r2n>k$eU`5aIJj_dDf@Dwck!z^a6 z-g+MPiW9S{aETVvaa@|s01Q(J@p@i(Y6$nb-HO&5LbVR)k1Xq6F&vD->bVu5cqg_) zl7EJPs4qZ|DQKZ1^+NqHXgrG9+=PGL6d|(H0611twhDrZz+QlCS;D+R_9l#3JZX5u z{-{#4M9Neluy97f%{mR_9?DbwRb!9^dlsm+l>)DAs$&+2Zx-$_IQ|BJ;FBF8AKz#s ztzrS2ejOKYNoam!Q=sz_fTCX_3zj_q867)wYrjIQRI#~8MPX#=H9O;{=eFC;jR&8L zDDRTOJ?+h~#@w?t@`?Wjn_9g1YTWSK5J|S1nB=!j;uiT}91w6=VBZ-H(5)ab^y~Y{ zT@AKNaQEnGo+>zd ze(05*z#eq3enOF)5|#tjU$03w0V+`%w9TAuYiLbLUh_LwY)54Y*L&SDl6?m(8Cy?E z@)IjMJAhJ};qgO8CEjXZa-#l+Ysuj1;rYjriB{7Iq&~WIojO~dr`UXW7^XBfsHg+) z*)rXNh4oXfHq^}Hwk=qTsg5$?+W_Ui7EbyekGccV8`8gD{?@((#mRTxi>cNi%v9pi zJf$ipq)Z)Ta(15ilYN@I7VQBMx!R!c9ioO=W}hmoI!sG_%Vt1NN!i@2512q|E;d@! zqT5ZbOZGVwec~zKmwCaf190M%F#4QiHecI7)~W!RM;ksvt8VmPVVEAQhN98wXO=K;;qmNnurdVu*9{nYn5#4Uh-)E z066uL6fWad7|bM{sOs#g?y@an!s9Tf9T+cCjHDmOYLCwHpe9~>ynbUl)F5I)5&G<& z*CV+RyDnNJ=K^`U8+dWh6h-|v`hwPPh3CqBq>yVrs%9qm(sZZQpdrUi2?|VG@17kJ z<7R5NAwBzrM+j35hOqkTfpXbuC8t{|fs zHepS#c#IHO8>n%LTHJv5B+~4ds9wM$GY9soEEor2>}8VyxKV4z49F64lLL+4!eJlC zvLm|vOs*Of=7x|2*9re#fxv4q#vnz|uQ#;v(~m{y9S!%P|7xLCOz7YcUmIR)u>0l9 zF0-2r$M%SL9@GcxO{I#W94Q(?lS5OAMkrv?=A;J}EUrSlO*6#Q9s&XiDt|*SDCY;l zu}jJHHA}nL1@nD8>LyBNIxat~=#_h-mH)+oZfP*W%MjF^N=Lo`5U#2L=liz@D$Sx) zZ7{&;!}MoB1=^rHG(PHA;^rq(Wv666RILMPPZlIlXFf!D70jwAD(ZpO zB&g#94)UV%Qc^wfwz*!70PkJv_{M46p$Ap4r+soH)(7)UhvnWePp((PU_y3@@>2Ua zyh+j;g+#^EA97mA0Ps3z_{cmcwu5`$)MJ`+%3>Ko0>F4VS{$UbH`OJx!zz$Qv;*or z@Lo*-Mkn4HydtXe+9`E(5T~I0jZvWX%&Xpu^3)F1m2F$IE)A$T10~ywL+7DbjH6*7 zAf$Ym57w0(5Jnw#X}bUMmVXk!{8p-QBi=smbM=QG5;sWj3>`=Cg(1ZLM>lcIbErE2 zz=9#_t)cCJj3rPe9qFmTTQMIwdL4+lY?#;EAelp1g+1UvqhN!4$=`_4?jV3G_~~aO z%Lk0Ji~UESW;U+nY`-6T9st)LZ;9T(q>~=>TYy1 zx(ag^;Ouy@#ow$%n(uM|bxNRT$8vkP@K*fELeN82LHm`@+C%337nTW_UH$&@(WRh!i9Je= zl0euJzrW82A0tr}%kS}HvrmdT-Td#xM6-3}A*va7WSd?%CT-FJl#^j?>jZp^Cv7hz zrj0=&c0kc+q&Q^qOw1D1KNp&}aw1S)yjT!RKr|1OutjU`#hBC1)2Y@}+UFLV+r5kI zx2qM}+zOF)P_ayo$U!~%NOB&UdS=afQ(-pFo%i98V9AFKFD+OpODvT70Iu9Awtsr? zgw!R2^Zq*{n8GxVcBT)eX)mcK`z3}N1{t(xCdC+omw-Nmo2(cPb-D?FG?NqP?W0@_ zUvDSGnL?TW;he(6heW5v!zPWI_$X*bspOb_`$l5xpVg!;JJ}z-dtoWXrDsgD$vv`*H*T0|x(1Mgv3jwUWc+du_wGKeLG3EYjQ+wLr`o|G4f zPTu{OXuWd8dK&>~ly++L*)5$DU!?dW;;}pnaQd?j9|Px>Ef;mPkC`cdLvW7>bee5J(M45|Y=n>8V-4Wl|_Uuo&^11%odaVY{pShHx zuC!zU-BJ*7GatK|k`j>-V**bj&M+a-1G{Nz4_8bHlu>r(9FA7OhbWnHCFXfA`je`$ zJ(QJhI%`znurF9irghITU)9mF?IO~M9v6sbx%@xs*B`5qsuX%~m9Ms6 z%Bn=o9hW=xMiUvGvCTut+v<|+RShj8^51L`@ux7)P2-uQF&-sr?j&` zJ0IWA?vbuj`2=NGQT+0l4|fmPSMgCA$AG?5a?TOVIUYqz0tr)FAV0!@Mi1BnJ1vR( zp(N8rfTJUnz8~n4yYZOJ-0=slzry&wt+hmdp50s4z;F^31h?e9tcaAAIcEndAkkr& zkd%m0P2h!%hqNy=_S_+pc#a%1Q^tdCB45sdPA^K#EEsM~_Z|o)dgo>E9=dEgoRq32 zLKQqWWheJP?Js37m`w%WX+_Y8@v}oNx+LcH*UbAeVI6)6rG4L2WvgOc&qUOD?UbON zw}z5`pU0KzAa100jqw7`S}QcsM@RjpD}intESd~`pAyeQBcs%Ibz1h&8Zdf&eUC|}j`@r1v6?r(5hOCeY9ehn!7wI2itf^~y|v1Fr9Je>|_wLtrSPfoUr z1hU=Qlk=q64bXoMgzu(Qg%9gywu8pu9nXUXJ18DK@B!-57DY1I_u{n~AO(3A4{Dz?m*b6iitix)a>Zo*KDTCP8buS}Re7;Nw3pC+dE6TK zPUJbN(4#rFbRuU8LKn9ma9jNcoVa41Rr(Ez-C_)s+WspJ{fx5YYT{lN40SN3R`JS}XHj9WAZ2uL3mA+bG!^ z62iei5Gdb|O0Yd0V7TE*@SS#MVZKg3H1Z9Sg6b0p%o*TUVFKd0SY~cCP+`l+iAtP* zy^)_k{aMs_v!dhjUDEswDg^l;8&vr{`|bfymGYf2=(as%u`h>q$Dd$l(~&_Ww=}aH z;N#ENr?^ZaOyjDfQAVSDAx6!voGMx*z9Ol`=^aBu8t~K`QOc#yd7|wbG>|?!6l^|h zQ)GUCb336xTFBUV*e+=bngT8G848i8GEM>tj&Q~0kD3r(9;lwgVSOw1f{b^k?lyGb zOoe#(LAC+8YE@btyW@J?V;^ZPVgdhl#6i}%LUY&>kJ5iTUGi;TkXSX-LnPr*jzXdP!Pck2G@A+yMlsX^=FYn_`f`}{OEtYQK5VWnPYVBSF4olY7P+7Fj@QiH9V#?Fi?I{8FB zNO5ORO?4tX3#%4e+#gr$& zbm$$Id9=&h$Lf9EK4=)`OMBsNuq+sI(8Zmq>XPO<7u_@pQ09k`yEyll3BiJ;w$s6X zU_2xjo#Os64qf3uhswQpFGsqq%D~$*ga*15@adQ8(^=S&^B=G^gWQ)uYuERr7O92- zt78^Oe+0Gy9qRUD>?p6-v1hCZZ-#@&{T&qaA#604{7G21KbEMT1`!+ z{7DzY8lz31FEe=aV0RM)M}nmmeVGveuhj!)U|i?Luhys73?obMt_0yDgS_VqWJc(6 z`5Fs{1wgJm3tWKgif^qM1`6%K!#LJS_dw2E51Ocv44sD@b@hx60*pCiCB*wN^8gf{o7j(Vn%$5(?QV0^Le zS;b^ySbx;z<{}(VHz?QIjO&o=JO6lN=Ky9a1s6NPNIJAtcFa=U3XEn<+fLZG zLe~*FxtJH8!<~=GEFZpKqi;00I~EPS9x^||C^^HpF^R}gSD!I%c*GveDUDM>)WBn) z(A%KglzZ{Ld}#MzScc6EdH{*^j#=eg>cJ-d$#IGIq1Wq4yFn(D+HqnyPh#NJOnA5N z2Z`dWI-R!WAT$Vy`dGvnGWD0~;EM9WK|=sCUBr5b73KId!0jqCCA#-_ykLR4huwzp z`?bB#cQ+@gG=t@u!hlG+ZB0Axn9=cdUd!t|59yX{SKZGZV$+t*vq~wj;5FQ2) zf~h8fSR8R+k^B`X5v^cc0y-e2=rjE_p%Vm25XF}ZMrr0l!_&_dodLRlvWMt@L*}0d z@tRgVhk4&=(I9RGfQ~ZoUd7DEy9)A#q4Nba5O@2XxS$_#(~H9vHI_#jYk-y)wBe{4 zK5+cm_|V656{Y#uqu6dg$d%fa zB7kDYsq;M#xb~-{on+EJ)0ACwq`4?Mr)Y2$^h2|NgYXnTYE$qh_6OP-KWYW}SW7?j820*=MG(Y(v+H}NRCgOqz_y&5iya3gOue1aOc3T6#{LB4r?@>-)Fpclsb7 z_MP&tn|n|wpdC#mm$V)8AS%#by17A>%^1@|Ya?qQXdu+wIP>WlyDhXN$(3#?yZPNv zrd-@1bCMV1O-Cg;0^oIbCiLa%s|dlCjnEfv;Hg`NmmNcPCozmuhx>g&6aV8F&j4-mO+Yz01UtdE9CLhJ-l-E)e7LKx8@pIsX?hcQ@|Yw)G!RU2Ige(3s9vQKMH^@x6QYuLfPok@Ish%R^kaiqWp20I0Q} z57k8@ogS^8#N37MV~gLs{axnSHVJTesCbdfJB1}jlr9bD<`(ijR*U!;JrM>$y8|Z} zFH$_blnjV(dB7X~819jq&|uDy*p)`yM9@zT3oEkl`pB`E#f-x$K{H?sx>=Sb&r)rw zPR49A!_MQ6*nNG0UaMg^whvzv)kG*+n9#AEl5?v5Bf0vgsREFN+#vpxFBfAav4wfd z>MHorIe_eJeFO7n%Cgo`aF#*dv$6|0T&p#dloiQS+SNa-V%EP3szsD7$l-s(${j%09gugm1Nn+QvHO<) zk8gFBfzC-Xp(lso_j@CV6J7gdy1RO}>r?fGzu-o1c2HkHjf%eb4ZVLjoUiN8{Pw=K z6bkwW{RKDrC6jfHyS4IK!WgzPTd&>v>P@VBQ8I|#-S%Mrk4mpEb3?>`S#SOJt&?h0 z11$#egnd(2jmZ}+vuxEJR%>Vp6sv{(OkNy6*R$l&EZN0V>wvdQHNO9_v zx)ptnX|*<8D}Yv88^3-X{j~Os%OPzPjeh-0{|>Zh&B*`5r_9rUVVl=ei@Q#t#MA$Z z2nl+Vpq|@uEAx*DBKk?rKwu*oqW&wT|J4nJ?=PARZ=D&*PTjt^3nAe$}JLwXwc#*GOnczN(!RD0E)GI~%X~+zmYV0TqC%`R3`Q9y|%QuG22J91# zgbpz{L@OyICZwfpx;$|pi2hhlD6IT%O7-BuPa@$&`bTcPE=;d}4?M?}C%#Jqu4$f; zus*ae&=$r1k8M+gX!j0RBFWIJcQz!gW%|cg+V3X_S^F1*B6nn(U8d#y>eVa0!;a(I z3Ly8i-t_t+`qxfdC%d;2y)51*!9*EI#^>i|efRVqH_?~AL2Jz>iujb^hO=Qen0!EN zdvFicd{KAcBaGy&%@=R{ygey8sDgd5yp)JP1JNy%WH0?cTwc-gHgdi!?eabNUBoKX z2YId(4LGrTd$7Y8or}jtf|j~Za>50YfzC@8*oB2&vlQ0}RuKEl00( zBYDjsPb_cLRbL~*I%K! zE8V$%cTfC(KQqF1So#$uj@O>7{N6(nnzfa=db?{U;kw5Fv3o!3aT~_%Yys=T7D%ZB zh>?l@>c9W_({C>v00S_3=zdE7^Ir;@5mo>GxK;w=Z~54-Ti^zqD@db#zwjy+Fp+g< z$GUq)-?onFzP6axF2IEmj;q=U=Bc#fO8Mz0+b#02du9F z=9mG_G3gMBe+2opkv$KxHqmIQ{j{zX+^4wF^n2}tv#Vbc#z4xfu=)pB_}?HG8rOIb z_Hw3sdh<&tNmCK@1prD593F&6>_r2?6TEA7jOz?>AE<3yWIej9BwhQRFOi|jE9W2I z_!TRJk$BP<{PBMysRilLb$~^Ml%N`^>OIllR-N}nC$rUyq@qS2@UY!`S!i-|cq?cB zim8GF(dFMBJbX+C?kSJub>kiG(tmmT5pVnZaEC9_MJkZJRePU4aBEk^(e+DX*W_`* zzm3CLw*ZdWcnYs3OU{&1+`kbk-Q-gMLusfE!eXwui;Uf7TK$DOrl ze!}wGyH{E4QqW``38k(>^Fvm@5`J>JL2Q8}^Hfq5D42cOPE%DmvQuI!3|jHtzVWwr zlvcP2d%EQ5@82Xyo8(;ob&5K9CG(SNB6+(0MD*5xkNniUeDy^RC{6xYkpI2^!o(mb zD@5NA@c9*JAl$zD-@}@U5NT{YDHf8p; zdRpt>i#HRx(aPYfLGX)$j9BKAtLtp#Fh>7bmz1tv6vRwI9nk$0F5KX2SU;<&B7Vcv z`oBTT`WJr*r1u_hqsD~TA0$G-aV#^-|M6QHw)81Zs#09L==Zn#^422W_Fpls%n=@u z+4sYzSNDFIB~i)WF`Tue^w$?SqnT7DS!`3JbsN_&!PUgPy8P~r4W@l^whzB_4sXA{(af6|ALi2t^VgnS}w?lvVuzM`>DUBW!{p+?*DTI zZt;fqLX_(-2DRK*@-WVSd?^g}#Lzh9;Mn$0llcT&l{e}P-SjK?USDHa0tZ6Bh{3u~ zq4Fcyi!5z^JTXD6GtdwBuMJr99x!m~X1Q6T^lRwf#q3QoUv*E%XhGC-Bl@sHL|*?P z>+gQ`PpjWBi7_+32TQ!)@rvx~*Z%yS1-hc_bwTsbqm>wyeN@KBe0~H+DtqE-_dh@X z3>JsO_4Pg%HF~G|M<;|Tn;5J_u;YC!K}t$nLn^)9`v15Bs9`q~Z~LheZP3`Jcl(#Y z57l#Gv2EyzieR8jjSf)j8LvO7qJv0RwN#z>^SY^0NcOI)W+5ypI8SQqU;s;y3!Vqd z?+G1-laMx4?zedO^Tic)v+w$L<7&!xakVGIWPA!w#;uay`I`RgS_4NTY@`-F^R}O5 z)Y=I<>9`%I=}K!r<<~5@V>!mQddJWQa0Yl(G7IYk9>V$&>SA*xt!~i8*^0 zFIx0g&c5FwkDxxOiQ0c%yz^r)mq|WO$XQi~9;fKpwzd~n7xq8Y_Eg@t;gnc#zz^<>#Vf#gU z3EB{%+6*}kySWPh`1O=V#tdi{41$DPk=qn_tFZbbjhJ6dam$a}rjaIMTqho0*+Oek zx)@pL&CfmE_v3kcXBHt^xX2+(0ufmTrpTTG3E&XhU*uF-G zG4J0{J~J4u_wSof4GxsF5z&bsk)$uF2sG&4jUoX^&~U?OFBh@W?lAm+t%4tKFbE~x zF~*y33sG?qzk^Ebi$)L7^D2jrgWT^?G z*Uylo|-fN=p+zrd?f2MS{jL(71pl!Jc=*e30OED|_ zD`!4(&6dAK<8`@F-vc-exy_F4UHKMp(D`C3kLC~<0YO$N@GK%8H*p@aCyNDgqUCr; zD-1HZfpp=}t7wLK8DoY#@(aiz4b2wK@{C)LF%otYMgN`kyg=J?!iG^mw`>*Ap*rOZ zWMpn1}suTL2sT;q1^eHcJD=^OVVrR-S5Wd$| zf5@k8*lsA(x_N-ciJzzovk#+(t-qq;)+^z4FFNMV7SeoA&2!IuR0$37?josP-`W9-`W*n;* zWy%0)uOhIT6QSu}Z7(F`OjgT@BtU7}?*ky`ma1;Tn-bWaRwP04~rPVn4+O+YjCaS{)8-Es$kPxIl_|)c3Gv*pzkIR za-9`(ZZsUWaY`i<-1Or8+saaafdY?C0I%_A(`jD##8eOuwjrS~ zFcOoHa29pncmiF93BG8$JsO~Sx?V=;;(k#GcnqCmV5jt8+V8q97{en^n-*j_fi%wP zH(#0O_dOOB>qUce%g-oj0K3MFEqU`Q7GTkKzugL?$1vLQTI%u8&ac4h6*o9L`++?oBf2K+%ubkHXk;;SbxC<+ zT3vp;u`+#(k+Safm?fOr0%A@#c{e>ibGywV7$|1+`pn@juL0E@CXokQu~+_Hurxl@ z+;w62}tKgVgzq=It)W-O+ii{sESzAoHX6o{3;(w;h-kr=%$|!OFD~*B8+vfH98?H92xUrDv}%eUBK?StNI}F_+pP19Z?s_f70E z?VTV4K<*7(y-UFB3o7CQ6Ez#L-VkuV$j&9}nFG!b@-9yT9VUmv{ABjNQD84{Hsm9U zIdZ5$x)iYP=F&wYZ}}Ck0cg!23{o zH&fDXe**`s{Nn=8-kB(Fh|2W!>PCoUhPb;82(2MXLXN;Gtj__8vjDGb%+Je*TW2 z&|jz~!kn_LlX~vS!gu$!NB#>B1qJ^g+#T`>Rf|>-_g?{1mj{e27-rgRQKecOL9S2x z@onT!M5-qvC1#NU4Eq{fBDoqxm!BUo0mm0nlx<=gZ>@70PmwD-(-9>3q?}QG>AX(G z@h5{F;PEyJMg^SvPQ;U`Fxq$ChLb>vOOHWSr1M=Q!d=VioF6>Nxv0-MR$B_Z`UGB=TdaZP%1h*vI^!E9Qv&QsBf<<3b-vUjB?nDZ%Qx;oXk7S9_KFgL5pbu`$l^5N&g<>n0Lk>md{U=6F=LNA+7n8 zCHw@>L?++j?quWkHwiIf)}LLXk*k=Qgsx|(g$d%_wseI=X%)?Y1a*qYsb$-Ii(MNX;}v11|==xpazQ& zQPNJEgoPy|ZnyqcOf~H})%wez=AC3%wBTqG&%TAOx_r`c-6D*S8DVVS0mo1H}NB{3y#$dh*mu-3BBN5+9mV9D*M2({f26-F$?K|{ z)z4|Z&fpy0vri}cOXUY(BV0;IfaARzs3x6}M&yy^h`N#x(c@=vgj*#sT^#el-5d-k zIxGmmx5yKbhoC!;LapY7L+9BGA*nhl0tOrzVBaa?$=X?0Al(z>;!dZK9Heh;1&Y7i z;Rv8yCJ#rc=!0xF)8_Nl7+WOxfTsaosmw83bzDVfvgKmJEB(OEqz%l4T;GSl*qM;J*u>DR zH8xg*w6hGZr~cO?SwOSBzje@GKyW}`z=mESs$)btYT#HA!!mpxjxN%8|e__g;F2^_5gEF-WN`**EBzUI!bN)Y7Pv*AG^g_zpT4 z=CIgQ`Lj}IW|x@)VeI3kv$W=f!~80dW zhlyKN2|FYQWeewdk+zEeqrB20PLL3*p3QgAz4_U>%?)-~zdT#N1dc|z*erfq0oy*bmnUV(~x6vjsbCku&RzFI2pBA<>ur4&4w(;`B53$l$5 z_)Fc8WXX&)#CQn{$Z&Lofk=TO5V()tdb32OTBQeXe^ zdb|UPZTaXpi_NhX`vq?1^Ml2C^qdV5O)5!6HBRNp&}T3nqsZFI>1z9-tzSPNtbj9j4mPx7{bz>AQWM;O5EeGfqgFC_`O@g#WF z@`{RsBy~uB|AdZ8+Q0gv&R4wmm3TM$T2Uc9@-X)}CbtYVBaa-!Tq$b$r-6d>>|7w{ z4-(S|aC2cGPmElOH8t($9Toi5LvHHZQ+c5>Sp zc;^xu!b>tl(KrtJE>>Umc1qP8;xlzMc)viCg+)CspOIU5xn|RlI$O0qm0CPDUIx zpk8vugd8S|AH7yB^q7)!`J}%3D8c0wE>>>zO|%OHG9x?jZxe`sY}XgTQcwWVtepM^ z4@@B$6Hn>--U%}JqaL^ykhEjbZ#ti)wduQq0|2XzqUb;u6g%bzOk#SZbh4lt^#?41`x6y8 zgI}6V5ElE6Z}j==Al3@)Xi#Do;hmfA5*NJ)HOzxL^JokR3Kh+L^e4fx#=4C`CG(xF ztmpjW@ZA1wL;&gFAf$Og8{GG$L=#KC`;T>h|W{9#bH)B54*9}wOyK_X6PU2wNx zVll^f;AkJ1_sl->F%2j=h22d4mN!oDG^?96oy;suXMWoMEK&Cmua!H2y3N!_hf%uS zB!?yK?wshCWzdagAP(V8fJ!3$9Kp%b2pGj7PGgsjkCu-aGz|cQ`W%3x^;up~OCUXouxIa~JL%%N3mvhJ zaKcT?Yc~|UgzVjoWW@JPF86S31VeC5Kj?Q8fx$Zqr7wlQ$b6Bwph>pzbRJ7>`A5JZ zJs7X8BVDA?3;sm()}SmRu#;zW;$%;!t^3iub$(np(@C|81?)EQairr347l*=Bn?BS zq3WWaSLHF4`$xDM?DxDLsY8%8W5E%*d={%zO8WNakTlj7TL+pnha38FT$W zmbB|DAIUSV1a3y*jO-ZSF^2JnduU{>vSs~D$@KQ7?9}0NQ%GikKm@>9L_9?+s1FqT zxVnV0Z*Yc6>rlHli{z(0R^(JZhXyy~SLV@yb5|f3#b+vZLW$ee*c;@=4pVqbNi{$k zISbTcL!OTa3!*wi)V6k;t7rBa8$yr{@>UsA)5AXlX4=u`FbpYX)Ed%1;g#WA=mlhx zetg)>DD6*7*tGNT!MSM=9riwnQ9)jIl(smX++~7A9vH>7Axb)M1H|q0F=;UA^RFwx z?tHz|@f#B1cx0{i?_heYA1_@}`R+-)c>I*{8Nbgi866HBuUm&7>zHrzx&c~_9TU+G zOmUVGVKpdI|75dAE*ro#L-3Cimbg*4&<`4pu%qB4BYV*_>>JKIAbfw!x~7tI&lctt z-qqRxP^1fY=&&h@m*nfNFhsKmW7r}KgL^-*X|c+_JzP2x22j*zS5`XwYz{j46_2F) ze|D9dgg&^GCK&Ta-KWUG&mfzBB>Nt6nl7t^`6ntgxXX$);@67rc=QIyw{mr(4IjUp@k2?J*sJdwb-+@*7)s0OZ+E=!DHH+7Vwu_`*e|{pzcslky7qicy#HZRR_r@;VjDos!@W8gH~pr2KJqwULcPj;6iFx%-qE@M=L|6sn;RAeK%#GO{oBHo8 zaf73X7?I(J-t!Znfs8z*o}XPd1izA1W*UAQoca`PkZc*T`#Cg=Ho&y#7X=IbfwM2a zFV|G~R=)hM6ulH7B7HKzIy>KdpgPlT<~qK}*CmmVGXn02>1YHG(uY7rAB?vI_hi`E z7a_`e+0c=-pdlb|EPJ8RQYdXe+Ht73j3q)B=EI%pKvxxYa1ja!|1=36a-=OwYXxnH zG1mAHBq+gvG1HFF*S>(cLlOlhyMoQV`=S6}0({T z83G*5j;Hb`HVeP){Bg#|KnD_oOoS^nK8L|_<0N@VG)&4RGZ{u4p&jnTiP4NZ*69XS zpK{)%EaY$ht=u2TG;|sFUrIZ$GQ`TR+8EIQ-ZgODWtA3bMEWTr=ejM zcIHrTi%m|eiA$RGF+#W|?UY=$1k|<+a=W~Vop7U3A}pSu6eAQ%_@{$g#(;L3Oqd_B z8BP|4qvV=2b1x^q)7$^DzrR$Fe3D{(06?&v6DJ)Ms2X7K!XQa4?UFz7AdGUnzzgTF znDYZj1>KRo*^?VY4_K0EdK2%&qXjbzTAn}a z3OZ%wc>N|e91WOB1UxQeJjG=!NWyjvyZR#Qbmfm8)=FF0%<|qRGfPePtv^o)ms8JC z^$0iY5UdT;4AuB&S!cuTf+Eu#-##hjg$mp{k_(6R4=jDc2s4YPTSPE6`qdRd?11N8 z6ljwUhDM%B9{0-?-+!2r(_OLvQA~J|2VgyM=PQp~NO;MpB`gjv#gFRRMWzswy;jP9 zB~I{X%s=vC4mA(Z$e=_qB&J9M>H9)c*0`?W6WFA$UYtp&yX|_D9b4UJ z4?x-bLV<%9L&CFpUm?w;f)WGk=)L}L*xH|9#p($UmH2PHWkEsc_iZin(EiD9*A}75 z13A4c><7o32Wdr+S<#IZ!n~2*_#~H1f+d~m0?0@?U$P8>_WV?bk;KazVD?>Dr9`&< z?pe^;W9}exnd+O-OjwCUSoo}CCkNW-_ll1&s%!+oVGuZ5b!LtGs+q!(&RDMj$@8g( zrNpHJ<7|N=7ySQv-CRBmEa#%3qdd2x@ZQChFuOzZS?#URXWMeae`z5E{V6oR{B}bt zX$(Q!IgOl8fn(xy`j4uDDh7j7>@D2@n{(Ek#S{&p?B|%5@Y>!9Z93M`Q<*KVV50?? zQk=3xlUfQsz3p$RsM4kbx?!X;n3IWn>70^z+FuN<=6XZ_ys9*ROGdsCK;K4gjD;sd?|mZOv0bExPl8h4W89acJu(ET`&T&W4HNQE;oyi16g0 z^I;6i1PA%lsgPkE7%V*tveNT__{mp=z!VDije{ukUkt6|&B?w66X)kS5?gw>Egg^z z8_a5r>U7Tp@!PS8*_@_qC7#10F&R%T107f!cwK!|hoq zj?k>Cp8wbgG2yeT8_2urW?X}-uRE(+5Ean3cRIUE-) zsBP-1bX9Ki3`P2g&509*0e&hg{YZRDzNZI_m65e@vdjIMoP5Y3#-J8qBOY)y`6F|j zKQ74m`yvH4Ds-0&tb-PgQV}8PDb+b53=F@z2BuNa2V=7b!>2KrUX=#>vK=-W{}LR6 zMli9-szIF7Huo{HCC)pFif1Nn!#>Pe+WAP(`btQ_&MoLCsiNK33=?YzFK%J_^ck=c zcoR>SD|hEKU}gpN-e^clg%|ZYvQx=y*wN4<9?%?`_D2fx&qReWRH-bb3K0$ z2+t1$J5MyJ$@Kp*_TBMRzy1HnsT_SuWi-(=lU-)C?9H)P$fm4}%sy=^+4E%YmA#@u zk&zuGkv*~{>-V}y^SpdFM@HoSx*efk#LJ&4)8-d(-;t zL+V-t)x#zN+Da!91h=T2n|{0ElPkly)gYmHC9oMrSVM0Z_|IUhS+n%=N3l3&0YEO^9jk(KEAbo>+DrK`5V5&6#ty>nanOtZYD!A?}x}A3Nf3QjR zLuC@Qo94T$6MYvEwnsLQLm*Txy-ABUqm zAJI!+v)IC{pi_9;Vf`A~{=BFm!Q2cH1;&(apmB$=?o|cnk#GWbFIDRfXj(=rT0m49 zIXaQoic?p;jir#RFc{HB%Yb>?>f3OeHCRj&DmAl+N|{YNG8J_orAv3DJJD<6c3Tw^ zP@j!Sz2fM9HJaJYVo4?;vLttcf<9p>p3QMw#8iAiUE1;AO!kV~gYg7s1L)MZB&8?V z9*KcMD>yq#6Y5jx#;)BWhjfQM&vegmx;W3zg|r)LN6|QYgRtH?iNOOd>0y)h6q5`e z5#WSVI;|358KE`gISHnBMvI)44?6qQ$UDDYK*%zW(Hxw?m%P(|sb%^$5P5EjilPI{ z51Kyd$8!rQp=&=GMd0*nTv)F+e?ut!XriFw+kqJlHtLW3cYZhg@lPdC{(D;TD;gU= zhDaZF6I>kY1gv`$J%8X>p~m{JSj2k3B65B*`V-wHWW4>_|D=9wip~{czt1OLQsK#e zbVz=0C2am}<2ZXIhPo%Er)>WG%wMvY#iOhML_}(xaK_yJjSPUmuRf-3YyZ4n^S=5| z=GJM~e`ddcdzWSBbpNwH3B7Ot*xuo7{~_FECFYi;uc-q#L1{-X5Y#|zmJLJ99~0Le zDIr%Ld^r!M(ZN*AoS2BQJm=Sw7ypK2cC@`AXX<0seM^A~>-*+sA14s)_(33g#YMOx z@)}6=<{)qfaE_>VfC?BXX^7x!2Qy_OgdVD}F0^ajhYU^Kgz8CIh0vi8;I?|hKAAWC zSLmtxdrBP}url-^Sn~?+c+^w)k~{pDgsA+CyDYsvXa}I;z8>X@C6=HCln2$u@B!oz z{o|JY(^?Zs+V1PX!p66L2T)HNlQ#Y#3;jP9${R9w9fTAdTdE!V_2HDK5&Y^VzR`#O z6QN5FZ#c>6#BK6JT!~m^>>3;&Ie%UC$Jb<6X z5RfDRz%-|<^-6g{8skF2Hy zuGheYgy82Ql%*sN{{M52-Oj-3C~=U3{bY5)P44F#-F#%=ly~GmU%PBF+-LyfIjZ0G zQ1-EM$^Ukh-(NUzRADJ36j!RiKXljFqr<%Uq<%dA{}!m}UjV5veaFX}@qff=F2Vtj zR{i7m{H)6W*sl&Z`d(aa*ZQh*f4^7f;vfI|Pjb)Zqg37V>Ityk>wGQ#zOTE)`!+-H z&qw!jLHW|4wNUMR#sB+ffjs|@M#azN{~yi=A|%&KYiauJarFO%?*hO6=LoH|Jy14h666}@y zPuv*-Q(Ah{^4ch>fZU=tRNq6#ACFk{Bs^61!}Y%p@)wVli~fIZwp;`r>CKn18-U$& zn4!&P(;pwO@whjl-}l{F$aevVQw!Z+xwFAkw0S`N?gs5{b9KS0({@A7L$p4`UuUBE zi&S^~WE#=t6Yu-y`>G~Jc=+=7HJ|qmLya(v4gXH~_9nRW>p~`zBwS6U4_*jRJ@S`9 zDSF5$Lx>+qcOejZ7tUc?u-( z)ZZyJWod*fG|V%DGTHl3yGS;WIQS11JLE)=XB`+ZSPXshOV0QGMgNB#^dmNs!HD{I zayk5{m}vI*2!ginukiu}6WMuGOFAut@Bi!Mfgk_e=0D%%<}TC+X$6ftZk)e`MnAHi z@9nyuvEt{svGGfUUL!xf>~0Mto%U74?N@f~#$x+#ghJrcAZmGrCC#m0KNa{i(r!$@ zulQf1#rk~)qWhG%5w=0Py-1q*`|19lp3VB@lHGLAK`v#THh*ii#$pPn{|v~#J~OaM zg*%mR(%7UOJ|LC#C|`SegKkp3ZjGk;9M!oczZKRUI!+AJv47{m*Q8*a>Wih>B zCL)Q?dBExCcHR8SeTbBGlhU}cd-0+Op4xQqLcX|yo7P`1UT7Ax@9zWgqPE-aqi#YD zXpK@YgvkApl=w=!*?f<1Kf}YlGJ<12l4Nu(NMb@mRuslu@|5^LhuIJ#oCz=g;mZgw z;4PKTzuP|}S&QX^&A%GF0bWnlE#tUFb75O^J#-)Z2lkC;{ruJ-s&>5Wgi_kf`9kB?mc{eQd? zbSbYc;W;6Y?muGiSFFZ!5)b`-NN&86_2qpL-RRpoJk$3lLUxMq^5#fuJ8d`Jn}y9@ zz~)X~f3m;qQuuxx6Kv_>?&9+c;q!k!Cn4HzM4o@A^U`Aun;+B zA4r(|{t4TEUh?L3RKUh09k|RuoGjvo`|HO4*Qes?(Vl+LB^xX)8Oi?+I;_=q2(8_C z$`9%6WZf3L$d(cJ@7Tb`j6-%d06#d`98l%c4_lj$=VU^fQojSZW6}j zoiN%ooEdD$KOqtLIpfK_7Ume_C<0AvJut(tndb$($j4*s=kKidzsZpFyRHf4buz4i#)ofQR~+v)z$BP~N8ELn{8PO?*${?l2Ryc@e<4EEg#s-E+ z9ugUTy<;i-ya$P{Mete__ES26j;fRc#m&b_g6RQyZd%nb$|aPxUm5`9Z|Ds(|88%d zfqi_?+64X6`ZjP&hFOoV05nB0hf}rAm^=kMXdVnpyP&1056%6EaTqkWLBqTY&`sgb znw4=l-W3?CCeAD@fp%gHeucTJYS!iW>HS<5?>j=zSXv4{2Cjr!O9NT zZ!n!`Jt`EleIL{V>?E@lMj<-_esfz9q9EL}v3^A>csNk`K4fVSUl zq>w2MV*;rPzAQW2o4xnxwxZ{20fw{1(3Xz43b5`m;!gb&@ds%0plNB~`m-3M)6+b7yqD%qmw)boTK@re9 z+DRhi@b6*$SYo5)4`<8EMofb($*U?K8k*SAR+T&PrC54d`HV{+mJrG!MDOuQDRxXS zbVXC*d1j!Gwr<5o7^pQ`cj6L9;K}uxV9=kZagzEw2CK}JIvEK5b=YW`8({snPwUB+^$qN?=+^zQLr2esI zc}VotLrx^rFn9{75VfG7LjJ^!dT{|-xp$j5Fk>$nL(g8nGR8W)f|{xD@_kC@+SH%4 zN2Hxkiwrm=Q#1+_D~THn=0#N@j@Mx*K>7LM7Hs)q@59sbDSiA$i7c$+itfzAT#;8e zU2QkqRpIP~3OBVU(!%r10#g$U;E{0K>VQXF^Q-k~yToq_VM`eGm`C@?KGnr#~mX`Xoo;SYQ{@W=L z7vkmZMkgv^4j+blKBK^*u9c==Xfxbo_O4(vp<;r_qwZe5_We=ysJs0Y={Aod1|qay z#l_DifC%G`glNuOdN&qrXPIVw{zZU2+^12lcBd}hANv6TFkldPFqYl`)HT-oQvl}} zo*PbbYmi*rQ8V|BB75VkmTiG(&y%0P?`i{Pi0g-^%5~eqRFoaIxt}uh_e4_Gvgkc3z>1q`ih@4wsnc^+j7lr8W*$aR z+N=Q{ohqmFWfhFUOaRtnEgdT`gJ*QE~|H^q_=;yrXyaD)d?aw>ia zvHEsavdN>9Fiq)%ht_k#RN7&-2wbm~q-|y49F1NVL@9p=uRzQH)z2gZ4*W_cp@u&b z3wqyjnADn#cjfOY&4sb=i)jjdq~FcAf^f+Z&PQBxVBeOAh3lC$z#{xJ7JO-0}eNm#Jm_d;C-}79dVeC*=KDY#P~`mmiZ5L`9K;XMnp zR!PVH1l*FsVGD=nlE}H}@HVg4mfA1I?tUx+Gieh*$4#ogx#5x)Zr>zEx5yHtB4jN& zV*sec_@!ire4Pp+1;;C8HxH*A&Ru{=*a?H-F`(Jc2%KJ-m&#Bst;$(YZ?jyLk9gry zE#?A+S`>juB^r%SeLUv)d!9~SOtA9%1fk}#PJj|F3u$MoBr*To>DT{0)o4! zELmw02H@>i)C~*_29Y%zk! z@Jy*`iCVZc}{He5_1ml!rdDCP@E4YdYbY0jGnaqj96wsgy)z4 zGqgGsDG38g602ocVP@caUmiqCyX-HGtv>^8^vOxcP&)x=-{+))?NkN!!_6QC!Zbfyz+8tC-9#=k$bP-Inj6zDhhjYrc|XK@*OR}K?N}X6=t3? zWg3ub*BJp=dEPW=AuAGgVVHC8q7)sgHYdCq8Wft?{#t|lu-+!f7I*Ts>5(}ZVe9EBwUdD|E5+ay716`ex>q7&Vg*rq6G_>>Kt(rgV#UD4 z5G>7<>+lajz}v#X$grvI1kO*6$pNe68u0Q^sx} zMFlZJp_nqw@U_*{E7VCY?1fgfG^*$E1(e`y}dlJsLMZ>Nz+@HLXC!A9VwISTMqmEl}m^i96pXMER7O6Y&&%SvXhZ zYQ}FX{pMv06%e)j zJ?341asC-X;L><=mkS88*a2mwzl1$h{l$n6wPBJem~$Cwb;_c*13-Aj5 zdBH@F>mgguX}2!C2KL1L^8AC@@B(wc@FKJKZHKP8eX-KQnn8V|mAzQgXFFF2sF^m^ z+_%d8s_fNT9IDy3tsG{D-p)hTTHHp_&ojoWvj-))+fk$cN!V-I?Csa~|40ZoJ5j$T zr0~N@+G_|q3R*V;pC#*~dqP0I{v_QUn|P56-H;MVn`eon@X{Sw>p(*IuvbV5+s0ar zbCDi=;Z>v{&hYis+!gQpMcR%z=H9KuEY!yfOd(X{Jb3V+-25tILDd0MWKhXrFy$=e zQ@sMmKYL)ap2ys`*TD?sm9Xo!(Nm<}8yG?sbEF~;JBxP~?AtSpc+_NmkkuU#mf!5y zqlL3bhL`@2`6hFT5HQG`&wK+UXa{5{Uy8${hybsX^SBiF3-y9D=g2t=KrrBQ%t)#& ztdaz!>6^f{n+I_RMX_BRpWx`+^^{lJ_LN3Jqq2*^YJ=-)jAnP9f;7*G*`lWyeXu=Z zp5<23g%dT0sbe0(q_Q#7h;>1Yn;7c29Yv#%mlpIdgz>L1Lw`4Fe``fuI&}>$K;5?| z7L2IaEq&y@+>LFy(-iZ-HB}Aj@GGyMjH47m9mHDF8VNFoK0_T9L$((m3jEDjJ0(6lO|8Q0^C_9x-#b+@S5vsSBc;~80RF0rWH=(iOfRT(W55q0MHg?jXZaluu1(IpI-(iJw4%_GvRefm7;jfjUWfYqg#)2Sd*z+u_2 zRBj1q4VPk0oJnl>^m57y-M~dTsFvuvm8uZ(RDVy;fz6ooBRc)^wuIE&k!lTC6y)Z# ze^^h2<7J{I>|S$8rF0=iM?F>x`e5nCx0%~LS?*^A)YS9Oya~reLAPrh(Kx0kTkgVs z=X7zYq)7zY)EffAjCrD9^fgcXrxe9l9rKa-Zkez5XIpjT7wb>3bMTa?dI-kBFHye`5>GZvtMSatt9P(9D@Vlz z>;d5=|Cl|}G=0SO8=Rwv0v^b1Sq+wOTD5kcpv7Y|;H%Z{GNQZF@Nqvd_{lZgwpyc^ zhv9j~Kf1`T@(JetENRH%g@RZ#$L`X&OltH)!)cHwgB>CI<6BE@N`RE41~J8U$XKDr zm)&G83;tOLkO)mtQsl>P32Cz}(JcU#$u-wp0X%r=2RQiipbV-UO$4Kr;i8c*V5q<^ z&{g3^xCKde*?oQ_(g}E71MGAy?xdZjuvwPgI8u%o?^Tb#1wFR6I|aY?*i%MS%VZyV zl@lF(R3d#@BV8`$&X)%qnCQiHW+_69mvL?(fEGJYi;!6kG7aQ=wJep1Mm5Y7IrrS>BPKTzp`Rh$DA=BvQd+7n4ZG%jDCB;3rB z1Y<_svBr065N7JTYkf}Tzi*Pr^K_7gI?jGF5_d;IND)wv?158`Y`Ly3BP)<6`i067 zD9fL}^B6)Zoo1`Ju^7pIX;4M@WTkc5JhNhJ*=WO{-qqZry@1*_fwxh>aMl_C?SU+l zbvLRtu{R+iu_O&Yo;CUeo9m>3QEPI5ahx!wc-UiLrd3%x?S+s!b0!x~Mc5vj=^3%~ zF6hOf`95K-jRfaB%5Zd>|8^^%q!Y6EEnd^A5rNOnHJt1WdvcV^x7QMR_S9+Q3FhFl z5nmn!nm3gh2E4M@WKDEr;+KchDh8V%Z`SGrH1n6xr z0NkllgTssZ#DKcxwnYfzOQ(7n)=)u(ltqvd%!So!r`N;!ZAhT)>N47^3aezfV9~V&c8SlyhZd>5gXRz@ks7J2F%<1_Oh^YOLYG z_fTM5K~7|OW)6kvc`J%&(|>!dHSRFOb$9PJ+z0EkS*H)7s3C{QqYrKb{8VQDt1S3_ zeFkjLGnMZ#lIHO5m&5`<5M$D~$Ta-UB7&VacNgXpv8 zCb5SGuBqTQn#?JXYE(KJ4k?ph8~>IX{$+0GZnYQK)2G?fS7(9I+XYVDHC}P_)x{9M zRDn)SG`HTa2g}hMgB8I%F7r#qEc9nHKxo4_{qmW2Uyhw#5kKTO^n|j_#Kon{yZR3ByyWD^?XK)O5b(?X(IP|{K{Q?<;eRfSCBQ>Qe$;A?l6o_m= zW>frd$~rF>kne;w5cJ&nl?+W1z#VE(KhUbM+4E;Vkbv+EVbk9$IISGoBZH~Lb{)ER z-$8n(dg3F9VT=JAIS18`HM!B^^fC3}ki)u*bb(7BLQI=3dU1EY;u7$D3Cbj00B|;P z|x1WjZ|CaVlW5yC_=+byO-zjt)+f`twf<64? zYX^_nOZynLn%n@J#C(XI@!YDEgJEXbVncl}Dv4nrNHJ`PM0B;=TCTp>FfLQdSe24D z|B~ztSE4{fij>q;n>_QpcU5Qx1s!MabAEGhD%f;K^J`Mf6J%d6*?MsmdAQ&7VtcDt ztyyG~<-TQG=lX1lHH^nP9vO!M&C6PQZh-G8kvZfDeX^m8@R0i|tSl&MK7k0Br#7O^ z_Eu}{gofkD3H1PeV2yF6Eq8$TnG?`=d)tvsXq8XtA^K|*$U?J)Viua3G)|BTqdY6? z$%VV2^`$LAU-)6w%#4e!pc>B>)v6KSa*!P2MR_;jzi}Pk;R$5sgqSo2f)i0Osc?HG zj+lq^Dl#dTPgqRt5Zt+^cJwPa_98FQ#l$DGPb~n$Y6ly5D>blVE{p*?LY(o-8KKKe z6eCkXSyCzZ2(UDYuQr_kDabrX=ewXz(MOC|h7L=R;S(DVsJ$Om9Ws)YXXP1Uk|C6a zYIbzmb(y53nWg2nA-Y4@#3x|AYH@nP)+&ksEg8&c1Dtw?EKQ-s=SAbhQvil+V~{i2 zWHY{M=9!$lAnc5cvLWMcTNgul5W$!Szf>KHGut5tg=p}ha zrY|jC#Q0>+&gbtodja7C{j!9LSr2kU<;TvBl@iU&Ztu(-JvIN0yd*{^1;4@%JRfs7 z?xuRNH71ayEc16&AWCJbUqG-WkW1#_oabX|0LMVpc()Q5W`zuNK@RvRIM>Yk2B={) z=(TFJ)m()^u~*RB5k2hrmP4+G$VjksR2|2r7Ftr-+tKN zUBj5A9pWwFywmW&!FJe#FzkV&*#8T)^9Fo~){EfvW+&mtllk5SS=?(6f^aj+iw{Ck zDmq&r{me&6L+*CQ+KlG2jCyin#Z&j`eJnlNOyn6t3=obos0B8f^|bngE9ZG3;D58PA4DOHRHcRu>Ep(8-r*AjI(19uSH%~V8vh$kC1Xn?m2-_5nRab_^B{pZlxM)G@cyHsj~WhJI)KuKeN1pEH0MRwp$ zew#cvcu{%>P~B#Q@0S@T(qYJb(Rf>04-)BgScE*H;kJ2ab zgmX|*pvKT8gb>`_?H@h1Cu05Zf0a^>!YVd$y6c`tKY(LR*7Y^-F^nnTmm7MLm_? z-ccvS{#8==ZTPENB@in z$f*J|DBfLdh}2ZHfqdaUa@^&qlgQZeAni>ah=ZyCpR|OM^h6Z3OoWOYP(G2r;C(S3 z8duohJng%2m=kC2zBq7p4&DE5P}621I`ilV`pb6zvT|rQ9+ZY<5HA<{PVuPe?{i(e$!KyP+>7~0Wqb34~k3T4eCi}3@AMQ zx!5J>){VTkN(;aE}=>qp5SA9zVVZhC(99h@646624LqItVH z;7Rfz)w24PfTfCiYpy^%GEUN0#oI4uJPlCEtJL1C9MH^<^bCk)&* zP6!x1e}0?v7Ue&s8ijSA8VQpSGA!{j8o2I`ir+oSAY=zK=S(59;aS$Z9gw6BhUsTn z2HsY^VcTA*$DF*hvr+Izi${nWlenpi@j$2tCAzTs&b35_+cb&Ai?|gj6+vxZaN=$t z%uv%BM;I#~c=L|XYYU5Gxeq}{W@d2&4q=?ZD~O1wFe?{!P-4^wX#{q3fJlE~Gxk0Z z7r8Jhw;38Cv0_ajO=BsYm|^Qy**0h20mzxqS`h55Gp;O#u7cVXp~w-b{5Zi#-ar|q zGa%Z>XBXNmcwOmLI3qUc8r-aJM@`{@Dgc69Itt7C(fjR18%$^+L1Jo`ueu04G$Uu5 z+g1n>v?Om0l#>Q?T^L31CB*wq{kY}tr%fCkzO@Ea_>ECttUf}S4E1*nf%43iCixks z3;l%meiy_P@F34KwntoZn2BM6347oGj2;uDO_WkWZKRTsiHTz%BQcRR?XxY+hQqqB z>ibC(G+In_KpU77qY|&dSlkSvE?sCI3R-qigS2TjqTprCiV>cGM1DA1(WwY`u;qP= z;Ub7>k;a`veZf+=ivjM5wbg}Z@r!BvE(e?UT2^@-G(>t(ZP7I;4iPMU(7U>d1m~c+ zkx=W$rI}Hj`6iot;Mj%2I$au0eT3b9fP?56UCLzx4VMS~7{|QS-d{2oIcn1{)AQgx+0s}ZgS!|{p8NNy3 zOdOM_h_kky7nl6$>V^OjcMA)j*Dd1KR*vutrBbd8ySya5%F_Ye{V_Om@?kpqb~*K0 zybFMdUdQVk$0%QQ+F@2yUd#Mp=B8?&(c9?TPtQkv3rxqSnzST7vQbTOW$U#uvr{{F%Z=7?N46t!&Mx2%JccUvZ`2{OZ)UqE;c@WkNku$FQ^n@$ zm&+E_2QHX6BJs2@rX#K+xgCY+6t#!E=k5!DI3m5!bz&TJpYw^_*EXW>y`u5}Wi8`H zdrmXk!r`E$n|TpmzRkS5F@n?$-V=-d4hA6G&Vp|Zn_cXL4%blXfDN=;I)U?k6~?+{ zC`pdT(_xC~2`(}*AWAN@6#7%``wcJg2sD9ESs~^8fFTzLCJrGyuhxeTd(NSW8Ir{X z6H*cdvL{5=Z!42{(dUzfWHF?00$MB||j$?-Y2+K~n8OeVY!afBhJj~`hxi`ctnMsxaAT2IhyS~07K$=lJw$`$w7T- z7mNe0ToM@H#q*i0`a+6dWvXqIEkDfVA1{RSSdT$Um6jbHf|GfiGEJW-o)Vc939ztt zTF!GyNS0i(Zv}cA$SZfIg~mXhEXw1ktt;1Q5zl>}V1_7DXa~5LoGVdPY5cG?xIQ!> zTm)n^ag0qCfUIO~O==GV@^sW$T-;A&3=Im>>EDuD+0?y;Zr?av91$yJ>@^ET|L1JVd#0rS%jx?I0RAIA;%= zIkbw`W=M80H;yO1Ffn1A6O3$gO-cuB8f?}$S$IZlV^$zsOZE?@+efv2m1pwJ>TXxm za!?fM4G(^5wMtccNP7I-;ChItfN?IVg@_nRE8RzS<68-c08gqHi;J>e!l@7h=3JWA zbnU!TD0`^~;WGs3-)?LCc2WFuarWmW1h@1V%Dcdo29zDJQW_n`-I+-(6dUU z@C?s9;HW?Qf;{$#+iuJkda?_Iv}$6;DQS7^8BC=VA83{BL}gqWRpxj?aH^dkz!>hv zfEl2tPvWi4haMbn3V}%g59VOnSoUB<4gKT*wE8(eoW_JGDK4L;e`S5inB9aT55Zzo z4@(dRVu_0;pOZ>esG*VLYxcxX(0H48e;1*nhKsB40ePHswy9G{pe7#TiBp-MZ4^a$ zJe}>U;^?DKAERY7ZfcN^8W^C(WD3)ZI0ja^j3e5IMPH@rh-1Do>{E=iQX+foq0D8G zCY?VVAExHZ%_&{C_%e&})RRRI8k0i@nvo7EObx^L+Sr5GLolvY!8hF7R?1p+qR=K2 z4e<0H$nf-&)9ovO0;zuFMyNXKuv8`6T5^4)5KVQ8F1yO~IQ@JYYm%EF<}xUyHnV_& zrs3RcX!oB@;T>5Kv1Zd;d};IHoR@_4`HX|(XF83J&8TVFCV9W)k&}Dmu$m$ne=5rQ ztU~cRT2)*K-&tHBp1a_-b3LJgcOy)klj`Vu2Q?LzAOZCtuT$T@;x{&TFvNxL$u8TH zHcd}%m+E4(ckD}W4zEx?99-Z#(-PR}sqEXY1H&CV>N@-`{G3!3;g+qz+|I?a#*UtM z1jDnAev{GDD7)!^r%p;82nc#^)iQ9xlI`;phw^8#7Fj)|#hYD5{Dre}E0XV923bcR zwx&7eR@T4hG+kcg-kFk7sjy4nj}8y4%b(pYvJE$`aqqY5&Ij(uNqt(Qg={O6I3AZrZ#v|eJ}@*Pz*6T!p>V1SCl`X z-Vz!kK?5i>>lsWDSaTB!pUlrmRf8{jsyIF_aE^xCELg7@DtrZX6NTF4(Sx3ju`e^8 zt%l~E66h87A#FTUI432a#+e}HLpRhE_i*MOxxmQLETXiW#A?bR^^d~&P|GrA)p%B` zO|ONTrn7m8Q67jX(@nRYvxW)9$qdN=RB^{UMQ4X9f8*{)&CjF?F%TUIV`&x(@NEl4^`nell#JXX;jQ|a@MDXY^e>=n?U>})Y~ zc%|*0=ve9)F&KK+(Q*vT*g0glu6k)#B`Yt8Wk{5^&E4~Y@#~4U!}H&30VpQoScJYG z#XpCF_X?n2ewA{{>CNPN3_E%p-(Pcg$NpY^k2wVWg8uy110TSbpn0I3@%;LXBjg4~ z)RX31cNw7U>|HR&kt8eh$f)Uj5z2u7*O1d2Lge~cmt>9M>oBZWZUJ^3C{uHwUPu$CRXuO_@h+CO8Ye>-jkcfjTX^m*l>{(Cm0EgM%D#RB_z~}ZC7o2QhvGhQ23>#& z{$K})5!Es3k<|}SWW`rI4RNIh)cA{7s}d$5u@mEN)0>?Dq1LI%a2a(rhNfb=lnFJx z%vg7I4I`Q5WKYEud$No-_h?u{eHu5(0 zEZT+@^*4;xWY#V!)w&UyVAUS3>|}C`?OEsJC?amIk6d3kF#nqDG-CP8Yq@6$uUEIf zbd^(@M;g2wh6OYnJ|8i~G>*(A2OQ2-Zp=wfuwZuqJ$$Jz+mO=RUX?V5WH!2iR_CTs z`HVRv#W#DFrf%2biGjX-?pHvN^5;7I{?$+5kQ)t@fXhaYHrT1|NB1-)pTim@0e*H+ zs}!pIbO$O>%f}Wqjwq$?Z#vHS%)=aD_CY5_hUu6lFS{SZ;-`2IU;>RWXVN~$>t*z_ z_bZUKL4XMkuAKV{g0884g3xB76Os~^Q(}o1lV$b%_7M!?7=_)E={^*Y>m&&s!<l@AScDO}8oC8bjH3Z8{2=}ojP@8G%$nFRC7ZJ_@4RseuR>hc6*NKm*`b0bfo^^_U?yQmeeWpio zZNq~%L7B)UJE4vb+JUH_n4;0nXk-|E{c?a!28r4eNnyRL$hfV( zmFd;<7AYsI(oal$@OZx`W3=N-c)NfyLF_yuO&9sU`}pj-nB34*;aai3e;y>5bP-JD zUm$0Z5udS+Zk>jCg_q~0*|GsTBknAAOq9uqf|og&gHov$7%2ov%W4*KN+SzalbwP^eJd08`4V)A$?!hEJk^coySc{wk!(Vob;i(l&9`(SHHx}94~b}8hPYH) z-%2*7AU04;pj~fSUb$JkPT|3zW2{m7N?YF{jkp*gYG0T{iWT%bZ`Mrxg9fv-0@JKYyFRlsPKQ)t+w@!n=7F>tV3fg zR8lXjtEpHtg8h{%_#knR5GQqv79!2>DZ$Sv-QdAI9KPr1q%>~1R zYIR4Fc`J6quSZ_MImW}IKXT~v*BAhHN1A+vG?Blb)KAP^Lv5TDhwJk^JeUo$;7A3H zl(X|N^Wv$WyNjE4f?7IRF{Ydi6M-x2;n1x;mG2q!s=7q3-_aEYMRYd%?A4*`Fowu6 zj2Qu<))9j+$H;{jb0SG=gZcSD#4YmY1Nr{dcWSLYI11K>?Je9WHt5ydm6SDG5&BDJ zMdF`3jeW-iC}x9=foVW(O?iVG(}1MRpV&@A3n5np@BdMp!qfj0!7x}uabl8>x1|vz zu@bpmu{Fuffc|VJPz4W5)Ii#p@vL5=RG6`KbmR?3gC`ZLBPCNNEB9LQ+XL4j;y|9b zic+1}_OCF?nx9m(La;9X(?%yT4=QVrt`d6p(A!aGCL0F>Dz>0jk}d6NJ)c_e+G(?A zo#w~NZL6>_ZEojz(~h2t<=gJscv_zPn*vVrIWVp`J=_G*E5Igl_+rN1{Lxa!IV*{i zf0A*4i{k%Fw*X=kh)LWIKvu=;KcWpTzZzyfP>jcE9)M2Y7po8~f(VAN1ij%((7D&^Q_iKprZ6u>mH(KodfXB1Le5#cc$ruCtav#g?t*lFaiCh-HDbkZ)jo}jbFV9 z>f+qcHf>%RMwsu+DX9K(_DY|9BEATCnw#L33nVv3aLv(@n z9&`y(VU#H+$IAdZ7f+DBOYmzs`NhZFeKn+6IvY>`&!_A>qtXsgPn$seuxu+y@)4_3 zD?f1o26f|rUd{%NMF#>o0dXb|$V83g`a^S%q8ug7VFK-E62)#H-c;Ci-{U^{#pG(O zDvyNs(+pDWb7wyuw(F?Kal|h!XFy+QY7aUx0EOUQWebbIu$J;`(ZFkP%!!yrM^Gho z;mC3OI{Z-_ryVjS+qLa}WXy)y79* z($9@KqX|Oo2xYNuNvAb}Ym68XHPfDXS9^?|A~Aus6_iVlngaXf+^P{YhNLI`aQhS+ z2T-S2YZb-R@N$u~qhMcljj#9Ku@dr!gZqIn{FMC@9AbxLByQk)8e#o37&n;mA*d># zQ+uH6JeX=9ucJW6_F`ARa%KooCZ}Z!QrnE!1EvCH^(?YLn!G8q$)xjX%Yc}#P0+n^u7w;#qcvGvNMuSrx8#{as?@a8c!H7&P1&( z(>3?Unxe)mA}%5OPnnF2*yrItqIfH_(9Gt`+&t-k=IL&B?6^Hk7+L$q=A*ZAhl~GU zy^G>FMQv*9=PT92^FXU(OACx|4WBy6PNhAQW(N9}2Tk`jt#guq3Qf^_ z?zH=Dl!!AV%C5-a@8hhRuE(Dm>sg1ICpQRh@4E{>;;G!Ye~1%BhZk3D^^#{xWRmmmF~hF8pfDY z7bpk)y3ddwid!|Q=ee)yGW6($YIcE^1$aytYmM{*SmX<_!LUkLP`i%@*IpWlQ2-}Q zl{QKooIcm%wx#8Sp@J(~HSCyXfz_{l2s_8qJ}qDg-PzvoD*n|1=u4#nSUKLgs6F(f z2nkcBVv0_URtATU<`MO;L@oc7v4P$VSvKTpM{db~7bLh%Ihp-jB~|9W&(~VYju>{e zT#1T{*zmXF65_Qvl^!Y3S~k|ACJifNSztm$t)<((3Z)&fTT|CL+^0*3oAlF;AGvVp z(xvC`)^<{q#8JFo4eE7?ReEXTN)db(xZn$tBuUsv@w5Nf6acUY!x&`x1krbJowbTS z9?@vX1C^>{fSJ}r#QXlF^5bcpiEh!16eo_O-Y$t@_b!FO$OXT`!4^gv=vBG4J@DAt zV8A}Lr-PD|(bHBoMHkIGvJ~1&*P*51rdr)H6!-*Mv4rv(8c2}tL_p`aUi@yT2-D*T ziUqXj3hNJIrsGri6pC>8W+%6<(xxa1K(|}P_olaVQkK?(JE#epLc8bjU%(<{R6Xmb z$}fMokAR7BI@xN@_c<3e24RF7a;rT=5mymbD>oc;4)y$b?D}1r$MZ@u`rqk2*8u{Yn{|*GE=K#3c zIOw{y(_uz87QH7FR8-6=nY{@iK}vKhB|EWfmO0qSAd15ggz?L=zBAwoc*TM#n34W8 z8yRbyQB^ZCn}TUZ2-_~I-S<)nw)~QgY&?V?jXQTy690i`ZkI0hx-b%F%l9vMv^Af;)x z&*FjDA)SNZ*W%YPA~>-fzGO_%tyRFj-ph9?X$4AGlf!&=oEWXEBlZBiJJmemp0vQH zpsAJ4(1F4`G<7s1L9NelycrtAkz76>cUZVzPqHm2-T9$OzSh{A$sDSSR+hd%hVs+7 zI!6Y)x}QQ1ksFzenao7i8@>tF1qpfMfD$z58;41IPOwzK4%IyqfO`k5yJVn-#!nj@ zNXh=H{)N8Nj-QoO)f=ItO?3P5^rCrynuXz-;T_d?^dO1PL0Q71!ds4pW941KVK9wT zRoyRx(B+UpQ4d`Y7;y@=Eo$vQvHgUtZ9b3nK!H+#Ka%u35pnl=bnBq}t&lcYIIPLu z?bp}4Hx!Az0g5l&iGpV7>&#M^sjc6z*#XJbU`!VSz0BwfxdNJ<36Ha$f^7*HOtM(8JuT#M+ZY!Yzb4 zLnW4VbpOKb;4{b|pem7|2uw%10MU4Oh>H(gx?;18S}$YFMx95I=Pk%^1P3ity%a}w z5{|Btl6`-Oik3^cac=CgjYKz*1)&QhBJFn5ix5@;X@rNAA3PuGkI^Hnv4o~BgL*%V zD>UQ-BkVdEhTo50tY8FiIk!3FQHf~IXa`;5E&%?G5SuaJ&!%8)zzlq^Of$7Mpqs=984*Y% zLF5#8yp$K_7d=C>^9bkpck683p|>&XlFpcb;kRDK2U&Va9X;cy4~C>YMAGOCMnf=` z=fHR*drqWDE-ieq1VD{-^5h=HIL~ZPbG5hWFjldHh0+B)(;niZk$@gUMOj*y_~pL$#%KY*iFzT5KP&ixcAMjcWyZyvNr=$n%*u{*uyPIL)gzg-1ju#XNq z#sGY(2Ngu-%&HaGHl6*lbmslFTNwfL&!FpKCS|=YRYv=O$dlwpO|p4_N(q~_+^w;AJ|In*d2zL|Vod2}jetR) z-Y>x=$8^eJxY$GVGaD$XoPJklIw_ONK6gRbJZr|rN#H!rojq&R@sJTbK)nRxg(9D# z{7zteUxiAP6GLr$sRC?xs&d5DmgdK$Qe@)NVz-|*Iw6Qzbr=rCwLi|9cU;b_;s>`e zr0e~(DUfJX(FgawspdLweU7o@&E;hsllvC}dlV$S%W6d1lT@;5y6ykmyby9m%$&yI9WV z(b(+lrdc%wUk#xDj6>t<29d1Z;*5YLpfaSJN43X?SLw@DzSM$CpZDJ7_|c*P1Bo5> zV^BaUe1=AVEsnBvAB9Tt_0hTL_WO@3UiiFp6ws$5{oa82nX{f0dhs)7?MIr64&65{ z10b-Usr*%~A|9?w33n8H!BlZxP_Va>)3ZUB&T#GtB0Z@(q$7N0^d(Zd;LGle*wk7z zRT6^^IFK)VNh?9w=*PK8s~U#LnjvbSJspo+0FC~8Gzol-H73}va`<+jS?Oc(3gwRm zE_dabOI5IAzao#tCM7wWVSs{irA<)cs>%6di^zTL0r)q{+80mDpLQ`(Y5$kvFM6Hc!^Xygh8&BDap?f*q9o&k+}aJn;{xSaE!Bvi$tP z7R(TXjx=FDmK?QTm1D22X{V4mlGwk|+RL+9ZMzX%`u^qSBHv(fuo6RfIxiH9-H2Ha zRCOR2*-%%9bczU^S0toN8crH)PLR2Qi%=klq^3#{ag+v|3WhME3`%AdUw>Q;o!XK5 z(PIjw-ZKESoD1b*_y|Y!Rj{a%7Psb;D}P9occpzcSo5?nlVi(gG_i*wK419O)(~pj zB%u%$flj6Z=_UD&GN3}p79+VDduOc6!>Cvu*-1a@jQ7Z@O`!dlj&1tv%cBtYWO|MD zy}1TUN-~HKXA#MMzMYd+5{QsOJd!lCGglGdlSKXk%~XB^<=Bx&EErY#WHRJ$)-k)Xu# zRrD*IihYGMT$X|W>$5*tNPP&ZF`S&-C&d}qe|$@mSs$<5(f&2K=X*MuFVsO z=R8VJp{=Tlz}#(5TI--H*x!(#lsl`^W`BO?Hf_Ls?2IE-wt>nH*#O;b>j5L5U`0sR zN}1>FYhy_TWypPpo{GLTY(Ip+B{SU($e=Cfxn}x5$Ym+7KTy307bD_dq%|3wSY3GX zYF%{+2~&ac+;MN^!TZj;95qx`*TH6uz5Yz{ghjt0uGU}BPgywsFo@gzcGdeg1_yFj zvz)h=pUg}6EZzi(b~6~J)~kI%>I1s`@v=U=b)70x`jah>!EUR{8(?BG4$THgTZX}x z?St*JuR@XkrPqOiVBci|39xqPwWa)jlwEf`)o=Splr*SFl-0DN ztRmy6q->QvBgu%YvN_sPl)XqxWR;usweV*SxUdHhm_kG>h zzTWS-0$^C6P^G?fryFeC(JW|?w%CrVHN5)5Dw&k^VtzdkmUuTPbnCoV2g5r6Xz<5V zX(Y$8l$<;HyMsK_{JQ4g*auA7rfng+DqZI3Zu4?#>R4AbKHCg8X06_6pZ{hb0b1d@p3A?(YS~O*vh; zA*Z`Wc&ET>E&%oIy@=iJS9ct8+}aTYr&bTjnl4-$%VG`(>A6=Id35Uw&zjT8u@aXBz-F$oDzlI@=n1%)v{Z$#qgNfg3+4Fy#O5HTR1&FIxj zi9U*ou3E8$8-y;t0i{P{$Jh7lA{KES|g$Z=}-kz)=_hpEdBy>uWL<3Mij^)Y@I2me( zDrRLu^*Q-ljP*T(?P!8YQp1MBM_~+_BlSaU3K1u@vU#Bcc=pl-5>VNGy)%)L4c=hK zDfSYFHi!7wSf27{5RnG>PHiQz)7yJl?B3LfX==Tq3_%MF-*V@E*16Fq+$)p}UCury zi*y=VCT;0p_Gd5hGh_?2$<8{0b(M-y%&e!L%&zY5e{>||W>-Ck-w+F48k++qG7eFV z8gze$jLsF>bfBOjZ7F;n5XlJ5L|K)T`2)KJ{Q^QQ)^9BH0mA!;CV9=LAf-%#%@wn^ z=utJ;T%`t|WmLkVwtFCs)(AXx+vKw}p)_KUQWT?w?vFP_pPd%QLe;ah$?gllki-uN zx6O!-P!hkuZ;n%q{0WYCJ8{qJQ1IgepyluDCCr5ix-t0zXoNRZ9k~G|#-~w1ln29( zv*ZuM7^gW@@jG6sqPDspkd;hB`RL7Gst?eC4?P4Vf$z7SI9b33E?*=}ntr>A&sYl{0BT|raiv%xR03Zn11G)8|A|$`aWJ6PMlw}CoVSHMjb__iA z@8e9b@g=)R{GmOz6g90Ip3#+~c;fPt#~ooop_x4Ha}#F>F^>u^vc)_aN-_8Cdg#V` zq&v2!2?6zww?jC5yUPrqY}#h5s(?8h2F`qH@4L)U;$E8dDF0N_fOsr3Ss-u}D1w_d zx1o`jVlkT(r9j$*!N;Eb-^HO3IEYJAQ?_@s86ImhZze-uwZHA<_`;*yLPSes!_w}% z6}0_lW0Jt7ohhL_i@+E?9}K#8-5V@MI5mwi91%vmXQ#e(97%=nZ1S7yOoUxT(krLT zygpCBqZv6k*#pBI+$7$tcsbqbXoE(cA>PlB94Ll1qqiUStGH7!Bh-^iGYAXq+e=ER z+o*8a^$jIaZ$QMOUvc0`s(}7yiA!9}cxiX*`nR144Nv+MDMAK&A-nq{TfdKdeio3e`?$mW8A@L+_m!eP(UfPajG#=bGV%UhG-zU<-`-UDW&%M z`OgwgWDEOXKxo(1IFVgEn|L_K13nvCz*5}yPI`{K48RZgHOaavs4L;Ompig!76~*f z7290mv}SWjeX3I=ve)am24FL~1P{xzIVoJW)?zAAde5Y%h<`Bq+Pf^J(O4 zZlZAs?*}|BayOv$DMpcP%SF@J95h2ZP9LG2x#U!kHG^?)Yu@mL$%nHiH!SR(&0=l; zIjh@%ENYn-2!wdC*-;}iai?N!b{Q!pY>IPy?R^%)#GCr`AvfP`MlnOJ(W&zc(AV>N zN~qqVPF|d!@ZBu8=-9=v(_@M=U;c2Tp;!Fr`oR_qEmol>+!LWgc;N^%*5{#?$0gxH2)i#1p$!)6z^hApI$*X{-@rqKOS?{+L}`_ zFW@uO#m}!2UH*D8W=Fz5P}-Is6VQ~Go}3aPB#PZS)3bd=ub)NK2y$dZV1v`*01A^P zEA#vPz1OlU#MYYv3@(Ud&#CpM0M+&p^kCyuf?Q4%-P(r%bLa;otPv_p6X3`M{e&)_ z>y2KMXtoE&RtKO3-5vjlW~plcLreJlKkp2qDD)B#Z{7au6JZDQm9xoW(;5vc$WS|Z z2tf8(q8xZbTgRX2TO2LP_0{2$WpNzWBzvqu%Er-9W z$j}n9uvs-(`s2&lJZ#@?oWq(-tlneIN&K}y7{*aJvnm}!$2a`7HiUOp1^(Bq*YG+h z0M24KNQBdTEeibz_-4O8xbofaKnqLPV@yEJ2Hw6UIbh>`gXw}E0mJ(Mgu!%f5 zASXpMPO|w**) z(nV`uenojpzv@7c<$?tl(SQ=gS!b?aA6)tFKWRPfy8s(8tZvYJ#C9n}TDu8l48hN} zfq&PFxa8zy<~cE>R^k1amcqqw(fbC zALFbjHMCVz;}_;Idd-L@@lH|{M-&ni0khCi3;?jf9M!yBt4n3JGv`Fu~&b^-u$5-Xz(3bFSf3d3s8`XJFBjU`_daPefeAPnN2|#mCP;pR`#sAMC)E+9nR7( zfmthLguy+TbbM45kbAw2hU@OO4J)|n%h2Xx0-NqvadsVESv@5AOl8=Mb7igz%Vtr| zgvjkT>XK-rF#6skO!@!{7En2pUwe;}vv9uUXS4>{zJ}($d~jaf=a<8izqk$)?|;1Q z(n)#f2G95`!+R_7`oF)U#-01~<9}fPGb^EW!B+q(xlaKz&3+<4_+|Vjyk5kE@M~?B zzC#Oig&Cyd-n;XM)~zOz@R^-B1j$2y9VUT_E3IS1`ocA4I}znF%L(jO^cl1ka-25< zRc)SIC9o(gB*R$d(T_GiXG0p*JMqGk%X8QYqHj{Sj5mX{N5dQe7 zvoHnp-Ndszd>Y5LUE?_3Y7L)x6%=24`?UXt4!zuCo5Hv@p`fNQh}>;@x!$|_rUy6Q zbfGad=v~bPEWh^*Vz{_Ar7mh|tPu-=#j>_5!D$Yr3N_%C^JP=9>J0{|$zwqC$^e?m&@Gz99(N%0_x~@& zjtxVo0Iy8RgW2c^ZGR7R(YpVkE0?!oMIm+`1ntM)a-(qFzhS}%ivAFRe{dTOz|)kJ zd`|+}ke6`wZ(^tad{@j1)a>QOGw{1mD3F|A=anogfVI-~(ow+fM}_?JrdwN(X8wOs zf8Yeq0SAgrv;Mi?)jZ>Wr~Xifr?n7eS}qR#f2cn)wouv)s}CLs-}u+TWxho?`!~(_ zuO$Px8SrjzOZHH#*bpxW3;q`c3sYaPM0G>jOtZ3k|NAB`ePZpl7vTIKGQ0W9+EP#v z7XE&S{!yMV9LNy~6IUT7JA#q?f8AB?Pu>rtn9fQ%SN|>>aLrW}zN)!axIRwV!X)#( z-8iAY*7%>Cv=y857uN^U1r%b0OXz*S-+66umR@U(>jSAnnA21EpYKZf5qYGpO_(Ep z>U^kT{qd_mJW>QLcrn{(d>MBW+eeAJ@ju-bx0Sd=7?{`wj{`BndRqeh&Ob;9{!N_b z0mG(S2{7VU76UV}rE0_fa??+5KyYm~M%w;YmYKCDTHPlJNqEniOs(PChqw? zY*-SHB_f|C^}X~36Ar3q%gkh&QA5R-W6U4Jf0_Ae z3(V#ER~pTf1+1rCSvJ(nS^RQ+d5g7+!@@6VxvrkCbDF5Mu?JC?@>9nOyIl!1-lP48 zDUt*Bmex~B^SZ|>vOljlPPpbS%xIKZ5R-={W)XvPy2+B>K-%6}Ndle+ql<-e>*CDM z(Dir6+_K?U+aRO}EP8|2@rmU?1(&fs`ScI)FCtbZ7 z5Uys4#_OMfX1(lW(n;&8~s6w2B+w$=+ z&fC{DPIi3RN5ej+!nLp5Uap2{{%O#{jy9sn{-NKVHu_6Nopq?bsv)Bd{EAGdfoy$?U- zB)q7-{c`EPPZJhGCB9JvPKU?zxz@RwaCr=#KYz%vel&|=>Tv_RMSkl(=E~N|JMIDa z4E!yxwR60mEkFNXwZ`%vnQtSP&|#xj(KOA!(J!qkRoJyvJxdV{C;5f>MVfU{%$3b@ zOBAhZlJ}3taSZ?q!@v2M;{T#GK$$M35B4K(`jtaf41HK8Qzv4gv`;L zDYt##mdQHr_|q+0d#iOX3NL|Th5P89=RN)`ot{$$=ON7Pg3dqfwcGFy0u;Y;~&^U%Qs%tkf68sq5NJ^R)ycP@Ixqc z?r$Erf4b{caGf~VI zR>n*vTB7ltYe#ui@&Dz=qEDa$>(|mOUlzL+ToJwe z?~s=BdQRZ57Mc9x4z51QavX)b3I9q}5#Y1>*`+7OUL~yf?HB8wa^?59>u6IN)y@>b zFR(ScaQ}n%ytJ^8boE6^SB~2OW`9HrZ#S>aO#Ire_513K&?V#9S6W|z=XHFIZh38W z{&|JBV%xTdY|*`2_qS*F@hE2KUtSiwA0dH~nv!X(WJa(xyKz&$o^R#5e-J0RSy)@| z1);Jj(pUea{#Q8I*57+A?q@_H6(xntFLfWrn7?QJN0;_-?elB9`bir8v3Z{P3BYw- z|4(VDwNEdv+R9a2CE#DfGU+T4aO=u?VtYuB?Ogk5_4j`WAkGWPLT8qE>kbUggDB;{ zr8w6w8Acs$dXi+*Zwrd4BVMM({uiA~jL!C}eVt2BO2&y9!aZQZJif zb(Ln_jemq<&KUnK$L#-Qh4b^^o(tv7Yl8dltT?|`4D$_krS#PCZR=qKDOK@WtGR$R zv+%bTTld~_hDeGV>0ajjdee`L7=~2|S(S-ZE%xj8`;jXAV0!HRnfyS4m8rS4W4fmG zetmxFyXE7XkE(x;De9v#03ZXsAMft}l|#i%a2uPoJncW4#&F2=bfUo?}pgzO-Bg^XC*YcB`1vhP3&g2G&$WC@xWE$pevs^ua>r?aCPY{4m4}b1E zli@73d=&jTa!A!`(m(B&dRX}g9OZ8i^aq_dV*aNj%LNgQll^`z zc#3uEa{U}1`W)jcg5}e-{`P-s98mu{2s5oC3v$9rl3z|1-uX`pS{=HfS$FWb1yRF8`Ke0A#jK)Cj;K^xLrBlB^w*e>$ye*(1y+ko|jGCD2t@|KlzbhHl-D%YXRm zD)oqzb7HnufT$CQwftVhS)%{z-2A0|iWfC<;QngYy8#pFPAydKf%MPzxE)nv8f8X!*a=m)ifT9D6PMgAY zDo(fl@#%lpMsNWzFvIHK%MEs{-Uxz>!vDbu#Z;p*>v;OZ?Yr{`g#J;1{Lb==9(LKh z0zfu69?gR%+t)dC7$QXDw6-e$Y;b6h;{Ws>YQW4?gv(uUzx~U9J_r~lWXZ$%Zv3u1 z7I)$QTd~wt8SMD}Tz*RwwJ;wgwEdsO4?9Z3^joXP>I-6`3I0nZKOCvvZCh91V7~E~Yc)IDp&yDNXB&4!HH*21pukmLxs=|3hRz3YGxqGMYcy zKgDMLe>4I*G0_0!rgZP_FN(lbq^tiT3ZxyZAh_`^UgC_4L@B>X!O}ASXHfvCL`22% ztc4U2)^6c#&gw&i&WhMXNba{`zTUb^h`W_9MO7VG8{Lj9CP$?f7du{$g|?t4-iw!U-ocqTj4s@)WZU&UAnU+o~^bsjIt)$&zR2vD7i5puuLKjK7_2) zYCrbGA~pp`7rp}LB)(2*8y>GBxM3g0N|{9=nagI6=$@zp?r+X`^Mr=G z5b-dp2Sfmy!Fo-Pk>6ak)M2xJHC$04*Z$k4B|yw2cogWaJ#|U024O5BFsP^j7>-Uy z_|Nfg+sOJGQ)AGC8Um>}C8{ex&u@MekBMj4#u zkKdS3 zNZG*;BUFeXd;o^rE3#3~tBQZcHn*i&I~euFS^eh&{SftZKg*8Wy6pS|kpU;*8_hlz zPGHWz!1fFQe#_jfMd-tTtrt?M`iVX*AL!I{Ve8cKhHt2XcnmJ4v5)klxqXAq({cN) z;d6zxclI&+#lslVt$e~welC4H6^s*0j4vyyV`X8>j{W{K#jyj{PgN+a2n&}SOOO3H z?R2m3DjCH0jL-#w#BVGB9?^?>NwEdutDz=hC!BaQx)%V+sF^c7+)ytA5GDK-IixbY zYG1qbjGt`S!tpjg#r%qlxBTiP;xd!%FCDY=-Kt_D)qp*&y0CdfRoUYU$p%OmM1V22Ww;NWJb5=mKANGUYV9M2 zfiQn`f2KE^Kkglgrs2M4X$E7Vh*uXe8&F<+vMymuq3rDPn@l66vScPOA3%vc{<2N) z_#A$yY5T);#^$^;@ry9JX6kvSpmTWrq` zh*4`?1uoS}@rgQ>LZYopEk3_3>+dgGq6SMln!D&m59w%e=V$xnZXi6*m#8@$uAE@l zey=i8WJzSftit4KWr)v`LM)NSl@-~+;O`eeE{S1AU5tt_lg!#{vKe58lj>tt zY6o^2Y+h=_tdCLRWIELR9x#_KzrG2i3jiA)&f0rV1`$b+wHzGy-MKC%3i|yQumYH3 zQsV6)Z2miH7bLJ}PjX;9#;AN_lU4FXJhwpnNL&Ll>4A4c_S%QjHxDfO7E-30I7z|dXlde6Cghz#N z3v6cIE>CWHBTXm7^7Cz8l>j%b1yK6IBTg_L9eKQ$w>`~OFB{{$;femYY@0|v%T}Vx-(wzj-5q@0SgSci zXe%&dwl1Yoo*ZS!%@~W9PSAk2pc1LFYLAJq_mT zJL~W;S7iK}5Cf1}4`71I>J}j1sll{38lC$Lyt%a{294>at>0t>dJmrZIbItFWIL0< zS7AN)9w5oDBG9Hj!!L<122yHe(qJRL!mFEt>{}?xH33S4v~EApO(Oo{$-=#+%DFDe zb!H^J>cCeXxnK;V@)fx-;~Kzs!w?zrC8`9#&_C&*kcV_1)s_@Q#R~+{J6bQNx4L$u zg0!g`H^De?%;jfB_@%j6WF}$FYZ07Nw=mnyWQjBMyU=jFJAbYIC1;<(OE4QDHKh;J zXtxIIftXH;(CqagUVY1iS(vD%bsro;QNp+qqBDmit7t$nZ=W6_p z>jq(z^9EQMWyd;#EGol{3Fx+MkyULx+y4~Bwt)X7I>>Gng*+TDW~-gVk<*jFwBYsK3 z4zphfH|^6-HLN!$BBiPt2Ou+L7@j=($qCW&{RC+gbrsN~#M=ODscN4?&Ay9ro4*@xz0?+{Igx3J2^vb$Xnk|;a`Eut@?$_!t z*B{KtPNGv7Z=jbAJP7rh1?!ENedwEvgjt4VbzLJe22y8L8{##jMb%ZUE?UQ(N zIH;@}fg|@49;YbdrZJY)&}WpCHaF3)510(nYWE6X;!lv&u~H~v4Khs)w!~)R1i7k& zE)TaLqNix1&dzBW1KR`}Fw0AP*A!v19k=btQoLRg=|0%Dx2tU16Yg(vC0L?O6PmB= ztN@yfe+X=;<+CXKhtp}^H8<8(b&w7EbHV|;6Vj>i#3nm6*i8s+ml^;9ofR1`<+~A; zxws+1FY=3J7XE1P21e85j(n-Gc;V$n7L)<4|58{D7GHCLde)0%r^i(vDHha}m}-tG zi3^h)0GxIvxmxhRacKG^UJnG;2&Sm5@dN^K0zIlp56+V0>usUEgbQ#9dn!13{?akE zNat5|fLrv6ssnl>GU_BpES?BCwwW_pkl6J-SEN0KIHd84!TODWofDVhTOX;YwfEF5 z<3>S$j!)7tH8U!7T#BJY-&1B)tI`1bFaJZG_Iu@@F!L-mv`tlBF2O8f=4%RVNMRJC zCW-VlwwSpXMxi*o?@`mpe$^-PSbzUf%H%kypG(A|Yd+&r?6vx3**eKNqJ%@Mkp73| z^V7jToZ$4cl=38Hc$odhN`uarX9vVL_WBt4AM3xkLG?WHaStF4ouiT7 zQ|g=t53)Y&hj@lBPA-zFcPB@LP+hx(lBQInLA=(@0+I^`qB1;r?tp+Ri?vUh1#M{p zB-BY`I6D$SLk0fyQPpb1rqoqm9Jf8fz+QD=5PYQu0}%3EeG8H0_%Rp7?^08$Y**EU zv!wq>Yh(8j1Kp*GDxpS9ZDkODZSLZN-PovJyw_p=<(whz6^V}1HwTzZPP<;czt%?n zwqFI4Gv*+;Pmeop=*f}7xG)n(_BruV2-SKdfV<6KHqi7Ni0S(DzcJmLXh6uFm4`kngO>W{6gg ztS`-O=s2nmjC{s`Q=q`re`kdC5yxg#GZRoR97@feKt5f|N==y`MYT6Z{S%WY+nA02 zUR8tm+e6>@Q@qs2VT&%vD4X#)E!gYxN{Ga4h-da0Sp)#U;=0@EQUM~gc}=P6H4s<6 zVg0z}tD<+|3Kl$0mH0aF+K;l}80q59$8nY-TC{AF+5LgJ>fuxo5J*eU!2qr_Tf!K5 z=uEa+9i0MUoT3QCVLwPgA)cs=B*!McauyoN{zFA->6)H#1dReBKSGG*18U_@v`0=r-Q z#hm)RxkI;^nxbn?ZvJKW zp6z~Bo zY&tUta%6UY3ozn#5^wGsJCUQsd|nO3*n`x(Ol?n#3+_VYZ_Gp3&wCFqD4_45gf%%z`32CH9LNRFCn}_! z8-LOG;nA))dSeb6d1W&k9Tzcq9T92~C5N&!fYK@wcRbzbqmK`5Y)6C>S;*LPP9Jdw zT;bE4!8;Xi`ldJ0Ih^;HmUI#pIaqVz^R8_ZrlL60FC%2O&|J6&DVaPrC zyO_P(7ramro9CHsP+JJBJYBJc@y>zAns6M%5*iq`Ugn2VY{k7}SjKjF%sbGZ{lqnB z1XVP8SmRs_zMpq6OZz^N>OCziZ+YN}+b(^SL7q&fd?67-OOkT&hH|;l1i!lPRRJOt6YpQaX27co8A*cIqZj1*sH6uINeTO`(Q9>%>D=9`1OVxPS{qZ z@AuqkAy_R`TSE7`9&BG-+JRdYH-81HiliNtNA#^|vz7X}XrB23p~Hx>Q$-GdtGt-h(()0W zAATRK@chXlJQmgiQVm-+KQ~-{#8-~kcT?IB434&3|AIZ#mk*$PQNam_cQ*O^`v+=( zh!JGUiUt)&Mqz&F7QMnQ*-zgwt@i_tU2%z`t$P4WShgi61C;kCt0d_FdLHOL^LAgmv65G1rgATOklB5nJ`p zLMn!XaFpfZh*aa!DCnwubtbH3Noc9Vgc?a@F@!$0ZI2cP4rQyxe6EX08rupY6d?Y8oHlL=?7TbFluG6MfOwjggUuf^jIi5d|yH!R)!K)v!Ca{l?T$% zd%5M2raez-c}rY~H%8vN)as_t^18RTA^{SVO#r4{tHDt+4yI*pu;Fl^aa)X(2mecQ zg*$>SC_nHSC%QrF=N7gxz!%(^5RFVud*XJUbX_Y06Jc<;lWMaEL&5P2pQZYOP*{#c z+_MT3&-&0OprlN8Ui=ErJCr_rmHX>*Qc%+T1#dO zAVE*30Ad0#J9r$05ryxA-VtK*P4J9)bPnUyxk?}HU`$quA>c!R9M15^m;PJ`PXK6% z)M_B&?9Cz)GLbJZYy`SR ztJcmO%Hdr>mXOHI6b`7CJXP->1qiyifm3sSEsX9_mlOSlp2o3Py^=pr%Q-zOHwg&G zs{Va9ZX12t{;D3ZPWWTOSmeqjwYR2`?y^8qk zQ3-@p9ZaAU=NCoQc9HGmMBSzK3lMN<&VMN_0Aw89ByXMubqnbjt-M@c9hs018Ie;b z^4J=Ry8%)w1@Jo?5ki-yk{76~3d{_&hRIWF+b{1Tr8-7y&&XV7w9D=18xI`8Q5gal zukVQ2YqGj3S3D$vD@&&*>~3hX%Dlc@K+JOCkVACkK?z#|OGw)iZIJ>pkr_Acnw?5`QQfx_a^tJ zLh`JYSIGARzz}cZh_CNXAq!RvE>)t?NUJcQu^`!~SE}UJuw{o}_u_lQH{1BWFdm6_ zbtOCE*Qd2O>ziN;e&0#`dI1MD4L0&wmRRu*)5W12L&Zlze~fE1P<@IZkS8JT6HH@a z=DZbjBSD}7Jay|d#Mm~zZQk1D!5zRB{b+G0K9O3s?B>lv8vTPB^>z=5W?+!7FFeLc zHR&|l#*>@b9H(!b!Cq%`{kS+aMsEGOAWKxM6i8bltcJd|s8}JpFR2>G`7O!o4vN}& zqSl>~4tAV)q7cM^{0(PFI@$3~KLE$seepDtZx!|6>G+z|%#rq@tVmzOFc+fGz?&uB3# zRI?>o8p&qX!GaBXcYBC(x-Cl%pzdILenw?;X`Fyio4lwc!fuD8X#G*iZ#9&HzUd`n z>Si8uGhiWR`V>6K&!w6NCY&4fP+@z1NSKvZZP%O8nb`?7k)tdwg7#tGewxLMh5>W1exEh(bdKkaM1cK#!gGAx zNN{*`O2y0>)TVzyZA|Bcj^=+C%Z_(aX-Z6&jI5(1#)b>fLqFr;2Gg zi!LFK2WIE-maoW><7kh-v>n)+(;@5Ijon0_xMBOre3#g0-}nmSUfb+zX@VbZHAF8v z&P9&Q&WNd&w#?mlxyPlu4sw>?R8GmNDYI3w`0=RSadhbhn4HD8ib{Yh)iAEC(N0VGE37_}Y>?$pMx@*IRpZ56?^>aE_(_iOo zenf}bUK*n}`>^Bn1({>5u$T%!X6mBoTPx!A)>{d${i4y_D0BsQbU$;x2L&eoK|X+3 zn5sTgWji;=?Bg{7a6YmEeu`U`mb zgO^eG5U40UHI`+bJLFz_++74hydnDwu~4?8m%Lma*`Qv3d5C4S3Y`7CD*GDyNzF#5 z3J8?>LxBU@J}w|?F3);IXOSw7{e|`?1b{i8yK#IWydf@RBThIjmV@&YO}}KI>Wo~> z<335RD>*(q8~JA)5`1T}ZN{XgM6EjX@N8`L^EGqcivD%AC&s(qmq^-tRVWI@+R!|7 zm&kF9|DhPs6*l;&>)D_uk-62N&xuxpKFK{ddK`>_COmRVyxc4&z1J#g{8-=A#Ue;} zYVe3Yv3-X)ShFp&UXAtSTpl`dZ+0wN`52Gsgm7GgKH)-$3j6y=2C+YwRbOw0fSnGT z`6KP(C7i1|8RF@M@vbAjYzv+#vtA%AlCFz23lhwJv6u69q(UNjcJGgpW-$InpUG#( z3%(sqZ%$6Wo2SvIEBMoUJ7;l8fvb6x*XVN@*l50VNow}3YGZKpE@=UA# zMpTdi#hHoPD#ow#lRORo>}?xQ;5pcm)dbFDyH_h<$;Uz&=Bs7wBvcS}XUpCXFx=65 zQC|!uRUc4U-a>YrHMdt?CNsX?_F~hddA!pHd%&F?KxvUD7EoYKg51tYtS73X=)T~q zFT{!7k+R9D3*>NLDA1bdd3@HJZ6DN|)Q;dKDl#i2n!v6H&oO$UqRCRWL*!Ui{R}vj zO}q{2y3hn-mIj#D?~eu=H3FttFjrD?V}FumyuZ-l+08yD$E(i7)FBi#0eS_Fw+FB1 zB8*!OC9=@)jOIK*thEp3F?inlIJ`gh&C{W$%*-E5U$_g;ezN48n5{EHfS8d}Tgs1= z>rY5)g$)4TEs!SaM)D7(KT&pcPz*bcV00_`RkQ@^gM#o$uyB{Hu_zSNxy z0DNTdwQvHEu$56gVCpMilhuyXR#^jSAkX6o{Nhx=lx=yx6=0dF)y{cprq5OR z_<%XPkh1wedRc0gR@_sO4J~$UoC=|D`YXl1 zPH!f-W%RL>I&MPXb9VKm!ow*hxM7IlN5s4wBR(2lbc*K3oy{7d?B6mA@{`qX|wcp>|uma9#(ZCu~bQ(KYG zIaS&9b19mkGt+jFMQWmHTvW;tQbpy7OU|T(wDW>uptCUX`5W`#pL6}>@(dX4RCk)0 zc+Pp9{^y}N@PoIi3cd~!E~`GF{(R?{Wtd+1#CP$K=t{LzFv$ZAdZ%!rfsZN zj>3#vW&=J=NjxR1=&z6QZIWv@m?d!<@2QR?cW!$`NIq^`{h+tkl{z`j(t&ZUIf7K$ z*;V2qrhte{r$oPdV(#U_h%7Lqxd9WSnw_fXxrTA~QZo9Bi6sCmNzKf7j*xyfIdEZz zYQZtG!dgRKlL1!g0%y+bVV7Qp=v0XmBJpwZX8(m3T8nQJoMM4;#r1Xe!a{=6+lENp z)F?reMK;oITtb7+eYK6`YEP+il5}@+mDQ6o4&!fn8Mt}x17%IY_}mlYmXa8u)EA5- zo%McHls!HtGy)W<<+UHhDpW#KK@e3|O{ec;sf}Cr;O7ryb>S99G-A9D`c!u-(Prfm zM@u_MXNUAEPsB*IMPiCR)Yo%+-_G)xo=a+)_>f}BcGp1WNSCA#FpD@osPlb!F@fs+ z3~NlR4q|xewfd1RViI~OGUklMS&I>9dPzbpR5lJ7o%Q`vgmSYxgIsVt-MmvRqmhyT z8A)`F3KwCiY;SpUfHqRO-N(;Z1WUyOgiCk>HPePBREac!(4SC;Zpj%i9w&% zi^Z*${o++AP<{3nmaOgn0qFp)pENY%+}tF}z4YPzkBO|P@ZS(n-?6m8)Go=)d5MPk zI@enPC&Q<}RC zBvqm06|hxfBHiiJyvm&Ci0$m2h{Fb@yz`s5K9Saz@$2ums*`A!tGMh>=N{!s^KsxS_bcOn=U3C$uW%wg(CBD` z`XC+uw|$R!Zckn*v#$8bWN<}cQh3)Xj-9W>F(c|irwb)9L_0m4iSM7pj`I-A?`a0* zOB?o%`T(#1shqKSe65YJ86vs8(}cz3@nJKgH6wcZn(8s{GiFRv%VoY8d2y3Z+Pp0! zFP(mHN+fOOogt7sY!f_d6uvOu69TBXw8f31f>M=obCrWR&!v?1>i3;uas1u|rJca;}2-b~Yn=qT%{0 zZ@itwj_wU+k5LqI^K>RY$Vj;PJiNi4OyVx+>~z+Cv6WC*isg9-^}xd}>t2e{L3(Rz(c&#F z$yQp@p9N@f{o!y;L44`lH6clDAc*0SDVI?P3896;}&=s%2WXz(L;eRdQwcPHz z@sKewAGc4GyMK27te`_^>!?^rzXv_JE+l{~-|y#f?VkApy;$@$m7)~motw!%iO-~1 zmH8bf9HIY-9S$pn7K*(TocQYK>-2Ynt#z7#gmZ(gUbVfMK;gYJ+xZX1LU5A7aHG(` zMpHHB=a+YqRPEc|$qV9l?C$1FK9{}W4Ds=a8Vh?0f&H0hs^mExGzCx7owXBWD3{#z zz+#|poc9<<7F|DN4g`&X9F6db(z9-(UV)qT;N6982b?>O67jz9M?|#q7I4fqJvGZQ zx^*P}c*?!05`5P4x%IHyTKoNDj$AxF zpRuxu6)NWE`!LZD1dadM4b7Uzo0?jiDFCERRz3JXkg z8JDR*n|{vh9cfDf1|Xl1F3uZ%t$be!UaAOi=ZRR@1x5 zFtpXyn*EOG8!Noc&NRcE{H`exhALh)#UKVIvLbFRyGjBr#kCgN$ z%yWn_e;kq`Rq4AX;dgyB1Z6B|4uiHjal){!oZGpxol9&fA%|=}VA9y#u z&?}eldf1_^>&E^p!ldR%h;^7rKrC??{aJ=ju8-gQY20_*JLIc!;4r6WZmq#7hnzmk zxzQ3`_V#ayUmUK5c+0+{nMzXV7asHUTJXjkCvxp}HRJMLRw>$_cH(2ibDP`yTdq01 zVsbUybe{fl9FRm+dy(6h(`9wWUuGpN*b+asBl{qUpjdx(S%WU55cJ1M^s1B3H$*ce z*wd6M4ZO8*r_Kdx3$86%A%!z42-6tL$?nOgsA>>VMJ8)HZ!Oz+!_p3zEUxnyNsd*M z+ZuJcHObxpbC{t8NqUUA|RaY#FyR~+4#)tP$xMnGO`%c;_HjbrnVmMP8&t!2@HXV+UZiHwv-CaS)*{pnHdgG=)ExfE zs4L4J20_kOBJyPaYcKkqyo^lEeI~__zoGABil^R3y-c59%GCoSL}Z#haxUkfLtFpm zN2;C0{3^_~44-G??k5r?7!?bi(F<(l%)Wb7;*DY|&GEeqlzLy!#J=--pkh{j_mD^< z>nSVMXXb%67qS3^nVKam;QoT^5uh1#`34@m^CvR1X6RI)fm*>`XF1RA)ARJ*Gz1~k z{#`$SR)G}SbgsS9OHj2AJmJ^p<=X=uT(`}j8+XM+!6EM?6}F=eUi$lOT%sXndE@sf zO_AyK+pBwN+%Hz&XzElR7Ce{`XI20NC1>&N@2vB_@>ZU*_7NA6X=>vqeOGB|oi%AQ z`Q$oprDI$#<=AN5mV{2M6=OVhqOYlM`dlk@imq{`;oizaM!~xVUWs!z_{H*tzqi&b zF5~PSJ~!pLeVc0|rl;JEN33S-NBa3p%Ygklc*_u(!P~R8^~GtO-u1n6W!biSIIw~s zU8A4&a4ent2vR>~q6yOBdsUY&Nh@%woEeD^&CpK>hJeH}= zH<@IZ%J{2b@wlL+%wR*)xmUGr=~-WdwRVn?(38)!-Fu*IW?#)~(>5P#;~e6M;p+|e z+(;i!mHM^9Qc7|(&eS0$SgbCk{LIm_7J_&2Y|R0WExmZ=f7~r;*rF~ApoFv zEw3rP>M5;QUR#_TeJ_(~SrdZ)ahmkB2ZkP-H{Zz_5jCVsn<;2Fzp(|P_|dRHGJV^R z>>Ge*{oz@06|*x;TgmAKl?<%Z5})( z3r7REXsjKdZSNl!X6W%g`sOn?$t(v$oOh}Urx1f37q_?ooo4rypzWr&L}sM!=x-@+ zWb+J73xPR?3>E9cU!IQbCZr3?wk$}!ThW^m*QlRelKhdkH|#M-iK`*2@mD?H@-kVQ z;M|SZ6vNB04>zfl?DkxEd#XYN+AKZ>9#yf}H*KAMsE~~If|hRMakUFE!uY5QWo`wT zWiCZu!-h0YMhd<>svDYOQ)&8WZ%;&RLk!l$RNXOZh`Zffo@H?+azRD<#99-o+yVMF_yyh9uDd`FJNUEN#bqY5q%?`*Dl9O^oq{p<*zVPtlJ z2X*Cf>yt)G8Y5r4(pt7NNvImX8)~)8C?7;9?WQHf-#TcI=g;JC@}1|%g~}w+yRdU} z{V17po=VkcQ__vEt%V>Zrhb5E&(%+`N%&T&o3SY4>0N@IhotLA<$R=+uGvA802y#~ zY4DbU`*yiyu&z*yPwVPxF5n)u=AP%3%5COX$}AGB^XiQvhb12isQq9{lJlNn9wm30 zK9iS{H~o=w_8#xvM7+zE`@m6rytWLZzJZ&q2AdB~WS%99iUZ{VK_}gol6&rTV?V6g zXy5Zj)mNI?)XTgPr76EO0IXUa9;PQ~rAW z*KId5hrA(2elrM(?S0G?-(7|cTaKp4sj)sxj0m}Mm)BS0mJ0rHh^;~Gh<$zFcDAfM z2oAkUP74)1*{j4VD{wq4)KWQS@{REUMFC(jR8u&6<`xBYz3oViNA-5S*}>a+O)s7c zv)xaq6KOJP-M#;FxYb#g0p>YC=0CrCSBbSlhveY46V1nFbr_Nu$Q$>(QPi-i&!4da z`k3*ga_Vlup7R^IS_p4FztVeyKEDm}tZF6~#OqsRZ7=`0s$#`#lER9 zG|C`{_D{?_74;}&swcm?zYkOyQzsZ|y|5Sf-!0734>i?FaZ6viHN59R`rAl19)Fqw zBhSMUdT2Ck!?#BvpjMpqxgFaN&z;$!J8x1cc^mp!QnE!NN+s+{+D|wNNOR9UylK7h zS<15$=MLAV=+2?B3jN(?YO^Zm^Ug|dWNelUaK2!kcd3apZ{9pF?CrOSEyi2~SoOxz zJ%=;st!8P@XKYC2X|s81bY9WHUnoo-YwjcTj)HZJiLI$O;!=YjS0#U~Rauj%_sP^B z$}$cm=0=?8Q`-)0V*%cPwr-WYUl%NJm}%rcHfC zXVTMXLa)D7skGQ7O-y_eI>0ss-lNX0TRH_efBzRB0o(nLF|0>~m&lovLFIvd2*f8bq@rai@ML!+RU~K+vM8P)m zhw7Xl@i~H`OwG*Dh4Cnd=Yml6bC^`i@uVERl^m;>e#UO+r-RzK)knPZoZ$2!VePg~ zk&v)bJ{{<#Dl4zq?tZoRT@C%yWWftjW8;0o^lxf)-rY(;gv8MSH$IHfxI6?$?8rXI z+Y*w0@6|twZ0q?g@_vIUs@EU$b-{B+vMaE9=WpFCO=G}ikrlcrJm!J5`N&6Va>%aA zOn)bL@_QPeRr$uVoUe(Uy9KcX=3l-EGPG67E(Ya@B~^;f2wxhmk+o4QVJE(@Vf+z9 zb1XEJUC`JrJQv(}L1YXwZw>YElpJf~GUU0Y24M}IW@&a&%|fZGEPB}G%O=wh#Os@{ z%itj5@2?_wpPV6NYqSaZ2i`#P%8KP(*ei5s?_&JXovoN0d6PGI+xu;>oC4$Pj_ z)Vw*}VPnu=$9rGDXoQzsbDZv4COA*Kh`PIA=#kz!lp?BF*K@1>xzG;BmrYL|cGu`? z>NjzwR^TB3nZo6#PZoa8an46<1mAHgLPEhH-kHMjpXqs8td6Uj1spDe3 z_F~*uGpacQ(lrG%iOWv5yG=ZwcvRUaT9pdQP+zR`{>VfhqM+HU#nuL+bum;emiylA zckI!l(Q(}yx$l#!k^Ua7$O^7f_5t>=8);Aw9d_={O{4PAowf*_Q{TgDcv;5S$R%$o zXGo8>O>6_F5Z8gdTct$4blfYJ7pMC_&fYqps&)AvmJsP~knWHYr36$`Ktu$jy96Y~ z4FZC6i?kr6AP6WeB_Q1)5)w)&t(#Q9_|3A8=bn4-_rC96M?E{%Uh8@0nVC<_NLJAmzXHzo{R6oA}h`HS zVmo*IbNe$`<@3{LR!z7Ab4yb8Gk<(?1%GkH#ZUd$8z|5=en8_7 z0+>xM>ZtUInZBbNmZL9k!WrNtf2qkGD@n8&vs(rRk~TwSZ(;+j6)993_~cILDfNp!1qn%4zi8@6Wp-{;pt&7a3anD2@UA*3ea!3kjj-AxB8!d1k{6IrmGt4CW=xuF#_4Cj* z+*Q(HS3w!T4Dg>W!)N=>R#cLYN>O|5L8c}2R?#|SG5T9-xE#^v4!N%z%eo(na8|s< z2eEPHWZknS9rps`{pV_)dre8{IP}E*Oi0xInUBLfOxw#=afn1?)i_G>|9jp2>U{#aM?9k}Rv|2G6)hq<1`zs*$*nGgO91V)E3{}s*z=!?)!S|>+(yt4)F>(Pp})ANI{m> zYCXgo|Rg1#09+P4*Jip z?4#!5VqG749Okmg`hZqHT}LM%SvN(Z{H3lrQ-nVn?Gg_**i%lb$KFYqIC4Z&tdi#yVGaTl8KT z*?FWZc`uA1?4b92MUSX=EG(+m(s@-X-xC;)3-L`La5WKYpo){Btby*ynO8y*47@jB z=%*ux;k1lQ>kNkObkJc6w52WN!u%=w@m4-VU3&SMYgFHQ#!M3QWRz(n{T;E zh#j~DXj8Y%Zud6ROod^`n-`CtC%8VjQ|1Wg1&R9GxFUD%o^wg7Rlm+QB=fq<+4O-$ zfxnAp!amK~L|}7@b?-Fkf&C4ygU{3jmt@HbXR^73jCCbm-;a@em*RgLUrSkEOEIZ0 z0DnH%gU+0AIQP8x$R(|7d#8FXr4OACkn4)Q^+}`c!#?J#N2>Y!Om(^@UTyAxnje6OZvl1 zIaO!sp=Um9t^4F+0X9=g%#{tw%K9Qz>dtNX%6eOlVe07+wbA2D&^os>*5LkLcnh** z;UVGftpB0r8t+Elo_aI2$z}pItted4xr&>6J*IPgXBLA=-1=$m_w_dEC@hvm zExCN|SSu?rd=dj?%TAZhJ35O|b(ZCg!q^QdX&^GBYKR`qk(4>dcIT{r%khVRBc3@sFdFZ9#5---c29+FR@OIMBzDUxYn3W_K$6Ls z#yFX(B~e53RIP^0*avS#P6vVFdXo{Jymc)sugq7w?fUWquvEs}Lto;k{wTMuP^b{d zUU_(ox`!ssXD{*w8UO~#m)`k0y%rmnWF+QhN#4>7BxY{Hk|vO8?bRGY5t0;=zDq3i zpWK2PtVlhHXC0rvh_rZeBQ*4s;u>3q#ROhMij`Cvo7YP2Y^0xptqcc*V|Nbq~;**yJl894TOZo**Pzs9=$eouVyd(m!yoOADp1as90{5*K*d-rnR*R zP7AdBb(rTJwB7PPd?@$pmRl&VHYr}7*rp9nhE_FilOP1roq$${PRkq6cC2_lBf`FMZ{MuM z#Mpo=Gb5rbLqo>qdDsO^Oa z>JVz8#{&sjx3}{bg4ASI=k)Qln^d7EXU&>_rJ+Tj>Uuua_0}ug7uGC!_vP|h zwszk@tFk^nq=l}Ip`yHs7kz8+yaKE(zP z|Ho5aSyP=@SlFySJK?B^Q0J-6uPb_u_%f(T5E8g6GF2X8E92dt31OC5<|Qxb(@Rbm z(9`p&>7t1VqA@lL=-}C8cYJhdv$3I%qRc8$9H@jJ36#)hyuPrxVcKm0%-|E{!bA0j-g?kdVh$fiI*8U~*A)%+C7Lhh zIml<}=P|RMSyNz_hy~?JCK|ekif77mz9h`MBdPZhCKSuy2l@hj8FWeeSho5@t(Q2~ znJ`yt77!9VeT%A!qb3j9gvL-VX)yzni!xKdO2X`jmD;r65)xzsn6ggjEwtZs1lj*a;KOzmRHOZ3Ta%ctSZ6lc&(RYvf#Gd2;=2__*_JeEAR zqRu;%1BqQ7N8}aPNRo~tH1&cky3SjKNd-)i6B^z( z`RDJ6IYd2J%Na<`HqSMv(%-*OJ&3EM6{?E4%<4RSnq|}%d#;-cf(7=NOB_cVu$-=wt6>g z^oovbLL1>wcU4{D$JE0UPULvotXZhXB*fQ$;;zp$%_4a7!W6i0jY+K&_0a+!<1;^V zkO!mD(V?DwUvZxof27cqAu|C=n98A+6B*r1K+JDymVMl{XR(HWqCWPC*AqT3#&MIQ7Amc`}jAb4y5E#ZB&cGrgVtp;; zJ%`27U}n2Q#pJKe)#2F1nXGexfTfalMXFmh6^4qfR zf$nF&(~eZh7Y6Ed(qpcTD*FbJjJ`!wlPV+SvS;Q#(lu5Z=wQga+K|cD*)?!lgvUb& zE7<$MCTZh>Bt*8nkoGMMrA-68iK; zaCN;fo|n#AAL)p&RLXp4(l;R+$mb#he(CdcE*hIR9WE%NwKCQ*&4j1x<*kxK34R5^ZG!BdXkl^{WR z-XT=GO0yn{OL=Dx$1Uy6PUS>`-brjFuP?`iWCnEa&*Dy!-pJb$TgB?AjPuNODkWi; z?Xk zJUY4TU?$zoW3dI;nCbRu@3@MtD~}b`uKc(DEGzw!tvn6ndnOiB;~7b zEoDHy;=aP!6D7pH-*S2sHZ%hCT7e^4`)i@Q5^R zj&4ux$bNND#2SDOUz74huc}Wxgcx|Y!s?GJw%q(!_vWyT2G8Aw_~q{Hd)~veq9f*U zwT&P~P`p!lst-v<@p$%b7de_E{q*BKXLV&|tNZYst!Yi(E<(H2DBcIvNk|oquks=! z)tcq#=%X*9!`Yk9&5e$+JxA5OmU{BxD>Pm6lPl>>ZPXvztPJAJQ_fS3-SLQvUK3kw zMzEAVDM+oZ%sNe}5ZECCN;@0c_O-IMO$^uE%~wbAty}LU1TX3<)b#E?_d9v4cj986 ziBGRTx4O+VZ~PM~`^MeMC*oEiuJNx#mk9K6Xw^QI+`XpjQ!Ly+EIO|%XjmMv8nUWW zIB~K4{K+ZXq=F~*p(h}}I(zVG()(x9K!=%^_y;8Z^N(2{@hGpd-bzubUoi4|n;2j} zT84jRv8R9f`kPbfsJZYj5rXV5Kdn38WY{#>-W8jg^Y>J@mVoS`qB4v7%>> zvZJEw9n6$oBpvy(DEF;kqSc+NywwQ!pd%%s3~xTaJ6YI;T)9;Y(Fk@yG+e*f%!m^p zIfTGK@ZhVu;h70YNps?9f2BIJ2|g{`*=BAoD{5ASR*I8q-x^_pJ!VFfDM+aOxz7;J zIK7A5g%Fu0Zyg-BU~8Tfd+x1R0H-s~N%v6LC!zkyHSe-6E`G;3fUS7atL&ORQP{JbQm@PX_J=b=tdsP)p7J?A z+tEdVcT}NJ??bRwkZ+vH-9zTrr#&%gg)cD@>hf=8r{&cKeiXBjb$oB=I7^MGLmkOk18C8v(MNyLy-|S^3w@dLV~b$bINK$@SD1uBOxu-^!=K~} za8I=lWCn1f1oBcP^XSjn#Wz3CGc$A(R8GhU-H!o4tLuEmMc+_;d5#Z`Rabj z-cv1L#K45e^FoS|9A>2c35f5m9mGd}Ze(Pasd)QvDTekjm3903tLUtb%ddEz89F}` zF9I{G*;UhT-j`uVV0GZ$u%>;`(_h=KF1xp|Y*llRzRDOng-xVD_;Tk;*5s+%A5)1K zxylJ$G2;n0b|-y)>^In7!qvE!4S)abCp2+eo`!53Kxz6%M6VUDLN({-MPuk#>Stbw znSvu)0}>q3bhmEaS%^D$U}!*ql92)fuV!_zz66K7wFFC4a`TBfx+p(*97Xg7dGOjb zAG&PZLx@h72gj92yMnbM9CPLKEa!1dl2!uLvB-E3<$m6<$reA;!tntJ>by}*r=-$B zjDbmFqD8b#(qOhpj08-o0DQT zaUDuxDt(yqt zr~WQCHgTQ3>uY4mE_g5D%YB|D!I~$tkO)%8^=Uu{3vooQJngG)PPm1uAo&C#*6{Q} zVo%xt$co~vX^oT9$MN|O9!7N*=R4KQQ){Tz!k!G(so*%}tB4`~bwfWcYL{6#KjsdYe3l8O1A4U7*xDs=|; zu@8E(%}ag(;+SZzb2qb1qBhjU4B5RhdFq=GM@iOYRgbL4dRWu%18N7>+~ud@PT$D#rhJkwt=0SW2>uQ~m>iSpze9t0~Ny>5X_or=!~Eva z0%-pD*n&>dGM4cgaPsT1B2q7kjW+w$3lIxKnct5-fxpqSE9AEkIcxZ4U)NkZOu=Hb zh^(JA%Wzfui3m#LjCv=fn|fPF%CVkLpf4`O#8%+JmDM%%-9bam^?HziDb5S8Yl=BKLH_p@`6@+OBmgqRH$^? z7OwWKgM-b9LFf6y7*o((p+juq{CRAX^lk8P8PQD2Z$a-lunFq%d*L~+1-r%ZY80w#e- zD_ZUz+{JuT_WI7T{=*{2zOWt(+=#UAA-g!c7bV+OX%l;D7Z=Kmo9RVp1Tpmy7yObB z0qwrdSd&znhmME$%A%+x<0AGLR}9VE_4-IEC&pVhAMY5&=VRS^m*drw$3M*F$i6mT z+vAzKY(jQ-m`lMNH>3xhRkYBIO5Wa0idx-1@sTM#cG<1wjcVw~gfd~>;28qNk6uw5 zl`rz+;=CQ!SS(=~l}a$%8k8Tu%XmQlPAEWVv0x0$uK2d}Q7<7Oz)vukcZ zrPmv7XTa3~JecW2l7V*UMY@aO8y6GtAJs)3%DS3EkJlM{vRpznHMs`Y1;rbMX6cV5 zB`NI%K{F*DCE3IpG|>Z+H^UONyXtXO;}0sUx-^;3doSD<`nET4 zlNiZLn(9b1+DKDOT-&H-L5{G@jc${*pc?H8E}eN0e;yxtVOH{oLT)F2wr&+DJ=J<= z!}GqwN+62pLkw^Lzph-ddft0<|yStM;SVdQoZ1Y@`>9YgZm&1Q4;gwcbMw zsXr`ys`*u-59lxRWp%!N6@^9&cXkB${oXh|on~VdWzr-&&E-TjohLMtTV}Ukw%o9p z_yjvBPI1fiRPLJ@x(x^-0(Xl|Yq!TIgihZ(Bx{o&|6qN{W!|WlFYEww|AMZO3GKX{ ziO~>R`hE5NyB;i$^`-~kpMPVk8&Jm@a0ecABDRei-Zm44A;-`3;g1AR{sToxFp_Od zDEsriq^HU>hSQFf_trM~|BHi+)LRQR^)cc~Z8uL^97Q zz39HeG6@2;M`h*+DByDw9wz(Y&=aPB{RX>`4v~$s#3bw;y}lj~2d@l+N8zF;W{5sU z<;$MNJ$DDAuOUlwqI=>Pzf@=0P)rG&sN8qyD~?u`JF=ahSM=4Lk`HAiZRR(~_I^M< zw1_69IikmxI2dMzg+Zj|$nES4RrBF`&h`ylR?uZXiiRqm2{%=S#D} z(Wdd(etRb%b3+2(xs{+2Y`qI%0%7n%rOmJpB*sIdIw!VFR^R3Y4$t9^9 z&mstzmOQWT%Zgon3XCC6==2GQ3?n={FyYkr-VJJ+;l5CC3FjueW5Dl9{kFUzo}Cfq zGxbBxr-3=7d{)B;pyjg{WljQ06bi<1C#(up@{ls&%N-4oHp{URu zs@-_dqdsoNVzP*U5IF8ng%~n$ma*3PFoX*1Cn5D!AInbLC9_1VL~C>=AnS0SpP-7P zd|7p$P{zEf<*HyBI53(QFP(A4y`Q>SySAMl95trW`el*MHl1uG+Wi-9+1cf*_ zQhXGd>RQz?ayeO17ymKUwXV+G=_n2jRF#3NDE&Y3+5lqmnddW*6f&n=2`mUY1}$?J z$k&fFNHu`LP0q4id(2UWtAHZ7ywp;@R$@cV8St3JjPrNN4{Q7ar*IcXe30mHv98$p zDOxcb^+uWus4O%)H$}xnyoSGR<&8|*-H(=9p%b{;DId0I15NwtPj=CIw*gWTn7F#b z-0Fs8uZ1=a8U?xoC*Rq6EY z@QT|wBr%8~iCwapC1iIa-g&_fN5ZHbg?&Bh+MltFSa!D{`3Ju}Wk?fkoMFZAoE&2# z@5A)6Hr=$79hJq#%@zvgMyq94R8e*NXI_g77cyZTt%6V!48;_$&$60ncQO7V(qy>J z@Fpe~~*FdyS_p-i2!hk?)lgq1fN z2lXD$q`aO*czK+yllID_dx{XA$bs;bl(PJt@RX&DH!gsj->gp9U|KFhH;~z2 z`YUL(xLEEy(%F@7a}_jzGi>tWXXrsupkF3hr@Kk}LOkz~4;kWq0=&Kpnl8o#mzwdo z(+7th^+Va}B;@>Y`nq$^sS74;ub4IT?$5n$F zS+FN=E~%f}XEgfa1179{HU+s!-IUGNBaxOqz)J{zn`8)qAKwrf7cKBDTH`he1^vwZ zRQrzUgjy7qdi8H2#1|RP@QoIi=#AnqM_%;1xNz4N>q!U-Gf>UsuIMZX*u4>lx*n@# z)C7Bs1Y3mxwg(jBV^m5zbvRY|Zz~@MC`QPqO!hZKC^vvnJImcYc1@#{$K0>`!Hl}d z!==EtJ%x?V?f}q8Z4P^F`|7KR&|70V)0x6yJW1=)gPlY)cDNELcJwTQ5jsROX;K zC90pXY@{H##n_68@^g4_FN`%G>WDW+Piw(2@j5={lupb zs<7+%HY^%Vt-22F4{P4sANJQPx}$$Yz$^D69(XIAt%gX1+3}c)>Xk*LVDkVlyKBGH zQCU}ZE5QrYHT4IW<~S8x73)ul1Rh#^4(=yL1CGUzjaM z0WFfY@(NT>diB^T&!SCx78+lB&5A5s3n&Ca4&Hh|!Nynv$4v{;>xo&Z^=-{Z#DQun z-PK_a1@?90LWluzFnPw8c+ReJFhVr>vPC#LkohwUeFv$L4jbO+olS3cO78N(<1$tr%Yi|CpD+FXtv3zr!K}QXn3qfTmd4` z%-swrX?WUI1%?yT;Rn4|b<;`mT8R4T4qs+J z%1xah4mgsb2;2IxQLGZXatt9-W$14BHiLZxHR;Qly(dK~n3@44en=bdBXm9dKi!Yr z(7aBHHK@BXu+eAv*?v=&e|+W|juyt3g8b`jCYL^nT(S%mB^4i4dExOAx-3T}Ji}fi ze%~*&aolC!H-7sX@U^cZ*!mRu_{1TZ}IudcbjXU(5uJbeh4qB-(dI(L}Y zhKFR2KsjaK^~0*CfAs=J?|8gSEZvS%X2Mgy7a3Yu7DDHI{?yLPrj-7TwzSBlGnqAF<2sVu22`pa_kCL+vI z+^MoJ7HgR$v750hf>_{1YktOqUfR@5Mnip1XmbSCm}mKK8C9(zO{(V;eOK zwH+%VI|&7VS6a#srFQ00&{BxEi$IL*{*p_-%Q_t1~HEb9cXnjnOF_-|1FqcZOoa9MU(|W0Bt4uYsCF zvs9RD?Vf*41qzYKv z_dG6gR3G#?`%&DB4LUi|$lyeBhOOUrf2r_E1ARWhBJTX*otsb`w7_5qQXrI)lIBIx zisaa6p}c&ctH*wtsslrdqa(#{VDEyMdvqscd*XB58}CxwUJp7i)oq*a(+;z-0d0<9 zT^`jU8J(3Kvdr@c=x+UF``mkEO2)u5h|^QV)d~^`@R~icA(!zIHF+LW%G4f}8dVp@MkX|DD>)#M zr)PXkM_xFbr6_MUs?+)A!DJln-LI$L+`sw!j?+GaZ`f8^gUYizT%@&wS6>V&(k@Q!+qiT!3#Q~`d(B<?MBxqF|c>&(ADk3`{W(TGOnz+ z40JuS_&tbmc)R>#EP=}XbB^pLPY=AQl3VVc@KCF(38~tWjo+M>cNKrNQstb9^ z=zl$a1lj#>$Jke)6`Ffosq0%=Ie6JUn}|&+Q@ouJ0J^^Ssy-(-{h956+qY;^>!%b_ zZp~mIJ96WE=u)2gJ?^{3d(5-VCFU(T_O<8|y_b{v8uzcU*<)cc$hfeInw?g!gz+jb zcP_T8EvJwOO`BhQSUqJI!<>O4rDCnkW`dy$_>lP-H?88oUJKE%8YYZK`ek2CsmY7uqNs zlT@U^S6c8-39`R@9X{Is@*TfuTHtBm9zlDzsU4ibnM}uZJ#;O+gK$L-P{Rdp#@<5W zda~DUjHHUXe9_GF9&~`LZ?<=O1)2pls4)YcuolQ|(q*SC<(%rP5QQXn@*Kfv%;&@% z#6x^<=SCtL4ws91GN|^XoC12J;(7@GJO)$aT}fq!k-!roqb;=AWVhHSVsL|Hd%TqH zU3q245#vUew&9}6{OHn^j||_}_>!&BL)tpCwhbZoK9*DSV)NwllpU!j+^3ceD;R_{ zUlzwzjdY6vVr#IoJSx$z+OCZqZKPMNIpf`35w{aVu*lbuB-@L6K7MK8bHSyF8`0)I zxo<>vPJW5@>p<4%Sd~G?t=~&>XY*y*m>uKCurO+}O_5`XrGY9pf~jCJ&6@VC>_Z|Mg@3_@4pI*l3xK0 zM}98S-{iEfMtWyJ6Bs}Cx6l4jeEr5m{`v;tPFw+iZDtYuT>dBmYMtHd4`|k0kmuePMj9cA~1&NfoxuQ#$E$y<7P1h z;Iz5#^wR%yVfgbFLVL@M;6GmK^ywJ{c}sIFQ|;?!NtXu2RWnCC&-DVlZWZ`kVGQN? z=6v;^iSq5HEN(Nb|M*$I7o6Dza)=PKtrVJsg+*oO8uxF$bRBTk} z^&#ZpM8nb=$xE6pY+in7>MolS21Y<(GyQaRNHQ{d%py1eOt^8s3G#gBF+oVQ<*j6h^1PzO|1d-Nfc>Lr1I z0pIuT=7B^f!W7(rbzsDibN15MD}&2-0Ybh6-XRQp$`hVd;r=$ie<_x zW?MGTODWky#OOGYZKO5Hj1-PA8d6=$E3jDFJ zD)7dCGEIJ$*2rV~=N4Q+T30l2aQ6eQa+`hm=R+X><2U`;2M>4+i<@)M9)gLK#?;H}bOK$My+RumdPa&@j4D_%34w?S-Gm!85D**j*pTGZYaY8 zP=D=%R&1j`1l`{QF5HK~18m3a-#b44G&mL!Nhtj!(&JEItp2?X_x){u#Oc3XS4E8c z(}+cbu`_}{-~ZD#n5 z;o2u_##}-&eq;@67vsR+x9Rs^jfajPGfAlB`qWa~wSV62Kc3}}g$$z)QO6Fq8vo0x zeJ0;@;@9%|z)?+vWr?Z29De|tn?STe{hv2_i3xL5;XiHMpS#kI9lqTusL=k`cXpHhWlr$NefT2;|M81zL*bWu z9wk`+{}oIJkW{KXbTs7m#tqoS?)}S*=Jyx<+r@{*gDcN;K|4E>m+1U)rN938-<}xq znu#;8;SEc!T>A44MUx9>{{10-U!ni}_0#h3?vsp%S6Ko~xXdPhU+RB)%Dze}DNuulM`cATKwML>9Zq zh3{G0-mSk~{` z|DW@vUyQT>IO?uubs9eYeUY>g*pB7D|Mu6-{Ns{;JuzAic;G~Lu&l-F;{7ZCJi!0j z$v+Fmzdi@KxSy$K2$Ff6-Kw6v@@u0a1+ZFE(*KnDT=0jVj(J-hj#m#cjMe|I@jalQ z0(Pp`yQE)L)sJf?yoOnYoF~7R-H$*1eZ}}=*Ji+%KbBL_g)(zV2NQwrKTUt|qk-Yu zrtV(AcWefjhXLVXT@%FT|8Z??c%MN~Kw{YuLxj+ACBRRs8GI9%8$f5Eyb59_5rEF= zW|igkwOj-97%8!d5N}4rqGT54sFA?b2j5SVZ@!MoX)~Ou)?<|O^XWV2-Z(rTIl2e!+n9-y zFv$_!&Hp3_kYCn%fQ4tla|0il>o{Ee8DK67a8r>}bn-@es0UMWkV#AmHlAdXi|a!W zrN8NtRQS%iXn7!TPHGWjSl=jbWjuB79(07&D5G{A$y=Na_GTL6uwI0h^`fAk7s8;4lF zsdX1vW0N66PIb33%3hzc75$p_=CjRrvky{Vo5V+pvo~v z++e`*{$>hX;{AhZ_+?@;pl{UA+YkP-5qp!+rR6C3GQdK&j)jA+X-}g6CwbK5L~N29|5u`~$KFj^kGf zi$zb>PlJ{DEI?3u$F0|?GUl#=I=G~ZD=i2nmp<{F1CgL=O{3t-7h={o;J9RPIsIg5 zmfC?)J0pK^4^n_x6CO7}7i&M-ocyF#ox9Wl+vYBgd z%k$@z=IGbkb%-4iVozm@6sGwz{wQXRG}ojs4dbyVM1Gv#0n?KYsPdQlpz?d4DNEM; zY#rwQCgSoogxfB%6~IRM>Q7TwYS}@pnvoV#HM8>`nnK+gmOs|y?_2x*JjoX#=;I{4JH>m z9x7Sa5%<%3makTl>@;DgT4AFJ-GJLmTxMYSqxPZ2$M)MwdlRI6UXl}rMa!B4j7*Fu z8Y*nQA#Ej?e7>F4G|0bBOIcEE#fz!VZNPUnycKcUYk~dql3ST9+aQxL^ZgVE46>63 zpJ+XCPk90!i?(tH`n0FKkE8rrx6eXyLQETo{0c%POoVla zLcI4ZM~ac-Du9}7Q2iwN_gR@^`G|OtC5G$ebDVm#r?oefk$Vt^FVo5m6Whc{Sk%`oHR1Y2(1DkVST~!C5Q0ET^Qq)=SbtU zhKN0m7c6rOX8N=Kv=1@|XNkJ{DtB8`x#r z?Y{t?&oStgQJVu&7ny`0Fy;V46Mv5>;PpRg5bDV=)&Pl_J|-|l%nS+Afe3#v{tZrt zVcW*uenb-r%P;$#H^RI-zRTFz%2W$aO+s{@B4RAf80fNzl-8_kE!iTekDTlBKd`!-CNAUc{BkhgA)ay4e}FG>)3cnP0ttizZ=rUdy0rYG9EEKLMhLIm~`02HPNA zju)SRoAT;mcKxA;MH(kwam-MKSC}45j5Wg)C|ZPXD{DV2jp%leBb9AH6=7mxHQz?a zs=d&XF-M>jP_funBWxB(h|>BEP~X6-5EsdR#(_16X+4-x9R;CYiqT{6^ei~>;Q=XX1PI|@;;PlM* zKB8Ke<5??PXiGL*zbe*sK7~=^*b-dvtmkV%tIi%rvy=Dac7_ka&;&s^!M=KQM`@N5 z!U**E>h`#8dD%35Y6rmnJ#NOlR2;hatQioZlOYM)MKJWpoj zx$8>&?C?F;`SxJA?Ltkd^UYq0)5EuT@~A;0bdxsPsUmH6ka@`=>Zvda1Yfe16F1vv z`&^}9Z2SGvTi(y_iWAZev`5hm%ULR^6t=e`Ar=`ugCSdyCab|bgAxfT4e0_vg&H}+ zgdx;MO>7g@0yUr+h@9ktJMHh@eBFJjRV#7o%K@;mv`qsGo_R(a7g(IBn~W16sRF=A z&0{^Pk^X{TEb(Q2;&H2}mEp}ZAh2X~@br)r-z|?-c-{FPZ``cgd zs2y95Z+K1Y%aj+&=%(WFTCA!${V+@i5uP>i_ZnkQG+;)ul?qLE1^XpT3Dx-`nkJ56#3lQs^x|q_oX$f; zl3bhoIQb;qKq!AyFylcWiL8V$4%hsu6FX)v#pXzlL;atz6k`0}+YEyO5dZtgk0ZHv z05NHbV_6rQkV3tk;KeD(8Vxi;^zJi*RqBXu5H(P9Jhv^rg4KCgvsZXDX)w(o41lrI z`P8h|iICOqo)JEmvLBd5kll&E%U+suZ`}-7p7;2eu=#zs@rD4HM(P~&Y6MzmqHn%* z1=eW#Loz-2aL93jIFU>K9$YvTFRe9AtfYy-Cp__v3H_$y*RyV5(#A%_waAA}u@ z?&_C;|AMvLMf0%BVC&8{B2D|Q_UPS0d>@1b^`!*Ok!prRW&()c^9-AO?zkP7HrdY zhX)D>a)=e!gk^wsg!zTi=E1|pve>1lujgxSerjQ6W2IqZ2ucOYtErC4IoW=^{u|YU zAr&b&bzn}68-e$n%x;tX|{3PNw5a40D_fM9PRZOWVvpvb%4U zQr&X-?15zf>-yXJocG(dyOPXg8IF+j$E8r+sIauyU6S*S-bmU6Rl^P%SWHs*n%`KD zcHm(MSswN@>!S~evPHP3QihqnD{96|V!}aq9RjIg4Y!KDE!pQ`wYh zIg#MtuBkLap?A3TTo`Y>b^By`PuLxVe%eGHx>(E`>9KL<+dEpDTw6v9nJYWvPyGZTV!r#e*7UyI%wFL_GIYLP3^iYA5))*Fi^co@-!Q zNj+F4UY10=H7@;Qx{xc^xUfxA&<%|CQJ9S&Jq1-7s}0>l!LoN#vog89jM|$nHx;C5 z320Zz4%v7Sb2$xJck@0ZEbkRndoPNZXSes}?5c$NeR54zsX$6jX-(G5K z>S(_Uzgvih@7N@HFUm5FxUh1-^<5x|!ricQ%j{7-Eukb&5{pe4#~%9y;CxS?z*Ii( z-*;R*^a-d2uD!x2l|r(^D`be>m4l+*{2WbF{dg}0GkVh#R#~jf3l^7|;Hx9@SFl`F z_N_&P*u4tEE^s8uTu)BEi@F>hEK!3e7d>Nr3X>9>GZSaTSgRd5fKYK6=Rbq*5j#2S zoGRB*J%`L~hoVv?`K*LQ;#tO=!}UuS{B-Zu3&)HgV!W?Vu6$J61gduEwNZU;^39#d zA_{+8i~wB|(bgm1{nxKe4ST7oF)0WxSEHCqG1v|%5V>9k3Pb~GKcz-yB5?p-GZ*$% z{SAHdFeSuv&hK@PN+c?9caY{4*uLS8+dsU=CS-tX@WPAKSIb5!-M3$Sr3`u`I0|e* zBZj1t!p5dUTS0yiwyhP@25(s}oOTK1iD^jePka5)l)X04IL`99SBqB~=M}Zdt7Edh zq25-n9)1}1Qz%lDY&n!o!NgVWc!EOdrsK$XXJwYw`>;gxOv(uLL?)F9SCl*Ny#`}T zz1nnCqw#v!da|+m*V&zD+1tI5xZxY@zy=?_NhbfL@!Q95m}7h~UW~~m^UEiWj>fnO z8!J8Boo$sE?X1K(eROOjH_3qS!QR-Tn?n-s4M`c;GmhPT=qUA=#%tavsTj%6eO3+M zfX#Pznv0{bWCmMrbPI#AyIfL#WR~pFq^FHg<||md0fl6a=ACZ`jo%{c$%Nqe_ic7q z9~`^V#_+?F?T>2VnPSuE&omsw(9k_)nCcjqfWF(=s3uTHn&!8O^&QtvATHG_PJVzp zgEmUwE{Tlt*|)u;DqClTzLh~!U+H-BuAqBx35;3NZ_bgx5<-)`K5n2?tAQG8aVKQE zRUIYjFwv{FRm8iFRu6udC%XH>ej_NJ@cuMP=MkryO5VUkKZV`KE&5nKeXlD)I``2T zV&H+a;#dD%#PV((sxeMH;~9OIGE=K2YT2?b?(nRs=o_d47`?dFLlR#pr3Z{Pcw`_1 z$fGZvGw0t3l>}p;uAC#lrzN>UHKcA@WPOq@w1gziL9|GFa{bE9QzUNIlO0~8-KC}a z*~BL0XoHli%HHZ~9W<{r3Ct(NJx3m?Nl$1M*5|mQ+%-(jnt}_#U41f@xzqd~YDsO+ z-=K9Elb}r=>k=O;1~O;kN@Dp?8683SzR&-w?7Rc9>fb+}O*Un3(Xf+|y+@QZWOYYI z_6T)TWM_{wNJc_RR#_F1Eo5b6Jysb}?y@5KT^}v=Jm2r{`Ti$5=f2N5pL2b#@xETy zoy9AkY3xTG_c3?hcYMiNa~IlDk<?w>bNuV(2dYA|GuJz>r`N}( zi}m$IkZv`ZV(i!D?3)QUl3PzN&lQWbO1RPfN~nCe-4etFQ}oTpqRLV~&>Nm5arT@N zB#i^wNxr+wlsAU!?%f*}UQauMk1)qCN^U=9vy^vf&EBTCb?&2rFa?g~=tAvkWG<~T z*Xe_Aor^j79VH%l15oYQBi^ceJ2|rXoyLd<5+n;F)h%yYH`|_AWtKABOD@c;C7(Du z0o&c=(P#Gxf3f&IM_H9hV)D~ zt))xmyo|)#YqWBg=ufm?`dDbs7O{15m{acY*}-YESW3p)wXHL(k})DUw@hLf0Y_a#%!R%ZF9NUdP~f=p zIw*)5qxWh2LxNb6rVZuIARJ3|MfJ=xXAYrp-lu_tNUJ~1Nh zqtcS6cP5e0Vy;insd=dP-z;YS&E`00ZLG4e6^*vooSawpxhlPOH)s{>tL7V^9kr9ZpP<~--EDvYFUFG;NK9(Lxf9YX%ASU!l2UJ4Aes z2V~M)mj(ifcuLP}UGNc#5faz#78CA!kHW%Zd#>p|b>$kG2#dO*u3#2AyJ>hx0^w%O z!bZm4FA5O+n70lbyVoI*os`pgi3-| zfI)9}EE%QljN>CR?EL~cdv;@ejgC0h&WO`bk2gWseOMw$DslQyQly|WpKf?D#=Ri&5$!s5w7|%2 z1Y1~TVk9W@rmX3vQ_-;uy3qZXrFCqYJYVdOm2j&4e8s*@k$>q7j9eLaQtgk3wse^24O%nS z5$M;?zNImG1tW-1IU2h2O^;q3(NQ;_35aOp*CSMinE8eDSB(hGb;>>X2SFrPsml=-yP<~upe?K z1gFI_J!^G3`4b)Ju=gUg!acf%Ybe#ijE0W#Yb>mOZQzY2#5{a7%x8lV6QmfV3G}f= zNlYK}7Q(}lBAEp^=-kP_|Malb^ov)qq%X2|9?cM|oq<(qBz4Ka-=OFnQtn%>!~G2H z&iUd7n6_grd_hTFm3Vz2<-=k^;`P?P_uKP&ktA}hs*ic2c@j9dIX1Kzokc2gJ<|nZ zuR~BhfDLeUBhXJv6(SFOu38(@$FL<3OiV$?N|J8!WA>#CX*%u~sNEzVl*4K|Y#93% z!v;f|jozNCWydG5mJXvS@Oo>$Df&f330XP=-44^|aw6jn z$?QgnPIE{;Qj@OE1z!#koMoqC)g{+R>dh^gfoRt6o|JZU$$r8cq?m`(3*hGmdwoTX zIlj^{PpS$Gyhb6-y0)tk%qL^cyS3Z72rGbvw09L$(>RM?TPJF3?sF`K`G*_dop(!;)o z_fgN@;bhcK*(Rq?yq0C$F2g!s0;}=P+%o=H+Ef!3e3q7~cG7E2q@>zS$6}Cvb8zoz z>PsH<-CAw1hK#mN3_&T35viH-0WU!$$FI-fKZPil+i=Y{W^@W1>+Ztl87M~2SQjoW z=N#D^RltQogY0(M%GT#*`N~`}?uwGBRcl8|+e%)6s?lAkbtwoU&hoSppL`p`?%;SI~X!Fgt3w<*ha}!VwaYnsxTdx(OPFe)5*|z^KIkPUG%D(7bvF|W@?*H{3cg*}#4>;axI=F18Wg<0FL)iGl8vsZ2s$pw-VejqU z3=>>7#?@)3J|#1h{cTO5B<9b$F$?O*p|yL9D&C-0La_Uo>-6TyxZHo6^sZsQj?$YCriDRydcJ~Hvd`v@D7dbN zKLnxD-H9H4G8&=>fQvMT)?qSjKL*#vpxvC|WlEX@`(ULIXYvKOb!d38=0OJ^-&THa zVA4|`t>>9jX$X$QH#!@~CA#ah1(1=*VVE#hiQ>gJ*O0&~8n)-lqD!iidacBaxrc3ZAN%?ocAc|WF2>$ zLG@*}q^Cnrs>M&H+TSzTg91HYMht&=rk>ZUl~-#0CjeJ!ql68!d4Mqw!JsKNsM$W4 z$3lQ-Bt7*`^4f!CgXqY{y^|zJi>ov*he(&1?=3IZvhKi4%-u@RO(}}Joc&>eREXHAEOFz&n>hPZEoz}!gli#hapU2K z_tRc`tXv5R&YUf0V!6HX3vegd=6OZGY8QD@N9iQ#AY*K@pFFUI(tsPUQ_T^=`_ zf86RVOqKgocp7&V_b|60l&rK{&vZjm_G|mHx$gUUbNQ?d4?oU5l_LC`)4@V$ygjIX zSB+as-|AvX1v@;}N)u9%IL`!@enB3!J5(WU4dRfI)x;F{<`&hA?kBV=o5{I$U7FC( zm3hV)N-yol0z$AT^U-jDZbVSMY=X>HvR2u;H77`<)p{-1WgeVmV!bDVpM*NThLv#-n}o@6hPtLD zg0mb?pJXJt%A+0^BP?D-wMn^kndXkrlP}NIJR+Uzy_yP zk)=WBrJmGk&UEO#h8Tv!VnpmCMb{14aIlYe9WaP&qNCLprsz0&Vf(#Lw3=+Rb8|r@ z%w@L6$zQy#6hF#V1o8mM`6EoRT%&4)0tGqaEP1MpYia#hc9?S)fybwh7Zi9C!ZbwU++G+{@=Q$N;7Le-Bl zlR=l?1WHD#wA7}JayZGO@AN0e-R)gXb3ACx54NwX>{nlr9vaq4G@VnsKG51Hd|Fl@ z#hlJ&p_jBB6%-K0(c#UIuYNH=i_Q!CxH|kEe5))x3%+ zd}+%2IV9D_tAnrnX+cVfsH1@E)GAR`nBXg+MT4-C8XnJ%C0Zj+Q_I)N!# z(vF_ueyG#UH;cD}nbEl(Q$~|kYhKhnwbw4g(PJQ9%tGbMUg_e1T6@%a-5L!O&pTfp&!Mxvl+_t%*tC^-z?4Z*_;rUnys6=(j zq9kNZ&u1P^ix{bAAIuZ^82&0a>AW9;QdOBtw=PrDL#OL4Y{`2sP!{AS))D@(2%(fx zMfkG~2jDcIIuyV}BDtE4JYJwSA_|m38IC{%hcYk4&tllZuO`tSr>t|+^f)tG2ct=L zuIH>NCLaOq&%}eIxz66S&I=!_w&(GsSlfFAH8@UX+;~}7y)9L7r9E2Z4u``W{xwNc z;9`i?v(Y03L7SGWG=na6_@HfOLhx}N28{2ghw?gY0ZM2rqB#su*>jSC89L-M2N-*5 z?%pW6t>-!iiU-g2Z_N0BI=&=(SHaq$$dl~$ZCtyuOzNIPi73m~7*tdCCs|De zO!N%~{bgc?5gQSC#Uodq;Ub*_5n>532wO~P!p}tJsS}PJ<6#gvtJvM}DhEb3?y*kG z(8sY=l{YB)DGrF^%qvk*RpB0t0Omv>sW7Zngt#!w(Zyxj|5(6#Ntynx6Pe_PluMu6 zzCvUoQuzoG;w;|l9n41DihR*(L<&cnTBUn3k}&AJ*k!R}|4CbnLS|vu*c?+}%$}iE zllm$eH9B>*(}-M^e9vbDWk4(PAL5}YPWy_c@krQW5+r;YpEU1n3Pg1=HfT}KsC9}~ zT*q?uqd0k=1e3_El7X{L$QjIs5nuD#M=d8{dRW*MtAhoM>Q|Sm3--edp(-fmF=%vTKKcXYF9kqkw zS@eY#*jKXjWDuP{%_evj!+Xq$LBG3bPt#jaRovPxmsPd=-Yb3lM=l_L7%a2M@iR52 zQ5_B#6MTCa<=$C`jyxV>+`VhR{2VB{40h_aLBk4x~<?DX$Or-RV94myN zsOw<#u6F?Hq1OIz3QMpbJg_?t<2Wo~1a4ke(Pgc`-7ZG)JPwE^izI!hTwc}cv-rpb zhWikfLE|$|6l40v#Hcs2N3TK_-f08@CzzlzXmGtbf8WlJN0xxCq=;0L=8)cuC{tVm z%|S$_Skt~Mw9C-Ip?({x$>$4mmq>_lKzmILMD%`OS1OW{GF(DL4bV68?Kl-&zlf-s zmEo?_a?mk12fG1rS-B~MjnI|mv`gZeJ@a$5haYM@8jZ)wKY%vq1N{QjM}szmC;ei7_12L+v&wMnG2R>MSid?CehfuM>uFcQVHgBoZ;$;f5J zFXpM#uMQ6Ik%i4SrCE%Dc`_NgkHjuqH_M@lXl(a%(BcH-kyPu#=1Z*Wp7W$>y*wKD zhP0@Z*flcea{MQmsm29?Q=bo%3+jJCfZL1Z5%|O-oHesKxsn!pIKiKWAIfCY|81H75|v9wP1yc}`00fYLQ~XFRsDC_ixTQ^(XI0MK%1`)GM6T@8jjIvW(8DY6`{giIUyp1Yqafx z*@z1M)50OQkyY04&q3ZV-t7^XJN|q6}g;Q=V z5(EHI3E~j63 z7i5zHdj*{2hNGPFX&nLlEh;`%W)et6mdZ#bT67}2KS&g$4*c0Sg?P?o^Cy@3xsbKj)wE21h15vu zlX7sk4N5*qqLB^x@Y<5poJX31Z}KHKM23DvEyk2QLetIX$aPkyqIstS>}I6862ZMR z7aHX}`*jl->eryC87-A*{v9wMj=Pc zP0S0B^BPMJ2PNHqg;k~&^5AEop^dV%CNx~@Eo9d2Ha`vs5%%_OxL*yrLa_K|=1A<= zZ@dWKxU1d=wqpDxOAW%X#x-N$%desl(gL0m!r`@$N?|IkxW{idRtILVzup4F8s1U( zt=zGpFQcZ(C;QD=QaTSwz#eu8-Zy%{b2)f@FG~lS=Qwo3g72$mOHRF_@=zg`=scQl zZ1)0oF?p`_EaGtoIba&c_BIdTJBcR|j(*d5<;jJv7X`7q z#rt<3a9EhXG0XDw$!Z9-!>o)L)md++69SM?`+R>2413;_!EDx$h5px)WmEjKXUJY% z!Qsvs%auUtsa9X22Yc%lcwj5(wsoJ_f=esx7cqFUMGP}zEyV^bV>{Z$2pzulN^R_Y zKW*Md*G1}1a5UdFR@>b{^`*k-Eu;G}RK;n8Wxnb1Trh7mMncFY;bTXq=;;0&-F0Kz zIDeT;yPRvqx@?^@!O z3=BN*`gRiO+5xKre22om&K*PA$Gb=NJ!{%`-6}K9u`GS4rLnMa*6yTSk?Yf$k{3=H z9eo7_Az7vEPwoauM)rk%Dzcvmn>~|v`J4@`JH)(njEPebkV`49j>Git`O6oXTscI! zq8t~z&gV@jFzqRv6#dR4mU;&b`CWU1`JqpzzOI~*u`fhOiooL+Qr+ zq!E*`x|E+q3(VkIvT~! zU_8R^1e-av(HDdf4+K9l-4>5MIW=vo-j_$P+Rs*z$rXIpeIu-``BB!QZ`*z~-y zt$48nMBX)t?JMBTMRzZ(r^~m-aSn?So_)CR8)1B$e7 zE~TbqSGezYz6AWkEESc#CV9<1Z=^H0@d6K^0fo{a zmDK4;W_*f}1-m|eoQH&+vzAs}c(VxPwAQBU_7x!hVzr2F4;bl~`QWPcf_jeryju~z z&0KO?BX~r~jf99Mu9O;2A@HQ7$!gji^snRA2@oMx!5r+i{4I3PNcW85XMK(KnZFam zAW(x_=qCLj-Hk*=Hwvx4fCXSjm1AG70_&=X24HHYp4v5A1|S6ddYH|av;~v)b$q8H z8PJBa<2okO2+sk;ARlz6+YWMM@XDt(+w2^pu6kUknGs=ma(m7Gm!CUA8m=oOv9V6w z)wR&V#%df{RA)T7ZJLa^;#1OxZDJ>64rbTqc5G+zXP#u8Wh226o`Wu?8+1k7K2=SO z9!;`4bcBkIRqT>WhMe$8JnoYf4IHGy#t|;h*bi!l;~)$kQ>o?hI=FNEX6qHI=p|hD z&BdEKY-7~#ymaJoY@Z+NrHbtH$_t{L4v7B@YsK-!C!u1~Zx&;MwD)(`$+yY^F|!|W zja$HjAOqAqt;9%YHZ5O6REBN3&w(#KihP;}lfhN|3F$?uJ^=hf?iLoN#_{->U`$3| zK+&)a?|B!vT_=`n7bXB>=K?BMb^k1{)3Ps=N!{C-EY-56nXsW>K_UOAi>R4Z{J&?0ZO(}@F0eia0P_wB;_!y(6_3Jkm zL&l8aKq=Poq5uHkCn(UA%IQN#DdJGL{Q!E+{C1ZxD?aS=qX|=p%{`y9SYwL|W^|s& zF_Ay_=2{%%kgn`p=J9uIZ-=;y-h=nc>ABE6s{5o&Ecb|QPKnM7=N@S-S;+yHY8SAj zB;7L;Liw5$;gh!t7NKb;Y&pGiOkd19s?HYN`(iD*m+_U$l?&8Q*%pbmq zXuIPSMGkh@3)!OHd!@$?cNu5KYco1Cm%z~-)8(IXw_1dRz;cX&6lk3%DvLNUwkCl& z;1DA=@yqR9ymM#Rb_$i*imWB9-a+A?nnQB98eh>Fn7T|Ja*xKH z`$?tSHvD)w6VH({(KqT+R!DG<=SsXt>ukmuOW`m40ntUWb z!0Rtls^zzr6B~pqGverlU?#`P!z>uAUM^SbcwxFRP#7M8n?p7C=8eM8XPPN)87_)- zG_BKYS{lQ2CVrZ}#ytz^uTqmmYZN?3BG04t3DLu?s71l=nLE&iulj`QFzcRQ0fVdA z8Q=aPzeU9;q>cj+GIKDRgm@I?kUj=_LdZk+oI3;>D(ms<+4L?Ia$Z2wyaY~uAhf;%;bGLD3n)wXjE*Wun6qInH?`b6HO(c&%v+-a#^9+k3mwj-6UzC?~_ zY6y=$-)s%X1pR0lPgjQS3e$wb!nic=jzH?>0?kv=j@(&XYpA(Tni09jSVLN~?aSP{)_mbDL{m)ZI%VKM)(NY&IQ?GN54ZS(3B z%-nmoJ#Xp}8$qAWAd{PPD~%G*@HmrL5XaQ09NQR_a)o7XFx@cBI@Qe-t`(?IEta|t zgM6rOarSxx1uji&1=4{&4l>5Pgtrx3#^f%u)?an}Xiomzn0jTT&4Kpz1MS;Uf3AOz%x1=5Xr_1o{y#ZF9dy5TvVT_s z8AX0;WDO)aC8wUbD`j+?NNwoV@rk0Wr@wHkHPCe(=CwSDCR;u#Tpzn$X;>+(>hqE7 z#dO^5VX7XoKHotY{{%GgiVj-tvpLaCX+y74_L3oab`>h!(}XTH$%*-nH9Xcceotif zTq1Eb3OZ-boDN(6I>kG1=lS)syW)Zzt0sFQEKDs{CSFoALmkV#^?-b7CRbIH-kK3b zl}=rL8fICcY3llqSd5+9?C)J+esoa0y5`$;P&)_8Ty3(1G~8T^Hz7@|a>TSfieZ$D zoLXDgsP3w@ZgtQS17_bS&H}k(rG|Zekx)_+pBrTs;Xs)%CWVXG5F$oO*01}EeoOMWAZo^F|3ZJc&S#}bjJ{212MU*vQxY;l5I z9jel5OkX}>cJw?VS1l_XzDA@>H1uXJP($8iWPwnhmuDBo#v(E3A&t&3EwQuS!gPGZ z-6Sk)O(*AUJ%`BfehlgL7kxT?!jCAWqD1Qb$HKGx6#_NyYLx0bN2xa30um}R(5vtf z?S;Z1w-hg)tnn}KVttz7elljNG^fUy(JB^96IDL1f=!qopSWl(H~b}4;Z^tw_jPqs zjhplSyROM}axhE>#RNQyIn?CydVXA6xG?;Q*jtxm?px}*bz%oH!_k<5@z_w;(4duC z*P?Ee81XTanvxezy1Y`E@QKZd?Pzi14LBOCf}BxZvzBI(9lOp)E1|186&j@)iq7s1 z>E02$Z|%JG=vE%vr!otzQ)U~lMYfMC)6`Ud#Oqs{*uPGfH}Hf9YSg<#9Tcd~EzK?J zZYd7<6z&!PFGFdASRQ~B-GYOj8D#s^bbgiQVB^gFgj+}!OPWI%^y{HFUFVNSnF!QB zFGYiQ2gP03GpQdQJ}+Hs+hM;#HMYf;RwZL>%scRMgatSi-t&0RA^Pf(zW-z$+}`b> zW!ZRqQf95Vh&08ovt&we-Y_ zjIVrxXox?c|6z{)Z=u>ZH|vK6jLXlDSG2KmyJO>~q=9OLZws1(Cdx%PSE}Mz)TYEA42_CsOjSP+*UJ@YY{C*BYOfL%5AyoorzBQbooPndtr88lf!r=e4TqjipooYs<{s^%+QpmrF%oL+kPih$R z<7MAC=iv~D*i_DVK_6v%yO5t{fY1P}CZe-F zi3HbHq6!51r=*e)`0Fj1jq*QaPqx$Q&OHXz1kQfF!6n zbZXf4MG+HI`=K&08^jBbwIOrF=~angz!6fTt|HXkd;bv zNmuYnFDOwkN<6FMlWp|KW`ncv1nsTk9WIL^s`}2F}#pBx|FEebefO@X22Z zVx+H2m%cJugOD7xwO&S*KtU)6AYdsqX$GD4po0ir&L6iH1B zVUimmln0830<}SS2Xs*aS-{EOyGSm78mu%7Rw6_^laQ*n!b7r0>f{9v7VB+y}g4V80 z=l}8Jch-`PINr0|^UTaZ>zd*r_>FT0r?0GBz+i5qSOSBgyZe}#wR=%%R8Rg2ozkS_r=lbXC7p)|mv+?6kG!?N3+ANoPjof~HNxwt%`EtMh`SW5nedq7z3r2I~Ro@;4 zLo&b9Eab<%-Sj;+-{wuv^ScRXV}j!&UB!Q=CyI&^3LgIld$h4bZ9WMt>>)D#uT{{! z8mARC{TIi4KO8+t7f}R-Vv4@$v5Eb*@yucf1x^0Hb5+tH_8TdiW#TuwP!So^uLt<^ z;r+bNKY#OW0Rjc^q0q5kQ)GU;tsD26K(CVJ&j zh%WV?@lZczKwh7Mn3$8ft!#T%vA!3$b z(T{&s>Ad_Nr!q-}{)NxQo8XIJl$7+A{P;htT8(`+-vBh)E&0`@Xch9GIT12}q|C#A z*fD~Bl5vqsf0{O>iIqwS{)cYvEIVmd>(lwzhu7YyB#qjp+Q;(m+(&mE&dJ~j-jm$P z@BAlZ4Q?-y_VWCz^}F#@e%>QT;5p|y+DL37t8)20B-H(JkAK<@*#G`So5d)V&`f4w zoxpKZHQai7Lc#w8CNnu;2HFtYd*uc!pFTSY*xKp*zTCd0dD@imlP)TB=;HCaP|)G2AiE_O+F4RtyenO;ZCB&Hn}4Ahe?9$P5@JnGCg@DMqn9IlCEBOv z(Mg8llf;@8AEiBS{SCs2)4}M8Py9V2A1i)k_}44@Yl`6&W(R^ z4U!C$=>71D+GYsKnunR~q(tQUtqP*4S`?;TmrDD;(LR;zSAFH)*#24uN?#S)rY3mV z8CyL*1#tergxQeeMg9+bzknzuTiigZ{q604Sc8q%Xdj;V&pZ9o9&UDI%(6hrBK7Km z@9cLq0|h01@sZ!3Dq25(?t9b%_!yCS95dT*qvjP320?sts zBx{4G{Kxlx674JT&&}gMaD7TQkof8iNyGGyNJCVb)KKSNK7PL;B+Mv@I+JaDhy7n3 z^V@#(2ag#oXbOJ(?tioI!+$MA!yCH`UR@zH)={$m4fk Date: Sat, 21 Feb 2026 03:19:58 +0000 Subject: [PATCH 27/42] feat: update dependencies and enhance configuration structure - Updated `apache-tvm-ffi` version to `0.1.8.post2` in both `pyproject.toml` files. - Added `pyzmq` to the optional `cuda` dependencies in `pymllm`. - Introduced `pymllm-server` script for server launch functionality. - Refactored configuration imports in `pymllm/configs/__init__.py` to streamline access to model and quantization configurations. - Created new configuration files for model and quantization settings to support enhanced model management. --- mllm-kernel/pyproject.toml | 2 +- pymllm/configs/__init__.py | 15 +- pymllm/configs/global_config.py | 606 +++++++++--------- pymllm/configs/model_config.py | 31 + pymllm/configs/quantization_config.py | 18 + pymllm/configs/server_config.py | 266 ++------ pymllm/engine/launch.py | 116 +++- .../scheduler.py => executor/eager_runner.py} | 0 pymllm/orchestrator/async_disk_io_process.py | 3 + pymllm/orchestrator/detokenizer_process.py | 3 + pymllm/orchestrator/model_runner_process.py | 3 + pymllm/orchestrator/parallel_state.py | 122 ++-- .../orchestrator/request_response_process.py | 10 + pymllm/orchestrator/scheduler_process.py | 3 + pymllm/orchestrator/tokenizer_process.py | 3 + pymllm/server/launch.py | 17 + pymllm/tests/test_vocab_parallel_embedding.py | 24 +- pyproject.toml | 5 +- 18 files changed, 624 insertions(+), 623 deletions(-) rename pymllm/{orchestrator/scheduler.py => executor/eager_runner.py} (100%) create mode 100644 pymllm/orchestrator/async_disk_io_process.py create mode 100644 pymllm/orchestrator/detokenizer_process.py create mode 100644 pymllm/orchestrator/model_runner_process.py create mode 100644 pymllm/orchestrator/request_response_process.py create mode 100644 pymllm/orchestrator/scheduler_process.py create mode 100644 pymllm/orchestrator/tokenizer_process.py diff --git a/mllm-kernel/pyproject.toml b/mllm-kernel/pyproject.toml index a8dbd98ea..77340b29a 100644 --- a/mllm-kernel/pyproject.toml +++ b/mllm-kernel/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "packaging", "torch", "torch-c-dlpack-ext", - "apache-tvm-ffi == 0.1.8", + "apache-tvm-ffi == 0.1.8.post2", ] [project.optional-dependencies] diff --git a/pymllm/configs/__init__.py b/pymllm/configs/__init__.py index 86af57beb..a23de035c 100644 --- a/pymllm/configs/__init__.py +++ b/pymllm/configs/__init__.py @@ -1,21 +1,14 @@ """Configuration module for pymllm.""" -from pymllm.configs.global_config import ( - CacheConfig, - GlobalConfig, - ModelConfig, - RuntimeConfig, - get_global_config, -) +from pymllm.configs.global_config import GlobalConfig, get_global_config +from pymllm.configs.model_config import ModelConfig +from pymllm.configs.quantization_config import QuantizationConfig from pymllm.configs.server_config import ServerConfig __all__ = [ - # Main singleton "GlobalConfig", "get_global_config", - # Sub configs "ServerConfig", "ModelConfig", - "RuntimeConfig", - "CacheConfig", + "QuantizationConfig", ] diff --git a/pymllm/configs/global_config.py b/pymllm/configs/global_config.py index 43783e946..1761697b1 100644 --- a/pymllm/configs/global_config.py +++ b/pymllm/configs/global_config.py @@ -1,349 +1,321 @@ -"""Global configuration singleton with all server, model and runtime configs.""" +"""Global configuration singleton aggregating all sub-configs.""" from __future__ import annotations -from dataclasses import dataclass, field +import argparse +import types +from dataclasses import MISSING, dataclass, field, fields from pathlib import Path -from typing import Any, Dict, Literal, Optional, TYPE_CHECKING +from typing import ( + Any, + Callable, + Literal, + Optional, + Sequence, + Union, + get_args, + get_origin, + get_type_hints, +) -if TYPE_CHECKING: - from transformers import PretrainedConfig +from pymllm.configs.server_config import ServerConfig +from pymllm.configs.model_config import ModelConfig +from pymllm.configs.quantization_config import QuantizationConfig @dataclass -class ModelConfig: - """Model-specific configuration parsed from HF config. - - This is a lightweight wrapper around HuggingFace config with - additional derived fields for efficiency. - """ - # Original HF config (populated after loading) - hf_config: Optional[Any] = field(default=None, repr=False) - hf_text_config: Optional[Any] = field(default=None, repr=False) - - # Model architecture - model_type: str = "unknown" - architectures: list[str] = field(default_factory=list) - - # Dimensions - hidden_size: int = 0 - num_hidden_layers: int = 0 - num_attention_heads: int = 0 - num_key_value_heads: Optional[int] = None - intermediate_size: int = 0 - vocab_size: int = 0 - - # Context length - max_position_embeddings: int = 0 - context_length: int = 0 # effective context length - - # Normalization - rms_norm_eps: float = 1e-6 - tie_word_embeddings: bool = False - - # RoPE - rope_theta: float = 10000.0 - rope_scaling: Optional[Dict[str, Any]] = None - - # Quantization - quantization: Optional[str] = None - - def __post_init__(self): - """Set default kv heads if not specified.""" - if self.num_key_value_heads is None: - self.num_key_value_heads = self.num_attention_heads - - -@dataclass -class RuntimeConfig: - """Runtime state that changes during execution.""" - - # Distributed state - tp_rank: int = 0 - tp_size: int = 1 - dp_rank: int = 0 - dp_size: int = 1 - pp_rank: int = 0 - pp_size: int = 1 - world_rank: int = 0 - world_size: int = 1 - local_rank: int = 0 - - # Device - device: str = "cuda" - - # Memory pools - max_num_seqs: int = 0 - max_model_len: int = 0 - - # Scheduler state (mutable during runtime) - num_running_reqs: int = 0 - num_waiting_reqs: int = 0 - num_swapped_reqs: int = 0 - +class GlobalConfig: + """Singleton that holds every sub-config pymllm needs. -@dataclass -class CacheConfig: - """KV cache configuration.""" - - block_size: int = 16 - num_gpu_blocks: int = 0 - num_cpu_blocks: int = 0 - - # Cache dtype - cache_dtype: Literal["auto", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] = "auto" - - # Sliding window - sliding_window: Optional[int] = None - - # Prefix caching - enable_prefix_caching: bool = False + Usage:: + from pymllm.configs import get_global_config -@dataclass -class GlobalConfig: - """Global configuration singleton containing all configs. - - This is the single source of truth for all configuration in pymllm. - It aggregates ServerConfig, ModelConfig, RuntimeConfig, and CacheConfig. - - Usage: - >>> from pymllm.configs import get_global_config - >>> config = get_global_config() - >>> - >>> # Access server config - >>> config.server.model_path - >>> config.server.tp_size - >>> - >>> # Access model config - >>> config.model.hidden_size - >>> config.model.vocab_size - >>> - >>> # Access runtime config (mutable) - >>> config.runtime.tp_rank - >>> config.runtime.device - >>> - >>> # Access cache config - >>> config.cache.block_size - >>> - >>> # Update with new server config - >>> config.load_server_config(server_config) - >>> - >>> # Update with HF model config - >>> config.load_hf_config(hf_config) + cfg = get_global_config() + cfg.model.model_path + cfg.model.hidden_size + cfg.quantization.method + cfg.server.host """ - - # Sub-configs - server: "ServerConfig" = field(default=None, repr=False) + + server: "ServerConfig" = field(default=None, repr=False) # type: ignore[assignment] model: ModelConfig = field(default_factory=ModelConfig) - runtime: RuntimeConfig = field(default_factory=RuntimeConfig) - cache: CacheConfig = field(default_factory=CacheConfig) - - # Additional metadata + quantization: QuantizationConfig = field(default_factory=QuantizationConfig) + _initialized: bool = field(default=False, repr=False) - + def __new__(cls): - if not hasattr(cls, '_instance') or cls._instance is None: + if not hasattr(cls, "_instance") or cls._instance is None: cls._instance = super().__new__(cls) return cls._instance - + def __post_init__(self): - # Lazy import to avoid circular dependency if self.server is None: - from pymllm.configs.server_config import ServerConfig - self.server = ServerConfig( - model_path=Path("."), # placeholder - ) - + self.server = ServerConfig(model_path=None) + @classmethod def get_instance(cls) -> "GlobalConfig": - """Get the singleton instance.""" - if not hasattr(cls, '_instance') or cls._instance is None: + if not hasattr(cls, "_instance") or cls._instance is None: cls._instance = cls() return cls._instance - - def load_server_config(self, server_config: "ServerConfig") -> None: - """Load server configuration and sync related fields.""" - self.server = server_config - - # Sync tp/dp/pp sizes to runtime - self.runtime.tp_size = server_config.tp_size - self.runtime.dp_size = server_config.dp_size - self.runtime.pp_size = server_config.pp_size - self.runtime.device = "cuda" if server_config.base_gpu_id >= 0 else "cpu" - - self._initialized = True - - def load_hf_config(self, hf_config: "PretrainedConfig") -> None: - """Load HuggingFace model configuration.""" - from transformers import PretrainedConfig - - # Store original - self.model.hf_config = hf_config - - # Get text config (for multimodal models) - if hasattr(hf_config, "text_config"): - self.model.hf_text_config = hf_config.text_config - text_config = hf_config.text_config - else: - text_config = hf_config - self.model.hf_text_config = hf_config - - # Extract fields - self.model.model_type = getattr(text_config, "model_type", "unknown") - self.model.architectures = getattr(text_config, "architectures", []) - - self.model.hidden_size = getattr(text_config, "hidden_size", 0) - self.model.num_hidden_layers = getattr(text_config, "num_hidden_layers", 0) - self.model.num_attention_heads = getattr(text_config, "num_attention_heads", 0) - self.model.num_key_value_heads = getattr(text_config, "num_key_value_heads", None) - self.model.intermediate_size = getattr(text_config, "intermediate_size", 0) - self.model.vocab_size = getattr(text_config, "vocab_size", 0) - - # Context length - self.model.max_position_embeddings = getattr( - text_config, "max_position_embeddings", 0 + + @classmethod + def reset(cls) -> None: + """Destroy the singleton (useful in tests).""" + cls._instance = None + + +def _parse_bool(value: Any) -> bool: + """Convert common CLI boolean spellings into ``bool``. + + This helper is intentionally permissive because CLI users often provide + booleans in different forms (for example ``true``, ``1``, ``yes``, + ``false``, ``0``, ``no``). The function raises ``argparse.ArgumentTypeError`` + to integrate naturally with ``argparse`` validation and error reporting. + """ + + if isinstance(value, bool): + return value + if value is None: + return True + + lowered = str(value).strip().lower() + if lowered in {"1", "true", "t", "yes", "y", "on"}: + return True + if lowered in {"0", "false", "f", "no", "n", "off"}: + return False + raise argparse.ArgumentTypeError( + f"Invalid boolean value: {value!r}. Expected one of true/false, 1/0, yes/no." + ) + + +def _unwrap_optional(annotation: Any) -> tuple[Any, bool]: + """Return ``(inner_type, is_optional)`` for Optional/Union annotations.""" + + origin = get_origin(annotation) + if origin not in (Union, types.UnionType): + return annotation, False + + args = [arg for arg in get_args(annotation) if arg is not type(None)] + if len(args) == 1 and len(get_args(annotation)) == 2: + return args[0], True + return annotation, False + + +def _converter_for_annotation(annotation: Any) -> Optional[Callable[[str], Any]]: + """Map a type annotation to an ``argparse`` converter. + + Only scalar, CLI-friendly annotations are supported. Complex runtime fields + (for example nested dict/object handles) are intentionally excluded from the + generated CLI surface to keep the interface predictable and safe. + """ + + inner, _ = _unwrap_optional(annotation) + origin = get_origin(inner) + if origin is not None: + if origin is Literal: + literal_values = get_args(inner) + if literal_values: + return type(literal_values[0]) + return str + return None + + if inner in (str, int, float): + return inner + if inner is Path: + return Path + return None + + +def _is_bool_annotation(annotation: Any) -> bool: + """Return ``True`` if annotation represents a bool/Optional[bool] field.""" + + inner, _ = _unwrap_optional(annotation) + return inner is bool + + +def _format_default_for_help(value: Any) -> str: + """Create a concise, readable default string for CLI help text.""" + + if value is MISSING: + return "" + if value is None: + return "None" + if isinstance(value, Path): + return str(value) + return repr(value) + + +def make_args( + parser: Optional[argparse.ArgumentParser] = None, +) -> argparse.ArgumentParser: + """Create an ``argparse`` parser with two-level GlobalConfig CLI options. + + The generated options follow the naming pattern ``--

.`` so + each sub-config can be configured independently: + + - ``server`` options map to :class:`ServerConfig` fields. + - ``model`` options map to :class:`ModelConfig` fields. + - ``quantization`` options map to :class:`QuantizationConfig` fields. + + Examples + -------- + - ``--server.host 0.0.0.0`` + - ``--server.port 8080`` + - ``--server.sleep_on_idle`` (implicit true) + - ``--server.sleep_on_idle false`` (explicit false) + - ``--quantization.method awq`` + + Design notes + ------------ + - Options are generated from dataclass metadata, which keeps the CLI surface + synchronized with config definitions and avoids manual drift. + - Parser defaults are suppressed (``argparse.SUPPRESS``), so ``read_args`` + can reliably detect whether a value was explicitly provided by the user. + - Only CLI-friendly scalar fields are exposed; runtime-only fields are + skipped automatically. + """ + + if parser is None: + parser = argparse.ArgumentParser( + prog="pymllm", + description="CLI options for configuring pymllm GlobalConfig.", ) - self.model.context_length = self._get_context_length(text_config) - - # Normalization - self.model.rms_norm_eps = getattr(text_config, "rms_norm_eps", 1e-6) - self.model.tie_word_embeddings = getattr( - text_config, "tie_word_embeddings", False + + cfg = GlobalConfig.get_instance() + sections: list[tuple[str, Any]] = [ + ("server", cfg.server), + ("model", cfg.model), + ("quantization", cfg.quantization), + ] + + for section_name, section_obj in sections: + section_group = parser.add_argument_group( + f"{section_name} config", + f"Options for the '{section_name}' section of GlobalConfig.", ) - - # RoPE - self.model.rope_theta = getattr(text_config, "rope_theta", 10000.0) - self.model.rope_scaling = getattr(text_config, "rope_scaling", None) - - # Sync to cache config - self.cache.sliding_window = getattr(text_config, "sliding_window", None) - - def _get_context_length(self, config: "PretrainedConfig") -> int: - """Extract effective context length from config.""" - # Try various fields - for key in ["max_position_embeddings", "n_positions", "seq_length"]: - if hasattr(config, key): - value = getattr(config, key) - if isinstance(value, int) and value > 0: - return value - return 2048 # default - - def update_runtime(self, **kwargs) -> None: - """Update runtime configuration.""" - for key, value in kwargs.items(): - if hasattr(self.runtime, key): - setattr(self.runtime, key, value) - else: - raise AttributeError(f"RuntimeConfig has no attribute '{key}'") - - def update_cache(self, **kwargs) -> None: - """Update cache configuration.""" - for key, value in kwargs.items(): - if hasattr(self.cache, key): - setattr(self.cache, key, value) - else: - raise AttributeError(f"CacheConfig has no attribute '{key}'") - - def temp(self, **kwargs): - """Context manager for temporary config changes. - - Usage: - # Modify runtime config temporarily - with config.temp(runtime=config.runtime): - config.runtime.tp_size = 2 - # ... do something with tp_size=2 - # runtime restored to original values - """ - return _TempGlobalConfig(self, **kwargs) - - def to_dict(self) -> Dict[str, Any]: - """Serialize all configs to dictionary.""" - return { - "server": self.server.to_dict() if self.server else {}, - "model": self._model_to_dict(), - "runtime": self._runtime_to_dict(), - "cache": self._cache_to_dict(), - } - - def _model_to_dict(self) -> Dict[str, Any]: - """Convert model config to dict.""" - return { - "model_type": self.model.model_type, - "architectures": self.model.architectures, - "hidden_size": self.model.hidden_size, - "num_hidden_layers": self.model.num_hidden_layers, - "num_attention_heads": self.model.num_attention_heads, - "num_key_value_heads": self.model.num_key_value_heads, - "intermediate_size": self.model.intermediate_size, - "vocab_size": self.model.vocab_size, - "context_length": self.model.context_length, - } - - def _runtime_to_dict(self) -> Dict[str, Any]: - """Convert runtime config to dict.""" - return { - "tp_rank": self.runtime.tp_rank, - "tp_size": self.runtime.tp_size, - "world_rank": self.runtime.world_rank, - "world_size": self.runtime.world_size, - "device": self.runtime.device, - } - - def _cache_to_dict(self) -> Dict[str, Any]: - """Convert cache config to dict.""" - return { - "block_size": self.cache.block_size, - "num_gpu_blocks": self.cache.num_gpu_blocks, - "cache_dtype": self.cache.cache_dtype, - } + type_hints = get_type_hints(type(section_obj)) + for dc_field in fields(section_obj): + if dc_field.name.startswith("_"): + continue + + annotation = type_hints.get(dc_field.name, dc_field.type) + option = f"--{section_name}.{dc_field.name}" + dest = f"{section_name}__{dc_field.name}" + default_value = getattr(section_obj, dc_field.name) + if _is_bool_annotation(annotation): + section_group.add_argument( + option, + dest=dest, + nargs="?", + const=True, + type=_parse_bool, + default=argparse.SUPPRESS, + help=( + f"{section_name}.{dc_field.name} (bool, default: " + f"{_format_default_for_help(default_value)}). " + "Can be provided as a flag for true or with an explicit value." + ), + ) + continue -class _TempGlobalConfig: - """Context manager for temporary global config changes. - - Supports nested keys like "runtime.tp_size" to modify sub-configs. + converter = _converter_for_annotation(annotation) + if converter is None: + # Skip non-scalar or runtime-only fields (e.g. arbitrary objects). + continue + + section_group.add_argument( + option, + dest=dest, + type=converter, + default=argparse.SUPPRESS, + help=( + f"{section_name}.{dc_field.name} (default: " + f"{_format_default_for_help(default_value)})." + ), + ) + + return parser + + +def read_args( + argv: Optional[Sequence[str]] = None, + parser: Optional[argparse.ArgumentParser] = None, +) -> GlobalConfig: + """Parse CLI args and apply overrides to the singleton ``GlobalConfig``. + + Parameters + ---------- + argv + Optional argument vector. If ``None``, ``argparse`` reads from + ``sys.argv`` (standard CLI behavior). + parser + Optional parser to use. When omitted, this function builds one through + :func:`make_args`. + + Returns + ------- + GlobalConfig + The singleton config instance after CLI overrides have been applied. + + Behavior + -------- + 1. Parse all generated ``--section.field`` options. + 2. Apply only explicitly provided options (no accidental overwrite by parser + defaults). + 3. Rebuild ``ServerConfig`` when server fields change so validation in + ``ServerConfig.__post_init__`` and ``_validate`` remains enforced. + 4. Keep ``server.model_path`` and ``model.model_path`` aligned when only one + side is explicitly overridden (the same precedence used by runtime config + loading conventions). """ - - def __init__(self, config: GlobalConfig, **kwargs): - self.config = config - self.temp_values = kwargs - self.old_values = {} - - def _get_nested_attr(self, key: str): - """Get attribute, supporting dot notation for nested access.""" - if "." in key: - parts = key.split(".") - obj = self.config - for part in parts[:-1]: - obj = getattr(obj, part) - return getattr(obj, parts[-1]) - return getattr(self.config, key) - - def _set_nested_attr(self, key: str, value): - """Set attribute, supporting dot notation for nested access.""" - if "." in key: - parts = key.split(".") - obj = self.config - for part in parts[:-1]: - obj = getattr(obj, part) - setattr(obj, parts[-1], value) - else: - setattr(self.config, key, value) - - def __enter__(self): - for key, value in self.temp_values.items(): - self.old_values[key] = self._get_nested_attr(key) - self._set_nested_attr(key, value) - return self.config - - def __exit__(self, exc_type, exc_val, exc_tb): - for key, value in self.old_values.items(): - self._set_nested_attr(key, value) - return False + + if parser is None: + parser = make_args() + + namespace = parser.parse_args(argv) + parsed = vars(namespace) + cfg = GlobalConfig.get_instance() + + # Server: reconstruct to preserve validation behavior. + from pymllm.configs.server_config import ServerConfig + + server_updates: dict[str, Any] = {} + for dc_field in fields(cfg.server): + key = f"server__{dc_field.name}" + if key in parsed: + server_updates[dc_field.name] = parsed[key] + if server_updates: + server_values = { + dc_field.name: getattr(cfg.server, dc_field.name) + for dc_field in fields(cfg.server) + } + server_values.update(server_updates) + cfg.server = ServerConfig(**server_values) + + # Model / Quantization: in-place updates are sufficient. + for section_name, section_obj in ( + ("model", cfg.model), + ("quantization", cfg.quantization), + ): + for dc_field in fields(section_obj): + key = f"{section_name}__{dc_field.name}" + if key in parsed: + setattr(section_obj, dc_field.name, parsed[key]) + + # Keep model path synchronized when only one side is explicitly overridden. + server_model_overridden = "server__model_path" in parsed + model_model_overridden = "model__model_path" in parsed + if server_model_overridden and not model_model_overridden: + cfg.model.model_path = cfg.server.model_path + elif model_model_overridden and not server_model_overridden: + cfg.server.model_path = cfg.model.model_path + + cfg._initialized = True + return cfg -# Convenience function def get_global_config() -> GlobalConfig: - """Get the global config singleton instance.""" + """Return the global config singleton.""" return GlobalConfig.get_instance() diff --git a/pymllm/configs/model_config.py b/pymllm/configs/model_config.py index e69de29bb..c23dff1d9 100644 --- a/pymllm/configs/model_config.py +++ b/pymllm/configs/model_config.py @@ -0,0 +1,31 @@ +"""Lightweight model configuration: path + HuggingFace config handle.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Optional + + +@dataclass +class ModelConfig: + """Minimal model config wrapping a HuggingFace PretrainedConfig. + + Attributes on ``hf_config`` are flattened onto this object:: + + cfg = get_global_config().model + cfg.hidden_size # -> hf_config.hidden_size + cfg.vocab_size # -> hf_config.vocab_size + cfg.text_config # -> hf_config.text_config (multimodal) + """ + + # Populated at runtime via ``transformers.AutoConfig.from_pretrained`` + hf_config: Optional[Any] = field(default=None, repr=False) + + def __getattr__(self, name: str) -> Any: + hf = object.__getattribute__(self, "hf_config") + if hf is not None and hasattr(hf, name): + return getattr(hf, name) + raise AttributeError( + f"'{type(self).__name__}' has no attribute '{name}' " + f"(also not found on hf_config)" + ) diff --git a/pymllm/configs/quantization_config.py b/pymllm/configs/quantization_config.py index e69de29bb..850ea82b8 100644 --- a/pymllm/configs/quantization_config.py +++ b/pymllm/configs/quantization_config.py @@ -0,0 +1,18 @@ +"""Quantization settings for model weights and KV cache.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal, Optional + + +@dataclass +class QuantizationConfig: + """Quantization configuration for weights and KV cache.""" + + # Weight quantization method (e.g. "awq", "gptq", "fp8", None for no quant) + method: Optional[str] = None + # KV cache data type override + kv_cache_dtype: Literal[ + "auto", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2" + ] = "auto" diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py index 56be4fc4f..7cda9c3b8 100644 --- a/pymllm/configs/server_config.py +++ b/pymllm/configs/server_config.py @@ -2,266 +2,118 @@ from pathlib import Path from typing import Any, Literal, Optional -from dataclasses import asdict, dataclass, field +from dataclasses import dataclass, field @dataclass class ServerConfig: - """ - Centralized runtime configuration for the MLLM server. + """Centralized runtime configuration for the MLLM server.""" - The fields are grouped by operational concern so that: - - CLI args can map directly to this dataclass. - - YAML/JSON config files can be loaded and validated in one place. - - future extensions can follow a predictable structure. - """ - - # ------------------------------------------------------------------------- - # Model and tokenizer settings - # ------------------------------------------------------------------------- - # Required path to the model checkpoint directory or model identifier. - model_path: Path - # Optional tokenizer path; when omitted we fall back to `model_path`. + # --------------------------------------------------------------------- # + # Model and tokenizer configuration + # --------------------------------------------------------------------- # + model_path: Optional[Path] = None tokenizer_path: Optional[Path] = None - # Tokenizer bootstrap strategy: - # - "auto": infer tokenizer mode from model type. - # - "slow"/"fast": force a specific tokenizer implementation. tokenizer_mode: Literal["auto", "slow", "fast"] = "auto" - # Number of worker threads/processes used by tokenizer service. - tokenizer_worker_num: int = 1 - # Skip tokenizer initialization at startup to reduce cold-start latency. - skip_tokenizer_init: bool = False - # Model loading format hint for loader backends. - load_format: Literal["auto", "pt", "safetensors", "gguf"] = "auto" - # Allow loading custom model code from remote repositories. + load_format: Literal["auto", "safetensors"] = "auto" trust_remote_code: bool = False - # Explicit context length; `None` means infer from model config. + download_dir: Optional[Path] = None context_length: Optional[int] = None - # Model precision policy for weights and activations. dtype: Literal["auto", "float16", "bfloat16", "float32"] = "auto" - # Quantization algorithm to apply at load time. - quantization: Optional[str] = None - # KV cache dtype; can differ from model dtype for better memory trade-offs. - kv_cache_dtype: Literal["auto", "float16", "bfloat16", "fp8_e4m3", "fp8_e5m2"] = ( - "auto" - ) - # HuggingFace revision/commit/tag for deterministic model resolution. - revision: Optional[str] = None - # Optional custom directory used to cache downloaded model artifacts. - download_dir: Optional[Path] = None - # ------------------------------------------------------------------------- - # HTTP / API server settings - # ------------------------------------------------------------------------- - # Host address the HTTP server binds to. + # --------------------------------------------------------------------- # + # HTTP / API server + # --------------------------------------------------------------------- # host: str = "127.0.0.1" - # TCP port exposed by the HTTP server. port: int = 30000 - # Optional FastAPI root path when running behind a reverse proxy. fastapi_root_path: str = "" - # API key required by client-facing endpoints. api_key: Optional[str] = None - # Admin API key for privileged management endpoints. admin_api_key: Optional[str] = None - # Public model name returned in OpenAI-compatible API responses. served_model_name: Optional[str] = None - # Path used for server-side file uploads or temporary user artifacts. file_storage_path: Path = Path("mllm_storage") - # ------------------------------------------------------------------------- - # Runtime and scheduling behavior - # ------------------------------------------------------------------------- - # Fraction of total GPU memory reserved for static allocations - # (primarily model weights + KV cache). + # --------------------------------------------------------------------- # + # Scheduling and memory + # --------------------------------------------------------------------- # mem_fraction_static: Optional[float] = None - # Maximum number of requests concurrently executing in scheduler. - max_running_requests: Optional[int] = None - # Maximum queued requests waiting for execution. + max_running_requests: Optional[int] = 1 max_queued_requests: Optional[int] = None - # Hard cap of total active tokens across all in-flight requests. max_total_tokens: Optional[int] = None - # Prefill chunk size used to trade throughput vs memory pressure. chunked_prefill_size: Optional[int] = None - # Upper bound for tokens accepted in a single prefill pass. - max_prefill_tokens: int = 16384 - # Scheduling policy: - # - "fcfs": first-come-first-served fairness. - # - "lpm": longest-prefix-match style cache locality optimization. - schedule_policy: Literal["fcfs", "lpm"] = "fcfs" - # Conservative multiplier for scheduler admission decisions. - # Values > 1.0 are safer for OOM avoidance but may reduce utilization. + max_prefill_tokens: int = None + schedule_policy: Literal["auto", "fcfs"] = "fcfs" schedule_conservativeness: float = 1.0 - # Enable low-power sleep while idle to reduce background GPU usage. sleep_on_idle: bool = False - # Stream partial output every N decode steps when streaming is enabled. stream_interval: int = 1 - # Enable token streaming in generation responses. stream_output: bool = True - # ------------------------------------------------------------------------- - # Parallelism and distributed deployment - # ------------------------------------------------------------------------- - # Tensor parallel size (intra-layer sharding). - tp_size: int = 1 - # Data parallel size (replicated model workers). - dp_size: int = 1 - # Expert parallel size for MoE-style models. - ep_size: int = 1 - # Pipeline parallel size (inter-layer partitioning). - pp_size: int = 1 - # Number of nodes participating in distributed serving. - nnodes: int = 1 - # Rank of current node in multi-node topology. - node_rank: int = 0 - # Torch distributed init address, e.g. "host:port". - dist_init_addr: Optional[str] = None - # Optional NCCL communication port override. - nccl_port: Optional[int] = None - # Timeout in seconds for distributed collectives. - dist_timeout: Optional[int] = None - # Base GPU index used for process-to-device mapping. + # --------------------------------------------------------------------- # + # Threads + # --------------------------------------------------------------------- # + enable_disk_io_async: bool = False + disk_io_async_thread_count: int = 1 + + # --------------------------------------------------------------------- # + # Device + # --------------------------------------------------------------------- # base_gpu_id: int = 0 - # Step size between logical workers when assigning GPU IDs. - gpu_id_step: int = 1 - # ------------------------------------------------------------------------- - # Backend and acceleration toggles - # ------------------------------------------------------------------------- - # Attention kernel backend selection. - attention_backend: Optional[str] = None - # Sampling backend selection. + # --------------------------------------------------------------------- # + # Backend / acceleration + # --------------------------------------------------------------------- # + attention_backend: Literal["auto", "flashinfer"] = "auto" sampling_backend: Optional[str] = None - # Grammar-constrained decoding backend. - grammar_backend: Optional[str] = None - # Disable CUDA graph capture for debugging/compatibility. disable_cuda_graph: bool = False - # Enable `torch.compile` acceleration path. - enable_torch_compile: bool = False - # Maximum batch size considered by `torch.compile` profiles. + enable_torch_compile: bool = True torch_compile_max_bs: int = 32 - # Enable deterministic inference behavior where possible. - enable_deterministic_inference: bool = False - # Random seed for reproducible sampling and initialization. - random_seed: Optional[int] = None + random_seed: Optional[int] = 42 - # ------------------------------------------------------------------------- - # Logging, metrics, and observability - # ------------------------------------------------------------------------- - # Global log level for server components. + # --------------------------------------------------------------------- # + # Logging and observability + # --------------------------------------------------------------------- # log_level: Literal["debug", "info", "warning", "error", "critical"] = "info" - # HTTP access log level; if None, inherits global log level. - log_level_http: Optional[str] = None - # Log each request payload/metadata for debugging. - log_requests: bool = False - # Verbosity level for request logging, larger means more detail. - log_requests_level: int = 2 - # Toggle built-in Prometheus/metrics endpoint. enable_metrics: bool = False - # Include latency/time-cost summaries in logs. show_time_cost: bool = False - # Optional OpenTelemetry traces endpoint ("host:port"). - otlp_traces_endpoint: str = "localhost:4317" - # Enable tracing export to OTLP collector. - enable_trace: bool = False - - # ------------------------------------------------------------------------- - # Feature switches and advanced decoding options - # ------------------------------------------------------------------------- - # Enable LoRA adapter serving support. - enable_lora: bool = False - # Maximum number of LoRA adapters loaded simultaneously. - max_loaded_loras: Optional[int] = None - # Maximum LoRA adapters that can be mixed in one batch. - max_loras_per_batch: int = 8 - # LoRA backend implementation. - lora_backend: Literal["triton", "csgmv", "torch_native"] = "csgmv" - # Enable multimodal processing pipeline. - enable_multimodal: bool = False - # Max concurrent multimodal tool calls. - mm_max_concurrent_calls: int = 32 - # Timeout (seconds) for each multimodal call. - mm_per_request_timeout: float = 10.0 - # Speculative decoding algorithm name (e.g. "eagle", "ngram"). - speculative_algorithm: Optional[str] = None - # Draft model path used in speculative decoding. - speculative_draft_model_path: Optional[Path] = None - # Number of speculative steps per target decode iteration. - speculative_num_steps: Optional[int] = None - # Number of proposed draft tokens per speculation step. - speculative_num_draft_tokens: Optional[int] = None - # ------------------------------------------------------------------------- - # Internal bookkeeping (not usually set by users directly) - # ------------------------------------------------------------------------- - # Additional arbitrary key-value options for forward compatibility. + # --------------------------------------------------------------------- # + # Feature switches + # --------------------------------------------------------------------- # + # enable_lora: bool = False + # max_loaded_loras: Optional[int] = None + # max_loras_per_batch: int = 8 + # lora_backend: Literal["triton", "csgmv", "torch_native"] = "csgmv" + # enable_multimodal: bool = False + # speculative_algorithm: Optional[str] = None + # speculative_draft_model_path: Optional[Path] = None + # speculative_num_steps: Optional[int] = None + # speculative_num_draft_tokens: Optional[int] = None + + # --------------------------------------------------------------------- # + # Extra + # --------------------------------------------------------------------- # extra_options: dict[str, Any] = field(default_factory=dict) def __post_init__(self) -> None: - """Normalize defaults and validate constraints after dataclass initialization.""" if self.tokenizer_path is None: self.tokenizer_path = self.model_path if self.served_model_name is None: self.served_model_name = str(self.model_path) + self._validate() - self._validate_basic_constraints() - self._validate_parallelism_constraints() - self._validate_scheduler_constraints() - - def _validate_basic_constraints(self) -> None: - """Validate scalar ranges and common invariants.""" + def _validate(self) -> None: if self.port <= 0 or self.port > 65535: raise ValueError("`port` must be in range [1, 65535].") - if self.max_prefill_tokens <= 0: - raise ValueError("`max_prefill_tokens` must be greater than 0.") + if self.max_prefill_tokens is not None and self.max_prefill_tokens <= 0: + raise ValueError("`max_prefill_tokens` must be > 0.") if self.stream_interval <= 0: - raise ValueError("`stream_interval` must be greater than 0.") + raise ValueError("`stream_interval` must be > 0.") if self.mem_fraction_static is not None and not ( 0.0 < self.mem_fraction_static < 1.0 ): - raise ValueError("`mem_fraction_static` must be in range (0.0, 1.0).") - - def _validate_parallelism_constraints(self) -> None: - """Validate distributed and parallel topology settings.""" - for key, value in { - "tp_size": self.tp_size, - "dp_size": self.dp_size, - "ep_size": self.ep_size, - "pp_size": self.pp_size, - "nnodes": self.nnodes, - }.items(): - if value <= 0: - raise ValueError(f"`{key}` must be greater than 0.") - - if self.node_rank < 0 or self.node_rank >= self.nnodes: - raise ValueError("`node_rank` must satisfy 0 <= node_rank < nnodes.") - - def _validate_scheduler_constraints(self) -> None: - """Validate scheduler-related soft limits.""" + raise ValueError("`mem_fraction_static` must be in (0.0, 1.0).") if self.max_running_requests is not None and self.max_running_requests <= 0: - raise ValueError("`max_running_requests` must be greater than 0 when set.") + raise ValueError("`max_running_requests` must be > 0 when set.") if self.max_queued_requests is not None and self.max_queued_requests < 0: raise ValueError("`max_queued_requests` must be >= 0 when set.") - if self.max_total_tokens is not None and self.max_total_tokens <= 0: - raise ValueError("`max_total_tokens` must be greater than 0 when set.") - if self.chunked_prefill_size is not None and self.chunked_prefill_size <= 0: - raise ValueError("`chunked_prefill_size` must be greater than 0 when set.") if self.schedule_conservativeness <= 0: - raise ValueError("`schedule_conservativeness` must be greater than 0.") - - def to_dict(self) -> dict[str, Any]: - """ - Serialize config to a plain dictionary. - - Path values are converted to string for easier JSON/YAML serialization. - """ - data = asdict(self) - for key in [ - "model_path", - "tokenizer_path", - "download_dir", - "file_storage_path", - "speculative_draft_model_path", - ]: - if data.get(key) is not None: - data[key] = str(data[key]) - return data + raise ValueError("`schedule_conservativeness` must be > 0.") diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index 7ce1be5e3..25ada7c70 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -1 +1,115 @@ -import multiprocessing as mp +import logging +from pathlib import Path +from typing import Optional + +import zmq +import torch +import torch.multiprocessing as mp +from transformers import AutoConfig +from huggingface_hub import snapshot_download +from pymllm.configs import get_global_config +from pymllm.orchestrator.tokenizer_process import TokenizerProcess +from pymllm.orchestrator.detokenizer_process import DetokenizerProcess +from pymllm.orchestrator.model_runner_process import ModelRunnerProcess +from pymllm.orchestrator.async_disk_io_process import AsyncDiskIoProcess +from pymllm.orchestrator.request_response_process import RequestResponseProcess + +logger = logging.getLogger(__name__) + + +class Engine: + def __init__(self): + self._config_logging() + self._set_default_torch_dtype() + self._check_model_and_tokenizer() + + # Orchestrator, shall we start the music here? + self._launch_processes() + + def _launch_processes(self): + """ + TODO issue processes here + """ + + # RR process is the main process + self._rr_process = RequestResponseProcess() + + def _set_default_torch_dtype(self): + """Set the default torch dtype based on the server configuration.""" + dtype = get_global_config().server.dtype + if dtype == "auto": + dtype = "bfloat16" if torch.cuda.is_available() else "float32" + dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, + } + torch_dtype = dtype_map.get(dtype) + if torch_dtype is None: + raise ValueError(f"Unsupported dtype for torch default dtype: {dtype!r}") + torch.set_default_dtype(torch_dtype) + + def _config_logging(self): + """Configure logging level from server configuration.""" + level_name = get_global_config().server.log_level.upper() + level = getattr(logging, level_name, logging.INFO) + root_logger = logging.getLogger() + if not root_logger.handlers: + logging.basicConfig( + level=level, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + else: + root_logger.setLevel(level) + logging.getLogger("pymllm").setLevel(level) + + def _check_model_and_tokenizer(self): + cfg = get_global_config() + if cfg.server.model_path is None or cfg.server.tokenizer_path is None: + logger.error("Model path or tokenizer path is not set") + raise ValueError("Model path or tokenizer path is not set") + model_path = cfg.server.model_path + tokenizer_path = cfg.server.tokenizer_path + download_dir = cfg.server.download_dir + trust_remote_code = cfg.server.trust_remote_code + + shared_path = model_path == tokenizer_path + + model_path = self._maybe_download(model_path, download_dir) + cfg.server.model_path = model_path + + if shared_path: + cfg.server.tokenizer_path = model_path + else: + cfg.server.tokenizer_path = self._maybe_download( + tokenizer_path, download_dir + ) + + cfg.model.hf_config = AutoConfig.from_pretrained( + str(model_path), + trust_remote_code=trust_remote_code, + ) + logger.info("Loaded model config: %s", cfg.model.hf_config.__class__.__name__) + + @staticmethod + def _maybe_download(path: Path, download_dir: Optional[Path] = None) -> Path: + """Return a local directory for *path*, downloading if necessary.""" + if path.is_dir(): + return path + + repo_id = str(path) + logger.info("Downloading '%s' ...", repo_id) + + kwargs = {} + if download_dir is not None: + kwargs["local_dir"] = str(download_dir / path.name) + + downloaded = snapshot_download(repo_id=repo_id, **kwargs) + logger.info("Downloaded '%s' to '%s'", repo_id, downloaded) + return Path(downloaded) + + def generate(self, stream: bool = True): + pass + + async def generate_async(self, stream: bool = True): + pass diff --git a/pymllm/orchestrator/scheduler.py b/pymllm/executor/eager_runner.py similarity index 100% rename from pymllm/orchestrator/scheduler.py rename to pymllm/executor/eager_runner.py diff --git a/pymllm/orchestrator/async_disk_io_process.py b/pymllm/orchestrator/async_disk_io_process.py new file mode 100644 index 000000000..598d93eb2 --- /dev/null +++ b/pymllm/orchestrator/async_disk_io_process.py @@ -0,0 +1,3 @@ +class AsyncDiskIoProcess: + def __init__(self): + pass diff --git a/pymllm/orchestrator/detokenizer_process.py b/pymllm/orchestrator/detokenizer_process.py new file mode 100644 index 000000000..47c1c5950 --- /dev/null +++ b/pymllm/orchestrator/detokenizer_process.py @@ -0,0 +1,3 @@ +class DetokenizerProcess: + def __init__(self): + pass diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py new file mode 100644 index 000000000..45091b590 --- /dev/null +++ b/pymllm/orchestrator/model_runner_process.py @@ -0,0 +1,3 @@ +class ModelRunnerProcess: + def __init__(self): + pass diff --git a/pymllm/orchestrator/parallel_state.py b/pymllm/orchestrator/parallel_state.py index 545c74a87..9fb208769 100644 --- a/pymllm/orchestrator/parallel_state.py +++ b/pymllm/orchestrator/parallel_state.py @@ -1,21 +1,31 @@ -"""Parallel state management for tensor and pipeline parallelism.""" +"""Minimal parallel state for single-GPU serving. + +pymllm targets single-GPU, high-concurrency inference. This module keeps +the TP / DP / PP scaffolding so the rest of the codebase can query ranks +and groups uniformly, but the default (and expected) case is world_size=1. +""" import logging +from typing import Optional + import torch import torch.distributed as dist -from typing import Optional -from pymllm.configs.global_config import get_global_config from pymllm.orchestrator.group_coordinator import GroupCoordinator logger = logging.getLogger(__name__) - -# Global groups _TP_GROUP: Optional[GroupCoordinator] = None _DP_GROUP: Optional[GroupCoordinator] = None _PP_GROUP: Optional[GroupCoordinator] = None +_TP_RANK: int = 0 +_TP_SIZE: int = 1 +_DP_RANK: int = 0 +_DP_SIZE: int = 1 +_PP_RANK: int = 0 +_PP_SIZE: int = 1 + def initialize_model_parallel( tensor_model_parallel_size: int = 1, @@ -23,15 +33,12 @@ def initialize_model_parallel( pipeline_model_parallel_size: int = 1, backend: str = "nccl", ) -> None: - """Initialize model parallel groups. - - Args: - tensor_model_parallel_size: Number of GPUs for tensor parallelism - data_parallel_size: Number of GPUs for data parallelism - pipeline_model_parallel_size: Number of stages for pipeline parallelism - backend: Communication backend (nccl for GPU, gloo for CPU) - """ global _TP_GROUP, _DP_GROUP, _PP_GROUP + global _TP_RANK, _TP_SIZE, _DP_RANK, _DP_SIZE, _PP_RANK, _PP_SIZE + + _TP_SIZE = tensor_model_parallel_size + _DP_SIZE = data_parallel_size + _PP_SIZE = pipeline_model_parallel_size if not dist.is_initialized(): return @@ -40,29 +47,6 @@ def initialize_model_parallel( world_rank = dist.get_rank() local_rank = int(torch.cuda.current_device()) if torch.cuda.is_available() else 0 - config = get_global_config() - - # Update runtime config - config.runtime.world_size = world_size - config.runtime.world_rank = world_rank - config.runtime.local_rank = local_rank - config.runtime.tp_size = tensor_model_parallel_size - config.runtime.dp_size = data_parallel_size - config.runtime.pp_size = pipeline_model_parallel_size - - # Logging - logger.info( - "Model parallel runtime config set: world_size=%s, world_rank=%s, " - "local_rank=%s, tp_size=%s, dp_size=%s, pp_size=%s", - config.runtime.world_size, - config.runtime.world_rank, - config.runtime.local_rank, - config.runtime.tp_size, - config.runtime.dp_size, - config.runtime.pp_size, - ) - - # Validate parallelism setup assert ( tensor_model_parallel_size * data_parallel_size * pipeline_model_parallel_size == world_size @@ -71,13 +55,22 @@ def initialize_model_parallel( f"PP({pipeline_model_parallel_size}) != World({world_size})" ) - # Create TP groups (intra-layer sharding) + logger.info( + "Parallel init: world=%d rank=%d tp=%d dp=%d pp=%d", + world_size, + world_rank, + tensor_model_parallel_size, + data_parallel_size, + pipeline_model_parallel_size, + ) + if tensor_model_parallel_size > 1: num_tp_groups = world_size // tensor_model_parallel_size for i in range(num_tp_groups): ranks = list( range( - i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size + i * tensor_model_parallel_size, + (i + 1) * tensor_model_parallel_size, ) ) if world_rank in ranks: @@ -86,13 +79,9 @@ def initialize_model_parallel( local_rank=local_rank, backend=backend, ) - config.runtime.tp_rank = _TP_GROUP.rank_in_group + _TP_RANK = _TP_GROUP.rank_in_group break - else: - _TP_GROUP = None - config.runtime.tp_rank = 0 - # Create DP groups (data replication) if data_parallel_size > 1: num_dp_groups = world_size // data_parallel_size for i in range(num_dp_groups): @@ -103,13 +92,9 @@ def initialize_model_parallel( local_rank=local_rank, backend=backend, ) - config.runtime.dp_rank = _DP_GROUP.rank_in_group + _DP_RANK = _DP_GROUP.rank_in_group break - else: - _DP_GROUP = None - config.runtime.dp_rank = 0 - # Create PP groups (inter-layer partitioning) if pipeline_model_parallel_size > 1: num_pp_groups = world_size // pipeline_model_parallel_size for i in range(num_pp_groups): @@ -121,67 +106,60 @@ def initialize_model_parallel( local_rank=local_rank, backend=backend, ) - config.runtime.pp_rank = _PP_GROUP.rank_in_group + _PP_RANK = _PP_GROUP.rank_in_group break - else: - _PP_GROUP = None - config.runtime.pp_rank = 0 + + +# ---- group accessors ------------------------------------------------------ def get_tp_group() -> Optional[GroupCoordinator]: - """Get the tensor model parallel group.""" return _TP_GROUP def get_dp_group() -> Optional[GroupCoordinator]: - """Get the data parallel group.""" return _DP_GROUP def get_pp_group() -> Optional[GroupCoordinator]: - """Get the pipeline parallel group.""" return _PP_GROUP -# Convenience functions for tensor parallelism +# ---- rank / size helpers -------------------------------------------------- + + def get_tensor_model_parallel_rank() -> int: - """Get current tensor model parallel rank.""" - return get_global_config().runtime.tp_rank + return _TP_RANK def get_tensor_model_parallel_world_size() -> int: - """Get tensor model parallel world size.""" - return get_global_config().runtime.tp_size + return _TP_SIZE def get_data_parallel_rank() -> int: - """Get current data parallel rank.""" - return get_global_config().runtime.dp_rank + return _DP_RANK def get_data_parallel_world_size() -> int: - """Get data parallel world size.""" - return get_global_config().runtime.dp_size + return _DP_SIZE def get_pipeline_model_parallel_rank() -> int: - """Get current pipeline parallel rank.""" - return get_global_config().runtime.pp_rank + return _PP_RANK def get_pipeline_model_parallel_world_size() -> int: - """Get pipeline parallel world size.""" - return get_global_config().runtime.pp_size + return _PP_SIZE def model_parallel_is_initialized() -> bool: - """Check if model parallel is initialized.""" return _TP_GROUP is not None or _DP_GROUP is not None or _PP_GROUP is not None -# Communication helpers +# ---- communication helpers ------------------------------------------------ + + def tensor_model_parallel_all_reduce(tensor: torch.Tensor) -> torch.Tensor: - """All-reduce across TP group.""" group = get_tp_group() if group is None: return tensor @@ -192,7 +170,6 @@ def tensor_model_parallel_all_gather( tensor: torch.Tensor, dim: int = 0, ) -> torch.Tensor: - """All-gather across TP group.""" group = get_tp_group() if group is None: return tensor @@ -200,7 +177,6 @@ def tensor_model_parallel_all_gather( def data_parallel_all_reduce(tensor: torch.Tensor) -> torch.Tensor: - """All-reduce across DP group.""" group = get_dp_group() if group is None: return tensor diff --git a/pymllm/orchestrator/request_response_process.py b/pymllm/orchestrator/request_response_process.py new file mode 100644 index 000000000..998c2655e --- /dev/null +++ b/pymllm/orchestrator/request_response_process.py @@ -0,0 +1,10 @@ +""" +This module contains the request and response threads for the orchestrator. + +NOTE: This RR(request and response) threads can only be used as the main thread of the orchestrator. +""" + + +class RequestResponseProcess: + def __init__(self): + pass diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py new file mode 100644 index 000000000..7a7783d57 --- /dev/null +++ b/pymllm/orchestrator/scheduler_process.py @@ -0,0 +1,3 @@ +class SchedulerProcess: + def __init__(self): + pass diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py new file mode 100644 index 000000000..0dca2155e --- /dev/null +++ b/pymllm/orchestrator/tokenizer_process.py @@ -0,0 +1,3 @@ +class TokenizerProcess: + def __init__(self): + pass diff --git a/pymllm/server/launch.py b/pymllm/server/launch.py index e69de29bb..83a222f7e 100644 --- a/pymllm/server/launch.py +++ b/pymllm/server/launch.py @@ -0,0 +1,17 @@ +from pymllm.engine.launch import Engine +from pymllm.configs.global_config import make_args, read_args + + +def _prepare_args(): + parser = make_args() + read_args(parser=parser) + + +def main(): + _prepare_args() + engine = Engine() + engine.launch() + + +if __name__ == "__main__": + main() diff --git a/pymllm/tests/test_vocab_parallel_embedding.py b/pymllm/tests/test_vocab_parallel_embedding.py index e22b52a57..44148f983 100644 --- a/pymllm/tests/test_vocab_parallel_embedding.py +++ b/pymllm/tests/test_vocab_parallel_embedding.py @@ -12,10 +12,11 @@ import torch.multiprocessing as mp from typing import Callable -from pymllm.configs import get_global_config from pymllm.layers import VocabParallelEmbedding -from pymllm.orchestrator import ( - initialize_model_parallel, +from pymllm.orchestrator import initialize_model_parallel +from pymllm.orchestrator.parallel_state import ( + get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size, ) # Show runtime init logs during test execution. @@ -91,10 +92,11 @@ def embedding_forward_tp8_worker_cuda(rank: int, local_rank: int, world_size: in local_rank: Local rank within this node (for logging/debugging) world_size: Total world size """ - config = get_global_config() + tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() - assert config.runtime.tp_size == 8, f"Rank {rank}: tp_size should be 8" - assert config.runtime.tp_rank == rank, f"Rank {rank}: tp_rank mismatch" + assert tp_size == 8, f"Rank {rank}: tp_size should be 8" + assert tp_rank == rank, f"Rank {rank}: tp_rank mismatch" vocab_size = 1024 embed_dim = 64 @@ -281,12 +283,12 @@ class TestVocabParallelEmbeddingCUDA: @pytest.fixture(autouse=True) def setup_config(self): - config = get_global_config() - config.runtime.tp_size = 1 - config.runtime.tp_rank = 0 + import pymllm.orchestrator.parallel_state as ps + ps._TP_SIZE = 1 + ps._TP_RANK = 0 yield - config.runtime.tp_size = 1 - config.runtime.tp_rank = 0 + ps._TP_SIZE = 1 + ps._TP_RANK = 0 def test_cuda_forward(self): layer = VocabParallelEmbedding(1000, 512).cuda() diff --git a/pyproject.toml b/pyproject.toml index 160341bad..d417b5790 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ dependencies=[ "packaging", "pytest", "pytest-html", - "apache-tvm-ffi == 0.1.8", + "apache-tvm-ffi == 0.1.8.post2", "pyyaml >= 6.0.2", "openai", "modelscope", @@ -33,12 +33,13 @@ dependencies=[ ] [project.optional-dependencies] -cuda = ["tilelang", "flashinfer-python"] +cuda = ["tilelang", "flashinfer-python", "pyzmq"] [project.scripts] pymllm = "pymllm.__main__:main" mllm-convertor = "pymllm.mobile.utils.mllm_convertor:main" mllm-service = "pymllm.mobile.service.tools:cli_app" +pymllm-server = "pymllm.server.launch:main" [tool.setuptools.exclude-package-data] "*" = ["*.pyc"] From 57ef3727446b9a72bbc5f22541ea875da2f977bf Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Sat, 21 Feb 2026 15:05:35 +0000 Subject: [PATCH 28/42] feat: implement store_cache functionality and related components - Added a new `store_cache` CUDA kernel for efficient key/value tensor storage in a cache. - Introduced Python interface for the `store_cache` kernel, enabling its use in PyTorch. - Created benchmarks to compare `store_cache` performance against standard PyTorch indexing. - Updated `.gitignore` to exclude `.claude` directory and added `.pytest_cache` to `mllm-kernel`. - Added tests for `store_cache` functionality to ensure correctness and performance. - Refactored memory management in `KVPool` to utilize the new `store_cache` kernel when applicable. --- .claude/skills/update-codeowners/SKILL.md | 44 + .gitignore | 1 - mllm-kernel/.gitignore | 1 + mllm-kernel/benchmarks/bench_store_cache.py | 164 ++++ .../mllm_kernel/cuda/csrc/store_cache.cuh | 202 +++++ mllm-kernel/mllm_kernel/cuda/jit/__init__.py | 3 +- .../mllm_kernel/cuda/jit/store_cache.py | 127 +++ mllm-kernel/tests/test_store_cache.py | 66 ++ pymllm/engine/io_struct.py | 196 +++++ pymllm/engine/launch.py | 308 ++++++- pymllm/mem_cache/__init__.py | 37 + pymllm/mem_cache/memory_pool.py | 480 +++++++++++ pymllm/mem_cache/param_disk_cache.py | 0 pymllm/mem_cache/radix_cache.py | 794 ++++++++++++++++++ pymllm/orchestrator/async_disk_io_process.py | 83 +- pymllm/orchestrator/detokenizer_process.py | 113 ++- pymllm/orchestrator/ipc_utils.py | 70 ++ pymllm/orchestrator/model_runner_process.py | 113 ++- .../orchestrator/request_response_process.py | 148 +++- pymllm/orchestrator/scheduler_process.py | 247 +++++- pymllm/orchestrator/tokenizer_process.py | 101 ++- 21 files changed, 3264 insertions(+), 34 deletions(-) create mode 100644 .claude/skills/update-codeowners/SKILL.md create mode 100644 mllm-kernel/benchmarks/bench_store_cache.py create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/store_cache.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/jit/store_cache.py create mode 100644 mllm-kernel/tests/test_store_cache.py create mode 100644 pymllm/engine/io_struct.py create mode 100644 pymllm/mem_cache/memory_pool.py delete mode 100644 pymllm/mem_cache/param_disk_cache.py create mode 100644 pymllm/orchestrator/ipc_utils.py diff --git a/.claude/skills/update-codeowners/SKILL.md b/.claude/skills/update-codeowners/SKILL.md new file mode 100644 index 000000000..286667045 --- /dev/null +++ b/.claude/skills/update-codeowners/SKILL.md @@ -0,0 +1,44 @@ +--- +name: update-codeowners +description: Updates CODEOWNERS entries safely with consistent path and owner formatting. Use when the user asks to add, remove, or modify CODEOWNERS rules, ownership mappings, reviewers, or module maintainers. +--- + +# Update CODEOWNERS + +## Goal +Maintain `CODEOWNERS` accurately while preserving the repository's existing section/comment style. + +## Workflow +1. Read the current `CODEOWNERS` file before editing. +2. Identify requested changes as one of: + - Add new path rule + - Modify owners for existing path rule + - Remove obsolete path rule + - Reorganize section comments (only if requested) +3. Update rules in place instead of creating duplicates for the same path. +4. Keep existing section headers and comment style unless the user asks to refactor structure. +5. Return a concise changelog describing which paths were added, changed, or removed. + +## Rule Format +- Use one rule per line: ` ...` +- Owners must be GitHub handles prefixed with `@`. +- Keep path style consistent with the file (in this repo, path patterns typically start with `/`). +- Do not leave rules with empty owner lists. + +## Editing Guidelines +- Prefer minimal edits near related sections. +- If a path already exists, update that line instead of adding a second conflicting line. +- If a new rule logically belongs to an existing section, place it in that section. +- Preserve human-readable grouping and blank lines. +- Keep comments intact unless they are clearly outdated and the user asked for cleanup. + +## Validation Checklist +- [ ] Every non-comment, non-empty line has at least one owner. +- [ ] Every owner token starts with `@`. +- [ ] No accidental duplicate rule for the exact same path pattern. +- [ ] Existing comments/sections were preserved unless explicitly changed. + +## Example Requests +- "Add `/mllm/models/new_model/ @alice @bob` under models." +- "Change `/core/Storage` owner to `@team-core`." +- "Remove ownership rule for deprecated path `/legacy/`." diff --git a/.gitignore b/.gitignore index 7397d6ecc..cdafc2707 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ .cache/ .tmp/ compile_commands.json -.claude/ # MLLM Team Specific tasks/mllmteam* diff --git a/mllm-kernel/.gitignore b/mllm-kernel/.gitignore index df61d0fae..3eefc8fba 100644 --- a/mllm-kernel/.gitignore +++ b/mllm-kernel/.gitignore @@ -3,3 +3,4 @@ build-py/ .vscode/settings.json compile_commands.json .clangd +.pytest_cache/ diff --git a/mllm-kernel/benchmarks/bench_store_cache.py b/mllm-kernel/benchmarks/bench_store_cache.py new file mode 100644 index 000000000..b96fa608b --- /dev/null +++ b/mllm-kernel/benchmarks/bench_store_cache.py @@ -0,0 +1,164 @@ +"""Benchmark store_cache vs torch index with torch.profiler. + +Example: +python benchmarks/bench_store_cache.py --warmup 20 --iters 200 --batch-size 512 --num-slots 8192 +""" + +import argparse + +import torch +from torch.profiler import ProfilerActivity, profile + +from mllm_kernel.cuda.jit import can_use_store_cache, store_cache + + +def _run_store_cache_once( + k: torch.Tensor, + v: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + indices: torch.Tensor, +): + store_cache(k, v, k_cache, v_cache, indices) + + +def _run_torch_index_once( + k: torch.Tensor, + v: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + indices: torch.Tensor, +): + k_cache[indices] = k + v_cache[indices] = v + + +def _profile_path( + name: str, + fn, + *, + warmup: int, + iters: int, + row_limit: int, + trace_path: str | None, +): + for _ in range(warmup): + fn() + torch.cuda.synchronize() + + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + record_shapes=False, + profile_memory=False, + with_stack=False, + ) as prof: + for _ in range(iters): + fn() + torch.cuda.synchronize() + + events = prof.key_averages() + # torch profiler times are in microseconds. + # PyTorch versions vary between *cuda* and *device* naming. + time_attr = ( + "self_cuda_time_total" + if events and hasattr(events[0], "self_cuda_time_total") + else "self_device_time_total" + ) + sort_key = ( + "self_cuda_time_total" + if time_attr == "self_cuda_time_total" + else "self_device_time_total" + ) + total_self_device_us = sum(float(getattr(evt, time_attr, 0.0)) for evt in events) + avg_self_device_us = total_self_device_us / max(iters, 1) + + print(f"\n=== {name} ===") + print( + prof.key_averages().table( + sort_by=sort_key, + row_limit=row_limit, + ) + ) + print(f"{name} total self device time: {total_self_device_us:.2f} us") + print(f"{name} avg self device time/iter: {avg_self_device_us:.2f} us") + + if trace_path: + prof.export_chrome_trace(trace_path) + print(f"{name} trace exported: {trace_path}") + + return avg_self_device_us + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Benchmark store_cache vs torch index using torch.profiler" + ) + parser.add_argument("--batch-size", type=int, default=1024) + parser.add_argument("--num-slots", type=int, default=16384) + parser.add_argument("--head-num", type=int, default=8) + parser.add_argument("--head-dim", type=int, default=128) + parser.add_argument( + "--dtype", + type=str, + default="float16", + choices=["float16", "bfloat16", "float32"], + ) + parser.add_argument("--warmup", type=int, default=50) + parser.add_argument("--iters", type=int, default=200) + parser.add_argument("--row-limit", type=int, default=20) + parser.add_argument("--export-trace-dir", type=str, default="") + parser.add_argument("--seed", type=int, default=0) + args = parser.parse_args() + + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required for this benchmark") + + torch.manual_seed(args.seed) + device = torch.device("cuda") + dtype = getattr(torch, args.dtype) + + row_dim = args.head_num * args.head_dim + row_bytes = row_dim * torch.tensor([], dtype=dtype).element_size() + if not can_use_store_cache(row_bytes): + raise RuntimeError(f"store_cache is unavailable for row_bytes={row_bytes}") + + k = torch.randn(args.batch_size, row_dim, device=device, dtype=dtype) + v = torch.randn(args.batch_size, row_dim, device=device, dtype=dtype) + # Use unique indices to avoid write conflicts. + indices = torch.randperm(args.num_slots, device=device)[: args.batch_size].to( + torch.int64 + ) + k_cache = torch.zeros(args.num_slots, row_dim, device=device, dtype=dtype) + v_cache = torch.zeros_like(k_cache) + print("=== store_cache profiler benchmark ===") + print( + f"shape: batch={args.batch_size}, row_dim={row_dim}, slots={args.num_slots}, dtype={dtype}" + ) + print(f"warmup={args.warmup}, iters={args.iters}, row_limit={args.row_limit}") + + trace_dir = args.export_trace_dir.strip() + store_trace = f"{trace_dir}/store_cache_trace.json" if trace_dir else None + torch_trace = f"{trace_dir}/torch_index_trace.json" if trace_dir else None + + store_avg_us = _profile_path( + "store_cache", + lambda: _run_store_cache_once(k, v, k_cache, v_cache, indices), + warmup=args.warmup, + iters=args.iters, + row_limit=args.row_limit, + trace_path=store_trace, + ) + torch_avg_us = _profile_path( + "torch_index", + lambda: _run_torch_index_once(k, v, k_cache, v_cache, indices), + warmup=args.warmup, + iters=args.iters, + row_limit=args.row_limit, + trace_path=torch_trace, + ) + speedup = torch_avg_us / max(store_avg_us, 1e-12) + print(f"\nSpeedup: {speedup:.3f}x") + + +if __name__ == "__main__": + main() diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/store_cache.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/store_cache.cuh new file mode 100644 index 000000000..05daabee0 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/store_cache.cuh @@ -0,0 +1,202 @@ +// Copyright SGLang Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Store KV cache kernel: efficiently scatter key/value tensors into a +// pre-allocated KV cache pool using warp-level vectorized copies. +// +// Reference: sglang jit_kernel/csrc/elementwise/kvcache.cuh + +#pragma once + +#include +#include +#include + +#include +#include + +#include + +namespace { + +// ─────────────────────────────────────────────────────────────── +// Parameter block passed to the kernel via __grid_constant__ +// ─────────────────────────────────────────────────────────────── + +struct StoreKVCacheParams { + const void* __restrict__ k; + const void* __restrict__ v; + void* __restrict__ k_cache; + void* __restrict__ v_cache; + const void* __restrict__ indices; + int64_t stride_k_bytes; + int64_t stride_v_bytes; + int64_t stride_cache_bytes; + int64_t stride_indices; + uint32_t batch_size; +}; + +constexpr uint32_t kNumWarps = 4; +constexpr uint32_t kThreadsPerBlock = kNumWarps * device::kWarpThreads; + +// ─────────────────────────────────────────────────────────────── +// Vectorized warp-level KV copy +// ─────────────────────────────────────────────────────────────── +// +// Each warp copies kElementBytes of K data and kElementBytes of V +// data using the widest possible aligned vector type (uint4 = 16B, +// uint2 = 8B, or uint32_t = 4B). + +namespace detail { + +template +__device__ __forceinline__ void warp_copy_bytes(const void* __restrict__ src, void* __restrict__ dst, int64_t num_vecs) { + const int lane = threadIdx.x % device::kWarpThreads; + const auto* s = static_cast(src); + auto* d = static_cast(dst); + for (int64_t i = lane; i < num_vecs; i += device::kWarpThreads) { d[i] = s[i]; } +} + +} // namespace detail + +template +__device__ __forceinline__ void copy_kv_warp(const void* __restrict__ k_src, const void* __restrict__ v_src, + void* __restrict__ k_dst, void* __restrict__ v_dst) { + static_assert(kElementBytes > 0 && kElementBytes % 4 == 0, "Element size must be a positive multiple of 4 bytes"); + + // Pick the widest aligned vector type the element size supports. + if constexpr (kElementBytes % 16 == 0) { + constexpr int64_t N = kElementBytes / 16; + detail::warp_copy_bytes(k_src, k_dst, N); + detail::warp_copy_bytes(v_src, v_dst, N); + } else if constexpr (kElementBytes % 8 == 0) { + constexpr int64_t N = kElementBytes / 8; + detail::warp_copy_bytes(k_src, k_dst, N); + detail::warp_copy_bytes(v_src, v_dst, N); + } else { + constexpr int64_t N = kElementBytes / 4; + detail::warp_copy_bytes(k_src, k_dst, N); + detail::warp_copy_bytes(v_src, v_dst, N); + } +} + +// ─────────────────────────────────────────────────────────────── +// Main kernel +// ─────────────────────────────────────────────────────────────── +// +// Template parameters: +// kElementBytes total bytes per token row (head_num * head_dim * dtype_size) +// kSplit how many warps collaborate on one element (1, 2, or 4) +// kUsePDL whether to emit PDL synchronisation instructions +// T index dtype (int32_t or int64_t) + +template +__global__ void store_kvcache(const __grid_constant__ StoreKVCacheParams params) { + using namespace device; + constexpr auto kSplitSize = kElementBytes / kSplit; + + const uint32_t warp_id = blockIdx.x * kNumWarps + threadIdx.x / kWarpThreads; + const uint32_t item_id = warp_id / kSplit; + const uint32_t split_id = warp_id % kSplit; + + const auto& [k_input, v_input, k_cache, v_cache, indices, stride_k, stride_v, stride_cache, stride_indices, batch_size] = + params; + + if (item_id >= batch_size) return; + + const auto index_ptr = static_cast(indices) + item_id * stride_indices; + PDLWaitPrimary(); + + const auto index = *index_ptr; + const auto k_src = pointer::offset(k_input, item_id * stride_k, split_id * kSplitSize); + const auto v_src = pointer::offset(v_input, item_id * stride_v, split_id * kSplitSize); + const auto k_dst = pointer::offset(k_cache, index * stride_cache, split_id * kSplitSize); + const auto v_dst = pointer::offset(v_cache, index * stride_cache, split_id * kSplitSize); + + copy_kv_warp(k_src, v_src, k_dst, v_dst); + PDLTriggerSecondary(); +} + +template +struct StoreKVCacheKernel { + static_assert(kElementBytes > 0 && kElementBytes % 4 == 0); + + template + static constexpr auto store_kernel = store_kvcache; + + template + static auto get_kernel(int num_split) { + using namespace mllm_kernel::host; + if constexpr (kElementBytes % (4 * 128) == 0) { + if (num_split == 4) return store_kernel<4, T>; + } + if constexpr (kElementBytes % (2 * 128) == 0) { + if (num_split == 2) return store_kernel<2, T>; + } + if (num_split == 1) return store_kernel<1, T>; + Panic("Unsupported num_split ", num_split, " for element size ", kElementBytes); + } + + static void run(tvm::ffi::TensorView k, tvm::ffi::TensorView v, tvm::ffi::TensorView k_cache, tvm::ffi::TensorView v_cache, + tvm::ffi::TensorView indices, int num_split) { + using namespace mllm_kernel::host; + + auto B = SymbolicSize{"batch_size"}; + auto D = SymbolicSize{"element_size"}; + auto KS = SymbolicSize{"k_stride"}; + auto VS = SymbolicSize{"v_stride"}; + auto S = SymbolicSize{"cache_stride"}; + auto I = SymbolicSize{"indices_stride"}; + auto dtype = SymbolicDType{}; + auto device = SymbolicDevice{}; + auto indice_dtype = SymbolicDType{}; + device.set_options(); + + // k, v: [B, D] with strides [KS, 1] + (void)TensorMatcher({B, D}).with_strides({KS, 1}).with_dtype(dtype).with_device(device).verify(k); + (void)TensorMatcher({B, D}).with_strides({VS, 1}).with_dtype(dtype).with_device(device).verify(v); + + // k_cache, v_cache: [*, D] with strides [S, 1] + (void)TensorMatcher({-1, D}).with_strides({S, 1}).with_dtype(dtype).with_device(device).verify(k_cache).verify(v_cache); + + // indices: [B] with strides [I] + (void)TensorMatcher({B}).with_strides({I}).with_dtype(indice_dtype).with_device(device).verify(indices); + + const int64_t dtype_size = dtype_bytes(dtype.unwrap()); + const uint32_t num_elements = static_cast(B.unwrap()); + RuntimeCheck(kElementBytes == dtype_size * D.unwrap(), "Element size mismatch: expected ", kElementBytes, " but got ", + dtype_size * D.unwrap()); + + const auto params = StoreKVCacheParams{ + .k = k.data_ptr(), + .v = v.data_ptr(), + .k_cache = k_cache.data_ptr(), + .v_cache = v_cache.data_ptr(), + .indices = indices.data_ptr(), + .stride_k_bytes = KS.unwrap() * dtype_size, + .stride_v_bytes = VS.unwrap() * dtype_size, + .stride_cache_bytes = S.unwrap() * dtype_size, + .stride_indices = I.unwrap(), + .batch_size = num_elements, + }; + + const auto use_int32 = indice_dtype.is_type(); + const auto kernel = use_int32 ? get_kernel(num_split) : get_kernel(num_split); + const auto num_blocks = div_ceil(num_elements * num_split, kNumWarps); + + LaunchKernel(num_blocks, kThreadsPerBlock, device.unwrap()).enable_pdl(kUsePDL)(kernel, params); + } +}; + +} // namespace diff --git a/mllm-kernel/mllm_kernel/cuda/jit/__init__.py b/mllm-kernel/mllm_kernel/cuda/jit/__init__.py index 696e73ea0..202ff3b36 100644 --- a/mllm-kernel/mllm_kernel/cuda/jit/__init__.py +++ b/mllm-kernel/mllm_kernel/cuda/jit/__init__.py @@ -1,3 +1,4 @@ from .add_constant import add_constant +from .store_cache import can_use_store_cache, store_cache -__all__ = ["add_constant"] +__all__ = ["add_constant", "can_use_store_cache", "store_cache"] diff --git a/mllm-kernel/mllm_kernel/cuda/jit/store_cache.py b/mllm-kernel/mllm_kernel/cuda/jit/store_cache.py new file mode 100644 index 000000000..96a73f5ef --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/jit/store_cache.py @@ -0,0 +1,127 @@ +# Copyright (c) MLLM Team. +# Licensed under the MIT License. +# +# Python interface for the store_cache CUDA kernel. +# Efficiently scatters key/value tensors into a pre-allocated KV cache pool. + +from __future__ import annotations + +import logging +import torch +from mllm_kernel.jit_utils import jit +from mllm_kernel.jit_utils.compile import cache_once, make_cpp_args + + +logger = logging.getLogger(__name__) + + +@cache_once +def _is_arch_support_pdl() -> bool: + if not torch.cuda.is_available(): + return False + major, minor = torch.cuda.get_device_capability() + # PDL requires sm_90a (Hopper) or later + return major > 9 or (major == 9 and minor >= 0) + + +def _make_store_cache_kernel(row_bytes: int): + """Create a JIT-compiled store_cache kernel for the given row_bytes.""" + pdl = _is_arch_support_pdl() + cpp_args = make_cpp_args(row_bytes, pdl) + + @jit( + args=[row_bytes, pdl], + device="cuda", + cuda_files=["store_cache.cuh"], + cpp_wrappers=[], + cuda_wrappers=[ + ("store_cache", f"StoreKVCacheKernel<{cpp_args}>::run"), + ], + func_name="store_cache", + ) + def _kernel( + compiled_module, + k: torch.Tensor, + v: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + indices: torch.Tensor, + num_split: int, + ) -> None: + compiled_module.store_cache(k, v, k_cache, v_cache, indices, num_split) + + return _kernel + + +_KERNEL_CACHE: dict[int, object] = {} + + +def _get_kernel(row_bytes: int): + if row_bytes not in _KERNEL_CACHE: + _KERNEL_CACHE[row_bytes] = _make_store_cache_kernel(row_bytes) + return _KERNEL_CACHE[row_bytes] + + +@cache_once +def can_use_store_cache(row_bytes: int) -> bool: + """Check whether the JIT store_cache kernel supports the given row size. + + Returns ``False`` if *row_bytes* is not a multiple of 4 or if the JIT + compilation fails for any reason. + """ + if row_bytes % 4 != 0: + logger.warning( + "Unsupported row_bytes=%d for JIT store_cache kernel: " + "must be multiple of 4", + row_bytes, + ) + return False + try: + _get_kernel(row_bytes) + return True + except Exception as e: + logger.warning( + "Failed to load JIT store_cache kernel with row_bytes=%d: %s", + row_bytes, + e, + ) + return False + + +def store_cache( + k: torch.Tensor, + v: torch.Tensor, + k_cache: torch.Tensor, + v_cache: torch.Tensor, + indices: torch.Tensor, + *, + row_bytes: int = 0, + num_split: int = 0, +) -> None: + """Store key and value tensors into a KV cache at specified indices. + + Each row of *k* (and *v*) is scattered into *k_cache* (and *v_cache*) + at the location given by the corresponding entry in *indices*. + + Args: + k: Key tensor, shape ``(batch_size, head_num * head_dim)``. + v: Value tensor, shape ``(batch_size, head_num * head_dim)``. + k_cache: Key cache, shape ``(num_slots, head_num * head_dim)``. + v_cache: Value cache, shape ``(num_slots, head_num * head_dim)``. + indices: Index tensor, shape ``(batch_size,)``, dtype int32 or int64. + row_bytes: Bytes per row. Auto-detected from *k* when 0. + num_split: Number of warps that cooperate on each element (1, 2, or 4). + When 0 the best value is chosen automatically based on alignment. + """ + row_bytes = row_bytes or k.shape[-1] * k.element_size() + kernel = _get_kernel(row_bytes) + + if num_split <= 0: + if row_bytes % 2048 == 0: + num_split = 4 + elif row_bytes % 1024 == 0: + num_split = 2 + else: + num_split = 1 + + kernel(k, v, k_cache, v_cache, indices, num_split) diff --git a/mllm-kernel/tests/test_store_cache.py b/mllm-kernel/tests/test_store_cache.py new file mode 100644 index 000000000..5e4f1bcc3 --- /dev/null +++ b/mllm-kernel/tests/test_store_cache.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +import pytest +import torch + +from mllm_kernel.cuda.jit import can_use_store_cache, store_cache + + +def _make_inputs( + *, + batch_size: int, + num_slots: int, + row_dim: int, + dtype: torch.dtype, + index_dtype: torch.dtype, + seed: int = 0, +): + torch.manual_seed(seed) + device = "cuda" + k = torch.randn(batch_size, row_dim, device=device, dtype=dtype) + v = torch.randn(batch_size, row_dim, device=device, dtype=dtype) + # Use unique indices to avoid write conflicts on the same cache slot. + indices = torch.randperm(num_slots, device=device)[:batch_size].to(index_dtype) + k_cache = torch.zeros(num_slots, row_dim, device=device, dtype=dtype) + v_cache = torch.zeros_like(k_cache) + return k, v, k_cache, v_cache, indices + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required") +@pytest.mark.parametrize("dtype", [torch.float16, torch.float32]) +@pytest.mark.parametrize("index_dtype", [torch.int32, torch.int64]) +def test_store_cache_matches_torch_index(dtype: torch.dtype, index_dtype: torch.dtype): + batch_size = 257 + num_slots = 4096 + row_dim = 8 * 128 # 1024 -> fp16 row_bytes=2048 + row_bytes = row_dim * torch.tensor([], dtype=dtype).element_size() + + assert can_use_store_cache(row_bytes), f"store_cache unavailable for row_bytes={row_bytes}" + + k, v, k_cache, v_cache, indices = _make_inputs( + batch_size=batch_size, + num_slots=num_slots, + row_dim=row_dim, + dtype=dtype, + index_dtype=index_dtype, + seed=2026, + ) + + k_ref = k_cache.clone() + v_ref = v_cache.clone() + k_ref[indices] = k + v_ref[indices] = v + + store_cache(k, v, k_cache, v_cache, indices) + torch.cuda.synchronize() + + assert torch.equal(k_cache, k_ref) + assert torch.equal(v_cache, v_ref) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required") +def test_can_use_store_cache_rejects_invalid_row_bytes(): + assert not can_use_store_cache(2) + assert not can_use_store_cache(6) + assert can_use_store_cache(4) + diff --git a/pymllm/engine/io_struct.py b/pymllm/engine/io_struct.py new file mode 100644 index 000000000..777186e28 --- /dev/null +++ b/pymllm/engine/io_struct.py @@ -0,0 +1,196 @@ +from __future__ import annotations + +import uuid +from dataclasses import dataclass, field +from typing import Any, Dict, Iterator, List, Optional, Union + + +@dataclass +class BaseReq: + rid: Optional[Union[str, List[str]]] = field(default=None, kw_only=True) + + def regenerate_rid(self) -> Union[str, List[str]]: + if isinstance(self.rid, list): + self.rid = [uuid.uuid4().hex for _ in range(len(self.rid))] + else: + self.rid = uuid.uuid4().hex + return self.rid + + +@dataclass +class BaseBatchReq: + rids: List[str] + + def regenerate_rids(self) -> List[str]: + self.rids = [uuid.uuid4().hex for _ in range(len(self.rids))] + return self.rids + + +@dataclass +class GenerateReqInput(BaseReq): + text: Optional[Union[List[str], str]] = None + input_ids: Optional[Union[List[List[int]], List[int]]] = None + sampling_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None + return_logprob: Optional[Union[List[bool], bool]] = None + logprob_start_len: Optional[Union[List[int], int]] = None + top_logprobs_num: Optional[Union[List[int], int]] = None + stream: bool = False + + # Multimodal placeholders. + image_data: Optional[Any] = None + video_data: Optional[Any] = None + audio_data: Optional[Any] = None + + # Runtime extension placeholders. + lora_path: Optional[Union[List[Optional[str]], str]] = None + session_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None + extra_options: Dict[str, Any] = field(default_factory=dict) + + # Derived fields populated by normalization. + is_single: bool = field(default=True, init=False) + batch_size: int = field(default=1, init=False) + + def normalize_batch_and_arguments(self) -> None: + self._validate_inputs() + self._determine_batch_size() + + def _validate_inputs(self) -> None: + has_text = self.text is not None + has_input_ids = self.input_ids is not None + if has_text == has_input_ids: + raise ValueError("Exactly one of `text` or `input_ids` must be provided.") + + def _determine_batch_size(self) -> None: + if self.text is not None: + if isinstance(self.text, str): + self.is_single = True + self.batch_size = 1 + else: + if len(self.text) == 0: + raise ValueError("`text` cannot be an empty list.") + self.is_single = False + self.batch_size = len(self.text) + return + + assert self.input_ids is not None + if len(self.input_ids) == 0: + raise ValueError("`input_ids` cannot be empty.") + if isinstance(self.input_ids[0], int): + self.is_single = True + self.batch_size = 1 + else: + self.is_single = False + self.batch_size = len(self.input_ids) + + def __getitem__(self, i: int) -> "GenerateReqInput": + if i < 0 or i >= self.batch_size: + raise IndexError(f"index {i} out of range for batch size {self.batch_size}") + if self.batch_size == 1: + return self + return GenerateReqInput( + rid=self._pick(self.rid, i), + text=self._pick(self.text, i), + input_ids=self._pick(self.input_ids, i), + sampling_params=self._pick(self.sampling_params, i), + return_logprob=self._pick(self.return_logprob, i), + logprob_start_len=self._pick(self.logprob_start_len, i), + top_logprobs_num=self._pick(self.top_logprobs_num, i), + stream=self.stream, + image_data=self._pick(self.image_data, i), + video_data=self._pick(self.video_data, i), + audio_data=self._pick(self.audio_data, i), + lora_path=self._pick(self.lora_path, i), + session_params=self._pick(self.session_params, i), + extra_options=self.extra_options.copy(), + ) + + @staticmethod + def _pick(value: Any, i: int) -> Any: + if isinstance(value, list): + return value[i] + return value + + def to_request_dict(self) -> Dict[str, Any]: + payload: Dict[str, Any] = {} + for key, value in { + "rid": self.rid, + "text": self.text, + "input_ids": self.input_ids, + "sampling_params": self.sampling_params, + "return_logprob": self.return_logprob, + "logprob_start_len": self.logprob_start_len, + "top_logprobs_num": self.top_logprobs_num, + "stream": self.stream, + "image_data": self.image_data, + "video_data": self.video_data, + "audio_data": self.audio_data, + "lora_path": self.lora_path, + "session_params": self.session_params, + }.items(): + if value is not None: + payload[key] = value + payload.update(self.extra_options) + return payload + + +@dataclass +class TokenizedGenerateReqInput(BaseReq): + input_text: str = "" + input_ids: List[int] = field(default_factory=list) + sampling_params: Dict[str, Any] = field(default_factory=dict) + stream: bool = False + return_logprob: bool = False + logprob_start_len: int = -1 + top_logprobs_num: int = 0 + lora_path: Optional[str] = None + session_params: Optional[Dict[str, Any]] = None + + +@dataclass +class BatchTokenizedGenerateReqInput(BaseBatchReq): + reqs: List[TokenizedGenerateReqInput] + + def __len__(self) -> int: + return len(self.reqs) + + def __getitem__(self, i: int) -> TokenizedGenerateReqInput: + return self.reqs[i] + + def __iter__(self) -> Iterator[TokenizedGenerateReqInput]: + return iter(self.reqs) + + +@dataclass +class BatchTokenIDOutput(BaseBatchReq): + finished_reasons: List[Optional[str]] + decode_ids: List[int] + read_offsets: List[int] + output_ids: Optional[List[int]] + skip_special_tokens: List[bool] + prompt_tokens: List[int] + completion_tokens: List[int] + input_token_logprobs_val: List[float] = field(default_factory=list) + input_token_logprobs_idx: List[int] = field(default_factory=list) + output_token_logprobs_val: List[float] = field(default_factory=list) + output_token_logprobs_idx: List[int] = field(default_factory=list) + input_top_logprobs_val: List[List[float]] = field(default_factory=list) + input_top_logprobs_idx: List[List[int]] = field(default_factory=list) + output_top_logprobs_val: List[List[float]] = field(default_factory=list) + output_top_logprobs_idx: List[List[int]] = field(default_factory=list) + + +@dataclass +class BatchStrOutput(BaseBatchReq): + finished_reasons: List[Optional[str]] + output_strs: List[str] + output_ids: Optional[List[int]] + prompt_tokens: List[int] + completion_tokens: List[int] + input_token_logprobs_val: List[float] = field(default_factory=list) + input_token_logprobs_idx: List[int] = field(default_factory=list) + output_token_logprobs_val: List[float] = field(default_factory=list) + output_token_logprobs_idx: List[int] = field(default_factory=list) + input_top_logprobs_val: List[List[float]] = field(default_factory=list) + input_top_logprobs_idx: List[List[int]] = field(default_factory=list) + output_top_logprobs_val: List[List[float]] = field(default_factory=list) + output_top_logprobs_idx: List[List[int]] = field(default_factory=list) diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index 25ada7c70..edad97af5 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -1,38 +1,310 @@ +import asyncio +import atexit import logging +import os +import uuid from pathlib import Path -from typing import Optional +from typing import Any, AsyncIterator, Dict, List, Optional, Union -import zmq import torch import torch.multiprocessing as mp from transformers import AutoConfig from huggingface_hub import snapshot_download + from pymllm.configs import get_global_config -from pymllm.orchestrator.tokenizer_process import TokenizerProcess -from pymllm.orchestrator.detokenizer_process import DetokenizerProcess -from pymllm.orchestrator.model_runner_process import ModelRunnerProcess -from pymllm.orchestrator.async_disk_io_process import AsyncDiskIoProcess -from pymllm.orchestrator.request_response_process import RequestResponseProcess +from pymllm.engine.io_struct import GenerateReqInput +from pymllm.orchestrator.ipc_utils import make_ipc_address +from pymllm.orchestrator.request_response_process import ( + ReqState, + RequestResponseProcess, +) +from pymllm.orchestrator.tokenizer_process import run_tokenizer_process +from pymllm.orchestrator.scheduler_process import run_scheduler_process +from pymllm.orchestrator.model_runner_process import run_model_runner_process +from pymllm.orchestrator.detokenizer_process import run_detokenizer_process +from pymllm.orchestrator.async_disk_io_process import run_async_disk_io_process logger = logging.getLogger(__name__) class Engine: def __init__(self): + self._subprocesses: List[mp.Process] = [] + self._rr_process: Optional[RequestResponseProcess] = None self._config_logging() self._set_default_torch_dtype() self._check_model_and_tokenizer() - # Orchestrator, shall we start the music here? + def launch(self) -> None: self._launch_processes() + atexit.register(self.shutdown) - def _launch_processes(self): - """ - TODO issue processes here + def _launch_processes(self) -> None: + """Spawn all subprocess workers and wire up ZMQ IPC channels.""" + mp.set_start_method("spawn", force=True) + uid = str(os.getpid()) + + # IPC addresses for ZMQ communication between processes + addr_request_response_to_tokenizer: str = make_ipc_address( + "request_response_to_tokenizer", uid + ) + addr_tokenizer_to_scheduler: str = make_ipc_address( + "tokenizer_to_scheduler", uid + ) + addr_scheduler_to_model_runner: str = make_ipc_address( + "scheduler_to_model_runner", uid + ) + addr_model_runner_to_scheduler: str = make_ipc_address( + "model_runner_to_scheduler", uid + ) + addr_scheduler_to_detokenizer: str = make_ipc_address( + "scheduler_to_detokenizer", uid + ) + addr_detokenizer_to_request_response: str = make_ipc_address( + "detokenizer_to_request_response", uid + ) + addr_scheduler_to_disk_io: str = make_ipc_address("scheduler_to_disk_io", uid) + + # Record all subprocesses + procs_and_readers: List[tuple] = [] + + # Tokenizer + tokenizer_reader, tokenizer_writer = mp.Pipe(duplex=False) + tokenizer_proc = mp.Process( + target=run_tokenizer_process, + args=( + addr_request_response_to_tokenizer, + addr_tokenizer_to_scheduler, + tokenizer_writer, + ), + daemon=True, + ) + procs_and_readers.append((tokenizer_proc, tokenizer_reader, "tokenizer")) + + # Scheduler + scheduler_reader, scheduler_writer = mp.Pipe(duplex=False) + scheduler_proc = mp.Process( + target=run_scheduler_process, + args=( + addr_tokenizer_to_scheduler, + addr_scheduler_to_model_runner, + addr_model_runner_to_scheduler, + addr_scheduler_to_detokenizer, + scheduler_writer, + ), + daemon=True, + ) + procs_and_readers.append((scheduler_proc, scheduler_reader, "scheduler")) + + # Model Runner + model_runner_reader, model_runner_writer = mp.Pipe(duplex=False) + model_runner_proc = mp.Process( + target=run_model_runner_process, + args=( + addr_scheduler_to_model_runner, + addr_model_runner_to_scheduler, + model_runner_writer, + ), + daemon=True, + ) + procs_and_readers.append( + (model_runner_proc, model_runner_reader, "model_runner") + ) + + # Detokenizer + detokenizer_reader, detokenizer_writer = mp.Pipe(duplex=False) + detokenizer_proc = mp.Process( + target=run_detokenizer_process, + args=( + addr_scheduler_to_detokenizer, + addr_detokenizer_to_request_response, + detokenizer_writer, + ), + daemon=True, + ) + procs_and_readers.append((detokenizer_proc, detokenizer_reader, "detokenizer")) + + # Async Disk I/O + if get_global_config().server.enable_disk_io_async: + disk_io_reader, disk_io_writer = mp.Pipe(duplex=False) + disk_io_proc = mp.Process( + target=run_async_disk_io_process, + args=(addr_scheduler_to_disk_io, disk_io_writer), + daemon=True, + ) + procs_and_readers.append((disk_io_proc, disk_io_reader, "async_disk_io")) + + # Start all subprocesses + for proc, _, name in procs_and_readers: + proc.start() + self._subprocesses.append(proc) + logger.info("Started %s process (pid=%s)", name, proc.pid) + + # Wait for readiness signals + for _, reader, name in procs_and_readers: + try: + msg = reader.recv() + except EOFError: + raise RuntimeError(f"{name} process died before signalling readiness") + if msg.get("status") != "ready": + raise RuntimeError(f"{name} process failed to initialise: {msg}") + logger.info("%s process ready", name) + + # RR Process is current main process + self._rr_process = RequestResponseProcess( + send_to_tokenizer_addr=addr_request_response_to_tokenizer, + recv_from_detokenizer_addr=addr_detokenizer_to_request_response, + ) + + try: + self._loop = asyncio.get_running_loop() + except RuntimeError: + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + + self._rr_process.start(self._loop) + logger.info("RequestResponseProcess started in main process") + + def generate( + self, + prompt: Optional[Union[List[str], str]] = None, + sampling_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None, + input_ids: Optional[Union[List[List[int]], List[int]]] = None, + image_data: Optional[Any] = None, + audio_data: Optional[Any] = None, + video_data: Optional[Any] = None, + return_logprob: Optional[Union[List[bool], bool]] = None, + logprob_start_len: Optional[Union[List[int], int]] = None, + top_logprobs_num: Optional[Union[List[int], int]] = None, + lora_path: Optional[Union[List[Optional[str]], str]] = None, + session_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None, + stream: bool = False, + rid: Optional[Union[List[str], str]] = None, + **kwargs, + ) -> Dict[str, Any]: + """Synchronous, non-streaming generation entry point.""" + if rid is None: + rid = uuid.uuid4().hex + request = GenerateReqInput( + rid=rid, + text=prompt, + input_ids=input_ids, + sampling_params=sampling_params, + return_logprob=return_logprob, + logprob_start_len=logprob_start_len, + top_logprobs_num=top_logprobs_num, + stream=stream, + image_data=image_data, + audio_data=audio_data, + video_data=video_data, + lora_path=lora_path, + session_params=session_params, + extra_options=kwargs, + ) + request.normalize_batch_and_arguments() + + async def _run() -> Dict[str, Any]: + state = await self._rr_process.add_request(request) + if isinstance(rid, list): + raise ValueError("Synchronous `generate` currently supports single request.") + return await self._wait_for_final_result(rid, state) + + return self._loop.run_until_complete(_run()) + + async def generate_async( + self, + prompt: Optional[Union[List[str], str]] = None, + sampling_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None, + input_ids: Optional[Union[List[List[int]], List[int]]] = None, + image_data: Optional[Any] = None, + audio_data: Optional[Any] = None, + video_data: Optional[Any] = None, + return_logprob: Optional[Union[List[bool], bool]] = None, + logprob_start_len: Optional[Union[List[int], int]] = None, + top_logprobs_num: Optional[Union[List[int], int]] = None, + lora_path: Optional[Union[List[Optional[str]], str]] = None, + session_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None, + stream: bool = False, + rid: Optional[Union[List[str], str]] = None, + **kwargs, + ) -> AsyncIterator[Dict[str, Any]]: + """Asynchronous generation entry point. + + When *stream* is ``False`` (default) the returned async iterator + yields a **single** final result dict. When *stream* is ``True`` + every incremental chunk from the detokenizer is yielded as it + arrives, following the ``Event + out_list`` pattern. """ + if rid is None: + rid = uuid.uuid4().hex + request = GenerateReqInput( + rid=rid, + text=prompt, + input_ids=input_ids, + sampling_params=sampling_params, + return_logprob=return_logprob, + logprob_start_len=logprob_start_len, + top_logprobs_num=top_logprobs_num, + stream=stream, + image_data=image_data, + audio_data=audio_data, + video_data=video_data, + lora_path=lora_path, + session_params=session_params, + extra_options=kwargs, + ) + request.normalize_batch_and_arguments() + state = await self._rr_process.add_request(request) + + try: + if isinstance(rid, list): + raise ValueError("`generate_async` currently supports single request only.") + if stream: + async for chunk in self._stream_results(rid, state): + yield chunk + else: + yield await self._wait_for_final_result(rid, state) + finally: + self._rr_process.remove_state(rid) + + @staticmethod + async def _wait_for_final_result(rid: str, state: ReqState) -> Dict[str, Any]: + """Block until the request is finished and return the last output.""" + while True: + await state.event.wait() + if state.finished: + return state.out_list[-1] + state.event.clear() + + @staticmethod + async def _stream_results( + rid: str, state: ReqState + ) -> AsyncIterator[Dict[str, Any]]: + """Yield incremental chunks as they arrive, until finished.""" + while True: + await state.event.wait() + for item in state.out_list: + yield item + state.out_list.clear() + if state.finished: + return + state.event.clear() - # RR process is the main process - self._rr_process = RequestResponseProcess() + def shutdown(self) -> None: + """Terminate all subprocesses.""" + if self._rr_process is not None: + try: + self._loop.run_until_complete(self._rr_process.shutdown()) + except Exception: + pass + for proc in self._subprocesses: + if proc.is_alive(): + proc.terminate() + proc.join(timeout=5) + if proc.is_alive(): + proc.kill() + self._subprocesses.clear() + logger.info("All subprocesses shut down") def _set_default_torch_dtype(self): """Set the default torch dtype based on the server configuration.""" @@ -93,23 +365,13 @@ def _check_model_and_tokenizer(self): @staticmethod def _maybe_download(path: Path, download_dir: Optional[Path] = None) -> Path: - """Return a local directory for *path*, downloading if necessary.""" if path.is_dir(): return path - repo_id = str(path) logger.info("Downloading '%s' ...", repo_id) - kwargs = {} if download_dir is not None: kwargs["local_dir"] = str(download_dir / path.name) - downloaded = snapshot_download(repo_id=repo_id, **kwargs) logger.info("Downloaded '%s' to '%s'", repo_id, downloaded) return Path(downloaded) - - def generate(self, stream: bool = True): - pass - - async def generate_async(self, stream: bool = True): - pass diff --git a/pymllm/mem_cache/__init__.py b/pymllm/mem_cache/__init__.py index e69de29bb..c2ce06eba 100644 --- a/pymllm/mem_cache/__init__.py +++ b/pymllm/mem_cache/__init__.py @@ -0,0 +1,37 @@ +from pymllm.mem_cache.memory_pool import ( + KVPool, + ReqToTokenPool, + TokenToKVPoolAllocator, + make_full_attention_net_mem_pool, + make_req_to_token_pool, +) +from pymllm.mem_cache.radix_cache import ( + EvictResult, + InsertResult, + MatchResult, + RadixCache, + RadixKey, + TreeNode, + hash_bytes, + hash_to_int64, + hash_token_ids, +) + +__all__ = [ + # memory_pool + "KVPool", + "TokenToKVPoolAllocator", + "ReqToTokenPool", + "make_full_attention_net_mem_pool", + "make_req_to_token_pool", + # radix_cache + "RadixCache", + "RadixKey", + "TreeNode", + "MatchResult", + "InsertResult", + "EvictResult", + "hash_token_ids", + "hash_to_int64", + "hash_bytes", +] diff --git a/pymllm/mem_cache/memory_pool.py b/pymllm/mem_cache/memory_pool.py new file mode 100644 index 000000000..0721fd71d --- /dev/null +++ b/pymllm/mem_cache/memory_pool.py @@ -0,0 +1,480 @@ +"""Lightweight KV-cache memory pools + +Three-layer architecture:: + + ReqToTokenPool maps (req_slot, position) → kv_index + TokenToKVPoolAllocator manages a free-list of integer indices + KVPool holds the actual GPU K/V tensors + +All indices are **int64** tensors on the target device. Slot 0 in the KV +buffers is reserved as a padding / dummy-output slot and is never allocated. +""" + +import logging +from typing import List, Optional, Tuple, Union + +import torch + +from mllm_kernel.cuda.jit.store_cache import store_cache, can_use_store_cache + +logger = logging.getLogger(__name__) + + +class KVPool: + """GPU (or CPU) storage for per-layer key and value caches. + + Layout per layer:: + + JIT: + k_buffer[layer][slot, k_head_num * k_head_dim] + v_buffer[layer][slot, v_head_num * v_head_dim] + + PyTorch: + k_buffer[layer][slot, k_head_num, k_head_dim] + v_buffer[layer][slot, v_head_num, v_head_dim] + + K and V may have **independent** head counts and head dimensions, which + covers standard MHA, GQA / MQA, and architectures like MLA where value + projection uses a different dimensionality. + + ``size`` usable slots are numbered ``[1, size]``. Slot 0 is a dummy + padding slot that absorbs writes from padded tokens. + + Parameters + ---------- + size : int + Number of usable token slots (total buffer length = ``size + 1``). + layer_num : int + Number of transformer layers (one K buffer + one V buffer per layer). + k_head_num : int + Number of key heads. + k_head_dim : int + Dimension of each key head. + device : str | torch.device + Target device (``"cuda"``, ``"cpu"``, …). + dtype : torch.dtype + Storage data type. + v_head_num : int, optional + Number of value heads. Defaults to *k_head_num*. + v_head_dim : int, optional + Dimension of each value head. Defaults to *k_head_dim*. + pin_memory : bool, optional + Whether to use pinned memory. Defaults to True. + """ + + def __init__( + self, + size: int, + layer_num: int, + k_head_num: int, + k_head_dim: int, + device: Union[str, torch.device] = "cuda", + dtype: torch.dtype = torch.float16, + v_head_num: Optional[int] = None, + v_head_dim: Optional[int] = None, + pin_memory: bool = True, + ): + self.size = size + self.layer_num = layer_num + self.k_head_num = k_head_num + self.k_head_dim = k_head_dim + self.v_head_num = v_head_num if v_head_num is not None else k_head_num + self.v_head_dim = v_head_dim if v_head_dim is not None else k_head_dim + self.device = torch.device(device) + self.dtype = dtype + + buf_len = size + 1 # slot 0 is padding + + if buf_len % 8 != 0: + logger.warning( + "KVPool buffer length is not divisible by 8, padding to the next multiple of 8" + ) + buf_len = (buf_len + 7) & ~7 + + k_row_dim = self.k_head_num * self.k_head_dim + v_row_dim = self.v_head_num * self.v_head_dim + self._same_kv_dim = k_row_dim == v_row_dim + self._row_bytes = k_row_dim * torch.tensor([], dtype=dtype).element_size() + self._use_jit = ( + self.device.type == "cuda" + and self._same_kv_dim + and can_use_store_cache(self._row_bytes) + ) + if not self._use_jit: + logger.warning( + f"Fallback to PyTorch index for KVPool, which is slower than the mllm-kernel's implementation, same_kv_dim={self._same_kv_dim}, row_bytes={self._row_bytes}" + ) + + self.k_buffer: List[torch.Tensor] = [ + torch.zeros( + (buf_len, self.k_head_num, self.k_head_dim), + dtype=dtype, + device=self.device, + pin_memory=pin_memory, + ) + for _ in range(layer_num) + ] + self.v_buffer: List[torch.Tensor] = [ + torch.zeros( + (buf_len, self.v_head_num, self.v_head_dim), + dtype=dtype, + device=self.device, + pin_memory=pin_memory, + ) + for _ in range(layer_num) + ] + + # Pre-computed 2D views for the JIT store_cache kernel. + # Zero-copy: same underlying storage as k_buffer / v_buffer. + if self._use_jit: + self._k_buffer_2d = [b.view(buf_len, -1) for b in self.k_buffer] + self._v_buffer_2d = [b.view(buf_len, -1) for b in self.v_buffer] + + logger.info( + "KVPool allocated: %d layers, %d slots, K=[%d,%d] V=[%d,%d], %.2f GB", + layer_num, + size, + self.k_head_num, + self.k_head_dim, + self.v_head_num, + self.v_head_dim, + self._mem_bytes() / (1 << 30), + ) + + def get_key_buffer(self, layer_id: int) -> torch.Tensor: + return self.k_buffer[layer_id] + + def get_value_buffer(self, layer_id: int) -> torch.Tensor: + return self.v_buffer[layer_id] + + def get_kv_buffer(self, layer_id: int) -> Tuple[torch.Tensor, torch.Tensor]: + return self.k_buffer[layer_id], self.v_buffer[layer_id] + + def set_kv_buffer( + self, + layer_id: int, + indices: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + ) -> None: + """Write K/V vectors into the cache at the given *indices*. + + ``k`` / ``v`` can be any shape as long as the trailing dimensions + multiply to ``head_num * head_dim`` (the row dimension). All leading + dimensions are treated as the batch axis and must match ``indices`` + after flattening. Typical shapes:: + + k: [num_tokens, head_num, head_dim] indices: [num_tokens] + k: [batch, seq_len, head_num, head_dim] indices: [batch, seq_len] + k: [num_tokens, head_num * head_dim] indices: [num_tokens] + """ + if self._use_jit: + row_dim = self.k_head_num * self.k_head_dim + store_cache( + k.reshape(-1, row_dim), + v.reshape(-1, row_dim), + self._k_buffer_2d[layer_id], + self._v_buffer_2d[layer_id], + indices.reshape(-1), + row_bytes=self._row_bytes, + ) + else: + self.k_buffer[layer_id][indices] = k + self.v_buffer[layer_id][indices] = v + + def _mem_bytes(self) -> int: + total = 0 + for buf in self.k_buffer + self.v_buffer: + total += buf.nelement() * buf.element_size() + return total + + +class TokenToKVPoolAllocator: + """Manages allocation / deallocation of integer indices into a :class:`KVPool`. + + Each ``alloc(n)`` returns *n* free indices; each ``free(indices)`` returns + them to the pool. + + Uses a **dual-buffer** strategy (``free_slots`` + ``release_slots``) so + that ``free()`` never cats onto the large main free-list. Freed indices + accumulate in the smaller ``release_slots`` and are merged lazily (with an + optional sort) only when ``alloc()`` cannot be satisfied from + ``free_slots`` alone. + + A **batch-free** API (``free_group_begin`` / ``free_group_end``) further + amortises cost when many ``free()`` calls happen in a tight loop (e.g. + during scheduling or eviction). + + Typical usage:: + + allocator = TokenToKVPoolAllocator(size=4096, device="cuda") + + # --- basic alloc / free --- + indices = allocator.alloc(128) # 128 free slot indices (int64) + allocator.free(indices[:64]) # return 64 slots + + # --- batch free (amortised) --- + allocator.free_group_begin() + for req in finished_requests: + allocator.free(req.kv_indices) # O(1) list append each + allocator.free_group_end() # single torch.cat + release + + Parameters + ---------- + size : int + Total number of allocatable slots (must match ``KVPool.size``). + device : str | torch.device + Device for the free-list tensor. + page_size : int + When > 1 the allocator works in page-aligned mode: ``alloc`` returns + multiples of ``page_size`` contiguous within each page, and ``free`` + deduplicates by page. + need_sort : bool + When ``True`` (default), ``merge_and_sort_free`` sorts after merging + so that lower-index slots are allocated first (better memory locality). + """ + + def __init__( + self, + size: int, + device: Union[str, torch.device] = "cuda", + page_size: int = 1, + need_sort: bool = True, + ): + self.size = size + self.page_size = page_size + self.device = torch.device(device) + self.need_sort = need_sort + self.clear() + + def clear(self) -> None: + """Reset the allocator so that all slots ``[1, size]`` are free. The first slot is reserved for padding.""" + if self.page_size == 1: + self.free_slots = torch.arange( + 1, self.size + 1, dtype=torch.int64, device=self.device + ) + else: + num_pages = self.size // self.page_size + self.free_slots = torch.arange( + 1, num_pages + 1, dtype=torch.int64, device=self.device + ) + self.release_slots = torch.empty((0,), dtype=torch.int64, device=self.device) + self._is_not_in_free_group = True + self._free_group: List[torch.Tensor] = [] + + def available_size(self) -> int: + """Number of tokens that can still be allocated.""" + return (len(self.free_slots) + len(self.release_slots)) * self.page_size + + def merge_and_sort_free(self) -> None: + """Merge ``release_slots`` into ``free_slots`` (and sort if ``need_sort``).""" + if len(self.release_slots) == 0: + return + self.free_slots = torch.cat((self.free_slots, self.release_slots)) + if self.need_sort: + self.free_slots, _ = torch.sort(self.free_slots) + self.release_slots = torch.empty((0,), dtype=torch.int64, device=self.device) + + def free_group_begin(self) -> None: + """Start collecting ``free()`` calls; actual release is deferred to ``free_group_end``.""" + self._is_not_in_free_group = False + self._free_group = [] + + def free_group_end(self) -> None: + """Flush all ``free()`` calls collected since ``free_group_begin``.""" + self._is_not_in_free_group = True + if self._free_group: + self.free(torch.cat(self._free_group)) + self._free_group = [] + + def alloc(self, need_size: int) -> Optional[torch.Tensor]: + """Allocate *need_size* token indices. + + Returns a 1-D ``int64`` tensor on success, or ``None`` if the pool is + exhausted. + """ + if self.page_size == 1: + if need_size > len(self.free_slots): + self.merge_and_sort_free() + if need_size > len(self.free_slots): + return None + out = self.free_slots[:need_size] + self.free_slots = self.free_slots[need_size:] + return out + + num_pages = (need_size + self.page_size - 1) // self.page_size + if num_pages > len(self.free_slots): + self.merge_and_sort_free() + if num_pages > len(self.free_slots): + return None + pages = self.free_slots[:num_pages] + self.free_slots = self.free_slots[num_pages:] + offsets = torch.arange(self.page_size, device=self.device) + out = (pages[:, None] * self.page_size + offsets).reshape(-1) + return out[:need_size] + + def free(self, indices: torch.Tensor) -> None: + """Return *indices* to the free pool.""" + if indices.numel() == 0: + return + + if not self._is_not_in_free_group: + self._free_group.append(indices) + return + + if self.page_size != 1: + indices = torch.unique(indices // self.page_size) + + if self.need_sort: + self.release_slots = torch.cat((self.release_slots, indices)) + else: + self.free_slots = torch.cat((self.free_slots, indices)) + + +class ReqToTokenPool: + """Maps each live request to its per-position KV-pool indices. + + Internally a 2-D tensor ``req_to_token[slot, position]`` stores the + KV-pool index for every token position of every active request. + Slots are recycled via a simple free-list. + + This class is a **pure mapping table** -- it does **not** track per-request + sequence lengths. The caller (typically the ``Req`` / IO-struct object) + must store ``req_pool_idx`` and ``seq_len`` and use them to slice into + ``req_to_token`` when reading back KV indices. + + Typical usage:: + + pool = ReqToTokenPool(max_reqs=256, max_context_len=4096) + + # --- on new request arrival --- + [slot] = pool.alloc(1) # slot = req_pool_idx + kv_indices = kv_allocator.alloc(seq_len) # from TokenToKVPoolAllocator + pool.write((slot, slice(0, seq_len)), kv_indices) + + # --- read back (caller tracks seq_len) --- + kv_indices = pool.req_to_token[slot, :seq_len] + + # --- on request completion --- + kv_allocator.free(pool.req_to_token[slot, :seq_len]) + pool.free(slot) + + Parameters + ---------- + max_reqs : int + Maximum number of concurrent requests (number of rows). + max_context_len : int + Maximum sequence length any single request can reach (number of cols). + device : str | torch.device + Target device for the mapping tensor. + """ + + def __init__( + self, + max_reqs: int, + max_context_len: int, + device: Union[str, torch.device] = "cuda", + ): + self.size = max_reqs + self.max_context_len = max_context_len + self.device = torch.device(device) + + self.req_to_token = torch.zeros( + (max_reqs, max_context_len), dtype=torch.int64, device=self.device + ) + self._free_slots: List[int] = list(range(max_reqs)) + + def available_size(self) -> int: + return len(self._free_slots) + + def alloc(self, n: int = 1) -> Optional[List[int]]: + """Allocate *n* request slots. Returns a list of slot indices.""" + if n > len(self._free_slots): + return None + out = self._free_slots[:n] + self._free_slots = self._free_slots[n:] + return out + + def free(self, slot: int) -> None: + """Return a single request slot to the pool.""" + self._free_slots.append(slot) + + def write(self, index: Tuple, values: torch.Tensor) -> None: + """Write KV indices into the mapping table. + + ``index`` is typically ``(req_pool_idx, slice(start, end))``. + """ + self.req_to_token[index] = values + + def clear(self) -> None: + self._free_slots = list(range(self.size)) + self.req_to_token.zero_() + + +def make_full_attention_net_mem_pool( + size: int, + layer_num: int, + k_head_num: int, + k_head_dim: int, + v_head_num: int, + v_head_dim: int, + device: Union[str, torch.device] = "cuda", + dtype: torch.dtype = torch.float16, + page_size: int = 1, + need_sort: bool = True, + pin_memory: bool = True, +) -> Tuple[KVPool, TokenToKVPoolAllocator]: + """Create a :class:`KVPool` and its :class:`TokenToKVPoolAllocator` for a + full-attention (non-SWA) model. + + Parameters + ---------- + size : int + Number of usable token slots in the KV cache. + layer_num : int + Number of transformer layers. + k_head_num / k_head_dim : int + Key head count and dimension. + v_head_num / v_head_dim : int + Value head count and dimension. + device : str | torch.device + Target device. + dtype : torch.dtype + Storage data type for the KV buffers. + page_size : int + Allocator page size (1 = per-token, >1 = page-aligned). + need_sort : bool + Whether the allocator sorts on merge for memory locality. + pin_memory : bool + Whether to use pinned memory for the KV buffers. + + Returns + ------- + (KVPool, TokenToKVPoolAllocator) + """ + pool = KVPool( + size=size, + layer_num=layer_num, + k_head_num=k_head_num, + k_head_dim=k_head_dim, + device=device, + dtype=dtype, + v_head_num=v_head_num, + v_head_dim=v_head_dim, + pin_memory=pin_memory, + ) + allocator = TokenToKVPoolAllocator( + size=size, + device=device, + page_size=page_size, + need_sort=need_sort, + ) + return pool, allocator + + +def make_req_to_token_pool( + max_reqs: int, + max_context_len: int, + device: Union[str, torch.device] = "cuda", +) -> ReqToTokenPool: + return ReqToTokenPool(max_reqs, max_context_len, device) diff --git a/pymllm/mem_cache/param_disk_cache.py b/pymllm/mem_cache/param_disk_cache.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/pymllm/mem_cache/radix_cache.py b/pymllm/mem_cache/radix_cache.py index e69de29bb..997790ea5 100644 --- a/pymllm/mem_cache/radix_cache.py +++ b/pymllm/mem_cache/radix_cache.py @@ -0,0 +1,794 @@ +"""Lightweight radix-tree KV cache with SWA and multimodal support. + + +Supports: + - Multi-batch serving on a single GPU + - Sliding Window Attention (SWA) via tombstone mechanism + - Multimodal namespace isolation via ``extra_key`` + - SHA256 position-aware hashing + - Page-aligned operations (page_size >= 1) + - LRU leaf eviction +""" + +from __future__ import annotations + +import hashlib +import heapq +import logging +import time +from collections import defaultdict +from dataclasses import dataclass +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union + +import torch + +logger = logging.getLogger(__name__) + + +def hash_token_ids( + token_ids: List[Union[int, Tuple[int, ...]]], + prior_hash: Optional[str] = None, +) -> str: + """SHA-256 hash of a token-id page with optional chain-hash. + + Each token is encoded as a 4-byte little-endian unsigned integer; + tuples (bigram / EAGLE) hash each element in order. When *prior_hash* + is supplied the digest is seeded with the raw bytes of the previous + hash, making the result position-aware. + """ + hasher = hashlib.sha256() + if prior_hash: + hasher.update(bytes.fromhex(prior_hash)) + for t in token_ids: + if isinstance(t, tuple): + for elem in t: + hasher.update(elem.to_bytes(4, byteorder="little", signed=False)) + else: + hasher.update(t.to_bytes(4, byteorder="little", signed=False)) + return hasher.hexdigest() + + +def hash_to_int64(hex_str: str) -> int: + """Convert a hex digest to a signed 64-bit integer (first 16 hex chars).""" + val = int(hex_str[:16], 16) + return val - (1 << 64) if val >= (1 << 63) else val + + +def hash_bytes(data: bytes) -> int: + """SHA-256 → unsigned 64-bit int. Useful for multimodal embedding keys.""" + return int.from_bytes(hashlib.sha256(data).digest()[:8], "big", signed=False) + + +class RadixKey: + """Compound lookup key: token-id sequence + optional namespace tag. + + ``extra_key`` isolates independent namespaces so that sequences with + identical leading tokens but different adapters / LoRA ids / multimodal + context hashes never share prefix nodes. + """ + + __slots__ = ("token_ids", "extra_key") + + def __init__( + self, + token_ids: List[Union[int, Tuple[int, ...]]], + extra_key: Optional[str] = None, + ): + self.token_ids = token_ids + self.extra_key = extra_key + + def __len__(self) -> int: + return len(self.token_ids) + + def __iter__(self) -> Iterator: + return iter(self.token_ids) + + def __getitem__(self, idx: Union[int, slice]) -> RadixKey: + if isinstance(idx, slice): + return RadixKey(self.token_ids[idx], self.extra_key) + return RadixKey([self.token_ids[idx]], self.extra_key) + + def __repr__(self) -> str: + preview = self.token_ids[:10] + tail = "..." if len(self.token_ids) > 10 else "" + return f"RadixKey(extra={self.extra_key!r}, toks={preview}{tail})" + + +_node_counter: int = 0 + + +def _next_node_id() -> int: + global _node_counter + _node_counter += 1 + return _node_counter + + +class TreeNode: + """A single node in the radix tree. + + ``value`` holds a 1-D ``int64`` tensor of KV-pool indices (one per token + in ``key``). When the node has been evicted, ``value`` is ``None``. + """ + + __slots__ = ( + "children", + "parent", + "key", + "value", + "lock_ref", + "swa_lock_ref", + "swa_tombstone", + "swa_boundary_id", + "last_access_time", + "hit_count", + "hash_values", + "id", + ) + + def __init__(self) -> None: + self.children: Dict[Any, TreeNode] = defaultdict(TreeNode) + self.parent: Optional[TreeNode] = None + self.key: Optional[RadixKey] = None + self.value: Optional[torch.Tensor] = None + + self.lock_ref: int = 0 + self.swa_lock_ref: int = 0 + self.swa_tombstone: bool = False + self.swa_boundary_id: Optional[int] = None + + self.last_access_time: float = time.monotonic() + self.hit_count: int = 0 + self.hash_values: Optional[List[str]] = None + self.id: int = _next_node_id() + + @property + def evicted(self) -> bool: + return self.value is None + + def __lt__(self, other: TreeNode) -> bool: + return self.last_access_time < other.last_access_time + + +def _key_match(key0: RadixKey, key1: RadixKey, page_size: int) -> int: + """Return the length of the common prefix (page-aligned when *page_size* > 1).""" + if key0.extra_key != key1.extra_key: + return 0 + if page_size == 1: + i = 0 + for a, b in zip(key0.token_ids, key1.token_ids): + if a != b: + break + i += 1 + return i + min_len = min(len(key0), len(key1)) + i = 0 + while i < min_len: + if key0.token_ids[i : i + page_size] != key1.token_ids[i : i + page_size]: + break + i += page_size + return i + + +def _child_key(key: RadixKey, page_size: int) -> Any: + """Derive the dict key used in ``node.children``.""" + plain = key.token_ids[0] if page_size == 1 else tuple(key.token_ids[:page_size]) + return (key.extra_key, plain) if key.extra_key is not None else plain + + +@dataclass +class MatchResult: + """Returned by :meth:`RadixCache.match_prefix`.""" + + indices: torch.Tensor + last_node: TreeNode + prefix_len: int = 0 + + +@dataclass +class InsertResult: + """Returned by :meth:`RadixCache.insert`.""" + + prefix_len: int = 0 + + +@dataclass +class EvictResult: + """Returned by :meth:`RadixCache.evict`.""" + + full_evicted: int = 0 + swa_evicted: int = 0 + + +class RadixCache: + """Lightweight radix tree for KV-cache prefix sharing. + + Parameters + ---------- + page_size: + Number of tokens per KV-pool page. Keys and values are aligned to + this granularity. + sliding_window_size: + If set, enables SWA mode. The cache tracks which nodes have had + their SWA KV freed (tombstoned) and constrains prefix matching + so that the sliding-window invariant is maintained. + disable: + When *True* every public method is a no-op (useful for ablation). + token_to_kv_pool_allocator: + Optional pool allocator with ``free(indices)`` (and ``free_swa`` for + SWA mode). When *None*, index tensors are simply discarded. + """ + + def __init__( + self, + page_size: int = 1, + sliding_window_size: Optional[int] = None, + disable: bool = False, + token_to_kv_pool_allocator: Any = None, + ): + self.page_size = page_size + self.sliding_window_size = sliding_window_size + self.disable = disable + self.pool = token_to_kv_pool_allocator + + if self.pool is not None and hasattr(self.pool, "device"): + self.device = self.pool.device + else: + self.device = torch.device("cpu") + + self._swa_boundary_counter: int = 0 + self.reset() + + @property + def supports_swa(self) -> bool: + return self.sliding_window_size is not None + + def evictable_size(self) -> int: + return self._evictable_size + + def swa_evictable_size(self) -> int: + return self._swa_evictable_size + + def protected_size(self) -> int: + return self._protected_size + + def swa_protected_size(self) -> int: + return self._swa_protected_size + + def reset(self) -> None: + """Clear all cached state and re-initialise the root node.""" + self.root_node = TreeNode() + self.root_node.key = RadixKey([]) + self.root_node.value = torch.tensor([], dtype=torch.int64) + self.root_node.lock_ref = 1 + self.root_node.swa_lock_ref = 1 + self._evictable_size: int = 0 + self._swa_evictable_size: int = 0 + self._protected_size: int = 0 + self._swa_protected_size: int = 0 + + def match_prefix(self, key: RadixKey) -> MatchResult: + """Find the longest cached prefix of *key*. + + For SWA mode the match is further constrained: the path from the + returned ``last_node`` to root must have at least + ``sliding_window_size`` non-tombstone tokens (or be entirely + tombstone-free back to root). + + Accessing a prefix refreshes LRU timestamps along the matched path. + """ + empty = MatchResult( + indices=torch.empty(0, dtype=torch.int64, device=self.device), + last_node=self.root_node, + ) + if self.disable or len(key) == 0: + return empty + + key = self._page_align_key(key) + if len(key) == 0: + return empty + + if self.supports_swa: + values, last_node, best_count = self._match_swa(key) + values = values[:best_count] + else: + values, last_node = self._match_normal(key) + + cat = ( + torch.cat(values) + if values + else torch.empty(0, dtype=torch.int64, device=self.device) + ) + return MatchResult(indices=cat, last_node=last_node, prefix_len=len(cat)) + + def insert( + self, + key: RadixKey, + value: Optional[torch.Tensor] = None, + *, + prev_prefix_len: int = 0, + swa_evicted_seqlen: int = 0, + ) -> InsertResult: + """Insert *key*/*value* into the tree. + + Returns how many leading tokens were already present (the prefix + length). The caller is responsible for freeing duplicate KV indices + in the range ``[cache_protected_len, prefix_len)``. + + Parameters + ---------- + prev_prefix_len: + (SWA mode) tokens before this offset are already protected and + should not have their values overwritten. + swa_evicted_seqlen: + (SWA mode) the sequence length up to which SWA KV has been + previously evicted. Used to decide whether a tombstoned node can + be un-tombstoned with the incoming value. + """ + if self.disable: + return InsertResult() + if value is None: + value = torch.tensor(key.token_ids, dtype=torch.int64) + if self.supports_swa: + plen = self._insert_swa( + self.root_node, key, value, prev_prefix_len, swa_evicted_seqlen + ) + else: + plen = self._insert_normal(self.root_node, key, value) + return InsertResult(prefix_len=plen) + + def evict(self, num_tokens: int, swa_num_tokens: int = 0) -> EvictResult: + """Evict up to *num_tokens* (full) and *swa_num_tokens* (SWA) tokens. + + Full eviction removes leaf nodes entirely; SWA eviction tombstones + internal nodes (freeing SWA KV but retaining full-attn KV). + """ + if self.disable: + return EvictResult() + + full_evicted = 0 + swa_evicted = 0 + + # Phase 1: full leaf eviction + if num_tokens > 0: + leaves = self._collect_evictable_leaves() + heap: List[Tuple[float, TreeNode]] = [ + (n.last_access_time, n) for n in leaves + ] + heapq.heapify(heap) + + while full_evicted < num_tokens and heap: + _, node = heapq.heappop(heap) + if node.evicted or node.lock_ref > 0: + continue + n = len(node.value) + self._free_indices(node.value) + full_evicted += n + swa_evicted += n + self._delete_leaf(node) + + p = node.parent + if ( + p is not None + and p != self.root_node + and len(p.children) == 0 + and p.lock_ref == 0 + ): + if self.supports_swa and p.swa_tombstone: + self._free_indices(p.value) + full_evicted += len(p.value) + self._delete_leaf(p) + else: + heapq.heappush(heap, (p.last_access_time, p)) + + # Phase 2: SWA tombstone eviction (internal nodes) + if self.supports_swa and swa_evicted < swa_num_tokens: + candidates = self._collect_swa_evictable() + heap2: List[Tuple[float, TreeNode]] = [ + (n.last_access_time, n) for n in candidates + ] + heapq.heapify(heap2) + + while swa_evicted < swa_num_tokens and heap2: + _, node = heapq.heappop(heap2) + if node.swa_tombstone or node.swa_lock_ref > 0 or node.evicted: + continue + n = len(node.value) + if len(node.children) == 0 and node.lock_ref == 0: + self._free_indices(node.value) + full_evicted += n + swa_evicted += n + self._delete_leaf(node) + elif len(node.children) > 0: + self._free_swa_indices(node.value) + swa_evicted += n + self._tombstone_node(node) + + return EvictResult(full_evicted=full_evicted, swa_evicted=swa_evicted) + + def inc_lock_ref(self, node: TreeNode) -> Optional[int]: + """Lock nodes from *node* up to root (prevents eviction). + + Returns ``swa_boundary_id`` that must be passed back to + :meth:`dec_lock_ref`. In non-SWA mode, returns ``None``. + """ + if self.disable or node is None: + return None + + swa_locked = 0 + swa_boundary_id: Optional[int] = None + cur = node + while cur != self.root_node: + if cur.lock_ref == 0: + self._evictable_size -= len(cur.key) + self._protected_size += len(cur.key) + cur.lock_ref += 1 + + if ( + self.supports_swa + and swa_locked < self.sliding_window_size + and not cur.swa_tombstone + ): + if cur.swa_lock_ref == 0: + self._swa_evictable_size -= len(cur.key) + self._swa_protected_size += len(cur.key) + cur.swa_lock_ref += 1 + swa_locked += len(cur.key) + if swa_locked >= self.sliding_window_size: + if cur.swa_boundary_id is None: + self._swa_boundary_counter += 1 + cur.swa_boundary_id = self._swa_boundary_counter + swa_boundary_id = cur.swa_boundary_id + + cur = cur.parent + return swa_boundary_id + + def dec_lock_ref( + self, node: TreeNode, swa_boundary_id: Optional[int] = None + ) -> None: + """Unlock nodes from *node* up to root.""" + if self.disable or node is None: + return + + dec_swa = True + cur = node + while cur != self.root_node: + if cur.lock_ref == 1: + self._evictable_size += len(cur.key) + self._protected_size -= len(cur.key) + cur.lock_ref -= 1 + + if self.supports_swa and dec_swa and not cur.swa_tombstone: + if cur.swa_lock_ref == 1: + self._swa_evictable_size += len(cur.key) + self._swa_protected_size -= len(cur.key) + cur.swa_lock_ref -= 1 + if swa_boundary_id and cur.swa_boundary_id == swa_boundary_id: + dec_swa = False + + cur = cur.parent + + def total_size(self) -> int: + """Total number of cached tokens (including tombstoned).""" + total = 0 + stack: List[TreeNode] = [self.root_node] + while stack: + n = stack.pop() + if n.value is not None: + total += len(n.value) + stack.extend(c for c in n.children.values() if not c.evicted) + return total + + def compute_node_hash(self, node: TreeNode) -> List[str]: + """Compute position-aware SHA-256 hashes for *node* (one per page). + + Lazily computed and cached on ``node.hash_values``. + """ + if node.hash_values is not None: + return node.hash_values + + parent_hash: Optional[str] = None + if ( + node.parent is not None + and node.parent.hash_values is not None + and len(node.parent.key) > 0 + and len(node.parent.hash_values) > 0 + ): + parent_hash = node.parent.hash_values[-1] + + hashes: List[str] = [] + for start in range(0, len(node.key), self.page_size): + page = node.key.token_ids[start : start + self.page_size] + if not page: + continue + h = hash_token_ids(page, prior_hash=parent_hash) + hashes.append(h) + parent_hash = h + + node.hash_values = hashes + return hashes + + def pretty_print(self) -> None: + """Print the tree structure to stdout.""" + self._print_helper(self.root_node, 0) + print( + f"total={self.total_size()} evictable={self._evictable_size}" + + ( + f" swa_evictable={self._swa_evictable_size}" + if self.supports_swa + else "" + ) + ) + + def _match_normal(self, key: RadixKey) -> Tuple[List[torch.Tensor], TreeNode]: + node = self.root_node + now = time.monotonic() + node.last_access_time = now + values: List[torch.Tensor] = [] + + while len(key) > 0: + ck = _child_key(key, self.page_size) + if ck not in node.children: + break + child = node.children[ck] + child.last_access_time = now + child.hit_count += 1 + plen = _key_match(child.key, key, self.page_size) + if plen < len(child.key): + new_node = self._split_node(child.key, child, plen) + values.append(new_node.value) + node = new_node + break + values.append(child.value) + node = child + key = key[plen:] + + return values, node + + def _match_swa(self, key: RadixKey) -> Tuple[List[torch.Tensor], TreeNode, int]: + """SWA-aware match. Returns *(values, last_node, best_value_count)*. + + ``best_value_count`` is the number of value tensors from *values* + that form a valid SWA-safe prefix (enough non-tombstone tokens within + the sliding window, or a tombstone-free path to root). + """ + node = self.root_node + values: List[torch.Tensor] = [] + non_tomb_len: float = float("inf") + best_count = 0 + best_node = node + + while len(key) > 0: + ck = _child_key(key, self.page_size) + if ck not in node.children: + break + child = node.children[ck] + + if child.swa_tombstone: + if non_tomb_len >= self.sliding_window_size: + best_count = len(values) + best_node = node + non_tomb_len = 0 + + plen = _key_match(child.key, key, self.page_size) + if plen < len(child.key): + new_node = self._split_node(child.key, child, plen) + values.append(new_node.value) + if not new_node.swa_tombstone: + non_tomb_len += len(new_node.value) + node = new_node + break + values.append(child.value) + if not child.swa_tombstone: + non_tomb_len += len(child.value) + node = child + key = key[plen:] + + if non_tomb_len >= self.sliding_window_size: + best_count = len(values) + best_node = node + + return values, best_node, best_count + + def _insert_normal(self, node: TreeNode, key: RadixKey, value: torch.Tensor) -> int: + now = time.monotonic() + node.last_access_time = now + if len(key) == 0: + return 0 + + total_prefix = 0 + while len(key) > 0: + ck = _child_key(key, self.page_size) + if ck not in node.children: + break + node = node.children[ck] + node.last_access_time = now + plen = _key_match(node.key, key, self.page_size) + if plen < len(node.key): + self._split_node(node.key, node, plen) + total_prefix += plen + key = key[plen:] + value = value[plen:] + + if len(key) > 0: + self._add_leaf(node, key, value) + + return total_prefix + + def _insert_swa( + self, + node: TreeNode, + key: RadixKey, + value: torch.Tensor, + prev_prefix_len: int, + swa_evicted_seqlen: int, + ) -> int: + """Insert with SWA tombstone awareness. + + When an existing node is tombstoned and the incoming *value* carries + fresh SWA KV (i.e. beyond *swa_evicted_seqlen*), the node is + un-tombstoned and its value is replaced. + """ + now = time.monotonic() + node.last_access_time = now + if len(key) == 0: + return 0 + + total_prefix = 0 + while len(key) > 0: + ck = _child_key(key, self.page_size) + if ck not in node.children: + break + node = node.children[ck] + node.last_access_time = now + plen = _key_match(node.key, key, self.page_size) + + if plen < len(node.key): + self._split_node(node.key, node, plen) + + beyond_protected = prev_prefix_len < total_prefix + plen + if beyond_protected and node.swa_tombstone: + if swa_evicted_seqlen <= total_prefix: + self._free_indices(node.value[:plen]) + node.value = value[:plen].clone() + node.swa_tombstone = False + self._swa_evictable_size += len(node.value) + else: + self._free_indices(value[:plen]) + elif beyond_protected: + self._free_indices(value[:plen]) + + total_prefix += plen + key = key[plen:] + value = value[plen:] + + if len(key) > 0: + if ( + swa_evicted_seqlen > total_prefix + and swa_evicted_seqlen < total_prefix + len(key) + ): + tomb_len = swa_evicted_seqlen - total_prefix + self._add_leaf( + node, key[:tomb_len], value[:tomb_len], swa_tombstone=True + ) + node = node.children[_child_key(key, self.page_size)] + key = key[tomb_len:] + value = value[tomb_len:] + + if len(key) > 0: + self._add_leaf(node, key, value, swa_tombstone=False) + + return total_prefix + + def _add_leaf( + self, + parent: TreeNode, + key: RadixKey, + value: torch.Tensor, + swa_tombstone: bool = False, + ) -> TreeNode: + new_node = TreeNode() + new_node.parent = parent + new_node.key = key + new_node.value = value.clone() + new_node.swa_tombstone = swa_tombstone + parent.children[_child_key(key, self.page_size)] = new_node + self._evictable_size += len(key) + if self.supports_swa and not swa_tombstone: + self._swa_evictable_size += len(key) + return new_node + + def _split_node(self, key: RadixKey, child: TreeNode, split_len: int) -> TreeNode: + """Split *child* at *split_len*, returning the new parent node.""" + new_node = TreeNode() + new_node.children[_child_key(key[split_len:], self.page_size)] = child + new_node.parent = child.parent + new_node.lock_ref = child.lock_ref + new_node.swa_lock_ref = child.swa_lock_ref + new_node.swa_tombstone = child.swa_tombstone + new_node.swa_boundary_id = child.swa_boundary_id + child.swa_boundary_id = None + new_node.key = child.key[:split_len] + new_node.value = child.value[:split_len].clone() + + # Split hash values if they exist + if child.hash_values is not None: + pages = split_len // self.page_size if self.page_size > 1 else split_len + new_node.hash_values = child.hash_values[:pages] + child.hash_values = child.hash_values[pages:] + else: + new_node.hash_values = None + + child.parent = new_node + child.key = child.key[split_len:] + child.value = child.value[split_len:].clone() + new_node.parent.children[_child_key(key, self.page_size)] = new_node + return new_node + + def _delete_leaf(self, node: TreeNode) -> None: + ck = _child_key(node.key, self.page_size) + node.parent.children.pop(ck, None) + self._evictable_size -= len(node.key) + if self.supports_swa and not node.swa_tombstone: + self._swa_evictable_size -= len(node.key) + + def _tombstone_node(self, node: TreeNode) -> None: + node.swa_tombstone = True + self._swa_evictable_size -= len(node.key) + + def _collect_evictable_leaves(self) -> List[TreeNode]: + leaves: List[TreeNode] = [] + stack: List[TreeNode] = [self.root_node] + while stack: + n = stack.pop() + if n.evicted: + continue + has_live_child = False + for c in n.children.values(): + if not c.evicted: + has_live_child = True + stack.append(c) + if not has_live_child and n.lock_ref == 0 and n != self.root_node: + leaves.append(n) + return leaves + + def _collect_swa_evictable(self) -> List[TreeNode]: + nodes: List[TreeNode] = [] + stack: List[TreeNode] = [self.root_node] + while stack: + n = stack.pop() + if n.evicted: + continue + if n != self.root_node and not n.swa_tombstone and n.swa_lock_ref == 0: + nodes.append(n) + stack.extend(c for c in n.children.values() if not c.evicted) + return nodes + + def _page_align_key(self, key: RadixKey) -> RadixKey: + if self.page_size == 1: + return key + aligned = len(key) // self.page_size * self.page_size + return key[:aligned] + + def _free_indices(self, indices: torch.Tensor) -> None: + if self.pool is not None and len(indices) > 0: + self.pool.free(indices) + + def _free_swa_indices(self, indices: torch.Tensor) -> None: + if self.pool is not None and len(indices) > 0: + if hasattr(self.pool, "free_swa"): + self.pool.free_swa(indices) + else: + self.pool.free(indices) + + def _print_helper(self, node: TreeNode, indent: int) -> None: + stack = [(node, indent)] + while stack: + n, ind = stack.pop() + toks = n.key.token_ids[:10] if n.key else [] + klen = len(n.key) if n.key else 0 + flags = f"lock={n.lock_ref}" + if self.supports_swa: + flags += f" swa={n.swa_lock_ref} tomb={n.swa_tombstone}" + print(f"{' ' * ind}[{klen}] {toks} {flags}") + for c in n.children.values(): + stack.append((c, ind + 1)) diff --git a/pymllm/orchestrator/async_disk_io_process.py b/pymllm/orchestrator/async_disk_io_process.py index 598d93eb2..ef3fd5f00 100644 --- a/pymllm/orchestrator/async_disk_io_process.py +++ b/pymllm/orchestrator/async_disk_io_process.py @@ -1,3 +1,84 @@ +""" +AsyncDiskIoProcess -- optional subprocess for asynchronous disk I/O. + +Handles weight loading, checkpoint saving, or other heavy disk operations +without blocking the scheduler or model runner. +""" + +import logging +from multiprocessing.connection import Connection +from typing import Any, Dict, Optional + +import zmq + +from pymllm.orchestrator.ipc_utils import create_zmq_socket + +logger = logging.getLogger(__name__) + + class AsyncDiskIoProcess: - def __init__(self): + """Runs inside a subprocess. Performs disk I/O on behalf of the scheduler.""" + + def __init__(self, recv_addr: str): + self._recv_addr = recv_addr + + self._zmq_ctx: Optional[zmq.Context] = None + self._recv_sock: Optional[zmq.Socket] = None + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def init_sockets(self) -> None: + self._zmq_ctx = zmq.Context() + self._recv_sock = create_zmq_socket( + self._zmq_ctx, zmq.PULL, self._recv_addr, bind=True, + ) + + def event_loop(self) -> None: + """Infinite loop: recv I/O request -> execute -> (optionally reply).""" + logger.info("AsyncDiskIoProcess event loop started") + while True: + io_request: Dict[str, Any] = self._recv_sock.recv_pyobj() + self._handle(io_request) + + # ------------------------------------------------------------------ + # I/O handling (placeholder) + # ------------------------------------------------------------------ + + def _handle(self, io_request: Dict[str, Any]) -> None: + """Dispatch an I/O request. + + TODO: implement weight loading, checkpoint save, etc. + """ + kind = io_request.get("kind") + logger.debug("AsyncDiskIoProcess received request kind=%s", kind) + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + if self._recv_sock is not None: + self._recv_sock.close() + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + +def run_async_disk_io_process( + recv_addr: str, + pipe_writer: Connection, +) -> None: + """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + proc = AsyncDiskIoProcess(recv_addr) + proc.init_sockets() + + pipe_writer.send({"status": "ready", "process": "async_disk_io"}) + pipe_writer.close() + + try: + proc.event_loop() + except KeyboardInterrupt: pass + finally: + proc.shutdown() diff --git a/pymllm/orchestrator/detokenizer_process.py b/pymllm/orchestrator/detokenizer_process.py index 47c1c5950..e9d5184b6 100644 --- a/pymllm/orchestrator/detokenizer_process.py +++ b/pymllm/orchestrator/detokenizer_process.py @@ -1,3 +1,114 @@ +""" +DetokenizerProcess -- subprocess that converts token IDs back to text. + +Receives ``BatchTokenIDOut``-style dicts from the SchedulerProcess, +detokenizes them, and forwards the decoded strings to the +RequestResponseProcess. +""" + +import logging +from multiprocessing.connection import Connection +from typing import Any, Dict, List, Optional + +import zmq + +from pymllm.orchestrator.ipc_utils import create_zmq_socket + +logger = logging.getLogger(__name__) + + class DetokenizerProcess: - def __init__(self): + """Runs inside a subprocess. Detokenizes finished outputs.""" + + def __init__( + self, + recv_from_scheduler_addr: str, + send_to_rr_addr: str, + ): + self._recv_from_scheduler_addr = recv_from_scheduler_addr + self._send_to_rr_addr = send_to_rr_addr + + self._zmq_ctx: Optional[zmq.Context] = None + self._recv_from_scheduler: Optional[zmq.Socket] = None + self._send_to_rr: Optional[zmq.Socket] = None + + # TODO: initialise the tokenizer (needed for decode) + self._tokenizer = None + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def init_sockets(self) -> None: + self._zmq_ctx = zmq.Context() + self._recv_from_scheduler = create_zmq_socket( + self._zmq_ctx, + zmq.PULL, + self._recv_from_scheduler_addr, + bind=False, + ) + self._send_to_rr = create_zmq_socket( + self._zmq_ctx, + zmq.PUSH, + self._send_to_rr_addr, + bind=False, + ) + + def event_loop(self) -> None: + """Infinite loop: recv token IDs -> detokenize -> send text to RR.""" + logger.info("DetokenizerProcess event loop started") + while True: + token_id_out = self._recv_from_scheduler.recv_pyobj() + str_out = self._detokenize(token_id_out) + self._send_to_rr.send_pyobj(str_out) + + # ------------------------------------------------------------------ + # Detokenization (placeholder) + # ------------------------------------------------------------------ + + def _detokenize(self, token_id_out: Dict[str, Any]) -> Dict[str, Any]: + """Convert token IDs to text. + + TODO: replace with real tokenizer.decode() call and incremental + detokenization logic. + """ + output_ids: List[int] = token_id_out.get("output_token_ids", []) + # placeholder: join ids as string + text = "" # TODO: self._tokenizer.decode(output_ids) + return { + "rid": token_id_out.get("rid"), + "text": text, + "output_token_ids": output_ids, + } + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + if self._recv_from_scheduler is not None: + self._recv_from_scheduler.close() + if self._send_to_rr is not None: + self._send_to_rr.close() + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + +def run_detokenizer_process( + recv_from_scheduler_addr: str, + send_to_rr_addr: str, + pipe_writer: Connection, +) -> None: + """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + proc = DetokenizerProcess(recv_from_scheduler_addr, send_to_rr_addr) + proc.init_sockets() + + pipe_writer.send({"status": "ready", "process": "detokenizer"}) + pipe_writer.close() + + try: + proc.event_loop() + except KeyboardInterrupt: pass + finally: + proc.shutdown() diff --git a/pymllm/orchestrator/ipc_utils.py b/pymllm/orchestrator/ipc_utils.py new file mode 100644 index 000000000..faaf7a6d9 --- /dev/null +++ b/pymllm/orchestrator/ipc_utils.py @@ -0,0 +1,70 @@ +"""ZMQ IPC utilities for inter-process communication. + +Provides helpers to generate unique IPC addresses and create pre-configured +ZMQ sockets so that every process uses the same conventions. +""" + +import os +import tempfile +from typing import Optional + +import zmq + + +_IPC_DIR = os.path.join(tempfile.gettempdir(), "pymllm_ipc") + + +def _ensure_ipc_dir() -> None: + os.makedirs(_IPC_DIR, exist_ok=True) + + +def make_ipc_address(name: str, unique_id: Optional[str] = None) -> str: + """Return an ``ipc://`` address for *name*, optionally scoped by *unique_id*. + + Parameters + ---------- + name + Logical channel name, e.g. ``"rr_to_tokenizer"``. + unique_id + Per-engine identifier (typically ``str(os.getpid())``) to avoid + collisions when multiple engines run on the same host. + """ + _ensure_ipc_dir() + suffix = f"_{unique_id}" if unique_id else "" + return f"ipc://{_IPC_DIR}/pymllm_{name}{suffix}" + + +def create_zmq_socket( + ctx: zmq.Context, + socket_type: int, + address: str, + bind: bool, +) -> zmq.Socket: + """Create a ZMQ socket, bind or connect it, and return it. + + Parameters + ---------- + ctx + A ``zmq.Context`` shared within the process. + socket_type + One of ``zmq.PUSH``, ``zmq.PULL``, ``zmq.PAIR``, etc. + address + The ``ipc://`` address string. + bind + If ``True`` the socket calls ``bind``; otherwise ``connect``. + """ + sock = ctx.socket(socket_type) + sock.setsockopt(zmq.LINGER, 0) + if bind: + sock.bind(address) + else: + sock.connect(address) + return sock + + +def close_zmq_socket(sock: zmq.Socket) -> None: + """Close a ZMQ socket, ignoring errors.""" + try: + sock.close() + except zmq.ZMQError: + pass diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py index 45091b590..4b28645eb 100644 --- a/pymllm/orchestrator/model_runner_process.py +++ b/pymllm/orchestrator/model_runner_process.py @@ -1,3 +1,114 @@ +""" +ModelRunnerProcess -- subprocess that executes model forward passes. + +Receives batches from the SchedulerProcess, runs the model forward + sampling, +and returns the results (logits, next_token_ids) back to the scheduler. +""" + +import logging +from multiprocessing.connection import Connection +from typing import Any, Dict, Optional + +import zmq + +from pymllm.orchestrator.ipc_utils import create_zmq_socket + +logger = logging.getLogger(__name__) + + class ModelRunnerProcess: - def __init__(self): + """Runs inside a subprocess. Owns the model and performs forward passes.""" + + def __init__( + self, + recv_from_scheduler_addr: str, + send_to_scheduler_addr: str, + ): + self._recv_from_scheduler_addr = recv_from_scheduler_addr + self._send_to_scheduler_addr = send_to_scheduler_addr + + self._zmq_ctx: Optional[zmq.Context] = None + self._recv_from_scheduler: Optional[zmq.Socket] = None + self._send_to_scheduler: Optional[zmq.Socket] = None + + # TODO: initialise model, attention backend, memory pool, etc. + self._model = None + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def init_sockets(self) -> None: + self._zmq_ctx = zmq.Context() + self._recv_from_scheduler = create_zmq_socket( + self._zmq_ctx, zmq.PULL, self._recv_from_scheduler_addr, bind=False, + ) + self._send_to_scheduler = create_zmq_socket( + self._zmq_ctx, zmq.PUSH, self._send_to_scheduler_addr, bind=False, + ) + + def event_loop(self) -> None: + """Infinite loop: recv batch -> forward -> sample -> send result.""" + logger.info("ModelRunnerProcess event loop started") + while True: + batch = self._recv_from_scheduler.recv_pyobj() + result = self._forward_batch(batch) + self._send_to_scheduler.send_pyobj(result) + + # ------------------------------------------------------------------ + # Forward pass (placeholder) + # ------------------------------------------------------------------ + + def _forward_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: + """Run the model forward pass and sampling for *batch*. + + TODO: implement real forward pass, logits processing, and sampling. + """ + requests = batch.get("requests", []) + finished = [] + unfinished = [] + + for req in requests: + # TODO: actual model forward, logits -> next_token_ids + next_token_ids = [] # placeholder + req["output_token_ids"] = req.get("output_token_ids", []) + next_token_ids + # TODO: check EOS / max_tokens to decide finished vs. unfinished + finished.append(req) + + return { + "batch_id": batch.get("batch_id"), + "finished": finished, + "unfinished": unfinished, + } + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + if self._recv_from_scheduler is not None: + self._recv_from_scheduler.close() + if self._send_to_scheduler is not None: + self._send_to_scheduler.close() + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + +def run_model_runner_process( + recv_from_scheduler_addr: str, + send_to_scheduler_addr: str, + pipe_writer: Connection, +) -> None: + """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + proc = ModelRunnerProcess(recv_from_scheduler_addr, send_to_scheduler_addr) + proc.init_sockets() + + pipe_writer.send({"status": "ready", "process": "model_runner"}) + pipe_writer.close() + + try: + proc.event_loop() + except KeyboardInterrupt: pass + finally: + proc.shutdown() diff --git a/pymllm/orchestrator/request_response_process.py b/pymllm/orchestrator/request_response_process.py index 998c2655e..743354280 100644 --- a/pymllm/orchestrator/request_response_process.py +++ b/pymllm/orchestrator/request_response_process.py @@ -1,10 +1,150 @@ """ -This module contains the request and response threads for the orchestrator. +RequestResponseProcess -- the main-process entry point for user requests. -NOTE: This RR(request and response) threads can only be used as the main thread of the orchestrator. +This process is **not** a subprocess; it lives in the engine's main process. +Incoming requests are placed into an ``asyncio.Queue`` and forwarded to the +TokenizerProcess via ZMQ. Decoded results arrive back from the +DetokenizerProcess and are dispatched to the waiting callers. + +The request-tracking model uses ``ReqState`` pattern: each request +gets an ``asyncio.Event`` + output list so that streaming (multiple incremental +chunks) and one-shot responses are both supported. """ +import asyncio +import dataclasses +import logging +from typing import Any, Dict, List, Optional + +import zmq +import zmq.asyncio + +from pymllm.engine.io_struct import GenerateReqInput +from pymllm.orchestrator.ipc_utils import create_zmq_socket, close_zmq_socket + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class ReqState: + """Per-request state that supports both streaming and one-shot responses. + + ``ReqState`` (Event + out_list). + + The recv loop appends results to *out_list* and signals *event*; + callers ``await event.wait()`` in a loop, consuming results until + *finished* is ``True``. + """ + + out_list: List[Dict[str, Any]] = dataclasses.field(default_factory=list) + finished: bool = False + event: asyncio.Event = dataclasses.field(default_factory=asyncio.Event) + class RequestResponseProcess: - def __init__(self): - pass + """Sits in the main process; bridges user-facing API and subprocess pipeline.""" + + def __init__( + self, + send_to_tokenizer_addr: str, + recv_from_detokenizer_addr: str, + ): + self._send_to_tokenizer_addr: str = send_to_tokenizer_addr + self._recv_from_detokenizer_addr: str = recv_from_detokenizer_addr + + # asyncio queue that buffers incoming user requests + self._request_queue: asyncio.Queue[Dict[str, Any]] = asyncio.Queue() + + # rid -> ReqState (replaces the old rid -> Future dict) + self._rid_to_state: Dict[str, ReqState] = {} + + # ZMQ (async context, sockets created lazily in the event loop) + self._zmq_ctx: Optional[zmq.asyncio.Context] = None + self._send_to_tokenizer: Optional[zmq.asyncio.Socket] = None + self._recv_from_detokenizer: Optional[zmq.asyncio.Socket] = None + + self._loop_task: Optional[asyncio.Task] = None + + def start(self, loop: asyncio.AbstractEventLoop) -> None: + """Kick off the background send/recv tasks on *loop*.""" + self._zmq_ctx = zmq.asyncio.Context() + self._send_to_tokenizer = create_zmq_socket( + self._zmq_ctx, + zmq.PUSH, + self._send_to_tokenizer_addr, + bind=True, + ) + self._recv_from_detokenizer = create_zmq_socket( + self._zmq_ctx, + zmq.PULL, + self._recv_from_detokenizer_addr, + bind=True, + ) + self._loop_task = loop.create_task(self._run()) + + async def add_request(self, request: GenerateReqInput) -> ReqState: + """Enqueue a request and return its :class:`ReqState`. + + Callers should ``await state.event.wait()`` in a loop, consuming + ``state.out_list`` entries until ``state.finished`` is ``True``. + """ + if not isinstance(request.rid, str): + raise ValueError("RequestResponseProcess currently accepts single requests only.") + rid = request.rid + state = ReqState() + self._rid_to_state[rid] = state + await self._request_queue.put(request.to_request_dict()) + return state + + def remove_state(self, rid: str) -> None: + """Remove the ``ReqState`` for *rid* (called by the caller once done).""" + self._rid_to_state.pop(rid, None) + + async def abort_request(self, rid: str) -> None: + """Cancel a pending request and notify downstream processes.""" + state = self._rid_to_state.pop(rid, None) + if state is not None and not state.finished: + state.finished = True + state.out_list.append({"rid": rid, "error": "aborted", "finished": True}) + state.event.set() + await self._send_to_tokenizer.send_pyobj({"rid": rid, "abort": True}) + + async def shutdown(self) -> None: + if self._loop_task is not None: + self._loop_task.cancel() + if self._send_to_tokenizer is not None: + close_zmq_socket(self._send_to_tokenizer) + if self._recv_from_detokenizer is not None: + close_zmq_socket(self._recv_from_detokenizer) + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + # ------------------------------------------------------------------ + # Internal loops + # ------------------------------------------------------------------ + + async def _run(self) -> None: + """Main loop: forward requests to tokenizer, receive results from detokenizer.""" + send_task = asyncio.create_task(self._send_loop()) + recv_task = asyncio.create_task(self._recv_loop()) + await asyncio.gather(send_task, recv_task) + + async def _send_loop(self) -> None: + """Drain the asyncio queue and push requests to the TokenizerProcess.""" + while True: + request = await self._request_queue.get() + await self._send_to_tokenizer.send_pyobj(request) + + async def _recv_loop(self) -> None: + """Receive decoded results from DetokenizerProcess and dispatch to ReqStates.""" + while True: + result = await self._recv_from_detokenizer.recv_pyobj() + rid = result.get("rid") + state = self._rid_to_state.get(rid) + if state is None: + logger.warning("Received result for unknown rid=%s", rid) + continue + state.out_list.append(result) + if result.get("finished", False): + state.finished = True + state.event.set() diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py index 7a7783d57..e7394dab3 100644 --- a/pymllm/orchestrator/scheduler_process.py +++ b/pymllm/orchestrator/scheduler_process.py @@ -1,3 +1,248 @@ +""" +SchedulerProcess -- the central scheduling hub. + +Receives tokenized requests from the TokenizerProcess, organises them into +batches, dispatches batches to the ModelRunnerProcess for forward passes, +collects results, and streams finished token IDs to the DetokenizerProcess. + +The main ``event_loop`` scheduler flow:: + + while True: + recv_requests() + process_input_requests() + batch = get_next_batch_to_run() + if batch: + run_batch(batch) + process_batch_result(batch) + stream_output() +""" + +import logging +import time +from collections import deque +from multiprocessing.connection import Connection +from typing import Any, Deque, Dict, List, Optional + +import zmq + +from pymllm.orchestrator.ipc_utils import create_zmq_socket + +logger = logging.getLogger(__name__) + + class SchedulerProcess: - def __init__(self): + """Runs inside a subprocess. Central hub that drives the inference loop.""" + + def __init__( + self, + recv_from_tokenizer_addr: str, + send_to_model_runner_addr: str, + recv_from_model_runner_addr: str, + send_to_detokenizer_addr: str, + ): + # ZMQ addresses + self._recv_from_tokenizer_addr = recv_from_tokenizer_addr + self._send_to_model_runner_addr = send_to_model_runner_addr + self._recv_from_model_runner_addr = recv_from_model_runner_addr + self._send_to_detokenizer_addr = send_to_detokenizer_addr + + # ZMQ runtime objects (initialised in init_sockets) + self._zmq_ctx: Optional[zmq.Context] = None + self._recv_from_tokenizer: Optional[zmq.Socket] = None + self._send_to_model_runner: Optional[zmq.Socket] = None + self._recv_from_model_runner: Optional[zmq.Socket] = None + self._send_to_detokenizer: Optional[zmq.Socket] = None + self._poller: Optional[zmq.Poller] = None + + # Request management + self._waiting_queue: Deque[Dict[str, Any]] = deque() + self._running_batch: Optional[Dict[str, Any]] = None + self._finished: List[Dict[str, Any]] = [] + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def init_sockets(self) -> None: + self._zmq_ctx = zmq.Context() + + self._recv_from_tokenizer = create_zmq_socket( + self._zmq_ctx, + zmq.PULL, + self._recv_from_tokenizer_addr, + bind=False, + ) + self._send_to_model_runner = create_zmq_socket( + self._zmq_ctx, + zmq.PUSH, + self._send_to_model_runner_addr, + bind=True, + ) + self._recv_from_model_runner = create_zmq_socket( + self._zmq_ctx, + zmq.PULL, + self._recv_from_model_runner_addr, + bind=True, + ) + self._send_to_detokenizer = create_zmq_socket( + self._zmq_ctx, + zmq.PUSH, + self._send_to_detokenizer_addr, + bind=True, + ) + + # Poller for non-blocking recv from tokenizer + self._poller = zmq.Poller() + self._poller.register(self._recv_from_tokenizer, zmq.POLLIN) + + def event_loop(self) -> None: + """Infinite scheduling loop.""" + logger.info("SchedulerProcess event loop started") + while True: + self.recv_requests() + self.process_input_requests() + batch = self.get_next_batch_to_run() + if batch is not None: + result = self.run_batch(batch) + self.process_batch_result(batch, result) + self.stream_output() + + # ------------------------------------------------------------------ + # Step 1: receive tokenized requests (non-blocking) + # ------------------------------------------------------------------ + + def recv_requests(self) -> None: + """Non-blocking receive of tokenized requests from TokenizerProcess. + + Uses ``zmq.Poller`` with a short timeout so the scheduler is never + stuck waiting when there are batches to run. + """ + while True: + events = dict(self._poller.poll(timeout=0)) # non-blocking + if self._recv_from_tokenizer not in events: + break + req = self._recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK) + self._waiting_queue.append(req) + + # ------------------------------------------------------------------ + # Step 2: process input requests + # ------------------------------------------------------------------ + + def process_input_requests(self) -> None: + """Pre-process and validate requests sitting in ``_waiting_queue``. + + TODO: attach sampling params, allocate KV-cache slots, etc. + """ pass + + # ------------------------------------------------------------------ + # Step 3: build the next batch + # ------------------------------------------------------------------ + + def get_next_batch_to_run(self) -> Optional[Dict[str, Any]]: + """Select requests from ``_waiting_queue`` and form a batch. + + TODO: implement real batching / scheduling policy. + """ + if not self._waiting_queue: + return None + + batch_requests: List[Dict[str, Any]] = [] + # TODO: respect max_running_requests, memory budget, etc. + while self._waiting_queue: + batch_requests.append(self._waiting_queue.popleft()) + + batch = { + "requests": batch_requests, + "batch_id": id(batch_requests), + "created_at": time.time(), + } + return batch + + # ------------------------------------------------------------------ + # Step 4: run the batch via ModelRunnerProcess + # ------------------------------------------------------------------ + + def run_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: + """Send *batch* to ModelRunnerProcess and wait for the result. + + This is a **blocking** call: the scheduler is synchronous with the + model runner for simplicity. Overlap scheduling can be added later. + """ + self._send_to_model_runner.send_pyobj(batch) + result = self._recv_from_model_runner.recv_pyobj() + return result + + # ------------------------------------------------------------------ + # Step 5: process batch result + # ------------------------------------------------------------------ + + def process_batch_result( + self, batch: Dict[str, Any], result: Dict[str, Any] + ) -> None: + """Handle the result returned by the ModelRunnerProcess. + + TODO: check completion status (EOS, max_tokens), manage KV-cache, + split finished vs. unfinished requests. + """ + finished_requests = result.get("finished", []) + unfinished_requests = result.get("unfinished", []) + + self._finished.extend(finished_requests) + + # Put unfinished requests back for the next iteration + for req in unfinished_requests: + self._waiting_queue.appendleft(req) + + # ------------------------------------------------------------------ + # Step 6: stream output to DetokenizerProcess + # ------------------------------------------------------------------ + + def stream_output(self) -> None: + """Send finished token-ID outputs to the DetokenizerProcess.""" + while self._finished: + item = self._finished.pop(0) + self._send_to_detokenizer.send_pyobj(item) + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + for sock in ( + self._recv_from_tokenizer, + self._send_to_model_runner, + self._recv_from_model_runner, + self._send_to_detokenizer, + ): + if sock is not None: + sock.close() + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + +def run_scheduler_process( + recv_from_tokenizer_addr: str, + send_to_model_runner_addr: str, + recv_from_model_runner_addr: str, + send_to_detokenizer_addr: str, + pipe_writer: Connection, +) -> None: + """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + proc = SchedulerProcess( + recv_from_tokenizer_addr, + send_to_model_runner_addr, + recv_from_model_runner_addr, + send_to_detokenizer_addr, + ) + proc.init_sockets() + + pipe_writer.send({"status": "ready", "process": "scheduler"}) + pipe_writer.close() + + try: + proc.event_loop() + except KeyboardInterrupt: + pass + finally: + proc.shutdown() diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py index 0dca2155e..852fac115 100644 --- a/pymllm/orchestrator/tokenizer_process.py +++ b/pymllm/orchestrator/tokenizer_process.py @@ -1,3 +1,102 @@ +""" +TokenizerProcess -- subprocess that tokenizes incoming raw requests. + +Receives raw requests from RequestResponseProcess via ZMQ, tokenizes them, +and forwards the tokenized payloads to the SchedulerProcess. +""" + +import logging +from multiprocessing.connection import Connection +from typing import Any, Dict, List + +import zmq + +from pymllm.orchestrator.ipc_utils import create_zmq_socket + +logger = logging.getLogger(__name__) + + class TokenizerProcess: - def __init__(self): + """Runs inside a subprocess spawned by ``torch.multiprocessing``.""" + + def __init__( + self, + recv_from_rr_addr: str, + send_to_scheduler_addr: str, + ): + self._recv_from_rr_addr = recv_from_rr_addr + self._send_to_scheduler_addr = send_to_scheduler_addr + + self._zmq_ctx: zmq.Context = None + self._recv_from_rr: zmq.Socket = None + self._send_to_scheduler: zmq.Socket = None + + # TODO: initialise the actual tokenizer (HuggingFace / custom) + self._tokenizer = None + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def init_sockets(self) -> None: + self._zmq_ctx = zmq.Context() + self._recv_from_rr = create_zmq_socket( + self._zmq_ctx, zmq.PULL, self._recv_from_rr_addr, bind=False, + ) + self._send_to_scheduler = create_zmq_socket( + self._zmq_ctx, zmq.PUSH, self._send_to_scheduler_addr, bind=True, + ) + + def event_loop(self) -> None: + """Infinite loop: recv raw request -> tokenize -> send to scheduler.""" + logger.info("TokenizerProcess event loop started") + while True: + raw_request: Dict[str, Any] = self._recv_from_rr.recv_pyobj() + tokenized = self._tokenize(raw_request) + self._send_to_scheduler.send_pyobj(tokenized) + + # ------------------------------------------------------------------ + # Tokenization (placeholder) + # ------------------------------------------------------------------ + + def _tokenize(self, raw_request: Dict[str, Any]) -> Dict[str, Any]: + """Tokenize a single raw request and return the tokenized payload. + + TODO: replace with real tokenizer call. + """ + text = raw_request.get("text", "") + # placeholder: produce fake token ids + input_ids: List[int] = [] # TODO: self._tokenizer.encode(text) + return { + **raw_request, + "input_ids": input_ids, + } + + def shutdown(self) -> None: + if self._recv_from_rr is not None: + self._recv_from_rr.close() + if self._send_to_scheduler is not None: + self._send_to_scheduler.close() + if self._zmq_ctx is not None: + self._zmq_ctx.term() + + +def run_tokenizer_process( + recv_from_rr_addr: str, + send_to_scheduler_addr: str, + pipe_writer: Connection, +) -> None: + """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + proc = TokenizerProcess(recv_from_rr_addr, send_to_scheduler_addr) + proc.init_sockets() + + # Signal readiness to the parent process + pipe_writer.send({"status": "ready", "process": "tokenizer"}) + pipe_writer.close() + + try: + proc.event_loop() + except KeyboardInterrupt: pass + finally: + proc.shutdown() From 7f78efa78fb1c2087ff27ef03f894be8b083f876 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Sat, 21 Feb 2026 15:05:58 +0000 Subject: [PATCH 29/42] refactor: improve socket initialization in TokenizerProcess - Enhanced readability by formatting socket creation parameters across multiple lines in the `init_sockets` method of `TokenizerProcess`. - Maintained functionality while improving code clarity for future maintenance. --- pymllm/orchestrator/tokenizer_process.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py index 852fac115..53714bb60 100644 --- a/pymllm/orchestrator/tokenizer_process.py +++ b/pymllm/orchestrator/tokenizer_process.py @@ -41,10 +41,16 @@ def __init__( def init_sockets(self) -> None: self._zmq_ctx = zmq.Context() self._recv_from_rr = create_zmq_socket( - self._zmq_ctx, zmq.PULL, self._recv_from_rr_addr, bind=False, + self._zmq_ctx, + zmq.PULL, + self._recv_from_rr_addr, + bind=False, ) self._send_to_scheduler = create_zmq_socket( - self._zmq_ctx, zmq.PUSH, self._send_to_scheduler_addr, bind=True, + self._zmq_ctx, + zmq.PUSH, + self._send_to_scheduler_addr, + bind=True, ) def event_loop(self) -> None: From 7f5d7d9a84d04925a15af9fe8e2a273040cb6d0f Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Fri, 27 Feb 2026 11:23:17 +0000 Subject: [PATCH 30/42] feat(engine): support batch generation and enable shared memory queue for IPC - Add enable_shared_queue config option to server configuration - Implement shared memory queue for fast IPC between tokenizer and scheduler - Refactor Engine.generate and generate_async to support single and batch requests - Add colorful ASCII art banner on engine startup if dependencies are available - Add _make_rids utility to auto-generate request IDs for batch and single requests - Implement TokenizedGenerateReqInput with multimodal inputs support - Refactor RequestResponseProcess to handle batch requests and return list of ReqStates - Enhance SchedulerProcess to receive requests from shared queue or legacy ZMQ - Introduce SharedMemoryManager for managing metadata in shared memory segments - Create TensorQueue to support fast IPC of tensors via shared memory and queues - Add CUDA IPC Transport module for zero-copy GPU tensor sharing with workspace buffer - Refactor ModelRunnerProcess to handle batch requests with actual output structure placeholders - Improve resource management and error handling in shared memory IPC utilities --- pymllm/configs/server_config.py | 1 + .../normal.py => engine/__init__.py} | 0 pymllm/engine/forward_batch.py | 0 pymllm/engine/io_struct.py | 9 + pymllm/engine/launch.py | 181 +++++++-- pymllm/layers/attention/attention_backend.py | 0 pymllm/layers/attention/flashinfer_backend.py | 0 pymllm/layers/attention/radix_attention.py | 0 pymllm/orchestrator/cuda_ipc_transport.py | 373 ++++++++++++++++++ pymllm/orchestrator/model_runner_process.py | 49 ++- .../orchestrator/request_response_process.py | 40 +- pymllm/orchestrator/scheduler_process.py | 97 ++++- pymllm/orchestrator/shared_memory_queue.py | 190 +++++++++ pymllm/orchestrator/tokenizer_process.py | 311 ++++++++++++++- pyproject.toml | 2 + 15 files changed, 1184 insertions(+), 69 deletions(-) rename pymllm/{layers/attention/normal.py => engine/__init__.py} (100%) create mode 100644 pymllm/engine/forward_batch.py create mode 100644 pymllm/layers/attention/attention_backend.py create mode 100644 pymllm/layers/attention/flashinfer_backend.py create mode 100644 pymllm/layers/attention/radix_attention.py create mode 100644 pymllm/orchestrator/cuda_ipc_transport.py create mode 100644 pymllm/orchestrator/shared_memory_queue.py diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py index 7cda9c3b8..9e399d62d 100644 --- a/pymllm/configs/server_config.py +++ b/pymllm/configs/server_config.py @@ -78,6 +78,7 @@ class ServerConfig: # --------------------------------------------------------------------- # # Feature switches # --------------------------------------------------------------------- # + enable_shared_queue: bool = False # Use shared memory queue for fast IPC # enable_lora: bool = False # max_loaded_loras: Optional[int] = None # max_loras_per_batch: int = 8 diff --git a/pymllm/layers/attention/normal.py b/pymllm/engine/__init__.py similarity index 100% rename from pymllm/layers/attention/normal.py rename to pymllm/engine/__init__.py diff --git a/pymllm/engine/forward_batch.py b/pymllm/engine/forward_batch.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/engine/io_struct.py b/pymllm/engine/io_struct.py index 777186e28..06c8d78d6 100644 --- a/pymllm/engine/io_struct.py +++ b/pymllm/engine/io_struct.py @@ -135,8 +135,17 @@ def to_request_dict(self) -> Dict[str, Any]: @dataclass class TokenizedGenerateReqInput(BaseReq): + # The decoded text passed to the tokenizer (empty string if only input_ids + # were provided by the caller). input_text: str = "" + # Token IDs produced by the tokenizer. input_ids: List[int] = field(default_factory=list) + # Multimodal inputs (processor output, e.g. pixel_values, or raw image / + # audio / video data when no processor is available). ``None`` means the + # request is text-only. + mm_inputs: Optional[Dict[str, Any]] = None + # Raw sampling parameters dict (parsed into a SamplingParams object by the + # model runner when needed). sampling_params: Dict[str, Any] = field(default_factory=dict) stream: bool = False return_logprob: bool = False diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index edad97af5..2200d7f33 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -11,6 +11,14 @@ from transformers import AutoConfig from huggingface_hub import snapshot_download +try: + from pyfiglet import figlet_format + from termcolor import colored + + HAS_BANNER_LIBS = True +except ImportError: + HAS_BANNER_LIBS = False + from pymllm.configs import get_global_config from pymllm.engine.io_struct import GenerateReqInput from pymllm.orchestrator.ipc_utils import make_ipc_address @@ -18,6 +26,7 @@ ReqState, RequestResponseProcess, ) +from pymllm.orchestrator.shared_memory_queue import TensorQueue from pymllm.orchestrator.tokenizer_process import run_tokenizer_process from pymllm.orchestrator.scheduler_process import run_scheduler_process from pymllm.orchestrator.model_runner_process import run_model_runner_process @@ -68,6 +77,26 @@ def _launch_processes(self) -> None: # Record all subprocesses procs_and_readers: List[tuple] = [] + # Config dict for the tokenizer subprocess (must be picklable). + cfg = get_global_config() + enable_shared_queue = cfg.server.enable_shared_queue + + # Create shared queue if enabled + shared_queue = None + if enable_shared_queue: + # TODO: WCH init CUDA IPC things. + shared_queue = TensorQueue(maxsize=1000) # Configurable max size + logger.info("Shared memory queue enabled for fast IPC") + + tokenizer_cfg: Dict[str, Any] = { + "tokenizer_path": str(cfg.server.tokenizer_path), + "tokenizer_mode": cfg.server.tokenizer_mode, + "trust_remote_code": cfg.server.trust_remote_code, + "context_length": cfg.server.context_length, + "hf_config": cfg.model.hf_config, + "enable_shared_queue": enable_shared_queue, + } + # Tokenizer tokenizer_reader, tokenizer_writer = mp.Pipe(duplex=False) tokenizer_proc = mp.Process( @@ -76,6 +105,8 @@ def _launch_processes(self) -> None: addr_request_response_to_tokenizer, addr_tokenizer_to_scheduler, tokenizer_writer, + tokenizer_cfg, + shared_queue, # Pass shared queue ), daemon=True, ) @@ -91,6 +122,8 @@ def _launch_processes(self) -> None: addr_model_runner_to_scheduler, addr_scheduler_to_detokenizer, scheduler_writer, + shared_queue, # Pass shared queue + enable_shared_queue, # Pass flag ), daemon=True, ) @@ -165,6 +198,29 @@ def _launch_processes(self) -> None: self._rr_process.start(self._loop) logger.info("RequestResponseProcess started in main process") + # Print colorful gradient ASCII art banner + if HAS_BANNER_LIBS: + try: + text = figlet_format("pymllm", font="slant") + fired_up = figlet_format("FIRED UP!", font="slant") + + # Apply blue-purple gradient + lines = text.strip().split("\n") + colors_cycle = ["blue", "cyan", "blue", "magenta", "magenta"] + for i, line in enumerate(lines): + color = colors_cycle[i % len(colors_cycle)] + print(colored(line, color, attrs=["bold"])) + + # Print "FIRED UP!" in bright magenta + for line in fired_up.strip().split("\n"): + print(colored(line, "magenta", attrs=["bold"])) + print() + except Exception as e: + logger.debug(f"Failed to print banner: {e}") + print("🚀 pymllm FIRED UP! 🚀\n") + else: + print("🚀 pymllm FIRED UP! 🚀\n") + def generate( self, prompt: Optional[Union[List[str], str]] = None, @@ -181,10 +237,14 @@ def generate( stream: bool = False, rid: Optional[Union[List[str], str]] = None, **kwargs, - ) -> Dict[str, Any]: - """Synchronous, non-streaming generation entry point.""" - if rid is None: - rid = uuid.uuid4().hex + ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: + """Synchronous, non-streaming generation entry point. + + Accepts a single prompt (``str``) or a batch (``List[str]``). Returns a + single result dict for single inputs and a list of result dicts for batch + inputs, preserving the input order. + """ + rid = self._make_rids(rid, prompt, input_ids) request = GenerateReqInput( rid=rid, text=prompt, @@ -203,11 +263,18 @@ def generate( ) request.normalize_batch_and_arguments() - async def _run() -> Dict[str, Any]: - state = await self._rr_process.add_request(request) - if isinstance(rid, list): - raise ValueError("Synchronous `generate` currently supports single request.") - return await self._wait_for_final_result(rid, state) + async def _run() -> Union[Dict[str, Any], List[Dict[str, Any]]]: + result = await self._rr_process.add_request(request) + if request.is_single: + single_rid = rid if isinstance(rid, str) else rid[0] + return await self._wait_for_final_result(single_rid, result) # type: ignore[arg-type] + # Batch: wait for every sub-request concurrently. + rids_list: List[str] = rid if isinstance(rid, list) else [rid] # type: ignore[assignment] + states: List[ReqState] = result # type: ignore[assignment] + outputs = await asyncio.gather( + *(self._wait_for_final_result(r, s) for r, s in zip(rids_list, states)) + ) + return list(outputs) return self._loop.run_until_complete(_run()) @@ -230,13 +297,14 @@ async def generate_async( ) -> AsyncIterator[Dict[str, Any]]: """Asynchronous generation entry point. - When *stream* is ``False`` (default) the returned async iterator - yields a **single** final result dict. When *stream* is ``True`` - every incremental chunk from the detokenizer is yielded as it - arrives, following the ``Event + out_list`` pattern. + For a **single** request and ``stream=False`` yields one final result + dict; with ``stream=True`` yields incremental chunks. + + For a **batch** request the iterator yields the final result for each + sub-request as it completes (order not guaranteed); streaming mode yields + incremental chunks from all sub-requests interleaved. """ - if rid is None: - rid = uuid.uuid4().hex + rid = self._make_rids(rid, prompt, input_ids) request = GenerateReqInput( rid=rid, text=prompt, @@ -254,18 +322,55 @@ async def generate_async( extra_options=kwargs, ) request.normalize_batch_and_arguments() - state = await self._rr_process.add_request(request) + result = await self._rr_process.add_request(request) - try: - if isinstance(rid, list): - raise ValueError("`generate_async` currently supports single request only.") - if stream: - async for chunk in self._stream_results(rid, state): - yield chunk - else: - yield await self._wait_for_final_result(rid, state) - finally: - self._rr_process.remove_state(rid) + if request.is_single: + single_rid = rid if isinstance(rid, str) else rid[0] # type: ignore[index] + state: ReqState = result # type: ignore[assignment] + try: + if stream: + async for chunk in self._stream_results(single_rid, state): + yield chunk + else: + yield await self._wait_for_final_result(single_rid, state) + finally: + self._rr_process.remove_state(single_rid) + else: + rids_list: List[str] = rid if isinstance(rid, list) else [rid] # type: ignore[assignment] + states: List[ReqState] = result # type: ignore[assignment] + try: + if stream: + # Merge streams from all sub-requests using an asyncio queue. + queue: asyncio.Queue = asyncio.Queue() + + async def _forward(r: str, s: ReqState) -> None: + async for chunk in self._stream_results(r, s): + await queue.put(chunk) + await queue.put(None) # sentinel + + tasks = [ + asyncio.create_task(_forward(r, s)) + for r, s in zip(rids_list, states) + ] + done_count = 0 + while done_count < len(tasks): + item = await queue.get() + if item is None: + done_count += 1 + else: + yield item + await asyncio.gather(*tasks) + else: + for coro in asyncio.as_completed( + [ + self._wait_for_final_result(r, s) + for r, s in zip(rids_list, states) + ] + ): + yield await coro + finally: + for r in rids_list: + self._rr_process.remove_state(r) @staticmethod async def _wait_for_final_result(rid: str, state: ReqState) -> Dict[str, Any]: @@ -290,6 +395,30 @@ async def _stream_results( return state.event.clear() + @staticmethod + def _make_rids( + rid: Optional[Union[str, List[str]]], + prompt: Optional[Union[str, List[str]]], + input_ids: Optional[Union[List[int], List[List[int]]]], + ) -> Union[str, List[str]]: + """Return rids, auto-generating UUIDs when *rid* is ``None``. + + The helper infers whether the call is a batch from *prompt* / *input_ids* + so callers don't have to handle this case themselves. + """ + if rid is not None: + return rid + # Determine batch size from the text/input_ids argument. + is_batch = isinstance(prompt, list) or ( + isinstance(input_ids, list) + and len(input_ids) > 0 + and isinstance(input_ids[0], list) + ) + if is_batch: + n = len(prompt) if prompt is not None else len(input_ids) # type: ignore[arg-type] + return [uuid.uuid4().hex for _ in range(n)] + return uuid.uuid4().hex + def shutdown(self) -> None: """Terminate all subprocesses.""" if self._rr_process is not None: diff --git a/pymllm/layers/attention/attention_backend.py b/pymllm/layers/attention/attention_backend.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/layers/attention/flashinfer_backend.py b/pymllm/layers/attention/flashinfer_backend.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/layers/attention/radix_attention.py b/pymllm/layers/attention/radix_attention.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/orchestrator/cuda_ipc_transport.py b/pymllm/orchestrator/cuda_ipc_transport.py new file mode 100644 index 000000000..7052f0e8f --- /dev/null +++ b/pymllm/orchestrator/cuda_ipc_transport.py @@ -0,0 +1,373 @@ +""" +CUDA IPC Transport for zero-copy tensor sharing between processes. + +This module implements CUDA IPC with workspace buffer management +to avoid PyTorch's memory leak issue when sharing IPC handles. + +1. Create a workspace buffer on GPU (pre-allocated memory pool) +2. Copy tensor data to a chunk in the workspace +3. Get CUDA IPC handle for the chunk +4. Send handle + metadata (shape, dtype, offset) to another process +5. Reconstruct tensor in target process from IPC handle +6. Copy to local tensor and mark chunk as reusable + +Key Problem Solved: + PyTorch never releases tensors whose IPC handles are shared until process ends. + Solution: Use a fixed-size workspace buffer and recycle chunks. +""" + +import logging +import struct +import uuid +from dataclasses import dataclass +from multiprocessing import Queue +from multiprocessing.shared_memory import SharedMemory +from typing import Any, Dict, List, Optional, Tuple + +import torch +import torch.cuda as cuda + +logger = logging.getLogger(__name__) + + +@dataclass +class MemoryChunk: + """Represents a chunk in the workspace buffer.""" + + offset: int # Offset in bytes from workspace start + size: int # Size in bytes + in_use: bool # Whether the chunk is currently occupied + sync_shm_name: Optional[str] = None # Shared memory name for sync flag + + +class WorkspaceBuffer: + """GPU memory pool for storing multimodal tensors temporarily. + + This prevents the PyTorch IPC handle memory leak by using a fixed-size + pre-allocated buffer and recycling chunks. + """ + + def __init__(self, size_gb: float = 4.0, device: int = 0): + """Initialize workspace buffer. + + Args: + size_gb: Total size of workspace in GB + device: CUDA device ID + """ + self.device = device + self.total_size = int(size_gb * 1024 * 1024 * 1024) # Convert GB to bytes + + # Allocate workspace on GPU + with torch.cuda.device(device): + self.workspace = torch.empty( + self.total_size // 4, # Divide by 4 because we use float32 + dtype=torch.float32, + device=f"cuda:{device}", + ) + + # Initialize chunk management + self.chunks: List[MemoryChunk] = [ + MemoryChunk(offset=0, size=self.total_size, in_use=False) + ] + + # Container for reusable sync buffers + self.sync_buffer_pool: List[str] = [] + + logger.info( + f"WorkspaceBuffer initialized: {size_gb}GB on cuda:{device}, " + f"ptr={self.workspace.data_ptr():#x}" + ) + + def allocate(self, size_bytes: int) -> Optional[Tuple[int, str]]: + """Allocate a chunk from the workspace. + + Args: + size_bytes: Required size in bytes + + Returns: + Tuple of (offset, sync_shm_name) if successful, None if no space + """ + # Find a free chunk that's large enough + for i, chunk in enumerate(self.chunks): + if not chunk.in_use and chunk.size >= size_bytes: + # Mark chunk as in use + chunk.in_use = True + + # Get or create sync buffer + if self.sync_buffer_pool: + sync_shm_name = self.sync_buffer_pool.pop() + # Reset sync flag to 0 (not ready) + self._reset_sync_buffer(sync_shm_name) + else: + sync_shm_name = self._create_sync_buffer() + + chunk.sync_shm_name = sync_shm_name + + # If chunk is larger than needed, split it + if chunk.size > size_bytes: + # Create a new free chunk for the remaining space + new_chunk = MemoryChunk( + offset=chunk.offset + size_bytes, + size=chunk.size - size_bytes, + in_use=False, + ) + chunk.size = size_bytes + self.chunks.insert(i + 1, new_chunk) + + logger.debug( + f"Allocated chunk: offset={chunk.offset}, size={size_bytes}, " + f"sync_shm={sync_shm_name}" + ) + return chunk.offset, sync_shm_name + + logger.warning(f"WorkspaceBuffer: No space for {size_bytes} bytes") + return None + + def release(self, offset: int) -> None: + """Release a chunk back to the pool. + + Args: + offset: Offset of the chunk to release + """ + for i, chunk in enumerate(self.chunks): + if chunk.offset == offset and chunk.in_use: + chunk.in_use = False + + # Return sync buffer to pool + if chunk.sync_shm_name: + self.sync_buffer_pool.append(chunk.sync_shm_name) + chunk.sync_shm_name = None + + # Try to merge with adjacent free chunks + self._merge_chunks() + + logger.debug(f"Released chunk: offset={offset}") + return + + logger.warning(f"Attempted to release unknown chunk at offset {offset}") + + def _merge_chunks(self) -> None: + """Merge adjacent free chunks to reduce fragmentation.""" + i = 0 + while i < len(self.chunks) - 1: + current = self.chunks[i] + next_chunk = self.chunks[i + 1] + + if not current.in_use and not next_chunk.in_use: + # Merge chunks + current.size += next_chunk.size + + # Keep first chunk's sync buffer, return second to pool + if next_chunk.sync_shm_name: + self.sync_buffer_pool.append(next_chunk.sync_shm_name) + + self.chunks.pop(i + 1) + else: + i += 1 + + def _create_sync_buffer(self) -> str: + """Create a new shared memory sync buffer (8 bytes, initialized to 0).""" + shm_name = f"pymllm_sync_{uuid.uuid4().hex[:12]}" + shm = SharedMemory(name=shm_name, create=True, size=8) + # Initialize to 0 (not ready) + shm.buf[:8] = struct.pack("Q", 0) + shm.close() + logger.debug(f"Created sync buffer: {shm_name}") + return shm_name + + def _reset_sync_buffer(self, shm_name: str) -> None: + """Reset sync buffer to 0 (not ready).""" + try: + shm = SharedMemory(name=shm_name, create=False) + shm.buf[:8] = struct.pack("Q", 0) + shm.close() + except Exception as e: + logger.warning(f"Failed to reset sync buffer {shm_name}: {e}") + + def copy_tensor_to_workspace(self, tensor: torch.Tensor, offset: int) -> None: + """Copy tensor data to workspace at given offset. + + Args: + tensor: Source tensor (must be on same CUDA device) + offset: Byte offset in workspace + """ + if not tensor.is_cuda or tensor.device.index != self.device: + raise ValueError(f"Tensor must be on cuda:{self.device}") + + size_bytes = tensor.numel() * tensor.element_size() + + # Get view of workspace at offset + offset_elements = offset // 4 # Workspace is float32 + num_elements = (size_bytes + 3) // 4 # Round up + + workspace_view = self.workspace[ + offset_elements : offset_elements + num_elements + ] + + # Copy tensor data (flatten and cast to float32 view) + tensor_flat = tensor.flatten().view(torch.uint8) + workspace_flat = workspace_view.view(torch.uint8)[: tensor_flat.numel()] + workspace_flat.copy_(tensor_flat) + + logger.debug(f"Copied tensor {tensor.shape} to workspace offset {offset}") + + def get_ipc_handle(self) -> bytes: + """Get CUDA IPC handle for the workspace buffer. + + Returns: + CUDA IPC handle as bytes + """ + # Get IPC handle using torch.cuda API + # Note: This requires CUDA-capable device with IPC support + handle = cuda.cudart().cudaIpcGetMemHandle(self.workspace.data_ptr()) + return bytes(handle) + + def cleanup(self) -> None: + """Cleanup all sync buffers.""" + all_shm_names = set() + for chunk in self.chunks: + if chunk.sync_shm_name: + all_shm_names.add(chunk.sync_shm_name) + all_shm_names.update(self.sync_buffer_pool) + + for shm_name in all_shm_names: + try: + shm = SharedMemory(name=shm_name, create=False) + shm.close() + shm.unlink() + except FileNotFoundError: + pass + except Exception as e: + logger.warning(f"Failed to cleanup sync buffer {shm_name}: {e}") + + logger.info("WorkspaceBuffer cleaned up") + + +@dataclass +class TensorMetadata: + """Metadata for reconstructing a tensor from CUDA IPC handle.""" + + shape: Tuple[int, ...] + dtype: torch.dtype + offset: int # Byte offset in workspace + size_bytes: int + sync_shm_name: str # Shared memory name for sync flag + + +class CudaIPCTransport: + """Transport for sharing CUDA tensors via IPC handles.""" + + def __init__( + self, + workspace_size_gb: float = 4.0, + device: int = 0, + ): + """Initialize CUDA IPC transport. + + Args: + workspace_size_gb: Size of workspace buffer in GB + device: CUDA device ID + """ + self.device = device + self.workspace = WorkspaceBuffer(workspace_size_gb, device) + self.ipc_handle = self.workspace.get_ipc_handle() + self.queue: Queue = Queue() + + def send_tensor(self, rid: str, tensor: torch.Tensor) -> bool: + """Send a tensor via CUDA IPC. + + Args: + rid: Request ID + tensor: Tensor to send (must be on CUDA) + + Returns: + True if sent via CUDA IPC, False if fallback needed + """ + if not tensor.is_cuda: + logger.debug(f"Tensor for {rid} not on CUDA, skipping IPC") + return False + + size_bytes = tensor.numel() * tensor.element_size() + + # Try to allocate from workspace + result = self.workspace.allocate(size_bytes) + if result is None: + logger.warning( + f"WorkspaceBuffer full, falling back to shared queue for {rid}" + ) + return False + + offset, sync_shm_name = result + + # Copy tensor to workspace + self.workspace.copy_tensor_to_workspace(tensor, offset) + + # Create metadata + metadata = TensorMetadata( + shape=tuple(tensor.shape), + dtype=tensor.dtype, + offset=offset, + size_bytes=size_bytes, + sync_shm_name=sync_shm_name, + ) + + # Send metadata through queue + self.queue.put((rid, metadata, self.ipc_handle)) + + logger.debug(f"Sent tensor {tensor.shape} for {rid} via CUDA IPC") + return True + + def receive_tensor( + self, timeout: float = 0.0001 + ) -> Optional[Tuple[str, torch.Tensor]]: + """Receive a tensor via CUDA IPC. + + Args: + timeout: Timeout for queue.get + + Returns: + Tuple of (rid, tensor) or None if queue empty + """ + try: + rid, metadata, ipc_handle = self.queue.get(timeout=timeout) + except Exception: + return None + + # Open IPC memory handle + # Note: This creates a tensor view into the remote process's workspace + with torch.cuda.device(self.device): + # Reconstruct tensor from IPC handle + # This is a view into remote memory, we need to copy it locally + + # For now, use a simpler approach: signal to copy later + # In production, you'd use cuda.cudart().cudaIpcOpenMemHandle + + logger.warning( + "CUDA IPC receive not fully implemented - requires cudaIpcOpenMemHandle" + ) + # TODO: Implement actual IPC handle opening + + # Create local tensor and signal copy completion + tensor = torch.empty( + metadata.shape, dtype=metadata.dtype, device=f"cuda:{self.device}" + ) + + # Mark chunk as ready for reuse by setting sync flag + self._mark_chunk_reusable(metadata.sync_shm_name) + + return rid, tensor + + def _mark_chunk_reusable(self, sync_shm_name: str) -> None: + """Mark a chunk as reusable by setting sync flag to 1.""" + try: + shm = SharedMemory(name=sync_shm_name, create=False) + shm.buf[:8] = struct.pack("Q", 1) # Set to 1 (ready for reuse) + shm.close() + logger.debug(f"Marked chunk reusable: {sync_shm_name}") + except Exception as e: + logger.error(f"Failed to mark chunk reusable {sync_shm_name}: {e}") + + def cleanup(self) -> None: + """Cleanup resources.""" + self.workspace.cleanup() + self.queue.close() diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py index 4b28645eb..b60966dd7 100644 --- a/pymllm/orchestrator/model_runner_process.py +++ b/pymllm/orchestrator/model_runner_process.py @@ -7,7 +7,7 @@ import logging from multiprocessing.connection import Connection -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional import zmq @@ -41,10 +41,16 @@ def __init__( def init_sockets(self) -> None: self._zmq_ctx = zmq.Context() self._recv_from_scheduler = create_zmq_socket( - self._zmq_ctx, zmq.PULL, self._recv_from_scheduler_addr, bind=False, + self._zmq_ctx, + zmq.PULL, + self._recv_from_scheduler_addr, + bind=False, ) self._send_to_scheduler = create_zmq_socket( - self._zmq_ctx, zmq.PUSH, self._send_to_scheduler_addr, bind=False, + self._zmq_ctx, + zmq.PUSH, + self._send_to_scheduler_addr, + bind=False, ) def event_loop(self) -> None: @@ -62,18 +68,41 @@ def event_loop(self) -> None: def _forward_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: """Run the model forward pass and sampling for *batch*. + *batch* is a dict produced by ``SchedulerProcess.get_next_batch_to_run`` + whose ``"requests"`` list contains + :class:`~pymllm.engine.io_struct.TokenizedGenerateReqInput` objects. + + Returns a dict ``{"batch_id": ..., "finished": [...], "unfinished": [...]}`` + where each element of *finished* / *unfinished* is a plain output dict + containing at least ``"rid"`` and ``"output_token_ids"``. + TODO: implement real forward pass, logits processing, and sampling. """ requests = batch.get("requests", []) - finished = [] - unfinished = [] + finished: List[Dict[str, Any]] = [] + unfinished: List[Dict[str, Any]] = [] for req in requests: - # TODO: actual model forward, logits -> next_token_ids - next_token_ids = [] # placeholder - req["output_token_ids"] = req.get("output_token_ids", []) + next_token_ids - # TODO: check EOS / max_tokens to decide finished vs. unfinished - finished.append(req) + # Support both TokenizedGenerateReqInput dataclass (normal path) and + # legacy plain dicts (defensive). + rid: str = req.rid if hasattr(req, "rid") else req.get("rid") + input_ids: List[int] = ( + req.input_ids if hasattr(req, "input_ids") else req.get("input_ids", []) + ) + mm_inputs: Optional[Dict[str, Any]] = ( + req.mm_inputs if hasattr(req, "mm_inputs") else req.get("mm_inputs") + ) + + # TODO: actual model forward; pass input_ids and mm_inputs to the model. + next_token_ids: List[int] = [] # placeholder + + output: Dict[str, Any] = { + "rid": rid, + "output_token_ids": next_token_ids, + "finished": True, + } + # TODO: check EOS / max_tokens to decide finished vs. unfinished. + finished.append(output) return { "batch_id": batch.get("batch_id"), diff --git a/pymllm/orchestrator/request_response_process.py b/pymllm/orchestrator/request_response_process.py index 743354280..fa9d92ece 100644 --- a/pymllm/orchestrator/request_response_process.py +++ b/pymllm/orchestrator/request_response_process.py @@ -14,7 +14,7 @@ import asyncio import dataclasses import logging -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import zmq import zmq.asyncio @@ -82,19 +82,39 @@ def start(self, loop: asyncio.AbstractEventLoop) -> None: ) self._loop_task = loop.create_task(self._run()) - async def add_request(self, request: GenerateReqInput) -> ReqState: - """Enqueue a request and return its :class:`ReqState`. + async def add_request( + self, request: GenerateReqInput + ) -> Union[ReqState, List[ReqState]]: + """Enqueue request(s) and return the corresponding :class:`ReqState`(s). + + * **Single request** (``request.is_single is True``): behaves exactly as + before – registers one ``ReqState`` and enqueues one message. + * **Batch request** (``request.is_single is False``): splits the batch + into *N* individual sub-requests, registers a ``ReqState`` per rid, and + enqueues each sub-request separately so the downstream pipeline sees + independent messages. Returns a ``List[ReqState]`` in the same order + as the input rids. Callers should ``await state.event.wait()`` in a loop, consuming ``state.out_list`` entries until ``state.finished`` is ``True``. """ - if not isinstance(request.rid, str): - raise ValueError("RequestResponseProcess currently accepts single requests only.") - rid = request.rid - state = ReqState() - self._rid_to_state[rid] = state - await self._request_queue.put(request.to_request_dict()) - return state + if request.is_single: + rid = request.rid if isinstance(request.rid, str) else request.rid[0] + state = ReqState() + self._rid_to_state[rid] = state + await self._request_queue.put(request.to_request_dict()) + return state + + # Batch path: fan-out into individual sub-requests. + states: List[ReqState] = [] + for i in range(request.batch_size): + sub = request[i] + rid = sub.rid if isinstance(sub.rid, str) else str(sub.rid) + state = ReqState() + self._rid_to_state[rid] = state + await self._request_queue.put(sub.to_request_dict()) + states.append(state) + return states def remove_state(self, rid: str) -> None: """Remove the ``ReqState`` for *rid* (called by the caller once done).""" diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py index e7394dab3..64ea55b0d 100644 --- a/pymllm/orchestrator/scheduler_process.py +++ b/pymllm/orchestrator/scheduler_process.py @@ -5,6 +5,10 @@ batches, dispatches batches to the ModelRunnerProcess for forward passes, collects results, and streams finished token IDs to the DetokenizerProcess. +Supports two modes: + 1. Legacy ZMQ path: Receive TokenizedGenerateReqInput via ZMQ recv_pyobj + 2. Shared queue fast path: Read rid from shared queue and metadata from shared memory + The main ``event_loop`` scheduler flow:: while True: @@ -18,6 +22,7 @@ """ import logging +import queue as stdlib_queue import time from collections import deque from multiprocessing.connection import Connection @@ -25,7 +30,9 @@ import zmq +from pymllm.engine.io_struct import TokenizedGenerateReqInput from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue logger = logging.getLogger(__name__) @@ -39,6 +46,8 @@ def __init__( send_to_model_runner_addr: str, recv_from_model_runner_addr: str, send_to_detokenizer_addr: str, + shared_queue: Optional[TensorQueue] = None, + enable_shared_queue: bool = False, ): # ZMQ addresses self._recv_from_tokenizer_addr = recv_from_tokenizer_addr @@ -46,6 +55,10 @@ def __init__( self._recv_from_model_runner_addr = recv_from_model_runner_addr self._send_to_detokenizer_addr = send_to_detokenizer_addr + # Shared queue configuration + self._shared_queue = shared_queue + self._enable_shared_queue = enable_shared_queue + # ZMQ runtime objects (initialised in init_sockets) self._zmq_ctx: Optional[zmq.Context] = None self._recv_from_tokenizer: Optional[zmq.Socket] = None @@ -55,7 +68,7 @@ def __init__( self._poller: Optional[zmq.Poller] = None # Request management - self._waiting_queue: Deque[Dict[str, Any]] = deque() + self._waiting_queue: Deque[TokenizedGenerateReqInput] = deque() self._running_batch: Optional[Dict[str, Any]] = None self._finished: List[Dict[str, Any]] = [] @@ -97,7 +110,10 @@ def init_sockets(self) -> None: def event_loop(self) -> None: """Infinite scheduling loop.""" - logger.info("SchedulerProcess event loop started") + logger.info( + "SchedulerProcess event loop started (shared_queue=%s)", + self._enable_shared_queue, + ) while True: self.recv_requests() self.process_input_requests() @@ -114,15 +130,80 @@ def event_loop(self) -> None: def recv_requests(self) -> None: """Non-blocking receive of tokenized requests from TokenizerProcess. - Uses ``zmq.Poller`` with a short timeout so the scheduler is never - stuck waiting when there are batches to run. + Supports two modes: + 1. Legacy ZMQ: Uses ``zmq.Poller`` with a short timeout + 2. Shared queue: Non-blocking get from multiprocessing.Queue + + Messages are either: + * A :class:`~pymllm.engine.io_struct.TokenizedGenerateReqInput` + dataclass – appended to ``_waiting_queue``. + * A plain abort sentinel dict ``{"rid": ..., "abort": True}`` – handled + inline by removing the matching rid from the waiting queue. """ + if self._enable_shared_queue and self._shared_queue is not None: + self._recv_from_shared_queue() + else: + self._recv_from_zmq() + + def _recv_from_zmq(self) -> None: + """Receive requests via legacy ZMQ path.""" while True: events = dict(self._poller.poll(timeout=0)) # non-blocking if self._recv_from_tokenizer not in events: break - req = self._recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK) - self._waiting_queue.append(req) + msg = self._recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK) + # Abort sentinel: plain dict with "abort" key. + if isinstance(msg, dict) and msg.get("abort"): + rid = msg.get("rid") + logger.debug("Scheduler received abort for rid=%s", rid) + self._waiting_queue = type(self._waiting_queue)( + r for r in self._waiting_queue if r.rid != rid + ) + else: + self._waiting_queue.append(msg) + + def _recv_from_shared_queue(self) -> None: + """Receive requests via shared memory + shared queue fast path.""" + while True: + try: + # Non-blocking get from shared queue + rid, shm_name, mm_inputs = self._shared_queue.get(timeout=0.0001) + + # Read metadata from shared memory (and unlink immediately) + metadata: TokenizedGenerateReqInput = SharedMemoryManager.read_metadata( + shm_name, unlink=True + ) + + # Reconstruct the full TokenizedGenerateReqInput with mm_inputs + full_request = TokenizedGenerateReqInput( + rid=metadata.rid, + input_text=metadata.input_text, + input_ids=metadata.input_ids, + mm_inputs=mm_inputs, # Restored from shared queue + sampling_params=metadata.sampling_params, + stream=metadata.stream, + return_logprob=metadata.return_logprob, + logprob_start_len=metadata.logprob_start_len, + top_logprobs_num=metadata.top_logprobs_num, + lora_path=metadata.lora_path, + session_params=metadata.session_params, + ) + + self._waiting_queue.append(full_request) + logger.debug(f"Received request {rid} from shared queue") + + except stdlib_queue.Empty: + # No more requests available + break + except Exception as e: + logger.error(f"Error receiving from shared queue: {e}", exc_info=True) + # Try to cleanup shared memory if possible + try: + if "shm_name" in locals(): + SharedMemoryManager.cleanup(shm_name) + except: + pass + break # ------------------------------------------------------------------ # Step 2: process input requests @@ -227,6 +308,8 @@ def run_scheduler_process( recv_from_model_runner_addr: str, send_to_detokenizer_addr: str, pipe_writer: Connection, + shared_queue: Optional[TensorQueue] = None, + enable_shared_queue: bool = False, ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" proc = SchedulerProcess( @@ -234,6 +317,8 @@ def run_scheduler_process( send_to_model_runner_addr, recv_from_model_runner_addr, send_to_detokenizer_addr, + shared_queue=shared_queue, + enable_shared_queue=enable_shared_queue, ) proc.init_sockets() diff --git a/pymllm/orchestrator/shared_memory_queue.py b/pymllm/orchestrator/shared_memory_queue.py new file mode 100644 index 000000000..3d26ebf14 --- /dev/null +++ b/pymllm/orchestrator/shared_memory_queue.py @@ -0,0 +1,190 @@ +""" +Shared memory and queue utilities for fast IPC between tokenizer and scheduler. + +This module implements shared-queue fast path to avoid expensive +ZMQ serialization of large multimodal tensors. + +Design: + - Metadata lane: Small tokenized objects stored in shared memory keyed by rid + - Tensor lane: Large tensors made shareable via share_memory_() and passed by handle +""" + +import logging +import pickle +import uuid +from multiprocessing import Queue +from multiprocessing.shared_memory import SharedMemory +from typing import Any, Dict, Optional + +import torch + +logger = logging.getLogger(__name__) + + +class SharedMemoryManager: + """Manages shared memory segments for passing metadata between processes. + + Each tokenized request's metadata is written to a unique shared memory segment + keyed by its request ID (rid). The scheduler reads and immediately unlinks the + segment to prevent memory leaks. + """ + + @staticmethod + def write_metadata(rid: str, metadata: Any) -> str: + """Write metadata to shared memory and return the segment name. + + Args: + rid: Request ID (used as part of the shared memory name) + metadata: Serializable metadata object + + Returns: + str: The shared memory segment name + """ + # Serialize the metadata + data = pickle.dumps(metadata) + size = len(data) + + # Create unique shared memory segment name + shm_name = f"pymllm_meta_{rid}_{uuid.uuid4().hex[:8]}" + + try: + # Create shared memory segment + shm = SharedMemory(name=shm_name, create=True, size=size) + # Write data + shm.buf[:size] = data + shm.close() + logger.debug(f"Wrote {size} bytes to shared memory {shm_name}") + return shm_name + except Exception as e: + logger.error(f"Failed to write metadata to shared memory: {e}") + raise + + @staticmethod + def read_metadata(shm_name: str, unlink: bool = True) -> Any: + """Read metadata from shared memory and optionally unlink it. + + Args: + shm_name: The shared memory segment name + unlink: If True, immediately unlink the segment after reading + + Returns: + The deserialized metadata object + """ + try: + # Open existing shared memory segment + shm = SharedMemory(name=shm_name, create=False) + # Read and deserialize data + data = bytes(shm.buf[:]) + metadata = pickle.loads(data) + shm.close() + + # Unlink to free memory immediately + if unlink: + try: + shm.unlink() + logger.debug(f"Read and unlinked shared memory {shm_name}") + except FileNotFoundError: + # Already unlinked, ignore + pass + + return metadata + except Exception as e: + logger.error(f"Failed to read metadata from shared memory {shm_name}: {e}") + raise + + @staticmethod + def cleanup(shm_name: str) -> None: + """Manually cleanup a shared memory segment (for error recovery).""" + try: + shm = SharedMemory(name=shm_name, create=False) + shm.close() + shm.unlink() + logger.debug(f"Cleaned up shared memory {shm_name}") + except FileNotFoundError: + pass # Already cleaned up + except Exception as e: + logger.warning(f"Failed to cleanup shared memory {shm_name}: {e}") + + +class TensorQueue: + """Queue for passing large tensors between processes using shared memory. + + Tensors are made shareable via .share_memory_() and passed through a + multiprocessing.Queue by handle (metadata only, not the actual data). + """ + + def __init__(self, maxsize: int = 0): + """Initialize the tensor queue. + + Args: + maxsize: Maximum queue size (0 for unlimited) + """ + self._queue: Queue = Queue(maxsize=maxsize) + + def put(self, rid: str, shm_name: str, mm_inputs: Optional[Dict[str, Any]]) -> None: + """Put a request with multimodal inputs into the queue. + + Args: + rid: Request ID + shm_name: Shared memory segment name for metadata + mm_inputs: Multimodal inputs dict (can contain torch tensors) + """ + # Make tensors shareable if present + if mm_inputs is not None: + mm_inputs = self._make_tensors_shareable(mm_inputs) + + self._queue.put((rid, shm_name, mm_inputs)) + logger.debug(f"Put request {rid} into tensor queue (shm={shm_name})") + + def get( + self, timeout: Optional[float] = None + ) -> tuple[str, str, Optional[Dict[str, Any]]]: + """Get a request from the queue. + + Args: + timeout: Timeout in seconds (None for blocking indefinitely) + + Returns: + Tuple of (rid, shm_name, mm_inputs) + """ + rid, shm_name, mm_inputs = self._queue.get(timeout=timeout) + logger.debug(f"Got request {rid} from tensor queue (shm={shm_name})") + return rid, shm_name, mm_inputs + + def empty(self) -> bool: + """Check if the queue is empty.""" + return self._queue.empty() + + def qsize(self) -> int: + """Return the approximate size of the queue.""" + try: + return self._queue.qsize() + except NotImplementedError: + return 0 # Some platforms don't support qsize + + def close(self) -> None: + """Close the queue.""" + self._queue.close() + + @staticmethod + def _make_tensors_shareable(data: Any) -> Any: + """Recursively make all torch tensors in a data structure shareable. + + Args: + data: Nested dict/list/tensor structure + + Returns: + The same structure with tensors made shareable via share_memory_() + """ + if isinstance(data, torch.Tensor): + # Make tensor shareable across processes + if not data.is_shared(): + data = data.share_memory_() + return data + elif isinstance(data, dict): + return {k: TensorQueue._make_tensors_shareable(v) for k, v in data.items()} + elif isinstance(data, (list, tuple)): + result = [TensorQueue._make_tensors_shareable(item) for item in data] + return type(data)(result) + else: + return data diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py index 53714bb60..43db5ba00 100644 --- a/pymllm/orchestrator/tokenizer_process.py +++ b/pymllm/orchestrator/tokenizer_process.py @@ -3,15 +3,22 @@ Receives raw requests from RequestResponseProcess via ZMQ, tokenizes them, and forwards the tokenized payloads to the SchedulerProcess. + +Supports two modes: + 1. Legacy ZMQ path: Send TokenizedGenerateReqInput via ZMQ send_pyobj + 2. Shared queue fast path: Write metadata to shared memory and put rid in shared queue """ import logging from multiprocessing.connection import Connection -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional, Union import zmq +from transformers import AutoProcessor, AutoTokenizer +from pymllm.engine.io_struct import TokenizedGenerateReqInput from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue logger = logging.getLogger(__name__) @@ -23,16 +30,42 @@ def __init__( self, recv_from_rr_addr: str, send_to_scheduler_addr: str, + tokenizer_cfg: Dict[str, Any], + shared_queue: Optional[TensorQueue] = None, ): + """ + Parameters + ---------- + tokenizer_cfg: + Serialisable dict built by the parent process (``Engine``) before + spawning. Required keys: + + * ``tokenizer_path`` – str, path to the tokenizer directory. + * ``tokenizer_mode`` – ``"auto" | "slow" | "fast"``. + * ``trust_remote_code`` – bool. + * ``context_length`` – Optional[int], explicit cap; inferred from + ``hf_config`` when ``None``. + * ``hf_config`` – Optional HuggingFace PretrainedConfig + (pickled by multiprocessing); used only to infer ``context_length``. + * ``enable_shared_queue`` – bool, whether to use shared memory fast path. + shared_queue: + Optional TensorQueue for shared memory fast path communication. + """ self._recv_from_rr_addr = recv_from_rr_addr self._send_to_scheduler_addr = send_to_scheduler_addr + self._tokenizer_cfg = tokenizer_cfg + self._enable_shared_queue = tokenizer_cfg.get("enable_shared_queue", False) + self._shared_queue = shared_queue - self._zmq_ctx: zmq.Context = None - self._recv_from_rr: zmq.Socket = None - self._send_to_scheduler: zmq.Socket = None + self._zmq_ctx: Optional[zmq.Context] = None + self._recv_from_rr: Optional[zmq.Socket] = None + self._send_to_scheduler: Optional[zmq.Socket] = None - # TODO: initialise the actual tokenizer (HuggingFace / custom) self._tokenizer = None + self._mm_processor = None + self._context_length: Optional[int] = None + + self._init_tokenizers() # ------------------------------------------------------------------ # Lifecycle @@ -55,29 +88,269 @@ def init_sockets(self) -> None: def event_loop(self) -> None: """Infinite loop: recv raw request -> tokenize -> send to scheduler.""" - logger.info("TokenizerProcess event loop started") + logger.info( + "TokenizerProcess event loop started (shared_queue=%s)", + self._enable_shared_queue, + ) while True: raw_request: Dict[str, Any] = self._recv_from_rr.recv_pyobj() tokenized = self._tokenize(raw_request) + + if self._enable_shared_queue and self._shared_queue is not None: + # Shared queue fast path + self._send_via_shared_queue(tokenized) + else: + # Legacy ZMQ path + self._send_to_scheduler.send_pyobj(tokenized) + + def _send_via_shared_queue( + self, tokenized: Union[TokenizedGenerateReqInput, Dict[str, Any]] + ) -> None: + """Send tokenized request via shared memory + shared queue fast path. + + Args: + tokenized: Either TokenizedGenerateReqInput dataclass or abort dict + """ + # Handle abort sentinel + if isinstance(tokenized, dict) and tokenized.get("abort"): + # Fallback to ZMQ for abort messages self._send_to_scheduler.send_pyobj(tokenized) + return + + assert isinstance(tokenized, TokenizedGenerateReqInput), ( + f"Expected TokenizedGenerateReqInput, got {type(tokenized)}" + ) + + rid = tokenized.rid + mm_inputs = tokenized.mm_inputs + + # Create a lightweight metadata object (without mm_inputs) + metadata = TokenizedGenerateReqInput( + rid=tokenized.rid, + input_text=tokenized.input_text, + input_ids=tokenized.input_ids, + mm_inputs=None, # Will be passed separately via shared queue + sampling_params=tokenized.sampling_params, + stream=tokenized.stream, + return_logprob=tokenized.return_logprob, + logprob_start_len=tokenized.logprob_start_len, + top_logprobs_num=tokenized.top_logprobs_num, + lora_path=tokenized.lora_path, + session_params=tokenized.session_params, + ) + + # Write metadata to shared memory + shm_name = SharedMemoryManager.write_metadata(rid, metadata) + + # Put (rid, shm_name, mm_inputs) into shared queue + self._shared_queue.put(rid, shm_name, mm_inputs) + + logger.debug(f"Sent request {rid} via shared queue (shm={shm_name})") # ------------------------------------------------------------------ - # Tokenization (placeholder) + # Tokenization and multimodal preprocessing # ------------------------------------------------------------------ - def _tokenize(self, raw_request: Dict[str, Any]) -> Dict[str, Any]: - """Tokenize a single raw request and return the tokenized payload. + def _init_tokenizers(self) -> None: + """Initialise text tokenizer and (optionally) multimodal processor. - TODO: replace with real tokenizer call. + All configuration is read from ``self._tokenizer_cfg`` which was + serialised by the parent process before ``spawn``. No global config + access happens inside the subprocess. """ - text = raw_request.get("text", "") - # placeholder: produce fake token ids - input_ids: List[int] = [] # TODO: self._tokenizer.encode(text) - return { - **raw_request, - "input_ids": input_ids, + cfg = self._tokenizer_cfg + tokenizer_path: str = cfg["tokenizer_path"] + tokenizer_mode: str = cfg.get("tokenizer_mode", "auto") + trust_remote_code: bool = bool(cfg.get("trust_remote_code", False)) + + tokenizer_kwargs: Dict[str, Any] = { + "use_fast": tokenizer_mode != "slow", + "trust_remote_code": trust_remote_code, } + self._tokenizer = AutoTokenizer.from_pretrained( + tokenizer_path, + **tokenizer_kwargs, + ) + + # Default to left padding for generation. + try: + self._tokenizer.padding_side = "left" + except Exception: + pass + + # Context length: explicit config value takes priority; fall back to + # common HF config field names. + context_len: Optional[int] = cfg.get("context_length") + if context_len is None: + hf_cfg = cfg.get("hf_config") + for name in ("max_position_embeddings", "max_sequence_length", "seq_len"): + if hf_cfg is not None and hasattr(hf_cfg, name): + context_len = int(getattr(hf_cfg, name)) + break + self._context_length = context_len + + # Try to load multimodal processor (optional). + try: + self._mm_processor = AutoProcessor.from_pretrained( + tokenizer_path, + trust_remote_code=trust_remote_code, + ) + except Exception: + # Text-only models don't provide a processor; that's fine. + self._mm_processor = None + + def _tokenize( + self, raw_request: Dict[str, Any] + ) -> Union[TokenizedGenerateReqInput, Dict[str, Any]]: + """Tokenize one raw request dict and return a typed object. + + * **Abort** messages (``{"rid": ..., "abort": True}``) are returned as + plain dicts so the scheduler can intercept them without importing the + io_struct. + * Normal requests are returned as a :class:`TokenizedGenerateReqInput` + dataclass instance that carries ``input_ids``, ``mm_inputs``, and all + sampling meta-data in typed fields. + + Each message arriving here corresponds to exactly one sub-request + because batch splitting happens upstream in ``RequestResponseProcess``. + """ + # Abort: propagate as a plain sentinel dict. + if raw_request.get("abort"): + return {"rid": raw_request.get("rid"), "abort": True} + + # ------------------------------------------------------------------ # + # 1. Text tokenization + # ------------------------------------------------------------------ # + if raw_request.get("input_ids") is not None: + # Caller already tokenized – skip text processing. + input_ids: List[int] = list(raw_request["input_ids"]) + raw_text = raw_request.get("text") + input_text: str = ( + str(raw_text[0]) if isinstance(raw_text, list) else str(raw_text or "") + ) + else: + text = raw_request.get("text") + if text is None: + raise ValueError( + "TokenizerProcess expects either `text` or `input_ids`." + ) + # Accept a list for robustness; take the first element. + input_text = str(text[0]) if isinstance(text, list) else str(text) + + encode_kwargs: Dict[str, Any] = { + "add_special_tokens": True, + "return_attention_mask": False, + } + if self._context_length is not None: + encode_kwargs.update( + {"truncation": True, "max_length": self._context_length} + ) + + encoding = self._tokenizer(input_text, **encode_kwargs) + input_ids = encoding["input_ids"] + + # ------------------------------------------------------------------ # + # 2. Multimodal pre-processing + # ------------------------------------------------------------------ # + mm_inputs = self._collect_mm_inputs(raw_request, text=input_text) + + # ------------------------------------------------------------------ # + # 3. Pack into the typed dataclass + # ------------------------------------------------------------------ # + return TokenizedGenerateReqInput( + rid=raw_request.get("rid"), + input_text=input_text, + input_ids=input_ids, + mm_inputs=mm_inputs, + sampling_params=raw_request.get("sampling_params") or {}, + stream=bool(raw_request.get("stream", False)), + return_logprob=bool(raw_request.get("return_logprob", False)), + logprob_start_len=int(raw_request.get("logprob_start_len", -1)), + top_logprobs_num=int(raw_request.get("top_logprobs_num", 0)), + lora_path=raw_request.get("lora_path"), + session_params=raw_request.get("session_params"), + ) + + def _normalize_image_input(self, image_data: Any) -> List[Any]: + """Normalise ``image_data`` into a list of image-like objects. + + Supported input forms: + - single PIL.Image / numpy array / torch.Tensor + - path string or bytes + - list/tuple of the above + """ + + def _to_image(obj: Any) -> Any: + # Lazily import Pillow to avoid hard dependency for text-only models. + try: + from PIL import Image # type: ignore + except Exception as exc: # pragma: no cover - optional dependency + raise RuntimeError( + "Pillow is required for image preprocessing in TokenizerProcess" + ) from exc + + if obj is None: + return None + if isinstance(obj, Image.Image): + return obj + if isinstance(obj, (str, bytes)): + return Image.open(obj) + return obj + + if isinstance(image_data, (list, tuple)): + return [ + img for img in (_to_image(x) for x in image_data) if img is not None + ] + return [img for img in (_to_image(image_data),) if img is not None] + + def _collect_mm_inputs( + self, raw_request: Dict[str, Any], text: Optional[str] = None + ) -> Optional[Dict[str, Any]]: + """Pre-process multimodal data and return a consolidated ``mm_inputs`` dict. + + Returns ``None`` for text-only requests. Otherwise returns a flat dict + whose keys are ready to be unpacked by the model runner: + + * ``image_inputs`` – output of ``AutoProcessor`` (contains + ``pixel_values``, etc.) when a processor is available. + * ``image_data`` – raw image objects when no processor is available. + * ``audio_data`` – forwarded verbatim (no processor yet). + * ``video_data`` – forwarded verbatim (no processor yet). + """ + image_data = raw_request.get("image_data") + video_data = raw_request.get("video_data") + audio_data = raw_request.get("audio_data") + + if not any(x is not None for x in (image_data, video_data, audio_data)): + return None # text-only request + + mm: Dict[str, Any] = {} + + # Image: prefer AutoProcessor output; fall back to raw data. + if image_data is not None: + if self._mm_processor is not None: + images = self._normalize_image_input(image_data) + try: + processor_inputs = self._mm_processor( + images=images, + text=text if text is not None else raw_request.get("text"), + return_tensors="pt", + ) + mm["image_inputs"] = processor_inputs + except Exception: + mm["image_data"] = image_data + else: + mm["image_data"] = image_data + + # Audio / video forwarded verbatim for now. + if audio_data is not None: + mm["audio_data"] = audio_data + if video_data is not None: + mm["video_data"] = video_data + + return mm + def shutdown(self) -> None: if self._recv_from_rr is not None: self._recv_from_rr.close() @@ -91,9 +364,13 @@ def run_tokenizer_process( recv_from_rr_addr: str, send_to_scheduler_addr: str, pipe_writer: Connection, + tokenizer_cfg: Dict[str, Any], + shared_queue: Optional[TensorQueue] = None, ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" - proc = TokenizerProcess(recv_from_rr_addr, send_to_scheduler_addr) + proc = TokenizerProcess( + recv_from_rr_addr, send_to_scheduler_addr, tokenizer_cfg, shared_queue + ) proc.init_sockets() # Signal readiness to the parent process diff --git a/pyproject.toml b/pyproject.toml index d417b5790..d752ddc1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,8 @@ dependencies=[ "typer", "torch", "torchao", + "pyfiglet", + "termcolor", ] [project.optional-dependencies] From 5d134113c344048c0427c9ae33bd971705993ab6 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Mon, 2 Mar 2026 06:45:16 +0000 Subject: [PATCH 31/42] feat(mllm-kernel): add high-performance create_kv_indices CUDA kernel and benchmark - Implement CUDA kernel to convert ReqToTokenPool mapping into flat KV index arrays - Use block-per-sequence parallelism for fully coalesced memory access - Validate tensor shapes, dtypes, and devices with TensorMatcher utilities - Provide Python JIT wrapper using mllm-kernel JIT system for easy integration - Add detailed documentation and usage guide for kernel implementation - Create benchmark script comparing kernel against naive PyTorch gather - Support optional start offsets for sliding-window decode scenarios - Ensure robust out-of-bounds checks to prevent segmentation faults - Establish testing and benchmarking patterns for future kernel development --- .claude/skills/impl-jit-kernel/SKILL.md | 486 +++++++++ .../benchmarks/bench_create_kv_indices.py | 218 ++++ .../cuda/csrc/create_kv_indices.cuh | 282 +++++ .../mllm_kernel/cuda/csrc/vocab_embedding.cuh | 0 .../mllm_kernel/cuda/jit/create_kv_indices.py | 118 +++ mllm-kernel/pyproject.toml | 2 +- mllm-kernel/tests/test_create_kv_indices.py | 191 ++++ pymllm/configs/server_config.py | 45 + pymllm/engine/__init__.py | 8 + pymllm/engine/forward_batch.py | 182 ++++ pymllm/engine/launch.py | 32 +- pymllm/layers/attention/__init__.py | 25 + pymllm/layers/attention/attention_backend.py | 143 +++ pymllm/layers/attention/flashinfer_backend.py | 964 ++++++++++++++++++ pymllm/layers/attention/radix_attention.py | 171 ++++ pymllm/layers/sampling.py | 0 pymllm/mem_cache/memory_pool.py | 16 +- pymllm/orchestrator/cuda_ipc_transport.py | 859 ++++++++++------ pymllm/orchestrator/scheduler_process.py | 50 +- pymllm/orchestrator/shared_memory_queue.py | 226 ++-- pymllm/orchestrator/tokenizer_process.py | 153 ++- 21 files changed, 3773 insertions(+), 398 deletions(-) create mode 100644 .claude/skills/impl-jit-kernel/SKILL.md create mode 100644 mllm-kernel/benchmarks/bench_create_kv_indices.py create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/create_kv_indices.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/vocab_embedding.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/jit/create_kv_indices.py create mode 100644 mllm-kernel/tests/test_create_kv_indices.py create mode 100644 pymllm/layers/sampling.py diff --git a/.claude/skills/impl-jit-kernel/SKILL.md b/.claude/skills/impl-jit-kernel/SKILL.md new file mode 100644 index 000000000..39cc02b6f --- /dev/null +++ b/.claude/skills/impl-jit-kernel/SKILL.md @@ -0,0 +1,486 @@ +--- +name: impl-jit-kernel +description: Guide for implementing CUDA or CPU JIT kernels in mllm-kernel. Use when the user asks to create, add, or implement a new kernel in mllm-kernel. +--- + +# Implementing a JIT Kernel in mllm-kernel + +## Overview + +mllm-kernel uses a JIT (Just-In-Time) compilation system built on `tvm_ffi`. Kernels are written in C++20 (`.cuh` for CUDA, `.cpp` for CPU), validated at runtime via `TensorMatcher`, and exposed to Python through a `@jit` decorator. No pre-compilation is needed -- kernels compile on first call and are cached at `~/.cache/mllm_kernel/`. + +## File Layout + +For a kernel named `my_kernel`: + +``` +mllm-kernel/ + mllm_kernel/ + cuda/ + csrc/my_kernel.cuh # CUDA kernel implementation + jit/my_kernel.py # Python JIT wrapper + jit/__init__.py # Add export here + cpu/ + csrc/my_kernel.cpp # CPU kernel implementation (Highway SIMD) + include/mllm_kernel/cpu/ + my_kernel.hpp # CPU SIMD body (NO #pragma once) + jit/my_kernel.py # Python JIT wrapper + jit/__init__.py # Add export here + tests/test_my_kernel.py # Pytest correctness tests + benchmarks/bench_my_kernel.py # Profiler benchmark vs PyTorch reference +``` + +--- + +## CUDA Kernel Walkthrough + +### Step 1: Write the `.cuh` kernel + +Create `mllm_kernel/cuda/csrc/my_kernel.cuh`: + +```cpp +#pragma once + +#include // TensorMatcher, SymbolicSize, SymbolicDevice, SymbolicDType +#include // RuntimeCheck, Panic, div_ceil +#include // LaunchKernel, fp16_t, bf16_t, PDL helpers + +#include +#include + +#include + +namespace { + +// --------------------------------------------------------------------------- +// 1. Parameter struct (trivially copyable, passed to kernel by value) +// --------------------------------------------------------------------------- +struct MyKernelParams { + const float* __restrict__ input; + float* __restrict__ output; + int32_t num_elements; +}; + +// --------------------------------------------------------------------------- +// 2. CUDA kernel +// --------------------------------------------------------------------------- +__global__ void my_kernel(const MyKernelParams params) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= params.num_elements) return; + params.output[idx] = params.input[idx] * 2.0f; +} + +// --------------------------------------------------------------------------- +// 3. Host-side launcher (entry point for TVM FFI binding) +// --------------------------------------------------------------------------- +struct MyKernel { + static void run(tvm::ffi::TensorView input, tvm::ffi::TensorView output) { + using namespace mllm_kernel::host; + + // --- Validate tensors --- + SymbolicSize N{"num_elements"}; + SymbolicDevice device; + + (void)TensorMatcher({N}) + .with_dtype() + .with_device(device) + .verify(input); + + (void)TensorMatcher({N}) + .with_dtype() + .with_device(device) + .verify(output); + + const int64_t n = N.unwrap(); + RuntimeCheck(n > 0, "num_elements must be positive, got ", n); + + // --- Build params --- + MyKernelParams params{ + .input = static_cast(input.data_ptr()), + .output = static_cast(output.data_ptr()), + .num_elements = static_cast(n), + }; + + // --- Launch --- + constexpr int kBlock = 256; + const int grid = static_cast(div_ceil(n, kBlock)); + LaunchKernel(grid, kBlock, device.unwrap())(my_kernel, params); + } +}; + +} // namespace +``` + +**Key rules:** + +- **Always wrap in `namespace {}`** (anonymous namespace). +- **Entry point** is a `static void run(tvm::ffi::TensorView ...)` method. +- **Validate every tensor** with `TensorMatcher` before reading `.data_ptr()`. +- **Never dereference device pointers on host** -- `data_ptr()` returns a GPU pointer. +- **Use `LaunchKernel`** to launch -- it handles stream resolution and error checking. + +### Step 2: Write the Python JIT wrapper + +Create `mllm_kernel/cuda/jit/my_kernel.py`: + +```python +"""JIT wrapper for my_kernel CUDA kernel.""" + +import torch +from mllm_kernel.jit_utils import jit + + +@jit( + args=[], + device="cuda", + cuda_files=["my_kernel.cuh"], + cpp_wrappers=[], + cuda_wrappers=[("my_kernel", "MyKernel::run")], + func_name="my_kernel", +) +def _kernel(compiled_module, input: torch.Tensor, output: torch.Tensor) -> None: + compiled_module.my_kernel(input, output) + + +def my_kernel(input: torch.Tensor) -> torch.Tensor: + """Double every element in *input*. + + Parameters + ---------- + input : torch.Tensor + 1-D float32 tensor on CUDA. + + Returns + ------- + torch.Tensor + Same shape and dtype as *input*. + """ + output = torch.empty_like(input) + _kernel(input, output) + return output +``` + +### Step 3: Export in `__init__.py` + +Edit `mllm_kernel/cuda/jit/__init__.py` and add: + +```python +from mllm_kernel.cuda.jit.my_kernel import my_kernel +``` + +### Step 4: Clear JIT cache after editing `.cuh` + +Any time you modify the `.cuh` file, delete the cached `.so`: + +```bash +rm -rf ~/.cache/mllm_kernel/cuda_my_kernel* +``` + +The next Python call will trigger recompilation automatically. + +--- + +## Template-Parameterized CUDA Kernels + +When the kernel takes compile-time constants (e.g. block size, dtype), use `make_cpp_args`: + +```python +from mllm_kernel.jit_utils import jit, make_cpp_args + +def _make_kernel(block_size: int, use_pdl: bool): + cpp_args = make_cpp_args(block_size, use_pdl) # -> "256, true" + + @jit( + args=[block_size, use_pdl], + device="cuda", + cuda_files=["my_kernel.cuh"], + cpp_wrappers=[], + cuda_wrappers=[("my_kernel", f"MyKernel<{cpp_args}>::run")], + func_name="my_kernel", + ) + def _kernel(compiled_module, input, output): + compiled_module.my_kernel(input, output) + return _kernel +``` + +`make_cpp_args` converts Python types to C++ literals: +- `int/float` -> string literal +- `bool` -> `"true"` / `"false"` +- `torch.dtype` -> C++ type (`torch.float32` -> `"fp32_t"`, `torch.float16` -> `"fp16_t"`, `torch.bfloat16` -> `"bf16_t"`, `torch.int32` -> `"int32_t"`, etc.) + +--- + +## CPU Kernel Walkthrough + +CPU kernels use **Google Highway** for portable SIMD. The key difference: the `.hpp` body is included **multiple times** by Highway's `foreach_target` dispatch, so it must NOT have `#pragma once`. + +### Step 1: Write the SIMD body (`.hpp`) + +Create `mllm_kernel/cpu/include/mllm_kernel/cpu/my_kernel.hpp`: + +```cpp +// NOTE: NO #pragma once -- this file is included multiple times by Highway. + +#include + +HWY_BEFORE_NAMESPACE(); +namespace mllm_kernel::cpu { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; + +template +inline void my_kernel_impl(float* HWY_RESTRICT dst, + const float* HWY_RESTRICT src, + size_t count) { + const hn::ScalableTag d; + const size_t lanes = hn::Lanes(d); + const auto vc = hn::Set(d, static_cast(Constant)); + size_t i = 0; + for (; i + lanes <= count; i += lanes) { + const auto v = hn::Load(d, src + i); + hn::Store(hn::Add(v, vc), d, dst + i); + } + for (; i < count; ++i) { + dst[i] = src[i] + static_cast(Constant); + } +} + +// Named entry points for HWY_EXPORT +static HWY_NOINLINE HWY_MAYBE_UNUSED void my_kernel_1(float* d, const float* s, size_t n) { + my_kernel_impl<1>(d, s, n); +} + +} // namespace HWY_NAMESPACE +} // namespace mllm_kernel::cpu +HWY_AFTER_NAMESPACE(); +``` + +### Step 2: Write the `.cpp` source + +Create `mllm_kernel/cpu/csrc/my_kernel.cpp`: + +```cpp +#include +#include +#include + +#undef HWY_TARGET_INCLUDE +#define HWY_TARGET_INCLUDE "../csrc/my_kernel.cpp" +#include + +#include + +#if HWY_ONCE +#include +#endif + +namespace mllm_kernel::cpu { +#if HWY_ONCE + +HWY_EXPORT(my_kernel_1); + +template +void my_kernel(tvm::ffi::TensorView dst, tvm::ffi::TensorView src) { + using namespace mllm_kernel::host; + SymbolicSize N{"num_elements"}; + SymbolicDevice device_; + (void)TensorMatcher({N}) + .with_dtype() + .with_device(device_) + .verify(dst) + .verify(src); + const size_t n = N.unwrap(); + auto* dst_ptr = static_cast(dst.data_ptr()); + const auto* src_ptr = static_cast(src.data_ptr()); + HWY_DYNAMIC_DISPATCH(my_kernel_1)(dst_ptr, src_ptr, n); +} + +// Explicit instantiation +template void my_kernel<1>(tvm::ffi::TensorView, tvm::ffi::TensorView); + +#endif +} // namespace mllm_kernel::cpu +``` + +### Step 3: Write the Python JIT wrapper + +Create `mllm_kernel/cpu/jit/my_kernel.py`: + +```python +import torch +from mllm_kernel.jit_utils import jit + +@jit( + args=1, + device="cpu", + cpp_files=["my_kernel.cpp"], + cpp_wrappers=[("my_kernel", "mllm_kernel::cpu::my_kernel<1>")], + func_name="my_kernel", +) +def _kernel_1(compiled_module, dst, src): + compiled_module.my_kernel(dst, src) + +def my_kernel(src: torch.Tensor) -> torch.Tensor: + dst = torch.empty_like(src) + _kernel_1(dst, src) + return dst +``` + +**Key CPU differences from CUDA:** + +| Aspect | CUDA | CPU | +|--------|------|-----| +| Source file | `.cuh` in `cuda/csrc/` | `.cpp` + `.hpp` in `cpu/csrc/` and `cpu/include/` | +| Namespace | Anonymous `namespace {}` | `mllm_kernel::cpu` | +| Device check | `with_device` | `with_device` | +| Launch | `LaunchKernel(grid, block, device)(...)` | Direct function call via `HWY_DYNAMIC_DISPATCH` | +| SIMD | CUDA warps | Highway `ScalableTag` | +| Wrapper fields | `cuda_files`, `cuda_wrappers` | `cpp_files`, `cpp_wrappers` | +| Wrapper name | `"MyKernel::run"` | `"mllm_kernel::cpu::my_kernel<1>"` (fully qualified) | + +--- + +## TensorMatcher Reference + +`TensorMatcher` validates shape, dtype, device, and strides of `tvm::ffi::TensorView` arguments. + +```cpp +using namespace mllm_kernel::host; + +// Symbolic dimensions -- bind on first .verify(), check consistency on subsequent calls +SymbolicSize B{"batch"}, N{"seq_len"}, D{"dim"}; +SymbolicSize Stride0{"stride0"}; +SymbolicDType dtype; +SymbolicDevice device; + +// Shape [B, N, D], contiguous, float32, on CUDA +(void)TensorMatcher({B, N, D}) + .with_dtype(dtype) + .with_device(device) + .verify(tensor_a); + +// Shape [B, N, D], same dtype and device (already bound) +(void)TensorMatcher({B, N, D}) + .with_dtype(dtype) + .with_device(device) + .verify(tensor_b); + +// Shape [B, D] with explicit strides (non-contiguous OK) +(void)TensorMatcher({B, D}) + .with_strides({Stride0, 1}) + .with_dtype() + .with_device(device) + .verify(indices); + +// Multiple acceptable dtypes +SymbolicDType flex_dtype; +(void)TensorMatcher({N}) + .with_dtype(flex_dtype) + .with_device(device) + .verify(mixed_tensor); + +// Extract bound values +int64_t batch = B.unwrap(); +int64_t dim = D.unwrap(); +DLDevice dev = device.unwrap(); +``` + +--- + +## LaunchKernel Reference + +```cpp +using namespace mllm_kernel::host; + +// Basic launch (resolves CUDA stream from DLDevice) +DLDevice dev = device.unwrap(); +LaunchKernel(grid_dim, block_dim, dev)(kernel_func, param_struct); + +// With shared memory +LaunchKernel(grid, block, dev, shared_mem_bytes)(kernel, params); + +// With PDL (Programmatic Dependent Launch, sm_90+) +LaunchKernel(grid, block, dev).enable_pdl(true)(kernel, params); +``` + +--- + +## Utility Reference (`mllm_kernel::host`) + +| Function | Description | +|----------|-------------| +| `RuntimeCheck(cond, msg...)` | Throws `PanicError` if `cond` is false | +| `Panic(msg...)` | Always throws (unreachable code) | +| `div_ceil(a, b)` | Integer ceiling division | +| `dtype_bytes(DLDataType)` | Byte size of a DLPack dtype | + +CUDA-only (`mllm_kernel::device`): + +| Symbol | Value | +|--------|-------| +| `kWarpThreads` | 32 | +| `kFullMask` | 0xffffffff | +| `fp16_t` | `__half` | +| `bf16_t` | `__nv_bfloat16` | + +--- + +## Testing Pattern + +Create `tests/test_my_kernel.py`: + +```python +import pytest +import torch +from mllm_kernel.cuda.jit.my_kernel import my_kernel + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +@pytest.mark.parametrize("n", [1, 128, 1024, 65536]) +def test_my_kernel(n): + x = torch.randn(n, dtype=torch.float32, device="cuda") + result = my_kernel(x) + torch.cuda.synchronize() + expected = x * 2.0 + assert torch.allclose(result, expected) +``` + +Run: +```bash +pytest tests/test_my_kernel.py -v +``` + +--- + +## Benchmark Pattern + +Create `benchmarks/bench_my_kernel.py`. Use `torch.profiler.profile` with `ProfilerActivity.CPU` and `ProfilerActivity.CUDA`. Compare the JIT kernel against a naive PyTorch implementation and report speedup. + +Run: +```bash +python benchmarks/bench_my_kernel.py --num-elements 1000000 +``` + +--- + +## Checklist for a New Kernel + +- [ ] `.cuh` / `.cpp` + `.hpp` kernel source created +- [ ] `TensorMatcher` validates all tensor arguments (shape, dtype, device) +- [ ] No host-side dereference of device pointers +- [ ] Python `@jit` wrapper created with correct `cuda_wrappers` or `cpp_wrappers` +- [ ] Public API function added (allocates output, calls internal `_kernel`) +- [ ] Exported in `jit/__init__.py` +- [ ] JIT cache cleared after `.cuh` edits (`rm -rf ~/.cache/mllm_kernel/cuda_*`) +- [ ] Pytest test with `@pytest.mark.parametrize` and PyTorch reference +- [ ] Benchmark with `torch.profiler` (optional but recommended) + +--- + +## Common Pitfalls + +1. **Segfault from dereferencing device pointer on host** -- `tensor.data_ptr()` returns a GPU pointer for CUDA tensors. Never read its contents in host code. Use `TensorMatcher` for validation instead. +2. **Stale JIT cache** -- After editing `.cuh`, delete `~/.cache/mllm_kernel/cuda_*/`. The old `.so` will be reused otherwise. +3. **Missing `#include `** -- CPU kernels must include this inside `#if HWY_ONCE` to provide `GetChosenTarget` for the JIT-built module. +4. **`#pragma once` in Highway `.hpp`** -- Highway's `foreach_target` includes the file multiple times for different SIMD targets. `#pragma once` breaks this. +5. **Wrong wrapper name** -- CUDA uses short names (`"MyKernel::run"`); CPU uses fully qualified names (`"mllm_kernel::cpu::my_kernel<1>"`). +6. **Generator device mismatch in tests** -- `torch.randperm` needs a CUDA generator on CUDA; `torch.randint` only accepts CPU generators. Use separate generators. diff --git a/mllm-kernel/benchmarks/bench_create_kv_indices.py b/mllm-kernel/benchmarks/bench_create_kv_indices.py new file mode 100644 index 000000000..f570e66de --- /dev/null +++ b/mllm-kernel/benchmarks/bench_create_kv_indices.py @@ -0,0 +1,218 @@ +"""Benchmark create_kv_indices vs naive torch gather using torch.profiler. + +Example: + python benchmarks/bench_create_kv_indices.py --batch-size 512 --max-reqs 2048 --max-ctx 4096 +""" + +from __future__ import annotations + +import argparse + +import torch +from torch.profiler import ProfilerActivity, profile + +from mllm_kernel.cuda.jit.create_kv_indices import create_kv_indices + + +def _make_batch( + *, + max_reqs: int, + max_ctx: int, + batch_size: int, + use_start_offsets: bool, + device: torch.device, + seed: int, +): + g_cuda = torch.Generator(device=device).manual_seed(seed) + g_cpu = torch.Generator(device="cpu").manual_seed(seed) + + req_to_token = torch.arange( + max_reqs * max_ctx, dtype=torch.int32, device=device + ).reshape(max_reqs, max_ctx) + + assert batch_size <= max_reqs + req_pool_indices = torch.randperm(max_reqs, generator=g_cuda, device=device)[ + :batch_size + ].to(torch.int32) + + page_kernel_lens_list = [] + kv_start_idx_list = [] + for _ in range(batch_size): + L = int(torch.randint(1, max_ctx, (1,), generator=g_cpu).item()) + if use_start_offsets: + start_max = max_ctx - L + start = int(torch.randint(0, max(start_max, 1), (1,), generator=g_cpu).item()) + else: + start = 0 + page_kernel_lens_list.append(L) + kv_start_idx_list.append(start) + + page_kernel_lens = torch.tensor( + page_kernel_lens_list, dtype=torch.int32, device=device + ) + kv_start_idx = torch.tensor(kv_start_idx_list, dtype=torch.int32, device=device) + + kv_indptr = torch.empty(batch_size + 1, dtype=torch.int32, device=device) + kv_indptr[0] = 0 + kv_indptr[1:] = torch.cumsum(page_kernel_lens, dim=0) + + kv_indices = torch.empty( + int(kv_indptr[-1].item()), dtype=torch.int32, device=device + ) + + return ( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) + + +def _profile( + name: str, fn, *, warmup: int, iters: int, row_limit: int, trace_path: str | None +): + for _ in range(warmup): + fn() + torch.cuda.synchronize() + + with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + record_shapes=False, + profile_memory=False, + with_stack=False, + ) as prof: + for _ in range(iters): + fn() + torch.cuda.synchronize() + + events = prof.key_averages() + time_attr = ( + "self_cuda_time_total" + if events and hasattr(events[0], "self_cuda_time_total") + else "self_device_time_total" + ) + sort_key = ( + "self_cuda_time_total" + if time_attr == "self_cuda_time_total" + else "self_device_time_total" + ) + total_us = sum(float(getattr(evt, time_attr, 0.0)) for evt in events) + avg_us = total_us / max(iters, 1) + + print(f"\n=== {name} ===") + print( + prof.key_averages().table( + sort_by=sort_key, + row_limit=row_limit, + ) + ) + print(f"{name} total self device time: {total_us:.2f} us") + print(f"{name} avg self device time/iter: {avg_us:.2f} us") + + if trace_path: + prof.export_chrome_trace(trace_path) + print(f"{name} trace exported: {trace_path}") + + return avg_us + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Benchmark create_kv_indices vs naive torch gather", + ) + parser.add_argument("--batch-size", type=int, default=512) + parser.add_argument("--max-reqs", type=int, default=2048) + parser.add_argument("--max-ctx", type=int, default=4096) + parser.add_argument("--warmup", type=int, default=50) + parser.add_argument("--iters", type=int, default=200) + parser.add_argument("--row-limit", type=int, default=20) + parser.add_argument("--export-trace-dir", type=str, default="") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--use-start-offsets", + action="store_true", + help="Enable non-zero kv_start_idx to emulate sliding-window decode", + ) + args = parser.parse_args() + + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required for this benchmark") + + torch.manual_seed(args.seed) + device = torch.device("cuda") + + ( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) = _make_batch( + max_reqs=args.max_reqs, + max_ctx=args.max_ctx, + batch_size=args.batch_size, + use_start_offsets=args.use_start_offsets, + device=device, + seed=args.seed, + ) + + print("=== create_kv_indices profiler benchmark ===") + print( + f"batch_size={args.batch_size}, max_reqs={args.max_reqs}, max_ctx={args.max_ctx}, " + f"use_start_offsets={args.use_start_offsets}" + ) + print(f"warmup={args.warmup}, iters={args.iters}, row_limit={args.row_limit}") + + trace_dir = args.export_trace_dir.strip() + kernel_trace = f"{trace_dir}/create_kv_indices_trace.json" if trace_dir else None + torch_trace = f"{trace_dir}/torch_gather_trace.json" if trace_dir else None + + def _run_kernel_once(): + create_kv_indices( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) + + def _run_torch_once(): + # Torch reference implementation on device: gather per-sequence ranges + # from req_to_token into a flat buffer. + out = [] + for i in range(args.batch_size): + req = req_pool_indices[i].item() + start = kv_start_idx[i].item() if args.use_start_offsets else 0 + L = page_kernel_lens[i].item() + row = req_to_token[req, start : start + L] + out.append(row) + torch.cat(out, out=kv_indices) + + kernel_avg_us = _profile( + "create_kv_indices", + _run_kernel_once, + warmup=args.warmup, + iters=args.iters, + row_limit=args.row_limit, + trace_path=kernel_trace, + ) + + torch_avg_us = _profile( + "torch_reference", + _run_torch_once, + warmup=args.warmup, + iters=args.iters, + row_limit=args.row_limit, + trace_path=torch_trace, + ) + + speedup = torch_avg_us / max(kernel_avg_us, 1e-12) + print(f"\nSpeedup: {speedup:.3f}x") + + +if __name__ == "__main__": + main() diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/create_kv_indices.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/create_kv_indices.cuh new file mode 100644 index 000000000..0b9e4c888 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/create_kv_indices.cuh @@ -0,0 +1,282 @@ +// High-performance CUDA kernel to build FlashInfer KV index arrays from +// pymllm's ReqToTokenPool mapping table. +// +// This is the CUDA-C equivalent of the Triton kernel +// `_create_kv_indices_triton` previously defined in +// `pymllm/layers/attention/flashinfer_backend.py`. +// +// Motivation +// ---------- +// FlashInfer's paged KV attention API expects a *flat* buffer of KV indices +// (`kv_indices`) together with a prefix-sum pointer array (`kv_indptr`). +// +// * `kv_indices` is a 1-D int32 array that stores, for every token of every +// sequence in a batch, the corresponding *slot index* in the KV cache. +// * `kv_indptr` (length = batch_size + 1) stores prefix sums over the +// per-sequence token counts. For sequence `i` we have tokens in: +// +// kv_indices[kv_indptr[i] : kv_indptr[i + 1]] +// +// In pymllm, the mapping from (request_slot, position_in_sequence) to KV slot +// index is stored in a 2-D tensor `req_to_token` owned by `ReqToTokenPool`: +// +// req_to_token[req_slot, position] -> kv_index (int32) +// +// For each batch we also know: +// * which request slots we are serving: `req_pool_indices[bs]` +// * how many tokens to use from each sequence: `page_kernel_lens[bs]` +// * the starting position inside each sequence: `kv_start_idx[bs]` (optional, +// used for sliding-window / partial-context attention) +// +// This kernel converts that 2-D layout into the flat `(kv_indptr, kv_indices)` +// layout in a single, highly parallel CUDA pass: +// +// For each sequence i in the batch: +// - let req = req_pool_indices[i] +// - let len = page_kernel_lens[i] +// - let start = kv_start_idx[i] (or 0 if not provided) +// - let offset = kv_indptr[i] +// - for j in [0, len): +// kv_indices[offset + j] = req_to_token[req, start + j] +// +// Requirements / invariants +// ------------------------- +// * `req_to_token` is int32 (aligned with sglang). +// * All tensors must reside on the same CUDA device. +// * The kernel is designed for extremely high throughput: +// - a block is assigned per sequence (batch element), +// - threads cooperate within the block to copy the token range with +// coalesced loads/stores. +// * Shape and dtype checks are performed at runtime via mllm_kernel's +// TensorMatcher utilities, so misuse is caught with clear error messages. +// +// Integration +// ----------- +// The exported entry point is `CreateKvIndicesKernel::run(...)`. The Python +// wrapper in `mllm_kernel/cuda/jit/create_kv_indices.py` JIT-compiles this +// kernel and exposes a `create_kv_indices(...)` function which is then called +// by `pymllm.layers.attention.flashinfer_backend`. + +#pragma once + +#include // TensorMatcher, SymbolicSize, SymbolicDevice, SymbolicDType +#include // div_ceil, RuntimeCheck, Panic +#include // LaunchKernel + +#include +#include + +#include + +namespace { + +// --------------------------------------------------------------------------- +// Parameter block passed to the CUDA kernel +// --------------------------------------------------------------------------- +// +// We keep this struct trivially-copyable so it can be passed via +// `__grid_constant__` if desired. Each field is carefully documented to make +// the data flow explicit. + +struct CreateKvIndicesParams { + // Pointer to ReqToTokenPool mapping table: + // req_to_token[req_slot, position] -> kv_index (int32) + // shape: [max_reqs, max_context_len] + const int32_t* __restrict__ req_to_token; + + // Request slots participating in this batch. + // shape: [batch_size] + const int32_t* __restrict__ req_pool_indices; + + // Number of tokens to copy for each sequence in the batch. + // shape: [batch_size] + const int32_t* __restrict__ page_kernel_lens; + + // Prefix sums over per-sequence token counts. + // kv_indptr[i] is the starting offset in kv_indices for sequence i. + // shape: [batch_size + 1] + const int32_t* __restrict__ kv_indptr; + + // Optional starting position inside each request's sequence. When nullptr, + // we assume start = 0 for all sequences. When non-null, shape is + // [batch_size]. + const int32_t* __restrict__ kv_start_idx; + + // Output flat KV index buffer (int32). Length must be at least + // kv_indptr[batch_size]. + int32_t* __restrict__ kv_indices; + + // Stride of the first dimension of req_to_token, i.e. the number of + // positions per request (max_context_len). + int32_t req_to_token_stride; + + // Number of sequences in the batch. + uint32_t batch_size; + + // Whether kv_start_idx is valid (1) or should be ignored (0). + uint32_t has_kv_start; +}; + +// We use a fixed block size chosen to balance occupancy and per-sequence +// parallelism. Each block is mapped to a single sequence and threads within +// the block cooperate to copy its token range. +constexpr int kBlockSize = 256; + +// --------------------------------------------------------------------------- +// Core CUDA kernel +// --------------------------------------------------------------------------- +// +// Grid mapping: +// * blockIdx.x -> sequence index `i` in [0, batch_size) +// * threadIdx.x -> intra-sequence worker; threads stride over the token +// range [0, len) with step `blockDim.x`. +// +// This design has several advantages: +// * No inter-block synchronisation is required. +// * Memory accesses are fully coalesced because each thread block walks a +// contiguous segment of the `req_to_token` and `kv_indices` arrays. +// * It handles variable-length sequences naturally; sequences with more +// tokens simply iterate more in the inner loop. + +__global__ void create_kv_indices_kernel(const CreateKvIndicesParams params) { + const uint32_t seq_id = blockIdx.x; // which sequence in the batch + if (seq_id >= params.batch_size) { return; } + + // Resolve the request slot for this sequence. + const int32_t req_slot = params.req_pool_indices[seq_id]; + + // Compute the output range [out_offset, out_offset + len) in kv_indices. + const int32_t out_offset = params.kv_indptr[seq_id]; + const int32_t len = params.page_kernel_lens[seq_id]; + + // Compute the starting position inside the original sequence. + int32_t start = 0; + if (params.has_kv_start && params.kv_start_idx != nullptr) { start = params.kv_start_idx[seq_id]; } + + // Base pointers for this sequence. + const int32_t* __restrict__ row = params.req_to_token + static_cast(req_slot) * params.req_to_token_stride; + int32_t* __restrict__ out = params.kv_indices + out_offset; + + // Each thread in the block handles a strided subset of [0, len). + for (int32_t t = threadIdx.x; t < len; t += blockDim.x) { + // Guard against out-of-bounds reads if (start + t) exceeds the + // configured context length. Under normal conditions upstream + // invariants guarantee `start + len <= req_to_token_stride`, but + // this check makes the kernel robust against misconfigured inputs + // and prevents rare segmentation faults observed during testing. + const int32_t pos = start + t; + if (pos < 0 || pos >= params.req_to_token_stride) { continue; } + + out[t] = row[pos]; + } +} + +// --------------------------------------------------------------------------- +// Host-side launcher used by the JIT wrapper +// --------------------------------------------------------------------------- +// +// `CreateKvIndicesKernel::run(...)` is the C++ entry point that will be bound +// to a TVM FFI function and called from Python via the JIT utility. It is +// responsible for: +// 1. Validating tensor shapes / dtypes / devices. +// 2. Extracting symbolic sizes and strides. +// 3. Building the parameter block. +// 4. Launching the CUDA kernel using mllm_kernel::host::LaunchKernel. + +struct CreateKvIndicesKernel { + static void run(tvm::ffi::TensorView req_to_token, tvm::ffi::TensorView req_pool_indices, + tvm::ffi::TensorView page_kernel_lens, tvm::ffi::TensorView kv_indptr, tvm::ffi::TensorView kv_start_idx, + tvm::ffi::TensorView kv_indices) { + using namespace mllm_kernel::host; + + // --------------------------------------------------------------------- + // 1. Validate input tensors + // --------------------------------------------------------------------- + // req_to_token: [max_reqs, max_context_len], int32, CUDA + SymbolicSize MaxReqs{"max_reqs"}; + SymbolicSize MaxCtx{"max_context_len"}; + SymbolicSize ReqStride{"req_stride"}; + SymbolicDType req_dtype; + SymbolicDevice device; + + (void)TensorMatcher({MaxReqs, MaxCtx}) + .with_strides({ReqStride, 1}) + .with_dtype(req_dtype) + .with_device(device) + .verify(req_to_token); + + // req_pool_indices: [B], int32, CUDA + SymbolicSize B{"batch_size"}; + SymbolicSize ReqPoolStride{"req_pool_stride"}; + (void)TensorMatcher({B}).with_strides({ReqPoolStride}).with_dtype().with_device(device).verify(req_pool_indices); + + // page_kernel_lens: [B], int32, same device + SymbolicSize PageStride{"page_stride"}; + (void)TensorMatcher({B}).with_strides({PageStride}).with_dtype().with_device(device).verify(page_kernel_lens); + + // kv_indptr: [Nind], int32, same device (we later require Nind >= B + 1) + SymbolicSize Nind{"indptr_len"}; + (void)TensorMatcher({Nind}).with_dtype().with_device(device).verify(kv_indptr); + + // kv_start_idx: either [B] or [0]; int32, same device + SymbolicSize StartLen{"start_len"}; + SymbolicSize StartStride{"start_stride"}; + (void)TensorMatcher({StartLen}).with_strides({StartStride}).with_dtype().with_device(device).verify(kv_start_idx); + + // kv_indices: [Nidx], int32, same device + SymbolicSize Nidx{"num_indices"}; + (void)TensorMatcher({Nidx}).with_dtype().with_device(device).verify(kv_indices); + + // Extract concrete sizes. + const int64_t batch_size = B.unwrap(); + const int64_t indptr_len = Nind.unwrap(); + const int64_t req_stride = ReqStride.unwrap(); + + // Basic consistency checks. + RuntimeCheck(batch_size > 0, "batch_size must be positive, got ", batch_size); + RuntimeCheck(indptr_len >= batch_size + 1, "kv_indptr length (", indptr_len, ") must be at least batch_size+1 (", + batch_size + 1, ")"); + + // NOTE: We intentionally do NOT read kv_indptr[batch_size] on the host to + // validate that kv_indices is large enough. kv_indptr resides in device + // memory and dereferencing it from host code would be an illegal memory + // access (segfault). Callers are responsible for ensuring that + // kv_indices.numel() >= kv_indptr[batch_size]. + + // kv_start_idx is optional; when StartLen == 0 we treat it as absent. + RuntimeCheck(StartLen.unwrap() == 0 || StartLen.unwrap() == batch_size, + "kv_start_idx must have length 0 or batch_size; got ", StartLen.unwrap(), " vs batch_size=", batch_size); + + const bool has_kv_start = (StartLen.unwrap() == batch_size); + + // --------------------------------------------------------------------- + // 2. Build parameter block + // --------------------------------------------------------------------- + CreateKvIndicesParams params{ + .req_to_token = static_cast(req_to_token.data_ptr()), + .req_pool_indices = static_cast(req_pool_indices.data_ptr()), + .page_kernel_lens = static_cast(page_kernel_lens.data_ptr()), + .kv_indptr = static_cast(kv_indptr.data_ptr()), + .kv_start_idx = has_kv_start ? static_cast(kv_start_idx.data_ptr()) : nullptr, + .kv_indices = static_cast(kv_indices.data_ptr()), + .req_to_token_stride = static_cast(req_stride), + .batch_size = static_cast(batch_size), + .has_kv_start = has_kv_start ? 1u : 0u, + }; + + const DLDevice dl_device = device.unwrap(); + + // --------------------------------------------------------------------- + // 3. Launch the CUDA kernel + // --------------------------------------------------------------------- + // We launch one block per sequence so that each sequence can be processed + // independently with fully coalesced memory accesses. The per-thread + // inner loop runs over the token range [0, len) with stride = blockDim.x. + + const int grid_size = static_cast(batch_size); + + LaunchKernel(grid_size, kBlockSize, dl_device)(create_kv_indices_kernel, params); + } +}; + +} // namespace diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/vocab_embedding.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/vocab_embedding.cuh new file mode 100644 index 000000000..e69de29bb diff --git a/mllm-kernel/mllm_kernel/cuda/jit/create_kv_indices.py b/mllm-kernel/mllm_kernel/cuda/jit/create_kv_indices.py new file mode 100644 index 000000000..565686a40 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/jit/create_kv_indices.py @@ -0,0 +1,118 @@ +"""High-performance CUDA JIT wrapper for create_kv_indices. + +This module exposes a single function: + + create_kv_indices(req_to_token, req_pool_indices, + page_kernel_lens, kv_indptr, + kv_start_idx, kv_indices) + +which is a Python binding around the C++/CUDA kernel defined in +`mllm_kernel/cuda/csrc/create_kv_indices.cuh`. + +The kernel transforms pymllm's 2-D ReqToTokenPool mapping table into the flat +`(kv_indptr, kv_indices)` layout expected by FlashInfer's paged KV attention +wrappers. It is carefully written for maximum throughput and is intended to +replace the Triton implementation `_create_kv_indices_triton` in +`pymllm.layers.attention.flashinfer_backend`. +""" + +from __future__ import annotations + +import torch + +from mllm_kernel.jit_utils import cache_once, jit + + +@cache_once +def _make_create_kv_indices_kernel(): + """JIT-compile the CUDA kernel and return a callable wrapper. + + The JIT system will: + * locate `create_kv_indices.cuh` under the mllm-kernel CUDA csrc tree, + * compile it into a TVM FFI module, + * expose `CreateKvIndicesKernel::run` as `compiled_module.create_kv_indices`. + """ + + @jit( + args=[], + device="cuda", + cuda_files=["create_kv_indices.cuh"], + cpp_wrappers=[], + cuda_wrappers=[ + ("create_kv_indices", "CreateKvIndicesKernel::run"), + ], + func_name="create_kv_indices", + ) + def _kernel( + compiled_module, + req_to_token: torch.Tensor, + req_pool_indices: torch.Tensor, + page_kernel_lens: torch.Tensor, + kv_indptr: torch.Tensor, + kv_start_idx: torch.Tensor, + kv_indices: torch.Tensor, + ) -> None: + compiled_module.create_kv_indices( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) + + return _kernel + + +def create_kv_indices( + req_to_token: torch.Tensor, + req_pool_indices: torch.Tensor, + page_kernel_lens: torch.Tensor, + kv_indptr: torch.Tensor, + kv_start_idx: torch.Tensor | None, + kv_indices: torch.Tensor, +) -> None: + """Fill a flat KV-index buffer from the ReqToTokenPool mapping. + + This is a thin Python wrapper that forwards to the JIT-compiled CUDA + kernel. All tensors must be placed on the same CUDA device. + + Args + ---- + req_to_token: + Mapping tensor from ReqToTokenPool, shape + ``[max_reqs, max_context_len]``, dtype ``torch.int32``. + req_pool_indices: + Request slots participating in this batch, shape ``[batch_size]``, + dtype ``torch.int32``. + page_kernel_lens: + Per-sequence token counts (how many tokens to attend), shape + ``[batch_size]``, dtype ``torch.int32``. + kv_indptr: + Prefix sums over per-sequence token counts, shape ``[batch_size + 1]``, + dtype ``torch.int32``. ``kv_indptr[i]`` is the starting offset in + ``kv_indices`` for sequence ``i``. + kv_start_idx: + Optional starting positions inside each sequence, shape + ``[batch_size]`` or ``[0]``, dtype ``torch.int32``. When + ``None``, the kernel assumes 0 for all sequences. + kv_indices: + Output flat KV-index buffer, shape ``[N]``, dtype ``torch.int32``. + ``N`` must be at least ``kv_indptr[batch_size]``. + """ + if kv_start_idx is None: + # Use an empty tensor to signal "no start offsets". The C++ launcher + # treats length==0 as "no kv_start" and will pass a nullptr into the + # parameter block, which is slightly cheaper than materialising a + # full zero tensor on every call. + kv_start_idx = req_pool_indices.new_empty(0, dtype=torch.int32) + + kernel = _make_create_kv_indices_kernel() + kernel( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) diff --git a/mllm-kernel/pyproject.toml b/mllm-kernel/pyproject.toml index 77340b29a..13147f068 100644 --- a/mllm-kernel/pyproject.toml +++ b/mllm-kernel/pyproject.toml @@ -55,7 +55,7 @@ logging.level = "INFO" # Wheel configuration - include the Python package wheel.packages = ["mllm_kernel"] -wheel.install-dir = "mllm_kernel" +wheel.install-dir = "" # Install directories for cmake targets wheel.cmake = true diff --git a/mllm-kernel/tests/test_create_kv_indices.py b/mllm-kernel/tests/test_create_kv_indices.py new file mode 100644 index 000000000..e8bf770a3 --- /dev/null +++ b/mllm-kernel/tests/test_create_kv_indices.py @@ -0,0 +1,191 @@ +from __future__ import annotations + +import pytest +import torch + +from mllm_kernel.cuda.jit.create_kv_indices import create_kv_indices + + +def _make_batch( + *, + max_reqs: int, + max_ctx: int, + batch_size: int, + use_start_offsets: bool, + seed: int = 0, +): + """Construct a random-but-bounded test batch for create_kv_indices. + + The constraints ensure that for every sequence i: + 0 <= kv_start_idx[i] + 0 < page_kernel_lens[i] + kv_start_idx[i] + page_kernel_lens[i] <= max_ctx + so the kernel never reads beyond the ReqToTokenPool row. + """ + # Use a CUDA generator for randperm (which requires matching device) + # and a separate CPU generator for randint (which only accepts CPU). + g_cuda = torch.Generator(device="cuda").manual_seed(seed) + g_cpu = torch.Generator(device="cpu").manual_seed(seed) + + device = "cuda" + # req_to_token[req_slot, position] -> kv_index (here we simply use a + # monotonically increasing pattern so correctness is easy to check). + req_to_token = torch.arange( + max_reqs * max_ctx, dtype=torch.int32, device=device + ).reshape(max_reqs, max_ctx) + + # Sample distinct request slots for the batch. + assert batch_size <= max_reqs + req_pool_indices = torch.randperm(max_reqs, generator=g_cuda, device=device)[ + :batch_size + ].to(torch.int32) + + # For each sequence choose a valid (start, length) pair. + page_kernel_lens_list = [] + kv_start_idx_list = [] + for _ in range(batch_size): + # ensure at least 1 token per sequence + L = int(torch.randint(1, max_ctx, (1,), generator=g_cpu).item()) + if use_start_offsets: + start_max = max_ctx - L + start = int(torch.randint(0, max(start_max, 1), (1,), generator=g_cpu).item()) + else: + start = 0 + page_kernel_lens_list.append(L) + kv_start_idx_list.append(start) + + page_kernel_lens = torch.tensor( + page_kernel_lens_list, dtype=torch.int32, device=device + ) + kv_start_idx = torch.tensor(kv_start_idx_list, dtype=torch.int32, device=device) + + # Build kv_indptr prefix sums. + kv_indptr = torch.empty(batch_size + 1, dtype=torch.int32, device=device) + kv_indptr[0] = 0 + kv_indptr[1:] = torch.cumsum(page_kernel_lens, dim=0) + + kv_indices = torch.empty( + int(kv_indptr[-1].item()), dtype=torch.int32, device=device + ) + + return ( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required") +@pytest.mark.parametrize("use_start_offsets", [False, True]) +@pytest.mark.parametrize( + "batch_size,max_reqs,max_ctx", + [ + (1, 4, 16), # minimal batch + (4, 8, 64), # small batch + (32, 64, 512), # medium batch, longer context + (128, 256, 2048), # larger batch, stress inner loop + ], +) +def test_create_kv_indices_matches_reference( + use_start_offsets: bool, + batch_size: int, + max_reqs: int, + max_ctx: int, +): + """create_kv_indices must match a naive PyTorch reference implementation. + + The reference is computed on CPU using explicit loops over + (request_slot, start, length); the CUDA kernel must produce identical + flat kv_indices for the same inputs. + """ + ( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx, + kv_indices, + ) = _make_batch( + max_reqs=max_reqs, + max_ctx=max_ctx, + batch_size=batch_size, + use_start_offsets=use_start_offsets, + seed=2026, + ) + + # Call CUDA kernel (kv_start_idx can be None to exercise that path). + create_kv_indices( + req_to_token, + req_pool_indices, + page_kernel_lens, + kv_indptr, + kv_start_idx if use_start_offsets else None, + kv_indices, + ) + torch.cuda.synchronize() + + # Naive reference on CPU. + req_to_token_cpu = req_to_token.cpu() + req_pool_indices_cpu = req_pool_indices.cpu().to(torch.long) + page_kernel_lens_cpu = page_kernel_lens.cpu() + kv_start_idx_cpu = kv_start_idx.cpu() + + ref_segments = [] + for i in range(batch_size): + req = req_pool_indices_cpu[i].item() + start = kv_start_idx_cpu[i].item() if use_start_offsets else 0 + L = page_kernel_lens_cpu[i].item() + row = req_to_token_cpu[req, start : start + L] + ref_segments.append(row) + ref = torch.cat(ref_segments, dim=0) + + assert kv_indices.shape == ref.shape + assert torch.equal(kv_indices.cpu(), ref) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required") +def test_single_token_per_sequence(): + """Each sequence has exactly 1 token — exercises the minimal-work path.""" + device = "cuda" + bs = 8 + max_ctx = 32 + req_to_token = torch.arange(bs * max_ctx, dtype=torch.int32, device=device).reshape(bs, max_ctx) + req_pool_indices = torch.arange(bs, dtype=torch.int32, device=device) + page_kernel_lens = torch.ones(bs, dtype=torch.int32, device=device) + kv_indptr = torch.arange(bs + 1, dtype=torch.int32, device=device) + kv_indices = torch.empty(bs, dtype=torch.int32, device=device) + + create_kv_indices(req_to_token, req_pool_indices, page_kernel_lens, kv_indptr, None, kv_indices) + torch.cuda.synchronize() + + # Each sequence contributes req_to_token[i, 0]. + expected = req_to_token[:, 0] + assert torch.equal(kv_indices, expected) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required") +def test_oversized_output_buffer(): + """kv_indices buffer is larger than needed (prefill path uses +256 padding).""" + device = "cuda" + bs = 4 + max_ctx = 64 + req_to_token = torch.arange(bs * max_ctx, dtype=torch.int32, device=device).reshape(bs, max_ctx) + req_pool_indices = torch.arange(bs, dtype=torch.int32, device=device) + page_kernel_lens = torch.full((bs,), 10, dtype=torch.int32, device=device) + kv_indptr = torch.arange(0, bs * 10 + 1, 10, dtype=torch.int32, device=device) + # Allocate with extra padding, like the prefill path does. + kv_indices = torch.full((bs * 10 + 256,), -1, dtype=torch.int32, device=device) + + create_kv_indices(req_to_token, req_pool_indices, page_kernel_lens, kv_indptr, None, kv_indices) + torch.cuda.synchronize() + + # First bs*10 entries should match; padding should remain -1. + ref_segments = [] + for i in range(bs): + ref_segments.append(req_to_token[i, :10]) + ref = torch.cat(ref_segments, dim=0) + assert torch.equal(kv_indices[:bs * 10], ref) + assert torch.all(kv_indices[bs * 10:] == -1) diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py index 9e399d62d..f6a2090fc 100644 --- a/pymllm/configs/server_config.py +++ b/pymllm/configs/server_config.py @@ -79,6 +79,39 @@ class ServerConfig: # Feature switches # --------------------------------------------------------------------- # enable_shared_queue: bool = False # Use shared memory queue for fast IPC + + # CUDA IPC transport for multimodal GPU tensors. + # Requires enable_shared_queue=True to take effect. + # + # Three transport modes (mutually exclusive for GPU tensors): + # + # "default" + # GPU tensors are moved to CPU first (GPU→CPU copy), then placed in + # POSIX shared memory via share_memory_(). Safe but adds a device copy. + # + # "cuda_ipc" + # GPU tensors stay on GPU. Each tensor is wrapped in a + # TransportProxyTensor whose __getstate__ calls storage._share_cuda_() + # to obtain an IPC handle; the receiver reconstructs via + # UntypedStorage._new_shared_cuda(*handle). Simple, but the underlying + # GPU allocation is never freed until the sender process exits + # (PyTorch limitation) -- can leak GPU memory in long-running services. + # + # "cuda_ipc_pool" [recommended for production] + # GPU tensors are copied into a pre-allocated fixed-size GPU workspace + # (MmItemMemoryPool). Each outgoing tensor occupies a "chunk" of the + # pool; the chunk's IPC handle is sent via CudaIpcTensorTransportProxy. + # After the receiver finishes copying data it increments a shared-memory + # sync flag; a background recycler thread in the sender watches these + # flags and returns chunks to the available pool. No GPU memory is leaked. + tensor_transport_mode: str = "default" # one of: default, cuda_ipc, cuda_ipc_pool + + # Size of the pre-allocated CUDA IPC memory pool in MB. + # Only used when tensor_transport_mode == "cuda_ipc_pool". + cuda_ipc_pool_size_mb: int = 512 + + # How often (seconds) the pool recycler thread wakes up. + cuda_ipc_recycle_interval: float = 0.1 # enable_lora: bool = False # max_loaded_loras: Optional[int] = None # max_loras_per_batch: int = 8 @@ -102,6 +135,18 @@ def __post_init__(self) -> None: self._validate() def _validate(self) -> None: + valid_modes = {"default", "cuda_ipc", "cuda_ipc_pool"} + if self.tensor_transport_mode not in valid_modes: + raise ValueError( + f"`tensor_transport_mode` must be one of {valid_modes}, " + f"got {self.tensor_transport_mode!r}." + ) + if self.tensor_transport_mode != "default" and not self.enable_shared_queue: + raise ValueError( + "`tensor_transport_mode` != 'default' requires `enable_shared_queue=True`." + ) + if self.cuda_ipc_pool_size_mb <= 0: + raise ValueError("`cuda_ipc_pool_size_mb` must be > 0.") if self.port <= 0 or self.port > 65535: raise ValueError("`port` must be in range [1, 65535].") if self.max_prefill_tokens is not None and self.max_prefill_tokens <= 0: diff --git a/pymllm/engine/__init__.py b/pymllm/engine/__init__.py index e69de29bb..50f2b7249 100644 --- a/pymllm/engine/__init__.py +++ b/pymllm/engine/__init__.py @@ -0,0 +1,8 @@ +"""Engine module for pymllm.""" + +from pymllm.engine.forward_batch import ForwardBatch, ForwardMode + +__all__ = [ + "ForwardBatch", + "ForwardMode", +] diff --git a/pymllm/engine/forward_batch.py b/pymllm/engine/forward_batch.py index e69de29bb..ebb715ff4 100644 --- a/pymllm/engine/forward_batch.py +++ b/pymllm/engine/forward_batch.py @@ -0,0 +1,182 @@ +"""ForwardMode and ForwardBatch for pymllm. + +Simplified forward-batch abstraction: no speculative decoding, no +encoder-decoder support, and no distributed-attention complexity (DP/TP +head splitting is handled at the layer level by the model code, not here). + +Typical data flow +----------------- + ModelRunner builds a ForwardBatch + ↓ + attn_backend.init_forward_metadata(forward_batch) + ↓ + model.forward(input_ids, positions, forward_batch) + ↓ + RadixAttention.forward(q, k, v, forward_batch) + ↓ + forward_batch.attn_backend.forward(q, k, v, layer, forward_batch) +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import IntEnum, auto +from typing import TYPE_CHECKING, List, Optional + +import torch + +if TYPE_CHECKING: + from pymllm.layers.attention.attention_backend import AttentionBackend + from pymllm.mem_cache.memory_pool import KVPool, ReqToTokenPool + + +# --------------------------------------------------------------------------- +# ForwardMode +# --------------------------------------------------------------------------- + + +class ForwardMode(IntEnum): + """Describes what kind of forward pass is being performed. + + Covers standard prefill / decode inference without speculative decoding. + """ + + # Prefill / extend: process new tokens. The KV cache of the prefix (if + # any) is already populated (e.g. shared system-prompt via radix cache). + EXTEND = auto() + + # Decode: generate exactly one new token per sequence. + DECODE = auto() + + # Mixed: a chunked-prefill batch that contains both extend and decode + # sequences simultaneously. + MIXED = auto() + + # Idle: no sequences to process (used with data-parallel workers when some + # ranks have no allocated sequences). + IDLE = auto() + + # ---- helpers ---- + + def is_extend(self) -> bool: + """True for EXTEND or MIXED (i.e. any prefill-style pass).""" + return self in (ForwardMode.EXTEND, ForwardMode.MIXED) + + def is_prefill(self) -> bool: + """Alias for ``is_extend()``.""" + return self.is_extend() + + def is_decode(self) -> bool: + return self == ForwardMode.DECODE + + def is_mixed(self) -> bool: + return self == ForwardMode.MIXED + + def is_idle(self) -> bool: + return self == ForwardMode.IDLE + + def is_decode_or_idle(self) -> bool: + return self == ForwardMode.DECODE or self == ForwardMode.IDLE + + +# --------------------------------------------------------------------------- +# ForwardBatch +# --------------------------------------------------------------------------- + + +@dataclass +class ForwardBatch: + """All tensors required by a single forward pass through the model. + + Parameters + ---------- + forward_mode + The kind of pass being performed (EXTEND / DECODE / MIXED / IDLE). + batch_size + Number of sequences in the batch. + input_ids + Token ids for every position in the batch, shape ``[num_tokens]``. + For decode, ``num_tokens == batch_size``; for extend, + ``num_tokens == extend_num_tokens``. + req_pool_indices + Index of each sequence in ``ReqToTokenPool``, shape ``[batch_size]`` + (int32 or int64, on the target device). + seq_lens + Total (prefix + new) length of each sequence, shape ``[batch_size]`` + (int32). + out_cache_loc + KV-pool slot that each *output* token is written to, shape + ``[num_tokens]`` (int64). + seq_lens_sum + Python ``int`` equal to ``seq_lens.sum()``. Cached to avoid repeated + device-to-host syncs. + seq_lens_cpu + CPU copy of ``seq_lens`` (optional; used by some attention backends + for plan computation without a device sync). + positions + Token position for each input token, shape ``[num_tokens]`` + (int32 or int64). + extend_num_tokens + Total number of new (non-prefix) tokens across the batch. Only set + during EXTEND / MIXED passes. + extend_seq_lens + Number of *new* tokens for each sequence, shape ``[batch_size]`` + (int32). Only set during EXTEND / MIXED. + extend_prefix_lens + Length of the already-cached prefix for each sequence, + shape ``[batch_size]`` (int32). Only set during EXTEND / MIXED. + extend_start_loc + Cumulative start offset of each sequence in the flattened extend + token stream, shape ``[batch_size]`` (int32). + extend_prefix_lens_cpu + CPU list mirror of ``extend_prefix_lens``. + extend_seq_lens_cpu + CPU list mirror of ``extend_seq_lens``. + return_logprob + Whether to compute per-token log-probabilities. + top_logprobs_nums + Number of top log-probs to return per sequence (None or list of ints). + req_to_token_pool + Reference to the ``ReqToTokenPool`` (set by the model runner). + token_to_kv_pool + Reference to the ``KVPool`` (set by the model runner). + attn_backend + The attention backend to use (set by the model runner before calling + ``model.forward``). + """ + + # ---- required fields (positional) ---- + forward_mode: ForwardMode + batch_size: int + input_ids: torch.Tensor # [num_tokens] + req_pool_indices: torch.Tensor # [batch_size] int32/int64 + seq_lens: torch.Tensor # [batch_size] int32 + out_cache_loc: torch.Tensor # [num_tokens] int64 + seq_lens_sum: int # python int + + # ---- optional metadata ---- + + # CPU mirror of seq_lens + seq_lens_cpu: Optional[torch.Tensor] = None + + # Position encoding – shape [num_tokens], int32 or int64 + positions: Optional[torch.Tensor] = None + + # ---- extend / prefill specific ---- + extend_num_tokens: Optional[int] = None + extend_seq_lens: Optional[torch.Tensor] = None # [batch_size] int32 + extend_prefix_lens: Optional[torch.Tensor] = None # [batch_size] int32 + extend_start_loc: Optional[torch.Tensor] = None # [batch_size] int32 + extend_prefix_lens_cpu: Optional[List[int]] = None + extend_seq_lens_cpu: Optional[List[int]] = None + + # ---- logprob options ---- + return_logprob: bool = False + top_logprobs_nums: Optional[List[int]] = None + + # ---- memory pools (set by model runner) ---- + req_to_token_pool: Optional["ReqToTokenPool"] = None + token_to_kv_pool: Optional["KVPool"] = None + + # ---- attention backend (set by model runner) ---- + attn_backend: Optional["AttentionBackend"] = None diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index 2200d7f33..2ba04e1c1 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -26,7 +26,6 @@ ReqState, RequestResponseProcess, ) -from pymllm.orchestrator.shared_memory_queue import TensorQueue from pymllm.orchestrator.tokenizer_process import run_tokenizer_process from pymllm.orchestrator.scheduler_process import run_scheduler_process from pymllm.orchestrator.model_runner_process import run_model_runner_process @@ -80,13 +79,30 @@ def _launch_processes(self) -> None: # Config dict for the tokenizer subprocess (must be picklable). cfg = get_global_config() enable_shared_queue = cfg.server.enable_shared_queue - - # Create shared queue if enabled + transport_mode: str = ( + cfg.server.tensor_transport_mode + ) # "default" | "cuda_ipc" | "cuda_ipc_pool" + + # Create shared queue if enabled. + # Note: the MmItemMemoryPool (for "cuda_ipc_pool") is created *inside* + # the tokenizer subprocess after CUDA is initialised. The queue here + # is constructed without a pool; TokenizerProcess._ensure_pool() will + # swap in a pool-aware queue at runtime. shared_queue = None if enable_shared_queue: - # TODO: WCH init CUDA IPC things. - shared_queue = TensorQueue(maxsize=1000) # Configurable max size - logger.info("Shared memory queue enabled for fast IPC") + from pymllm.orchestrator.shared_memory_queue import TensorQueue as _TQ + + # Construct with the configured transport mode. The pool is not + # supplied here; it will be lazily initialised inside the subprocess. + shared_queue = _TQ( + maxsize=1000, + transport_mode=transport_mode, + pool=None, # pool initialised lazily inside TokenizerProcess + ) + logger.info( + "Shared memory queue enabled for fast IPC (transport_mode=%s)", + transport_mode, + ) tokenizer_cfg: Dict[str, Any] = { "tokenizer_path": str(cfg.server.tokenizer_path), @@ -95,6 +111,9 @@ def _launch_processes(self) -> None: "context_length": cfg.server.context_length, "hf_config": cfg.model.hf_config, "enable_shared_queue": enable_shared_queue, + "tensor_transport_mode": transport_mode, + "cuda_ipc_pool_size_mb": cfg.server.cuda_ipc_pool_size_mb, + "cuda_ipc_recycle_interval": cfg.server.cuda_ipc_recycle_interval, } # Tokenizer @@ -124,6 +143,7 @@ def _launch_processes(self) -> None: scheduler_writer, shared_queue, # Pass shared queue enable_shared_queue, # Pass flag + transport_mode, # Pass tensor transport mode ), daemon=True, ) diff --git a/pymllm/layers/attention/__init__.py b/pymllm/layers/attention/__init__.py index e69de29bb..5d0dbf076 100644 --- a/pymllm/layers/attention/__init__.py +++ b/pymllm/layers/attention/__init__.py @@ -0,0 +1,25 @@ +"""Attention layers and backends for pymllm.""" + +from pymllm.layers.attention.attention_backend import AttentionBackend +from pymllm.layers.attention.flashinfer_backend import ( + DecodeMetadata, + FlashInferAttnBackend, + PrefillMetadata, + WrapperDispatch, + should_use_tensor_core, +) +from pymllm.layers.attention.radix_attention import AttentionType, RadixAttention + +__all__ = [ + # Base + "AttentionBackend", + # RadixAttention + "AttentionType", + "RadixAttention", + # FlashInfer backend + "FlashInferAttnBackend", + "DecodeMetadata", + "PrefillMetadata", + "WrapperDispatch", + "should_use_tensor_core", +] diff --git a/pymllm/layers/attention/attention_backend.py b/pymllm/layers/attention/attention_backend.py index e69de29bb..07e2f6a17 100644 --- a/pymllm/layers/attention/attention_backend.py +++ b/pymllm/layers/attention/attention_backend.py @@ -0,0 +1,143 @@ +"""Abstract base class for pymllm attention backends. + +Every concrete backend (FlashInfer, Triton, torch-native, …) must implement +at minimum: + + * ``init_forward_metadata`` – called once per batch before the model forward. + * ``forward_extend`` – prefill / extend attention. + * ``forward_decode`` – single-token decode attention. + +The public ``forward`` method dispatches to the correct variant based on +``forward_batch.forward_mode``. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Optional + +import torch + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch, ForwardMode + from pymllm.layers.attention.radix_attention import RadixAttention + + +class AttentionBackend(ABC): + """Abstract base class for attention backends. + + All concrete backends inherit from this class and implement the abstract + methods below. + """ + + # ------------------------------------------------------------------ + # Core interface – must be implemented by every backend + # ------------------------------------------------------------------ + + @abstractmethod + def init_forward_metadata(self, forward_batch: "ForwardBatch") -> None: + """Prepare per-batch metadata before the model's attention layers run. + + For FlashInfer this plans the KV-index arrays and calls + ``wrapper.begin_forward``; for Triton / torch-native this is a no-op. + Must be called once per batch *before* ``model.forward``. + """ + raise NotImplementedError + + @abstractmethod + def forward_decode( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Run attention for a decode step (one new token per sequence).""" + raise NotImplementedError + + @abstractmethod + def forward_extend( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Run attention for a prefill / extend step.""" + raise NotImplementedError + + # ------------------------------------------------------------------ + # Dispatch – shared logic; do not override in normal backends + # ------------------------------------------------------------------ + + def forward( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Dispatch to ``forward_decode`` or ``forward_extend`` based on mode. + + For IDLE batches a zero-filled output tensor is returned without any + compute. + """ + if forward_batch.forward_mode.is_idle(): + # Return empty output without computation. + return q.new_empty(q.shape[0], layer.tp_q_head_num * layer.v_head_dim) + elif forward_batch.forward_mode.is_decode(): + return self.forward_decode( + q, k, v, layer, forward_batch, save_kv_cache=save_kv_cache, **kwargs + ) + else: + return self.forward_extend( + q, k, v, layer, forward_batch, save_kv_cache=save_kv_cache, **kwargs + ) + + # ------------------------------------------------------------------ + # Optional CUDA-graph interface + # ------------------------------------------------------------------ + + def get_cuda_graph_seq_len_fill_value(self) -> int: + """Fill value used to pad ``seq_lens`` tensors for CUDA-graph capture. + + Most backends use ``1`` (not ``0``) to avoid division-by-zero in + attention kernels. + """ + raise NotImplementedError + + def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int) -> None: + """Allocate shared CUDA-graph state (buffers reused across captures).""" + raise NotImplementedError + + def init_forward_metadata_capture_cuda_graph( + self, + bs: int, + num_tokens: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + forward_mode: "ForwardMode", + ) -> None: + """Set up per-batch metadata for capturing a CUDA graph.""" + raise NotImplementedError + + def init_forward_metadata_replay_cuda_graph( + self, + bs: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_sum: int, + forward_mode: "ForwardMode", + seq_lens_cpu: Optional[torch.Tensor], + ) -> None: + """Update metadata when replaying a captured CUDA graph.""" + raise NotImplementedError diff --git a/pymllm/layers/attention/flashinfer_backend.py b/pymllm/layers/attention/flashinfer_backend.py index e69de29bb..479fb5cec 100644 --- a/pymllm/layers/attention/flashinfer_backend.py +++ b/pymllm/layers/attention/flashinfer_backend.py @@ -0,0 +1,964 @@ +"""FlashInfer attention backend for pymllm. + + * No model-runner object -- constructor takes explicit scalar / tensor params. + * No tensor-parallelism head splitting (handled at the model layer level). + * No speculative decoding support. + * ``KVPool`` API: + - ``get_kv_buffer(layer_id)`` returns ``(k_buf, v_buf)`` each shaped + ``[buf_len, num_heads, head_dim]``. + - ``set_kv_buffer(layer_id, indices, k, v)`` -- no scale arguments. + +Supports: + * Single-wrapper mode (full context, no sliding window) + * Sliding-window mode (two wrappers: window + full) + * CUDA-graph capture / replay for decode and target-verify passes. +""" + +from __future__ import annotations + +import logging +import os +from dataclasses import dataclass +from enum import Enum, auto +from typing import List, Optional, Union + +import torch + +from pymllm.engine.forward_batch import ForwardBatch, ForwardMode +from pymllm.layers.attention.attention_backend import AttentionBackend +from mllm_kernel.cuda.jit.create_kv_indices import create_kv_indices + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Optional FlashInfer import +# --------------------------------------------------------------------------- + +_flashinfer_available = False +try: + from flashinfer import ( + BatchDecodeWithPagedKVCacheWrapper, + BatchPrefillWithPagedKVCacheWrapper, + BatchPrefillWithRaggedKVCacheWrapper, + ) + + try: + from flashinfer import fast_decode_plan + from functools import partial as _partial + + _has_fast_decode_plan = True + except ImportError: + _has_fast_decode_plan = False + + from flashinfer.cascade import merge_state + + _flashinfer_available = True +except ImportError: + logger.warning( + "flashinfer is not installed; FlashInferAttnBackend will raise " + "NotImplementedError if used." + ) + +# --------------------------------------------------------------------------- +# Global workspace buffer (shared across all FlashInfer wrapper instances) +# --------------------------------------------------------------------------- + +_global_workspace_buffer: Optional[torch.Tensor] = None + +# Default workspace size (128 MB); can be overridden via environment variable. +_DEFAULT_WORKSPACE_BYTES = int( + os.environ.get("PYMLLM_FLASHINFER_WORKSPACE_SIZE", 128 * 1024 * 1024) +) + +# --------------------------------------------------------------------------- +# Enums / dataclasses +# --------------------------------------------------------------------------- + + +class WrapperDispatch(Enum): + """Indicates which wrapper to use for a given attention layer.""" + + SLIDING_WINDOW = auto() + CROSS_ATTENTION = auto() + + +@dataclass +class DecodeMetadata: + """Per-batch metadata for a decode step.""" + + decode_wrappers: "List[BatchDecodeWithPagedKVCacheWrapper]" + + +@dataclass +class PrefillMetadata: + """Per-batch metadata for a prefill / extend step.""" + + prefill_wrappers: "List[BatchPrefillWithPagedKVCacheWrapper]" + use_ragged: bool + extend_no_prefix: bool + + +# --------------------------------------------------------------------------- +# CUDA kernel – build the flat kv_indices array for FlashInfer +# --------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- +# Helper – choose whether to use tensor cores for decode +# --------------------------------------------------------------------------- + + +def should_use_tensor_core( + kv_cache_dtype: torch.dtype, + num_attention_heads: int, + num_kv_heads: int, +) -> bool: + """Return whether FlashInfer decode should use tensor cores. + + For FP8 we always use tensor cores. For fp16 / bf16 we use them when + the GQA group size (num_attention_heads / num_kv_heads) is ≥ 4, which + fuses the head group with the token dimension in the MMA instruction. + """ + env_override = os.environ.get("PYMLLM_FLASHINFER_USE_TENSOR_CORE") + if env_override is not None: + return env_override.lower() == "true" + + try: + from flashinfer.decode import _grouped_size_compiled_for_decode_kernels + + return not _grouped_size_compiled_for_decode_kernels( + num_attention_heads, num_kv_heads + ) + except (ImportError, AttributeError): + pass + + gqa_group_size = num_attention_heads // num_kv_heads + if kv_cache_dtype in (torch.float8_e4m3fn, torch.float8_e5m2): + return True + if kv_cache_dtype in (torch.float16, torch.half, torch.bfloat16): + return gqa_group_size >= 4 + return False + + +# --------------------------------------------------------------------------- +# FlashInferAttnBackend +# --------------------------------------------------------------------------- + + +class FlashInferAttnBackend(AttentionBackend): + """FlashInfer-based attention backend for pymllm. + + This class does not depend on a ``ModelRunner`` object. Instead it takes + all required configuration explicitly so that it can be constructed + independently of any particular model runner. + + Parameters + ---------- + num_heads + Number of query heads per device (after any TP sharding). + num_kv_heads + Number of KV heads per device. + head_dim + Per-head dimension for Q and K. + kv_cache_dtype + ``torch.dtype`` of the KV cache (e.g. ``torch.float16``). + q_dtype + ``torch.dtype`` of the query tensor. + max_context_len + Maximum sequence length the model supports. + req_to_token + The ``[max_reqs, max_context_len]`` int32 tensor from + ``ReqToTokenPool.req_to_token``. + device + Target device (e.g. ``torch.device("cuda")``) + max_req_pool_size + Maximum number of concurrent requests (= ``ReqToTokenPool.size``). + Used to pre-allocate ``kv_indptr`` / ``kv_last_page_len`` buffers. + sliding_window_size + When not ``None``, enables sliding-window attention mode which + allocates two wrapper sets (window + full context). + skip_prefill + When ``True``, skip creating prefill wrappers (for backends that only + perform decode, e.g. multi-step draft backends). + kv_indptr_buf + Optional pre-allocated ``kv_indptr`` buffer. Used when sharing + buffers across multiple backend instances (e.g. multi-step draft). + kv_last_page_len_buf + Optional pre-allocated ``kv_last_page_len`` buffer. + init_new_workspace + When ``True`` allocate a fresh workspace buffer instead of reusing the + global one. + """ + + def __init__( + self, + num_heads: int, + num_kv_heads: int, + head_dim: int, + kv_cache_dtype: torch.dtype, + q_dtype: torch.dtype, + max_context_len: int, + req_to_token: torch.Tensor, + device: torch.device, + max_req_pool_size: int, + sliding_window_size: Optional[int] = None, + skip_prefill: bool = False, + kv_indptr_buf: Optional[torch.Tensor] = None, + kv_last_page_len_buf: Optional[torch.Tensor] = None, + init_new_workspace: bool = False, + ): + if not _flashinfer_available: + raise RuntimeError( + "flashinfer is required for FlashInferAttnBackend but is not " + "installed. Run: pip install flashinfer-python" + ) + + super().__init__() + + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = head_dim + self.kv_cache_dtype = kv_cache_dtype + self.q_dtype = q_dtype + self.max_context_len = max_context_len + self.req_to_token = req_to_token + self.device = device + self.skip_prefill = skip_prefill + + # Tensor-core preference for decode + self.decode_use_tensor_cores = should_use_tensor_core( + kv_cache_dtype, num_heads, num_kv_heads + ) + + # Sliding-window / cross-attention wrapper dispatch + if sliding_window_size is not None: + self.num_wrappers = 2 + self.dispatch_reason: Optional[WrapperDispatch] = ( + WrapperDispatch.SLIDING_WINDOW + ) + self.sliding_window_size: Optional[int] = sliding_window_size + else: + self.num_wrappers = 1 + self.dispatch_reason = None + self.sliding_window_size = None + + # ------------------------------------------------------------------ + # Workspace buffer + # ------------------------------------------------------------------ + global _global_workspace_buffer + if _global_workspace_buffer is None: + _global_workspace_buffer = torch.empty( + _DEFAULT_WORKSPACE_BYTES, + dtype=torch.uint8, + device=device, + ) + if init_new_workspace: + self.workspace_buffer = torch.empty( + _DEFAULT_WORKSPACE_BYTES, + dtype=torch.uint8, + device=device, + ) + else: + self.workspace_buffer = _global_workspace_buffer + + # ------------------------------------------------------------------ + # kv_indptr [num_wrappers × (max_req_pool_size + 1)] + # kv_last_page_len [max_req_pool_size] + # ------------------------------------------------------------------ + if kv_indptr_buf is None: + self.kv_indptr: List[torch.Tensor] = [ + torch.zeros((max_req_pool_size + 1,), dtype=torch.int32, device=device) + for _ in range(self.num_wrappers) + ] + else: + assert self.num_wrappers == 1 + self.kv_indptr = [kv_indptr_buf] + + if kv_last_page_len_buf is None: + self.kv_last_page_len = torch.ones( + (max_req_pool_size,), dtype=torch.int32, device=device + ) + else: + assert self.num_wrappers == 1 + self.kv_last_page_len = kv_last_page_len_buf + + # qo_indptr – only needed for prefill + if not skip_prefill: + self.qo_indptr: List[torch.Tensor] = [ + torch.zeros((max_req_pool_size + 1,), dtype=torch.int32, device=device) + for _ in range(self.num_wrappers) + ] + + # ------------------------------------------------------------------ + # Create FlashInfer wrappers + # ------------------------------------------------------------------ + self.prefill_wrapper_ragged: Optional[ + "BatchPrefillWithRaggedKVCacheWrapper" + ] = None + self.prefill_wrappers_paged: List["BatchPrefillWithPagedKVCacheWrapper"] = [] + self.decode_wrappers: List["BatchDecodeWithPagedKVCacheWrapper"] = [] + + if not skip_prefill: + self.prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper( + self.workspace_buffer, "NHD" + ) + + for _ in range(self.num_wrappers): + if not skip_prefill: + self.prefill_wrappers_paged.append( + BatchPrefillWithPagedKVCacheWrapper(self.workspace_buffer, "NHD") + ) + self.decode_wrappers.append( + BatchDecodeWithPagedKVCacheWrapper( + self.workspace_buffer, + "NHD", + use_tensor_cores=self.decode_use_tensor_cores, + ) + ) + + # ------------------------------------------------------------------ + # Indices updaters + # ------------------------------------------------------------------ + if not skip_prefill: + self.indices_updater_prefill = _FlashInferIndicesUpdaterPrefill(self) + self.indices_updater_decode = _FlashInferIndicesUpdaterDecode(self) + + # Per-batch metadata set by init_forward_metadata + self.forward_metadata: Optional[Union[DecodeMetadata, PrefillMetadata]] = None + + # CUDA-graph metadata stores + self.decode_cuda_graph_metadata: dict = {} + self.prefill_cuda_graph_metadata: dict = {} + + # ------------------------------------------------------------------ + # init_forward_metadata + # ------------------------------------------------------------------ + + def init_forward_metadata(self, forward_batch: ForwardBatch) -> None: + """Prepare FlashInfer wrappers for the current batch. + + Must be called once per batch before the model's ``forward`` method. + """ + if forward_batch.forward_mode.is_decode_or_idle(): + self.indices_updater_decode.update( + forward_batch.req_pool_indices, + forward_batch.seq_lens, + forward_batch.seq_lens_cpu, + forward_batch.seq_lens_sum, + decode_wrappers=self.decode_wrappers, + ) + self.forward_metadata = DecodeMetadata(self.decode_wrappers) + else: + # Extend / prefill + prefix_lens = forward_batch.extend_prefix_lens + extend_no_prefix = ( + forward_batch.extend_prefix_lens_cpu is not None + and not any(forward_batch.extend_prefix_lens_cpu) + ) + use_ragged = extend_no_prefix + + self.indices_updater_prefill.update( + forward_batch.req_pool_indices, + forward_batch.seq_lens, + forward_batch.seq_lens_cpu, + forward_batch.seq_lens_sum, + prefix_lens=prefix_lens, + prefill_wrappers=self.prefill_wrappers_paged, + use_ragged=use_ragged, + ) + self.forward_metadata = PrefillMetadata( + self.prefill_wrappers_paged, + use_ragged=use_ragged, + extend_no_prefix=extend_no_prefix, + ) + + # ------------------------------------------------------------------ + # forward_extend + # ------------------------------------------------------------------ + + def forward_extend( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", # noqa: F821 + forward_batch: ForwardBatch, + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + from pymllm.layers.attention.radix_attention import RadixAttention + + assert isinstance(layer, RadixAttention) + meta: PrefillMetadata = self.forward_metadata + + prefill_wrapper_paged = meta.prefill_wrappers[self._get_wrapper_idx(layer)] + cache_loc = forward_batch.out_cache_loc + + # Write K/V into the pool + if k is not None: + assert v is not None + if save_kv_cache: + forward_batch.token_to_kv_pool.set_kv_buffer( + layer.layer_id, cache_loc, k, v + ) + + q_3d = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim) + + if not meta.use_ragged: + # Paged-only path: uses the full KV cache (prefix + extend). + k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer( + layer.layer_id + ) + # Reshape to [buf_len, page_size=1, num_heads, head_dim] for FlashInfer. + paged_kv = (k_cache.unsqueeze(1), v_cache.unsqueeze(1)) + + o = prefill_wrapper_paged.forward( + q_3d, + paged_kv, + causal=not layer.is_cross_attention, + sm_scale=layer.scaling, + window_left=layer.sliding_window_size, + logits_soft_cap=layer.logit_cap if layer.logit_cap > 0 else None, + ) + else: + # Ragged path: query attends only to the new (ragged) K/V; + # prefix K/V is in the paged pool. + if k is None: + # Fallback: load K/V from the pool. + k_buf, v_buf = forward_batch.token_to_kv_pool.get_kv_buffer( + layer.layer_id + ) + k = k_buf + v = v_buf + + k_3d = k.view(-1, layer.tp_k_head_num, layer.head_dim) + v_3d = v.view(-1, layer.tp_v_head_num, layer.v_head_dim) + + if meta.extend_no_prefix: + # Pure prefill – no prefix at all. + o = self.prefill_wrapper_ragged.forward( + q_3d, + k_3d, + v_3d, + causal=True, + sm_scale=layer.scaling, + logits_soft_cap=(layer.logit_cap if layer.logit_cap > 0 else None), + ) + else: + # Extend with prefix: merge ragged (new) and paged (prefix). + o1, s1 = self.prefill_wrapper_ragged.forward_return_lse( + q_3d, + k_3d, + v_3d, + causal=True, + sm_scale=layer.scaling, + logits_soft_cap=(layer.logit_cap if layer.logit_cap > 0 else None), + ) + + k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer( + layer.layer_id + ) + paged_kv = (k_cache.unsqueeze(1), v_cache.unsqueeze(1)) + o2, s2 = prefill_wrapper_paged.forward_return_lse( + q_3d, + paged_kv, + causal=False, + sm_scale=layer.scaling, + logits_soft_cap=(layer.logit_cap if layer.logit_cap > 0 else None), + ) + + o, _ = merge_state(o1, s1, o2, s2) + + return o.view(-1, layer.tp_q_head_num * layer.v_head_dim) + + # ------------------------------------------------------------------ + # forward_decode + # ------------------------------------------------------------------ + + def forward_decode( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", # noqa: F821 + forward_batch: ForwardBatch, + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + from pymllm.layers.attention.radix_attention import RadixAttention + + assert isinstance(layer, RadixAttention) + meta: DecodeMetadata = self.forward_metadata + + decode_wrapper = meta.decode_wrappers[self._get_wrapper_idx(layer)] + cache_loc = forward_batch.out_cache_loc + + if k is not None: + assert v is not None + if save_kv_cache: + forward_batch.token_to_kv_pool.set_kv_buffer( + layer.layer_id, cache_loc, k, v + ) + + k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id) + paged_kv = (k_cache.unsqueeze(1), v_cache.unsqueeze(1)) + + o = decode_wrapper.forward( + q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim), + paged_kv, + sm_scale=layer.scaling, + logits_soft_cap=layer.logit_cap if layer.logit_cap > 0 else None, + ) + + return o.view(-1, layer.tp_q_head_num * layer.v_head_dim) + + # ------------------------------------------------------------------ + # CUDA-graph support + # ------------------------------------------------------------------ + + def get_cuda_graph_seq_len_fill_value(self) -> int: + return 1 + + def init_cuda_graph_state( + self, + max_bs: int, + max_num_tokens: int, + kv_indices_buf: Optional[torch.Tensor] = None, + ) -> None: + """Allocate CUDA-graph shared state buffers.""" + if kv_indices_buf is None: + cuda_graph_kv_indices = torch.zeros( + (max_num_tokens * self.max_context_len,), + dtype=torch.int32, + device=self.device, + ) + else: + cuda_graph_kv_indices = kv_indices_buf + + self.cuda_graph_kv_indices = [cuda_graph_kv_indices] + [ + cuda_graph_kv_indices.clone() for _ in range(self.num_wrappers - 1) + ] + + if not self.skip_prefill: + self.cuda_graph_custom_mask = torch.zeros( + (max_num_tokens * self.max_context_len,), + dtype=torch.uint8, + device=self.device, + ) + self.cuda_graph_qk_indptr = [x.clone() for x in self.kv_indptr] + self.cuda_graph_qo_indptr = [x.clone() for x in self.kv_indptr] + + def init_forward_metadata_capture_cuda_graph( + self, + bs: int, + num_tokens: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + forward_mode: ForwardMode, + ) -> None: + """Set up metadata for CUDA-graph capture of a decode step.""" + if not forward_mode.is_decode_or_idle(): + raise ValueError( + "CUDA-graph capture is only supported for decode / idle modes." + ) + + decode_wrappers = [] + for i in range(self.num_wrappers): + decode_wrappers.append( + BatchDecodeWithPagedKVCacheWrapper( + self.workspace_buffer, + "NHD", + use_cuda_graph=True, + use_tensor_cores=self.decode_use_tensor_cores, + paged_kv_indptr_buffer=self.kv_indptr[i][: num_tokens + 1], + paged_kv_indices_buffer=self.cuda_graph_kv_indices[i], + paged_kv_last_page_len_buffer=self.kv_last_page_len[:num_tokens], + ) + ) + + seq_lens_sum = seq_lens.sum().item() + self.indices_updater_decode.update( + req_pool_indices, + seq_lens, + seq_lens.cpu(), + seq_lens_sum, + decode_wrappers=decode_wrappers, + ) + self.decode_cuda_graph_metadata[bs] = decode_wrappers + self.forward_metadata = DecodeMetadata(decode_wrappers) + + if _has_fast_decode_plan: + for i in range(self.num_wrappers): + decode_wrappers[i].begin_forward = _partial( + fast_decode_plan, decode_wrappers[i] + ) + + def init_forward_metadata_replay_cuda_graph( + self, + bs: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_sum: int, + forward_mode: ForwardMode, + seq_lens_cpu: Optional[torch.Tensor], + ) -> None: + """Update metadata when replaying a CUDA graph for decode.""" + if not forward_mode.is_decode_or_idle(): + raise ValueError( + "CUDA-graph replay is only supported for decode / idle modes." + ) + + self.indices_updater_decode.update( + req_pool_indices[:bs], + seq_lens[:bs], + seq_lens_cpu[:bs] if seq_lens_cpu is not None else None, + seq_lens_sum, + decode_wrappers=self.decode_cuda_graph_metadata[bs], + ) + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + def _get_wrapper_idx(self, layer) -> int: + """Return the wrapper index for the given attention layer.""" + if self.num_wrappers == 1: + return 0 + if self.dispatch_reason == WrapperDispatch.SLIDING_WINDOW: + # Wrapper 0 → sliding window attention. + # Wrapper 1 → full-context attention. + return int(layer.sliding_window_size == -1) + raise ValueError(f"Unknown dispatch reason: {self.dispatch_reason}") + + +# --------------------------------------------------------------------------- +# _FlashInferIndicesUpdaterDecode +# --------------------------------------------------------------------------- + + +class _FlashInferIndicesUpdaterDecode: + """Populates ``kv_indptr`` / ``kv_indices`` and calls + ``wrapper.begin_forward`` before every decode step. + """ + + def __init__(self, backend: FlashInferAttnBackend): + self.num_qo_heads = backend.num_heads + self.num_kv_heads = backend.num_kv_heads + self.head_dim = backend.head_dim + self.data_type = backend.kv_cache_dtype + self.q_data_type = backend.q_dtype + self.sliding_window_size = backend.sliding_window_size + self.backend = backend + + self.kv_indptr = backend.kv_indptr + self.kv_last_page_len = backend.kv_last_page_len + self.req_to_token = backend.req_to_token + + def update( + self, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_cpu: Optional[torch.Tensor], + seq_lens_sum: int, + decode_wrappers: "List[BatchDecodeWithPagedKVCacheWrapper]", + kv_start_idx: Optional[torch.Tensor] = None, + ) -> None: + if self.backend.dispatch_reason == WrapperDispatch.SLIDING_WINDOW: + self._update_sliding_window( + req_pool_indices, + seq_lens, + seq_lens_cpu, + seq_lens_sum, + decode_wrappers, + ) + else: + # Single-wrapper: full-context decode. Build kv_indptr/kv_indices + # and call FlashInfer's plan function via the CUDA kernel. + bs = len(req_pool_indices) + kv_indptr = self.kv_indptr[0] + + # Fill kv_indptr: prefix sums of paged_kernel_lens. + kv_indptr[1 : bs + 1] = torch.cumsum(seq_lens, dim=0) + kv_indptr_sliced = kv_indptr[: bs + 1] + + if seq_lens_cpu is not None: + seq_lens_sum = int(seq_lens_cpu.sum().item()) + else: + seq_lens_sum = int(seq_lens.sum().item()) + + # Allocate KV indices buffer. + if decode_wrappers and decode_wrappers[0].is_cuda_graph_enabled: + kv_indices = decode_wrappers[0]._paged_kv_indices_buf + else: + kv_indices = torch.empty( + seq_lens_sum, dtype=torch.int32, device=self.req_to_token.device + ) + + # Use high-performance CUDA kernel to populate kv_indices. + create_kv_indices( + self.req_to_token, + req_pool_indices.to(torch.int32), + seq_lens.to(torch.int32), + kv_indptr_sliced, + None, + kv_indices, + ) + + decode_wrappers = decode_wrappers or self.decode_wrappers + decode_wrappers[0].begin_forward( + kv_indptr_sliced, + kv_indices, + self.kv_last_page_len[:bs], + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + 1, + data_type=self.data_type, + q_data_type=self.q_data_type, + non_blocking=True, + ) + + def _update_sliding_window( + self, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_cpu: Optional[torch.Tensor], + seq_lens_sum: int, + decode_wrappers: "List[BatchDecodeWithPagedKVCacheWrapper]", + ) -> None: + assert self.sliding_window_size is not None + for wrapper_id in range(2): + if wrapper_id == 0: + # Sliding-window attention: clamp to window size + 1 + paged_kernel_lens = torch.clamp( + seq_lens, max=self.sliding_window_size + 1 + ) + paged_kernel_lens_sum = int(paged_kernel_lens.sum().item()) + kv_start_idx = seq_lens - paged_kernel_lens + seq_lens_cpu_tmp = ( + torch.clamp(seq_lens_cpu, max=self.sliding_window_size + 1) + if seq_lens_cpu is not None + else None + ) + else: + # Full-context attention + paged_kernel_lens = seq_lens + paged_kernel_lens_sum = seq_lens_sum + kv_start_idx = None + seq_lens_cpu_tmp = seq_lens_cpu + + bs = len(req_pool_indices) + kv_indptr = self.kv_indptr[wrapper_id] + kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0) + kv_indptr_sliced = kv_indptr[: bs + 1] + + if decode_wrappers and decode_wrappers[wrapper_id].is_cuda_graph_enabled: + kv_indices = decode_wrappers[wrapper_id]._paged_kv_indices_buf + else: + kv_indices = torch.empty( + paged_kernel_lens_sum, + dtype=torch.int32, + device=self.req_to_token.device, + ) + + # High-performance CUDA kernel populates kv_indices from req_to_token. + create_kv_indices( + self.req_to_token, + req_pool_indices.to(torch.int32), + paged_kernel_lens.to(torch.int32), + kv_indptr_sliced, + kv_start_idx.to(torch.int32) if kv_start_idx is not None else None, + kv_indices, + ) + + decode_wrappers[wrapper_id].begin_forward( + kv_indptr_sliced, + kv_indices, + self.kv_last_page_len[:bs], + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + 1, + data_type=self.data_type, + q_data_type=self.q_data_type, + non_blocking=True, + ) + + +# --------------------------------------------------------------------------- +# _FlashInferIndicesUpdaterPrefill +# --------------------------------------------------------------------------- + + +class _FlashInferIndicesUpdaterPrefill: + """Populates indices and calls ``wrapper.begin_forward`` before extend.""" + + def __init__(self, backend: FlashInferAttnBackend): + self.num_qo_heads = backend.num_heads + self.num_kv_heads = backend.num_kv_heads + self.head_dim = backend.head_dim + self.data_type = backend.kv_cache_dtype + self.q_data_type = backend.q_dtype + self.sliding_window_size = backend.sliding_window_size + self.backend = backend + + self.kv_indptr = backend.kv_indptr + self.kv_last_page_len = backend.kv_last_page_len + self.qo_indptr = backend.qo_indptr + self.req_to_token = backend.req_to_token + self.prefill_wrapper_ragged = backend.prefill_wrapper_ragged + + def update( + self, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_cpu: Optional[torch.Tensor], + seq_lens_sum: int, + prefix_lens: Optional[torch.Tensor], + prefill_wrappers: "List[BatchPrefillWithPagedKVCacheWrapper]", + use_ragged: bool, + ) -> None: + if self.backend.dispatch_reason == WrapperDispatch.SLIDING_WINDOW: + self._update_sliding_window( + req_pool_indices, + seq_lens, + seq_lens_cpu, + seq_lens_sum, + prefix_lens, + prefill_wrappers, + use_ragged, + ) + else: + if use_ragged: + paged_kernel_lens = prefix_lens + paged_kernel_lens_sum = paged_kernel_lens.sum().item() + else: + paged_kernel_lens = seq_lens + paged_kernel_lens_sum = seq_lens_sum + + self._call_begin_forward( + self.prefill_wrapper_ragged, + prefill_wrappers[0], + req_pool_indices, + paged_kernel_lens, + paged_kernel_lens_sum, + seq_lens, + prefix_lens, + kv_start_idx=None, + kv_indptr=self.kv_indptr[0], + qo_indptr=self.qo_indptr[0], + use_ragged=use_ragged, + ) + + def _update_sliding_window( + self, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_cpu: Optional[torch.Tensor], + seq_lens_sum: int, + prefix_lens: Optional[torch.Tensor], + prefill_wrappers: "List[BatchPrefillWithPagedKVCacheWrapper]", + use_ragged: bool, + ) -> None: + assert self.sliding_window_size is not None + for wrapper_id in range(2): + if wrapper_id == 0: + # Sliding-window portion uses a limited context window. + extend_lens = seq_lens - prefix_lens + paged_kernel_lens = torch.minimum( + seq_lens, + torch.tensor(self.sliding_window_size, device=seq_lens.device) + + extend_lens, + ) + paged_kernel_lens_sum = int(paged_kernel_lens.sum().item()) + kv_start_idx = seq_lens - paged_kernel_lens + else: + # Full-context portion. + paged_kernel_lens = seq_lens + paged_kernel_lens_sum = seq_lens_sum + kv_start_idx = None + + kv_indptr = self.kv_indptr[wrapper_id] + qo_indptr = self.qo_indptr[wrapper_id] + + self._call_begin_forward( + self.prefill_wrapper_ragged, + prefill_wrappers[wrapper_id], + req_pool_indices, + paged_kernel_lens, + paged_kernel_lens_sum, + seq_lens, + prefix_lens, + kv_start_idx=kv_start_idx, + kv_indptr=kv_indptr, + qo_indptr=qo_indptr, + use_ragged=use_ragged, + ) + + def _call_begin_forward( + self, + wrapper_ragged: "BatchPrefillWithRaggedKVCacheWrapper", + wrapper_paged: "BatchPrefillWithPagedKVCacheWrapper", + req_pool_indices: torch.Tensor, + paged_kernel_lens: torch.Tensor, + paged_kernel_lens_sum: int, + seq_lens: torch.Tensor, + prefix_lens: Optional[torch.Tensor], + kv_start_idx: Optional[torch.Tensor], + kv_indptr: torch.Tensor, + qo_indptr: torch.Tensor, + use_ragged: bool, + ) -> None: + bs = len(seq_lens) + + # Build kv_indptr and kv_indices using the CUDA kernel. + kv_indptr_sliced = kv_indptr[: bs + 1] + kv_indptr_sliced[1:] = torch.cumsum(paged_kernel_lens, dim=0) + + kv_indices = torch.empty( + paged_kernel_lens_sum + 256, + dtype=torch.int32, + device=req_pool_indices.device, + ) + + create_kv_indices( + self.req_to_token, + req_pool_indices.to(torch.int32), + paged_kernel_lens.to(torch.int32), + kv_indptr_sliced, + kv_start_idx.to(torch.int32) if kv_start_idx is not None else None, + kv_indices, + ) + + # Build qo_indptr (number of new tokens per sequence). + if prefix_lens is not None: + extend_lens = seq_lens - prefix_lens + else: + extend_lens = seq_lens + qo_indptr_sliced = qo_indptr[: bs + 1] + qo_indptr_sliced[1:] = torch.cumsum(extend_lens, dim=0) + + # Plan the ragged wrapper (new tokens only). + if use_ragged: + wrapper_ragged.begin_forward( + qo_indptr_sliced, + qo_indptr_sliced, + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + q_data_type=self.q_data_type, + ) + + # Plan the paged wrapper (cached prefix tokens). + wrapper_paged.begin_forward( + qo_indptr_sliced, + kv_indptr_sliced, + kv_indices, + self.kv_last_page_len[:bs], + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + 1, + q_data_type=self.q_data_type, + kv_data_type=self.data_type, + non_blocking=True, + ) diff --git a/pymllm/layers/attention/radix_attention.py b/pymllm/layers/attention/radix_attention.py index e69de29bb..114130dbf 100644 --- a/pymllm/layers/attention/radix_attention.py +++ b/pymllm/layers/attention/radix_attention.py @@ -0,0 +1,171 @@ +"""RadixAttention -- the attention layer used by pymllm models. + +This module is kept small intentionally: all heavy computation is delegated +to the pluggable ``AttentionBackend`` that is attached to the ``ForwardBatch``. +""" + +from __future__ import annotations + +from enum import Enum +from typing import TYPE_CHECKING, Optional + +import torch +from torch import nn + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch + + +# --------------------------------------------------------------------------- +# AttentionType +# --------------------------------------------------------------------------- + + +class AttentionType(Enum): + """Attention variant used by a :class:`RadixAttention` layer. + + Uses string values so that ``torch.compile`` can treat them as constants. + """ + + # Standard causal self-attention in a decoder layer. + DECODER = "decoder" + + # Bidirectional self-attention for image tokens inside a decoder + # (e.g. VLM visual encoder embedded in the language model). + DECODER_BIDIRECTIONAL = "decoder_bidirectional" + + # Full bidirectional self-attention in an encoder-only model. + ENCODER_ONLY = "encoder_only" + + +# --------------------------------------------------------------------------- +# RadixAttention +# --------------------------------------------------------------------------- + + +class RadixAttention(nn.Module): + """Attention layer that delegates computation to a pluggable backend. + + Each transformer attention layer in a pymllm model creates exactly one + ``RadixAttention`` with a unique ``layer_id``. During the forward pass + the layer looks up the correct KV buffer via ``layer_id`` and calls the + backend attached to the current :class:`~pymllm.engine.forward_batch.ForwardBatch`. + + Parameters + ---------- + num_heads + Number of query attention heads (after any tensor-parallelism + sharding; pass the full count if not using TP). + head_dim + Per-head dimension for query and key projections. + scaling + Softmax pre-scale, typically ``1 / sqrt(head_dim)``. + num_kv_heads + Number of key / value heads (supports GQA / MQA). + layer_id + Zero-based index of this layer within the model. Used to index into + ``KVPool.k_buffer`` / ``v_buffer``. + logit_cap + If > 0, attention logits are soft-capped to this value via a ``tanh`` + gate (used by Gemma2 / Gemma3 style models). Set to ``0.0`` to + disable. + v_head_dim + Per-head dimension of the value projection. Defaults to ``head_dim`` + (i.e. standard square QKV). + sliding_window_size + Sliding-window attention span. ``-1`` means full context (no window). + is_cross_attention + ``True`` for cross-attention layers in encoder-decoder models. + attn_type + One of :class:`AttentionType`. + """ + + def __init__( + self, + num_heads: int, + head_dim: int, + scaling: float, + num_kv_heads: int, + layer_id: int, + logit_cap: float = 0.0, + v_head_dim: int = -1, + sliding_window_size: int = -1, + is_cross_attention: bool = False, + attn_type: AttentionType = AttentionType.DECODER, + ): + super().__init__() + + self.tp_q_head_num: int = num_heads + self.tp_k_head_num: int = num_kv_heads + self.tp_v_head_num: int = num_kv_heads + + self.head_dim: int = head_dim + self.qk_head_dim: int = head_dim + self.v_head_dim: int = v_head_dim if v_head_dim != -1 else head_dim + + self.scaling: float = scaling + self.layer_id: int = layer_id + self.logit_cap: float = logit_cap + self.sliding_window_size: int = ( + sliding_window_size if sliding_window_size is not None else -1 + ) + self.is_cross_attention: bool = is_cross_attention + self.attn_type: AttentionType = attn_type + + # ------------------------------------------------------------------ + # forward + # ------------------------------------------------------------------ + + def forward( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Run attention for one batch. + + Parameters + ---------- + q + Query tensor, shape ``[num_tokens, tp_q_head_num * head_dim]`` + (or already reshaped to ``[num_tokens, tp_q_head_num, head_dim]``). + k + Key tensor, same leading dimension as ``q``, shape + ``[num_tokens, tp_k_head_num * qk_head_dim]``. + Pass ``None`` for cross-layer KV sharing (``v`` must also be + ``None`` in this case). + v + Value tensor, shape + ``[num_tokens, tp_v_head_num * v_head_dim]``. + forward_batch + Batch metadata and references to memory pools / backend. + save_kv_cache + When ``False``, skip writing K/V into the pool (useful for draft + models in speculative decoding). + **kwargs + Passed through to the backend (e.g. ``q_rope``, ``k_rope``). + """ + if k is not None: + assert v is not None, "k and v must both be provided or both be None" + k = k.view(-1, self.tp_k_head_num, self.qk_head_dim) + v = v.view(-1, self.tp_v_head_num, self.v_head_dim) + + return forward_batch.attn_backend.forward( + q, k, v, self, forward_batch, save_kv_cache, **kwargs + ) + + def extra_repr(self) -> str: + return ( + f"layer_id={self.layer_id}, " + f"q_heads={self.tp_q_head_num}, " + f"kv_heads={self.tp_k_head_num}, " + f"head_dim={self.head_dim}, " + f"v_head_dim={self.v_head_dim}, " + f"scaling={self.scaling:.4f}, " + f"logit_cap={self.logit_cap}, " + f"sliding_window={self.sliding_window_size}, " + f"attn_type={self.attn_type.value}" + ) diff --git a/pymllm/layers/sampling.py b/pymllm/layers/sampling.py new file mode 100644 index 000000000..e69de29bb diff --git a/pymllm/mem_cache/memory_pool.py b/pymllm/mem_cache/memory_pool.py index 0721fd71d..f9c176a94 100644 --- a/pymllm/mem_cache/memory_pool.py +++ b/pymllm/mem_cache/memory_pool.py @@ -6,7 +6,7 @@ TokenToKVPoolAllocator manages a free-list of integer indices KVPool holds the actual GPU K/V tensors -All indices are **int64** tensors on the target device. Slot 0 in the KV +All indices are **int32** tensors on the target device. Slot 0 in the KV buffers is reserved as a padding / dummy-output slot and is never allocated. """ @@ -210,7 +210,7 @@ class TokenToKVPoolAllocator: allocator = TokenToKVPoolAllocator(size=4096, device="cuda") # --- basic alloc / free --- - indices = allocator.alloc(128) # 128 free slot indices (int64) + indices = allocator.alloc(128) # 128 free slot indices (int32) allocator.free(indices[:64]) # return 64 slots # --- batch free (amortised) --- @@ -251,14 +251,14 @@ def clear(self) -> None: """Reset the allocator so that all slots ``[1, size]`` are free. The first slot is reserved for padding.""" if self.page_size == 1: self.free_slots = torch.arange( - 1, self.size + 1, dtype=torch.int64, device=self.device + 1, self.size + 1, dtype=torch.int32, device=self.device ) else: num_pages = self.size // self.page_size self.free_slots = torch.arange( - 1, num_pages + 1, dtype=torch.int64, device=self.device + 1, num_pages + 1, dtype=torch.int32, device=self.device ) - self.release_slots = torch.empty((0,), dtype=torch.int64, device=self.device) + self.release_slots = torch.empty((0,), dtype=torch.int32, device=self.device) self._is_not_in_free_group = True self._free_group: List[torch.Tensor] = [] @@ -273,7 +273,7 @@ def merge_and_sort_free(self) -> None: self.free_slots = torch.cat((self.free_slots, self.release_slots)) if self.need_sort: self.free_slots, _ = torch.sort(self.free_slots) - self.release_slots = torch.empty((0,), dtype=torch.int64, device=self.device) + self.release_slots = torch.empty((0,), dtype=torch.int32, device=self.device) def free_group_begin(self) -> None: """Start collecting ``free()`` calls; actual release is deferred to ``free_group_end``.""" @@ -290,7 +290,7 @@ def free_group_end(self) -> None: def alloc(self, need_size: int) -> Optional[torch.Tensor]: """Allocate *need_size* token indices. - Returns a 1-D ``int64`` tensor on success, or ``None`` if the pool is + Returns a 1-D ``int32`` tensor on success, or ``None`` if the pool is exhausted. """ if self.page_size == 1: @@ -380,7 +380,7 @@ def __init__( self.device = torch.device(device) self.req_to_token = torch.zeros( - (max_reqs, max_context_len), dtype=torch.int64, device=self.device + (max_reqs, max_context_len), dtype=torch.int32, device=self.device ) self._free_slots: List[int] = list(range(max_reqs)) diff --git a/pymllm/orchestrator/cuda_ipc_transport.py b/pymllm/orchestrator/cuda_ipc_transport.py index 7052f0e8f..938132c8b 100644 --- a/pymllm/orchestrator/cuda_ipc_transport.py +++ b/pymllm/orchestrator/cuda_ipc_transport.py @@ -1,373 +1,648 @@ """ -CUDA IPC Transport for zero-copy tensor sharing between processes. +CUDA IPC Transport for zero-copy GPU tensor sharing between processes. -This module implements CUDA IPC with workspace buffer management -to avoid PyTorch's memory leak issue when sharing IPC handles. +## Background -1. Create a workspace buffer on GPU (pre-allocated memory pool) -2. Copy tensor data to a chunk in the workspace -3. Get CUDA IPC handle for the chunk -4. Send handle + metadata (shape, dtype, offset) to another process -5. Reconstruct tensor in target process from IPC handle -6. Copy to local tensor and mark chunk as reusable +When sharing CUDA tensors between processes, there are two fundamentally different paths: -Key Problem Solved: - PyTorch never releases tensors whose IPC handles are shared until process ends. - Solution: Use a fixed-size workspace buffer and recycle chunks. +1. **CPU shared memory path** (``enable_shared_queue=True, enable_cuda_ipc=False``): + GPU tensors are moved to CPU / POSIX shared memory via ``tensor.share_memory_()``. + This is safe but incurs a GPU→CPU copy which is expensive for large vision features. + +2. **CUDA IPC path** (``enable_cuda_ipc=True``): + GPU tensors stay on GPU. PyTorch's ``storage._share_cuda_()`` yields a serialisable + IPC handle; the receiver calls ``UntypedStorage._new_shared_cuda(*handle)`` to map + the same physical GPU memory without any copy. + +These two paths are **mutually exclusive for GPU tensors**. ``enable_cuda_ipc`` takes +priority; when active the CPU-copy step in ``TensorQueue._make_tensors_shareable`` is +skipped. + +## CUDA IPC memory-leak problem and its fix + +PyTorch never releases the GPU allocation backing an IPC-exported tensor until the +*sending* process exits. If we export raw model tensors we permanently leak GPU memory. + +**Solution** (pool-based recycling via ``MmItemMemoryPool``): + +* Allocate a single, fixed-size GPU workspace (``MmItemMemoryPool``). +* For each outgoing GPU tensor, copy it into a chunk of the workspace and export the + *chunk* via IPC (the workspace is never freed; its chunks are recycled). +* After the receiving process has finished with the data it writes a sync flag + (``ShmSyncBuffer``) to signal that the chunk may be reused. +* A background recycler thread in the sender walks ``occupied_chunks`` and returns + chunks whose sync flag has been incremented back to ``available_chunks``. + +## Transport modes + +``TensorTransportMode``: +* ``"default"`` – CPU/shared-memory path; no CUDA IPC. +* ``"cuda_ipc"`` – Simple CUDA IPC: wraps GPU tensors in ``TransportProxyTensor`` + (a ``torch.Tensor`` subclass whose ``__getstate__``/``__setstate__`` use + ``_share_cuda_``). Suitable for single-process-group scenarios; incurs the + PyTorch memory-leak noted above. +* ``"cuda_ipc_pool"`` – Pool-based CUDA IPC: copies GPU tensors into a pre-allocated + ``MmItemMemoryPool`` and wraps the slice in ``CudaIpcTensorTransportProxy``. + The pool is recycled, so there is no memory leak. """ +from __future__ import annotations + +import fcntl import logging -import struct -import uuid -from dataclasses import dataclass -from multiprocessing import Queue -from multiprocessing.shared_memory import SharedMemory -from typing import Any, Dict, List, Optional, Tuple +import threading +import time +from multiprocessing import shared_memory +from typing import Any, Dict, List, Literal, Optional, Tuple +import numpy as np import torch -import torch.cuda as cuda logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- +# Type alias for transport mode +# --------------------------------------------------------------------------- + +TensorTransportMode = Literal["default", "cuda_ipc", "cuda_ipc_pool"] -@dataclass -class MemoryChunk: - """Represents a chunk in the workspace buffer.""" - offset: int # Offset in bytes from workspace start - size: int # Size in bytes - in_use: bool # Whether the chunk is currently occupied - sync_shm_name: Optional[str] = None # Shared memory name for sync flag +# --------------------------------------------------------------------------- +# ShmSyncBuffer – a tiny POSIX shared memory float used as a sync counter +# --------------------------------------------------------------------------- -class WorkspaceBuffer: - """GPU memory pool for storing multimodal tensors temporarily. +class ShmSyncBuffer: + """A single float32 in POSIX shared memory used as a sync counter. - This prevents the PyTorch IPC handle memory leak by using a fixed-size - pre-allocated buffer and recycling chunks. + The sender resets it to 0 before exporting a chunk. The receiver + increments it (atomically under a file lock) once it has finished copying + data out of the chunk. When the value reaches the number of consumers + (``tp_size``) the sender recycles the chunk. """ - def __init__(self, size_gb: float = 4.0, device: int = 0): - """Initialize workspace buffer. + def __init__(self, byte_size: int = 4) -> None: + self.buffer = shared_memory.SharedMemory(create=True, size=byte_size) + self._arr = np.ndarray(1, dtype=np.float32, buffer=self.buffer.buf) + self._arr *= 0 # initialise to 0 + self.meta_data: Dict[str, Any] = { + "handle": self.buffer.name, + "shape": self._arr.shape, + "dtype": str(self._arr.dtype), + } + + # ------------------------------------------------------------------ + # Helpers consumed by the *receiver* side + # ------------------------------------------------------------------ + + @staticmethod + def open( + meta_data: Dict[str, Any], + ) -> Tuple[shared_memory.SharedMemory, np.ndarray]: + """Open an existing ShmSyncBuffer from the metadata dict.""" + shm = shared_memory.SharedMemory(name=meta_data["handle"]) + arr = np.ndarray(meta_data["shape"], dtype=meta_data["dtype"], buffer=shm.buf) + return shm, arr + + def __del__(self) -> None: + try: + self.buffer.close() + self.buffer.unlink() + except Exception: + pass - Args: - size_gb: Total size of workspace in GB - device: CUDA device ID - """ - self.device = device - self.total_size = int(size_gb * 1024 * 1024 * 1024) # Convert GB to bytes - # Allocate workspace on GPU - with torch.cuda.device(device): - self.workspace = torch.empty( - self.total_size // 4, # Divide by 4 because we use float32 - dtype=torch.float32, - device=f"cuda:{device}", - ) +# Lock file used to serialise writes to sync flags across processes +_SHM_LOCK_FILE = "/tmp/pymllm_shm_wr_lock.lock" - # Initialize chunk management - self.chunks: List[MemoryChunk] = [ - MemoryChunk(offset=0, size=self.total_size, in_use=False) - ] - # Container for reusable sync buffers - self.sync_buffer_pool: List[str] = [] +def _increment_sync_flag(meta_data: Dict[str, Any]) -> None: + """Increment the sync flag by 1 under a process-level file lock.""" + shm, arr = ShmSyncBuffer.open(meta_data) + try: + open(_SHM_LOCK_FILE, "a").close() # ensure file exists + with open(_SHM_LOCK_FILE, "w+") as f: + fcntl.flock(f, fcntl.LOCK_EX) + arr += 1.0 + fcntl.flock(f, fcntl.LOCK_UN) + finally: + shm.close() - logger.info( - f"WorkspaceBuffer initialized: {size_gb}GB on cuda:{device}, " - f"ptr={self.workspace.data_ptr():#x}" - ) - def allocate(self, size_bytes: int) -> Optional[Tuple[int, str]]: - """Allocate a chunk from the workspace. +# --------------------------------------------------------------------------- +# MmItemMemoryChunk +# --------------------------------------------------------------------------- - Args: - size_bytes: Required size in bytes - Returns: - Tuple of (offset, sync_shm_name) if successful, None if no space - """ - # Find a free chunk that's large enough - for i, chunk in enumerate(self.chunks): - if not chunk.in_use and chunk.size >= size_bytes: - # Mark chunk as in use - chunk.in_use = True - - # Get or create sync buffer - if self.sync_buffer_pool: - sync_shm_name = self.sync_buffer_pool.pop() - # Reset sync flag to 0 (not ready) - self._reset_sync_buffer(sync_shm_name) - else: - sync_shm_name = self._create_sync_buffer() - - chunk.sync_shm_name = sync_shm_name - - # If chunk is larger than needed, split it - if chunk.size > size_bytes: - # Create a new free chunk for the remaining space - new_chunk = MemoryChunk( - offset=chunk.offset + size_bytes, - size=chunk.size - size_bytes, - in_use=False, - ) - chunk.size = size_bytes - self.chunks.insert(i + 1, new_chunk) +class MmItemMemoryChunk: + """A contiguous slice of the ``MmItemMemoryPool`` workspace tensor.""" - logger.debug( - f"Allocated chunk: offset={chunk.offset}, size={size_bytes}, " - f"sync_shm={sync_shm_name}" - ) - return chunk.offset, sync_shm_name + def __init__(self, area: Tuple[int, int], sync_flag: ShmSyncBuffer) -> None: + self.area = area + self.sync_flag = sync_flag - logger.warning(f"WorkspaceBuffer: No space for {size_bytes} bytes") - return None + @property + def mem_size(self) -> int: + return self.area[1] - self.area[0] - def release(self, offset: int) -> None: - """Release a chunk back to the pool. + @property + def start(self) -> int: + return self.area[0] - Args: - offset: Offset of the chunk to release - """ - for i, chunk in enumerate(self.chunks): - if chunk.offset == offset and chunk.in_use: - chunk.in_use = False + @property + def end(self) -> int: + return self.area[1] - # Return sync buffer to pool - if chunk.sync_shm_name: - self.sync_buffer_pool.append(chunk.sync_shm_name) - chunk.sync_shm_name = None + def try_to_recycle(self, num_consumers: int = 1) -> bool: + """Return True if all consumers have finished and the chunk can be reused.""" + val = float(self.sync_flag._arr.item()) + logger.debug( + "[try_to_recycle] area=%s flag=%.0f consumers=%d", + self.area, + val, + num_consumers, + ) + if val >= float(num_consumers): + self.sync_flag._arr *= 0.0 # reset for next use + return True + return False - # Try to merge with adjacent free chunks - self._merge_chunks() - logger.debug(f"Released chunk: offset={offset}") - return +# --------------------------------------------------------------------------- +# MmItemMemoryPool – pre-allocated GPU workspace to avoid IPC memory leaks +# --------------------------------------------------------------------------- - logger.warning(f"Attempted to release unknown chunk at offset {offset}") - def _merge_chunks(self) -> None: - """Merge adjacent free chunks to reduce fragmentation.""" - i = 0 - while i < len(self.chunks) - 1: - current = self.chunks[i] - next_chunk = self.chunks[i + 1] +class MmItemMemoryPool: + """Pre-allocated GPU memory pool for CUDA IPC tensor transport. - if not current.in_use and not next_chunk.in_use: - # Merge chunks - current.size += next_chunk.size + Chunks are allocated from a contiguous ``torch.int8`` tensor on GPU. + A background thread periodically recycles chunks whose sync flags show + that all consumers have finished reading. - # Keep first chunk's sync buffer, return second to pool - if next_chunk.sync_shm_name: - self.sync_buffer_pool.append(next_chunk.sync_shm_name) + Args: + memory_size: Pool size in **bytes**. + recycle_interval: How often (seconds) the recycler thread runs. + num_consumers: Number of consumer processes (tp_size). Each consumer + must increment the sync flag once before a chunk is recycled. + device: CUDA device index. + """ - self.chunks.pop(i + 1) - else: - i += 1 - - def _create_sync_buffer(self) -> str: - """Create a new shared memory sync buffer (8 bytes, initialized to 0).""" - shm_name = f"pymllm_sync_{uuid.uuid4().hex[:12]}" - shm = SharedMemory(name=shm_name, create=True, size=8) - # Initialize to 0 (not ready) - shm.buf[:8] = struct.pack("Q", 0) - shm.close() - logger.debug(f"Created sync buffer: {shm_name}") - return shm_name + def __init__( + self, + memory_size: int, + recycle_interval: float = 0.1, + num_consumers: int = 1, + device: int = 0, + ) -> None: + self.num_consumers = num_consumers + self._recycle_interval = recycle_interval + self._lock = threading.Lock() + self._stop = False - def _reset_sync_buffer(self, shm_name: str) -> None: - """Reset sync buffer to 0 (not ready).""" - try: - shm = SharedMemory(name=shm_name, create=False) - shm.buf[:8] = struct.pack("Q", 0) - shm.close() - except Exception as e: - logger.warning(f"Failed to reset sync buffer {shm_name}: {e}") - - def copy_tensor_to_workspace(self, tensor: torch.Tensor, offset: int) -> None: - """Copy tensor data to workspace at given offset. - - Args: - tensor: Source tensor (must be on same CUDA device) - offset: Byte offset in workspace - """ - if not tensor.is_cuda or tensor.device.index != self.device: - raise ValueError(f"Tensor must be on cuda:{self.device}") + with torch.cuda.device(device): + self.memory_pool: torch.Tensor = torch.empty( + memory_size, dtype=torch.int8, device=f"cuda:{device}" + ).contiguous() + + init_chunk = MmItemMemoryChunk((0, memory_size), self._new_sync_buffer()) + self.available_chunks: List[MmItemMemoryChunk] = [init_chunk] + self.occupied_chunks: List[MmItemMemoryChunk] = [] + # Pool of reusable ShmSyncBuffer objects (returned from recycled chunks) + self._sync_pool: List[ShmSyncBuffer] = [] + + self._recycler = threading.Thread( + target=self._recycle_loop, + name="MmItemMemoryPoolRecycler", + daemon=True, + ) + self._recycler.start() - size_bytes = tensor.numel() * tensor.element_size() + logger.info( + "MmItemMemoryPool: %d MB on cuda:%d, recycle_interval=%.2fs", + memory_size // (1024 * 1024), + device, + recycle_interval, + ) - # Get view of workspace at offset - offset_elements = offset // 4 # Workspace is float32 - num_elements = (size_bytes + 3) // 4 # Round up + # ------------------------------------------------------------------ + # Sync buffer management + # ------------------------------------------------------------------ + + def _new_sync_buffer(self) -> ShmSyncBuffer: + if self._sync_pool: + return self._sync_pool.pop() + return ShmSyncBuffer() + + def _return_sync_buffer(self, buf: ShmSyncBuffer) -> None: + buf._arr *= 0.0 # reset counter + self._sync_pool.append(buf) + + # ------------------------------------------------------------------ + # Allocation + # ------------------------------------------------------------------ + + def _get_available_chunk(self, src: torch.Tensor) -> Optional[MmItemMemoryChunk]: + """Best-fit allocation: find the smallest available chunk >= src size.""" + needed = src.numel() * src.element_size() + best: Optional[MmItemMemoryChunk] = None + for chunk in self.available_chunks: + if chunk.mem_size >= needed: + if best is None or chunk.mem_size < best.mem_size: + best = chunk + if best is None: + return None - workspace_view = self.workspace[ - offset_elements : offset_elements + num_elements - ] + # Split the selected chunk + occupied_area = (best.start, best.start + needed) + occupied = MmItemMemoryChunk(occupied_area, best.sync_flag) + self.occupied_chunks.append(occupied) + self.available_chunks.remove(best) - # Copy tensor data (flatten and cast to float32 view) - tensor_flat = tensor.flatten().view(torch.uint8) - workspace_flat = workspace_view.view(torch.uint8)[: tensor_flat.numel()] - workspace_flat.copy_(tensor_flat) + remainder = (occupied.end, best.end) + if remainder[0] < remainder[1]: + split = MmItemMemoryChunk(remainder, self._new_sync_buffer()) + self.available_chunks.append(split) - logger.debug(f"Copied tensor {tensor.shape} to workspace offset {offset}") + return occupied - def get_ipc_handle(self) -> bytes: - """Get CUDA IPC handle for the workspace buffer. + def get_slice_with_flag( + self, src: torch.Tensor + ) -> Tuple[Optional[Dict[str, Any]], Optional[torch.Tensor]]: + """Allocate a pool slice for *src* and return ``(sync_flag_meta, slice_tensor)``. - Returns: - CUDA IPC handle as bytes + Thread-safe. Returns ``(None, None)`` if the pool is full. """ - # Get IPC handle using torch.cuda API - # Note: This requires CUDA-capable device with IPC support - handle = cuda.cudart().cudaIpcGetMemHandle(self.workspace.data_ptr()) - return bytes(handle) - - def cleanup(self) -> None: - """Cleanup all sync buffers.""" - all_shm_names = set() - for chunk in self.chunks: - if chunk.sync_shm_name: - all_shm_names.add(chunk.sync_shm_name) - all_shm_names.update(self.sync_buffer_pool) - - for shm_name in all_shm_names: + with self._lock: + chunk = self._get_available_chunk(src) + if chunk is None: + logger.warning( + "MmItemMemoryPool full (%d occupied, %d available); " + "falling back to CPU transport", + len(self.occupied_chunks), + len(self.available_chunks), + ) + return None, None + pool_slice = self.memory_pool[chunk.start : chunk.end] + return chunk.sync_flag.meta_data, pool_slice + + # ------------------------------------------------------------------ + # Recycling + # ------------------------------------------------------------------ + + def _recycle_loop(self) -> None: + while not self._stop: try: - shm = SharedMemory(name=shm_name, create=False) - shm.close() - shm.unlink() - except FileNotFoundError: - pass - except Exception as e: - logger.warning(f"Failed to cleanup sync buffer {shm_name}: {e}") + with self._lock: + self._recycle_chunks() + self._merge_chunks() + except Exception as exc: + logger.warning( + "MmItemMemoryPool recycler error: %s", exc, exc_info=True + ) + time.sleep(self._recycle_interval) + + def _recycle_chunks(self) -> None: + new_occupied: List[MmItemMemoryChunk] = [] + for chunk in self.occupied_chunks: + if chunk.try_to_recycle(self.num_consumers): + self._return_sync_buffer(chunk.sync_flag) + chunk.sync_flag = self._new_sync_buffer() + self.available_chunks.append(chunk) + else: + new_occupied.append(chunk) + self.occupied_chunks = new_occupied + + def _merge_chunks(self) -> None: + """Coalesce adjacent free chunks to reduce fragmentation.""" + merged: List[MmItemMemoryChunk] = [] + for chunk in sorted(self.available_chunks, key=lambda c: c.start): + if merged and merged[-1].end == chunk.start: + prev = merged.pop() + self._return_sync_buffer(chunk.sync_flag) + merged.append( + MmItemMemoryChunk((prev.start, chunk.end), prev.sync_flag) + ) + else: + merged.append(chunk) + self.available_chunks = merged + + def shutdown(self) -> None: + self._stop = True + if self._recycler.is_alive(): + self._recycler.join(timeout=2.0) + - logger.info("WorkspaceBuffer cleaned up") +# --------------------------------------------------------------------------- +# CudaIpcTensorTransportProxy – pool-based CUDA IPC proxy object +# --------------------------------------------------------------------------- -@dataclass -class TensorMetadata: - """Metadata for reconstructing a tensor from CUDA IPC handle.""" +class CudaIpcTensorTransportProxy: + """Proxy that carries a CUDA IPC handle for a pool-slice tensor. - shape: Tuple[int, ...] - dtype: torch.dtype - offset: int # Byte offset in workspace - size_bytes: int - sync_shm_name: str # Shared memory name for sync flag + The *sender* process: + 1. Copies the source tensor into a ``MmItemMemoryPool`` slice (int8 view). + 2. Wraps the slice in this proxy, which captures the CUDA IPC handle via + ``storage._share_cuda_()``. + 3. Sends the proxy through ``multiprocessing.Queue`` (pickle). + The *receiver* process: + 1. Calls :meth:`reconstruct_on_device` to map the IPC memory and copy it + into a fresh local tensor. + 2. The copy increments the sync flag, allowing the sender's recycler to + reclaim the pool slice. -class CudaIPCTransport: - """Transport for sharing CUDA tensors via IPC handles.""" + Fallback: if ``_share_cuda_()`` fails (e.g. TP ranks), ``tensor_data`` holds + the raw tensor (which will be pickled the normal way, incurring serialization cost). + """ def __init__( self, - workspace_size_gb: float = 4.0, - device: int = 0, - ): - """Initialize CUDA IPC transport. + data: torch.Tensor, + info_data: torch.Tensor, + sync_buffer_meta: Dict[str, Any], + ) -> None: + if not isinstance(data, torch.Tensor) or not isinstance( + info_data, torch.Tensor + ): + raise TypeError( + f"data and info_data must be torch.Tensors, got {type(data)}, {type(info_data)}" + ) - Args: - workspace_size_gb: Size of workspace buffer in GB - device: CUDA device ID - """ - self.device = device - self.workspace = WorkspaceBuffer(workspace_size_gb, device) - self.ipc_handle = self.workspace.get_ipc_handle() - self.queue: Queue = Queue() + self.sync_data_meta = sync_buffer_meta + self._state = self._build_state(data, info_data) + self._reconstructed: Optional[torch.Tensor] = None + self._shm: Optional[shared_memory.SharedMemory] = None - def send_tensor(self, rid: str, tensor: torch.Tensor) -> bool: - """Send a tensor via CUDA IPC. + def _build_state( + self, data: torch.Tensor, info_data: torch.Tensor + ) -> Dict[str, Any]: + try: + storage = data.untyped_storage() + handle = storage._share_cuda_() + return { + "ipc_handle": { + "handle": handle, + "shape": data.shape, + "dtype": data.dtype, + "stride": data.stride(), + "device_index": data.device.index, + "storage_offset": data.storage_offset(), + "target_shape": info_data.shape, + "target_dtype": info_data.dtype, + }, + "tensor_data": None, + } + except Exception as exc: + logger.warning( + "CudaIpcTensorTransportProxy: _share_cuda_() failed (%s); " + "falling back to direct tensor.", + exc, + ) + return {"ipc_handle": None, "tensor_data": data} - Args: - rid: Request ID - tensor: Tensor to send (must be on CUDA) + def reconstruct_on_device(self, device_index: Optional[int] = None) -> torch.Tensor: + """Map IPC memory and copy into a new local tensor. - Returns: - True if sent via CUDA IPC, False if fallback needed + This **must** be called from the *receiver* process. After the copy + the sync flag is incremented so the sender can recycle the pool chunk. """ - if not tensor.is_cuda: - logger.debug(f"Tensor for {rid} not on CUDA, skipping IPC") - return False - - size_bytes = tensor.numel() * tensor.element_size() - - # Try to allocate from workspace - result = self.workspace.allocate(size_bytes) - if result is None: - logger.warning( - f"WorkspaceBuffer full, falling back to shared queue for {rid}" + if self._reconstructed is not None: + return self._reconstructed + + state = self._state + if state["ipc_handle"] is not None: + h = state["ipc_handle"] + source_device = torch.device(f"cuda:{h['device_index']}") + target_device = ( + source_device + if device_index is None + else torch.device(f"cuda:{device_index}") ) - return False + with torch.cuda.device(source_device): + storage = torch.UntypedStorage._new_shared_cuda(*h["handle"]) + slice_tensor = torch.empty( + 0, dtype=h["dtype"], device=source_device + ).set_( + storage, + storage_offset=h["storage_offset"], + size=h["shape"], + stride=h["stride"], + ) - offset, sync_shm_name = result + result = torch.empty( + h["target_shape"], dtype=h["target_dtype"], device=target_device + ).contiguous() + result.view(torch.int8).view(-1).copy_(slice_tensor) - # Copy tensor to workspace - self.workspace.copy_tensor_to_workspace(tensor, offset) + # Signal sender that the chunk can be recycled + _increment_sync_flag(self.sync_data_meta) + elif state["tensor_data"] is not None: + result = state["tensor_data"] + if device_index is not None: + result = result.to(f"cuda:{device_index}", non_blocking=True) + else: + raise RuntimeError("CudaIpcTensorTransportProxy: invalid state") - # Create metadata - metadata = TensorMetadata( - shape=tuple(tensor.shape), - dtype=tensor.dtype, - offset=offset, - size_bytes=size_bytes, - sync_shm_name=sync_shm_name, - ) + self._reconstructed = result + return result - # Send metadata through queue - self.queue.put((rid, metadata, self.ipc_handle)) - logger.debug(f"Sent tensor {tensor.shape} for {rid} via CUDA IPC") - return True +# --------------------------------------------------------------------------- +# TransportProxyTensor – simple CUDA IPC via torch.Tensor subclass + pickle +# --------------------------------------------------------------------------- - def receive_tensor( - self, timeout: float = 0.0001 - ) -> Optional[Tuple[str, torch.Tensor]]: - """Receive a tensor via CUDA IPC. - Args: - timeout: Timeout for queue.get +class TransportProxyTensor(torch.Tensor): + """A ``torch.Tensor`` subclass whose pickle uses CUDA IPC handles. - Returns: - Tuple of (rid, tensor) or None if queue empty - """ - try: - rid, metadata, ipc_handle = self.queue.get(timeout=timeout) - except Exception: - return None + When ``transport_mode == "cuda_ipc"`` and the tensor is on CUDA, + ``__getstate__`` exports the tensor via ``storage._share_cuda_()`` instead + of serialising the raw data. ``__setstate__`` reconstructs it in the + receiving process via ``UntypedStorage._new_shared_cuda``. - # Open IPC memory handle - # Note: This creates a tensor view into the remote process's workspace - with torch.cuda.device(self.device): - # Reconstruct tensor from IPC handle - # This is a view into remote memory, we need to copy it locally + Caveat: The underlying GPU allocation is never freed until the *sender* + process exits (PyTorch limitation). Prefer ``"cuda_ipc_pool"`` mode for + long-running services to avoid GPU memory leaks. - # For now, use a simpler approach: signal to copy later - # In production, you'd use cuda.cudart().cudaIpcOpenMemHandle + When the tensor is on CPU or ``transport_mode == "default"``, the tensor + is serialised normally (pickle of raw data). + """ + @staticmethod + def __new__( + cls, + data: torch.Tensor, + transport_mode: TensorTransportMode = "default", + ) -> "TransportProxyTensor": + if not isinstance(data, torch.Tensor): + raise TypeError(f"data must be a torch.Tensor, got {type(data)}") + instance = data.as_subclass(cls) + instance._transport_mode = transport_mode + return instance + + def __getstate__(self) -> Dict[str, Any]: + state: Dict[str, Any] = { + "transport_mode": self._transport_mode, + "tensor_data": None, + "ipc_extra": None, + } + if self._transport_mode == "cuda_ipc" and self.is_cuda: + try: + storage = self.untyped_storage() + handle = storage._share_cuda_() + state["ipc_extra"] = { + "handle": handle, + "shape": self.shape, + "dtype": self.dtype, + "stride": self.stride(), + "device_index": self.device.index, + "storage_offset": self.storage_offset(), + } + except Exception as exc: + logger.warning( + "TransportProxyTensor: _share_cuda_() failed (%s); falling back.", + exc, + ) + state["transport_mode"] = "default" + state["tensor_data"] = self.as_subclass(torch.Tensor) + else: + state["transport_mode"] = "default" + state["tensor_data"] = self.as_subclass(torch.Tensor) + return state + + def __setstate__(self, state: Dict[str, Any]) -> None: + self._transport_mode = state["transport_mode"] + if state["transport_mode"] == "cuda_ipc" and state["ipc_extra"] is not None: + h = state["ipc_extra"] + target = torch.device(f"cuda:{h['device_index']}") + try: + with torch.cuda.device(target): + storage = torch.UntypedStorage._new_shared_cuda(*h["handle"]) + reconstructed = torch.empty( + 0, dtype=h["dtype"], device=target + ).set_( + storage, + storage_offset=h["storage_offset"], + size=h["shape"], + stride=h["stride"], + ) + self.set_(reconstructed) + except Exception as exc: + logger.error("TransportProxyTensor: failed to open IPC handle: %s", exc) + raise + elif state["tensor_data"] is not None: + self.set_(state["tensor_data"]) + else: + raise RuntimeError("TransportProxyTensor: invalid state – no tensor data") + + @property + def transport_mode(self) -> TensorTransportMode: + return getattr(self, "_transport_mode", "default") + + +# --------------------------------------------------------------------------- +# Helpers: wrap / unwrap mm_inputs dicts +# --------------------------------------------------------------------------- + + +def wrap_mm_inputs_for_ipc( + mm_inputs: Optional[Dict[str, Any]], + transport_mode: TensorTransportMode, + pool: Optional["MmItemMemoryPool"] = None, +) -> Optional[Dict[str, Any]]: + """Recursively wrap CUDA tensors in *mm_inputs* for IPC transport. + + Args: + mm_inputs: Nested dict/list of tensors and other data. + transport_mode: One of ``"default"``, ``"cuda_ipc"``, ``"cuda_ipc_pool"``. + pool: Required when ``transport_mode == "cuda_ipc_pool"``. + + Returns: + A new data structure with CUDA tensors replaced by IPC proxies. + CPU tensors are left unchanged (they will be shared via ``share_memory_()`` + or normal pickling downstream). + """ + if mm_inputs is None: + return None + return _wrap_recursive(mm_inputs, transport_mode, pool) + + +def _wrap_recursive( + data: Any, + transport_mode: TensorTransportMode, + pool: Optional["MmItemMemoryPool"], +) -> Any: + if isinstance(data, torch.Tensor) and data.is_cuda: + return _wrap_cuda_tensor(data, transport_mode, pool) + elif isinstance(data, dict): + return {k: _wrap_recursive(v, transport_mode, pool) for k, v in data.items()} + elif isinstance(data, (list, tuple)): + wrapped = [_wrap_recursive(item, transport_mode, pool) for item in data] + return type(data)(wrapped) + else: + return data + + +def _wrap_cuda_tensor( + tensor: torch.Tensor, + transport_mode: TensorTransportMode, + pool: Optional["MmItemMemoryPool"], +) -> Any: + if transport_mode == "cuda_ipc": + return TransportProxyTensor(tensor, transport_mode="cuda_ipc") + + if transport_mode == "cuda_ipc_pool": + if pool is None: + raise ValueError("pool must be provided for transport_mode='cuda_ipc_pool'") + sync_meta, pool_slice = pool.get_slice_with_flag(tensor) + if pool_slice is not None: + # Copy tensor bytes into the pool slice + pool_slice.copy_(tensor.view(torch.int8).view(-1), non_blocking=True) + return CudaIpcTensorTransportProxy( + data=pool_slice, + info_data=tensor, + sync_buffer_meta=sync_meta, + ) + else: + # Pool full – fall back to simple IPC (with potential memory leak) logger.warning( - "CUDA IPC receive not fully implemented - requires cudaIpcOpenMemHandle" + "Pool full; falling back to simple CUDA IPC (potential memory leak)" ) - # TODO: Implement actual IPC handle opening + return TransportProxyTensor(tensor, transport_mode="cuda_ipc") - # Create local tensor and signal copy completion - tensor = torch.empty( - metadata.shape, dtype=metadata.dtype, device=f"cuda:{self.device}" - ) + # "default" – move to CPU shared memory (handled by share_memory_() downstream) + return tensor - # Mark chunk as ready for reuse by setting sync flag - self._mark_chunk_reusable(metadata.sync_shm_name) - return rid, tensor +def unwrap_mm_inputs_from_ipc( + mm_inputs: Optional[Dict[str, Any]], + device_index: Optional[int] = None, +) -> Optional[Dict[str, Any]]: + """Recursively reconstruct tensors from IPC proxy objects. - def _mark_chunk_reusable(self, sync_shm_name: str) -> None: - """Mark a chunk as reusable by setting sync flag to 1.""" - try: - shm = SharedMemory(name=sync_shm_name, create=False) - shm.buf[:8] = struct.pack("Q", 1) # Set to 1 (ready for reuse) - shm.close() - logger.debug(f"Marked chunk reusable: {sync_shm_name}") - except Exception as e: - logger.error(f"Failed to mark chunk reusable {sync_shm_name}: {e}") - - def cleanup(self) -> None: - """Cleanup resources.""" - self.workspace.cleanup() - self.queue.close() + Call this in the *receiver* process after getting data from the queue. + + Args: + mm_inputs: Data structure possibly containing IPC proxy objects. + device_index: If not None, move reconstructed tensors to this device. + """ + if mm_inputs is None: + return None + return _unwrap_recursive(mm_inputs, device_index) + + +def _unwrap_recursive(data: Any, device_index: Optional[int]) -> Any: + if isinstance(data, CudaIpcTensorTransportProxy): + return data.reconstruct_on_device(device_index) + elif isinstance(data, TransportProxyTensor): + # Already reconstructed during unpickling; just return as plain tensor + return data.as_subclass(torch.Tensor) + elif isinstance(data, dict): + return {k: _unwrap_recursive(v, device_index) for k, v in data.items()} + elif isinstance(data, (list, tuple)): + result = [_unwrap_recursive(item, device_index) for item in data] + return type(data)(result) + else: + return data diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py index 64ea55b0d..8f2d9a958 100644 --- a/pymllm/orchestrator/scheduler_process.py +++ b/pymllm/orchestrator/scheduler_process.py @@ -9,6 +9,10 @@ 1. Legacy ZMQ path: Receive TokenizedGenerateReqInput via ZMQ recv_pyobj 2. Shared queue fast path: Read rid from shared queue and metadata from shared memory +When the shared queue fast path is active the scheduler also handles CUDA IPC +tensor reconstruction via +:func:`~pymllm.orchestrator.cuda_ipc_transport.unwrap_mm_inputs_from_ipc`. + The main ``event_loop`` scheduler flow:: while True: @@ -31,6 +35,10 @@ import zmq from pymllm.engine.io_struct import TokenizedGenerateReqInput +from pymllm.orchestrator.cuda_ipc_transport import ( + TensorTransportMode, + unwrap_mm_inputs_from_ipc, +) from pymllm.orchestrator.ipc_utils import create_zmq_socket from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue @@ -48,6 +56,7 @@ def __init__( send_to_detokenizer_addr: str, shared_queue: Optional[TensorQueue] = None, enable_shared_queue: bool = False, + tensor_transport_mode: TensorTransportMode = "default", ): # ZMQ addresses self._recv_from_tokenizer_addr = recv_from_tokenizer_addr @@ -58,6 +67,7 @@ def __init__( # Shared queue configuration self._shared_queue = shared_queue self._enable_shared_queue = enable_shared_queue + self._tensor_transport_mode = tensor_transport_mode # ZMQ runtime objects (initialised in init_sockets) self._zmq_ctx: Optional[zmq.Context] = None @@ -111,8 +121,9 @@ def init_sockets(self) -> None: def event_loop(self) -> None: """Infinite scheduling loop.""" logger.info( - "SchedulerProcess event loop started (shared_queue=%s)", + "SchedulerProcess event loop started (shared_queue=%s, transport=%s)", self._enable_shared_queue, + self._tensor_transport_mode, ) while True: self.recv_requests() @@ -163,10 +174,21 @@ def _recv_from_zmq(self) -> None: self._waiting_queue.append(msg) def _recv_from_shared_queue(self) -> None: - """Receive requests via shared memory + shared queue fast path.""" + """Receive requests via shared memory + shared queue fast path. + + After reading a ``(rid, shm_name, mm_inputs)`` tuple from the queue: + 1. The tokenized metadata is read from the POSIX shared memory segment. + 2. If CUDA IPC is enabled, ``mm_inputs`` may contain + :class:`~pymllm.orchestrator.cuda_ipc_transport.CudaIpcTensorTransportProxy` + or :class:`~pymllm.orchestrator.cuda_ipc_transport.TransportProxyTensor` + objects that are reconstructed by calling + :func:`~pymllm.orchestrator.cuda_ipc_transport.unwrap_mm_inputs_from_ipc`. + This step also increments sync flags so the sender can recycle pool chunks. + 3. A full ``TokenizedGenerateReqInput`` is assembled and appended to + ``_waiting_queue``. + """ while True: try: - # Non-blocking get from shared queue rid, shm_name, mm_inputs = self._shared_queue.get(timeout=0.0001) # Read metadata from shared memory (and unlink immediately) @@ -174,12 +196,16 @@ def _recv_from_shared_queue(self) -> None: shm_name, unlink=True ) - # Reconstruct the full TokenizedGenerateReqInput with mm_inputs + # Reconstruct GPU tensors from CUDA IPC handles (if any) + if self._tensor_transport_mode in ("cuda_ipc", "cuda_ipc_pool"): + mm_inputs = unwrap_mm_inputs_from_ipc(mm_inputs) + + # Reassemble the full request full_request = TokenizedGenerateReqInput( rid=metadata.rid, input_text=metadata.input_text, input_ids=metadata.input_ids, - mm_inputs=mm_inputs, # Restored from shared queue + mm_inputs=mm_inputs, sampling_params=metadata.sampling_params, stream=metadata.stream, return_logprob=metadata.return_logprob, @@ -190,18 +216,18 @@ def _recv_from_shared_queue(self) -> None: ) self._waiting_queue.append(full_request) - logger.debug(f"Received request {rid} from shared queue") + logger.debug("Received request %s from shared queue", rid) except stdlib_queue.Empty: - # No more requests available break - except Exception as e: - logger.error(f"Error receiving from shared queue: {e}", exc_info=True) - # Try to cleanup shared memory if possible + except Exception as exc: + logger.error( + "Error receiving from shared queue: %s", exc, exc_info=True + ) try: if "shm_name" in locals(): SharedMemoryManager.cleanup(shm_name) - except: + except Exception: pass break @@ -310,6 +336,7 @@ def run_scheduler_process( pipe_writer: Connection, shared_queue: Optional[TensorQueue] = None, enable_shared_queue: bool = False, + tensor_transport_mode: TensorTransportMode = "default", ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" proc = SchedulerProcess( @@ -319,6 +346,7 @@ def run_scheduler_process( send_to_detokenizer_addr, shared_queue=shared_queue, enable_shared_queue=enable_shared_queue, + tensor_transport_mode=tensor_transport_mode, ) proc.init_sockets() diff --git a/pymllm/orchestrator/shared_memory_queue.py b/pymllm/orchestrator/shared_memory_queue.py index 3d26ebf14..2f006bdc0 100644 --- a/pymllm/orchestrator/shared_memory_queue.py +++ b/pymllm/orchestrator/shared_memory_queue.py @@ -1,32 +1,75 @@ """ Shared memory and queue utilities for fast IPC between tokenizer and scheduler. -This module implements shared-queue fast path to avoid expensive -ZMQ serialization of large multimodal tensors. +This module implements the shared-queue fast path to avoid expensive ZMQ +serialization of large multimodal tensors. -Design: - - Metadata lane: Small tokenized objects stored in shared memory keyed by rid - - Tensor lane: Large tensors made shareable via share_memory_() and passed by handle +## Design + +- **Metadata lane**: Small tokenized objects are written to a POSIX shared memory + segment keyed by the request ID (``rid``). The scheduler reads and immediately + unlinks the segment. + +- **Tensor lane**: Large tensors can be transported in one of three modes, + controlled by ``TensorTransportMode`` (passed at queue construction time): + + * ``"default"`` – CPU tensors only. GPU tensors are moved to POSIX shared + memory via ``tensor.share_memory_()`` (or left on CPU if already there). + This is the original behaviour and requires no CUDA support. + + * ``"cuda_ipc"`` – GPU tensors stay on GPU and are wrapped in + :class:`~pymllm.orchestrator.cuda_ipc_transport.TransportProxyTensor`. On the + receiver side the proxy's ``__setstate__`` automatically reconstructs the + tensor from the CUDA IPC handle during unpickling. CPU tensors are handled as + in ``"default"`` mode. **Caveat**: GPU memory is not freed until the sender + process exits (PyTorch limitation). Prefer ``"cuda_ipc_pool"`` for services. + + * ``"cuda_ipc_pool"`` – GPU tensors are copied into a pre-allocated + :class:`~pymllm.orchestrator.cuda_ipc_transport.MmItemMemoryPool` workspace and + wrapped in :class:`~pymllm.orchestrator.cuda_ipc_transport.CudaIpcTensorTransportProxy`. + After the receiver copies the data it increments a sync flag and the sender's + recycler thread returns the chunk to the pool. This avoids GPU memory leaks. + CPU tensors are handled as in ``"default"`` mode. + +## Key relationship with CUDA IPC + +``"default"`` and ``"cuda_ipc*"`` modes are **mutually exclusive for GPU tensors**: + +- In ``"default"`` mode, GPU tensors that need to cross process boundaries must + first be moved to CPU (``share_memory_()``). This incurs a GPU→CPU copy. +- In ``"cuda_ipc*"`` modes, GPU tensors are shared as-is via CUDA IPC handles; + no copy to CPU is needed. + +CPU tensors are always handled via ``share_memory_()`` regardless of the mode. """ +from __future__ import annotations + import logging import pickle import uuid from multiprocessing import Queue from multiprocessing.shared_memory import SharedMemory -from typing import Any, Dict, Optional +from typing import Any, Dict, Literal, Optional import torch +from pymllm.orchestrator.cuda_ipc_transport import ( + MmItemMemoryPool, + TensorTransportMode, + unwrap_mm_inputs_from_ipc, + wrap_mm_inputs_for_ipc, +) + logger = logging.getLogger(__name__) class SharedMemoryManager: """Manages shared memory segments for passing metadata between processes. - Each tokenized request's metadata is written to a unique shared memory segment - keyed by its request ID (rid). The scheduler reads and immediately unlinks the - segment to prevent memory leaks. + Each tokenized request's metadata is written to a unique shared memory + segment keyed by its request ID (rid). The scheduler reads and immediately + unlinks the segment to prevent memory leaks. """ @staticmethod @@ -40,23 +83,17 @@ def write_metadata(rid: str, metadata: Any) -> str: Returns: str: The shared memory segment name """ - # Serialize the metadata data = pickle.dumps(metadata) size = len(data) - - # Create unique shared memory segment name shm_name = f"pymllm_meta_{rid}_{uuid.uuid4().hex[:8]}" - try: - # Create shared memory segment shm = SharedMemory(name=shm_name, create=True, size=size) - # Write data shm.buf[:size] = data shm.close() - logger.debug(f"Wrote {size} bytes to shared memory {shm_name}") + logger.debug("Wrote %d bytes to shared memory %s", size, shm_name) return shm_name - except Exception as e: - logger.error(f"Failed to write metadata to shared memory: {e}") + except Exception as exc: + logger.error("Failed to write metadata to shared memory: %s", exc) raise @staticmethod @@ -71,25 +108,21 @@ def read_metadata(shm_name: str, unlink: bool = True) -> Any: The deserialized metadata object """ try: - # Open existing shared memory segment shm = SharedMemory(name=shm_name, create=False) - # Read and deserialize data data = bytes(shm.buf[:]) metadata = pickle.loads(data) shm.close() - - # Unlink to free memory immediately if unlink: try: shm.unlink() - logger.debug(f"Read and unlinked shared memory {shm_name}") + logger.debug("Read and unlinked shared memory %s", shm_name) except FileNotFoundError: - # Already unlinked, ignore pass - return metadata - except Exception as e: - logger.error(f"Failed to read metadata from shared memory {shm_name}: {e}") + except Exception as exc: + logger.error( + "Failed to read metadata from shared memory %s: %s", shm_name, exc + ) raise @staticmethod @@ -99,85 +132,137 @@ def cleanup(shm_name: str) -> None: shm = SharedMemory(name=shm_name, create=False) shm.close() shm.unlink() - logger.debug(f"Cleaned up shared memory {shm_name}") + logger.debug("Cleaned up shared memory %s", shm_name) except FileNotFoundError: - pass # Already cleaned up - except Exception as e: - logger.warning(f"Failed to cleanup shared memory {shm_name}: {e}") + pass + except Exception as exc: + logger.warning("Failed to cleanup shared memory %s: %s", shm_name, exc) class TensorQueue: - """Queue for passing large tensors between processes using shared memory. + """Queue for passing large tensors between processes. - Tensors are made shareable via .share_memory_() and passed through a - multiprocessing.Queue by handle (metadata only, not the actual data). - """ + Depending on ``transport_mode``, GPU tensors are either moved to CPU shared + memory (``"default"``) or kept on GPU and shared via CUDA IPC handles + (``"cuda_ipc"`` / ``"cuda_ipc_pool"``). - def __init__(self, maxsize: int = 0): - """Initialize the tensor queue. + Args: + maxsize: Maximum queue size (0 for unlimited). + transport_mode: Controls how GPU tensors are transported. + pool: Required when ``transport_mode == "cuda_ipc_pool"``. + """ - Args: - maxsize: Maximum queue size (0 for unlimited) - """ + def __init__( + self, + maxsize: int = 0, + transport_mode: TensorTransportMode = "default", + pool: Optional[MmItemMemoryPool] = None, + ) -> None: + # pool is allowed to be None at construction time for "cuda_ipc_pool" mode + # because the pool is initialised lazily inside the sender subprocess. + # The pool reference is injected later via _pool attribute assignment. self._queue: Queue = Queue(maxsize=maxsize) + self._transport_mode = transport_mode + self._pool = pool + + # ------------------------------------------------------------------ + # Producer side + # ------------------------------------------------------------------ - def put(self, rid: str, shm_name: str, mm_inputs: Optional[Dict[str, Any]]) -> None: - """Put a request with multimodal inputs into the queue. + def put( + self, + rid: str, + shm_name: str, + mm_inputs: Optional[Dict[str, Any]], + ) -> None: + """Put a request into the queue. + + GPU tensors inside *mm_inputs* are wrapped according to + ``transport_mode`` before being placed into the underlying + ``multiprocessing.Queue``. Args: - rid: Request ID - shm_name: Shared memory segment name for metadata - mm_inputs: Multimodal inputs dict (can contain torch tensors) + rid: Request ID. + shm_name: Shared memory segment name for the tokenized metadata. + mm_inputs: Multimodal inputs dict (may contain CUDA tensors). """ - # Make tensors shareable if present if mm_inputs is not None: - mm_inputs = self._make_tensors_shareable(mm_inputs) + if self._transport_mode in ("cuda_ipc", "cuda_ipc_pool"): + if self._transport_mode == "cuda_ipc_pool" and self._pool is None: + # Pool not yet initialised (race condition or CUDA unavailable); + # fall back to simple CUDA IPC for this message. + effective_mode = "cuda_ipc" + else: + effective_mode = self._transport_mode + # Wrap CUDA tensors in IPC proxies (stays on GPU, no copy to CPU) + mm_inputs = wrap_mm_inputs_for_ipc( + mm_inputs, + transport_mode=effective_mode, + pool=self._pool, + ) + # CPU tensors within mm_inputs are still shared via share_memory_() + mm_inputs = self._share_cpu_tensors(mm_inputs) + else: + # "default": move all tensors to CPU shared memory + mm_inputs = self._make_tensors_shareable(mm_inputs) self._queue.put((rid, shm_name, mm_inputs)) - logger.debug(f"Put request {rid} into tensor queue (shm={shm_name})") + logger.debug("Put request %s into tensor queue (shm=%s)", rid, shm_name) + + # ------------------------------------------------------------------ + # Consumer side + # ------------------------------------------------------------------ def get( self, timeout: Optional[float] = None ) -> tuple[str, str, Optional[Dict[str, Any]]]: """Get a request from the queue. + GPU tensors wrapped as IPC proxies are **not** automatically + reconstructed here – the caller (scheduler) must call + :func:`~pymllm.orchestrator.cuda_ipc_transport.unwrap_mm_inputs_from_ipc` + after retrieval. + Args: - timeout: Timeout in seconds (None for blocking indefinitely) + timeout: Timeout in seconds (None for blocking). Returns: - Tuple of (rid, shm_name, mm_inputs) + Tuple of ``(rid, shm_name, mm_inputs)``. """ rid, shm_name, mm_inputs = self._queue.get(timeout=timeout) - logger.debug(f"Got request {rid} from tensor queue (shm={shm_name})") + logger.debug("Got request %s from tensor queue (shm=%s)", rid, shm_name) return rid, shm_name, mm_inputs + # ------------------------------------------------------------------ + # Queue introspection + # ------------------------------------------------------------------ + def empty(self) -> bool: - """Check if the queue is empty.""" return self._queue.empty() def qsize(self) -> int: - """Return the approximate size of the queue.""" try: return self._queue.qsize() except NotImplementedError: - return 0 # Some platforms don't support qsize + return 0 def close(self) -> None: - """Close the queue.""" self._queue.close() + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + @staticmethod def _make_tensors_shareable(data: Any) -> Any: - """Recursively make all torch tensors in a data structure shareable. + """Recursively move all tensors (CPU and CUDA) to POSIX shared memory. - Args: - data: Nested dict/list/tensor structure - - Returns: - The same structure with tensors made shareable via share_memory_() + GPU tensors are first moved to CPU (incurring a device copy), then + placed in shared memory. This is the ``"default"`` path. """ if isinstance(data, torch.Tensor): - # Make tensor shareable across processes + if data.is_cuda: + data = data.cpu() if not data.is_shared(): data = data.share_memory_() return data @@ -188,3 +273,20 @@ def _make_tensors_shareable(data: Any) -> Any: return type(data)(result) else: return data + + @staticmethod + def _share_cpu_tensors(data: Any) -> Any: + """Recursively place CPU tensors in shared memory (GPU tensors are already + wrapped as IPC proxies and must not be touched here). + """ + if isinstance(data, torch.Tensor) and not data.is_cuda: + if not data.is_shared(): + data = data.share_memory_() + return data + elif isinstance(data, dict): + return {k: TensorQueue._share_cpu_tensors(v) for k, v in data.items()} + elif isinstance(data, (list, tuple)): + result = [TensorQueue._share_cpu_tensors(item) for item in data] + return type(data)(result) + else: + return data diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py index 43db5ba00..587a7c1ea 100644 --- a/pymllm/orchestrator/tokenizer_process.py +++ b/pymllm/orchestrator/tokenizer_process.py @@ -4,9 +4,26 @@ Receives raw requests from RequestResponseProcess via ZMQ, tokenizes them, and forwards the tokenized payloads to the SchedulerProcess. -Supports two modes: - 1. Legacy ZMQ path: Send TokenizedGenerateReqInput via ZMQ send_pyobj - 2. Shared queue fast path: Write metadata to shared memory and put rid in shared queue +Supports two transport modes (controlled by ``enable_shared_queue`` and +``tensor_transport_mode`` in the tokenizer config): + +1. **Legacy ZMQ path** (``enable_shared_queue=False``): + Tokenized objects are sent directly via ``ZMQ send_pyobj`` (pickle). This + is simple but slow for large multimodal tensors. + +2. **Shared queue fast path** (``enable_shared_queue=True``): + Metadata is written to POSIX shared memory and the queue carries a + lightweight ``(rid, shm_name, mm_inputs)`` tuple. The GPU tensors inside + ``mm_inputs`` are transported differently depending on ``tensor_transport_mode``: + + * ``"default"`` – GPU tensors are moved to CPU first (GPU→CPU copy), + then placed in POSIX shared memory. + * ``"cuda_ipc"`` – GPU tensors stay on GPU; they are wrapped in a + :class:`TransportProxyTensor` whose pickle uses CUDA IPC handles. + Simple but may leak GPU memory. + * ``"cuda_ipc_pool"`` – GPU tensors are copied into a pre-allocated + :class:`MmItemMemoryPool` workspace and shared via pool-chunk IPC + handles. Chunks are recycled; no GPU memory is leaked. """ import logging @@ -17,6 +34,7 @@ from transformers import AutoProcessor, AutoTokenizer from pymllm.engine.io_struct import TokenizedGenerateReqInput +from pymllm.orchestrator.cuda_ipc_transport import MmItemMemoryPool, TensorTransportMode from pymllm.orchestrator.ipc_utils import create_zmq_socket from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue @@ -40,16 +58,22 @@ def __init__( Serialisable dict built by the parent process (``Engine``) before spawning. Required keys: - * ``tokenizer_path`` – str, path to the tokenizer directory. - * ``tokenizer_mode`` – ``"auto" | "slow" | "fast"``. - * ``trust_remote_code`` – bool. - * ``context_length`` – Optional[int], explicit cap; inferred from - ``hf_config`` when ``None``. - * ``hf_config`` – Optional HuggingFace PretrainedConfig - (pickled by multiprocessing); used only to infer ``context_length``. - * ``enable_shared_queue`` – bool, whether to use shared memory fast path. + * ``tokenizer_path`` – str, path to the tokenizer directory. + * ``tokenizer_mode`` – ``"auto" | "slow" | "fast"``. + * ``trust_remote_code`` – bool. + * ``context_length`` – Optional[int], explicit cap; inferred + from ``hf_config`` when ``None``. + * ``hf_config`` – Optional HuggingFace PretrainedConfig. + * ``enable_shared_queue`` – bool, whether to use shared memory fast path. + * ``tensor_transport_mode`` – ``"default" | "cuda_ipc" | "cuda_ipc_pool"``. + * ``cuda_ipc_pool_size_mb`` – int, pool size in MB (cuda_ipc_pool only). + * ``cuda_ipc_recycle_interval`` – float, recycler sleep interval (s). + shared_queue: - Optional TensorQueue for shared memory fast path communication. + Optional :class:`TensorQueue` for the shared memory fast path. + When *transport_mode* is ``"cuda_ipc_pool"`` this queue should have + been constructed with a ``MmItemMemoryPool``; the ``TokenizerProcess`` + initialises its own pool in that case. """ self._recv_from_rr_addr = recv_from_rr_addr self._send_to_scheduler_addr = send_to_scheduler_addr @@ -57,6 +81,21 @@ def __init__( self._enable_shared_queue = tokenizer_cfg.get("enable_shared_queue", False) self._shared_queue = shared_queue + # Tensor transport configuration + self._transport_mode: TensorTransportMode = tokenizer_cfg.get( + "tensor_transport_mode", "default" + ) + # Pool for cuda_ipc_pool mode – will be initialised lazily when the + # process first encounters a CUDA tensor. + self._ipc_pool: Optional[MmItemMemoryPool] = None + if self._transport_mode == "cuda_ipc_pool": + # The pool must be created inside the subprocess (after fork/spawn) + # because it allocates CUDA memory. We defer to _ensure_pool(). + pool_mb: int = int(tokenizer_cfg.get("cuda_ipc_pool_size_mb", 512)) + recycle: float = float(tokenizer_cfg.get("cuda_ipc_recycle_interval", 0.1)) + self._ipc_pool_size_mb = pool_mb + self._ipc_recycle_interval = recycle + self._zmq_ctx: Optional[zmq.Context] = None self._recv_from_rr: Optional[zmq.Socket] = None self._send_to_scheduler: Optional[zmq.Socket] = None @@ -89,8 +128,9 @@ def init_sockets(self) -> None: def event_loop(self) -> None: """Infinite loop: recv raw request -> tokenize -> send to scheduler.""" logger.info( - "TokenizerProcess event loop started (shared_queue=%s)", + "TokenizerProcess event loop started (shared_queue=%s, transport=%s)", self._enable_shared_queue, + self._transport_mode, ) while True: raw_request: Dict[str, Any] = self._recv_from_rr.recv_pyobj() @@ -108,12 +148,19 @@ def _send_via_shared_queue( ) -> None: """Send tokenized request via shared memory + shared queue fast path. - Args: - tokenized: Either TokenizedGenerateReqInput dataclass or abort dict + GPU tensors inside ``mm_inputs`` are handled according to + ``self._transport_mode``: + + * ``"default"`` – moved to CPU via ``share_memory_()`` by ``TensorQueue``. + * ``"cuda_ipc"`` – wrapped in :class:`TransportProxyTensor` (stays on GPU). + * ``"cuda_ipc_pool"`` – copied into the :class:`MmItemMemoryPool` workspace and + wrapped in :class:`CudaIpcTensorTransportProxy`. + + Abort sentinel messages are forwarded via ZMQ (they are lightweight dicts). """ # Handle abort sentinel if isinstance(tokenized, dict) and tokenized.get("abort"): - # Fallback to ZMQ for abort messages + # Fallback to ZMQ for abort messages (no tensor payload) self._send_to_scheduler.send_pyobj(tokenized) return @@ -121,10 +168,14 @@ def _send_via_shared_queue( f"Expected TokenizedGenerateReqInput, got {type(tokenized)}" ) + # Lazily initialise the CUDA IPC pool (must happen inside the subprocess) + if self._transport_mode == "cuda_ipc_pool": + self._ensure_pool() + rid = tokenized.rid mm_inputs = tokenized.mm_inputs - # Create a lightweight metadata object (without mm_inputs) + # Create lightweight metadata object (mm_inputs sent separately via queue) metadata = TokenizedGenerateReqInput( rid=tokenized.rid, input_text=tokenized.input_text, @@ -143,9 +194,73 @@ def _send_via_shared_queue( shm_name = SharedMemoryManager.write_metadata(rid, metadata) # Put (rid, shm_name, mm_inputs) into shared queue + # TensorQueue.put() handles wrapping mm_inputs based on transport_mode self._shared_queue.put(rid, shm_name, mm_inputs) - logger.debug(f"Sent request {rid} via shared queue (shm={shm_name})") + logger.debug( + "Sent request %s via shared queue (shm=%s, transport=%s)", + rid, + shm_name, + self._transport_mode, + ) + + # ------------------------------------------------------------------ + # CUDA IPC pool initialisation (deferred to subprocess) + # ------------------------------------------------------------------ + + def _ensure_pool(self) -> None: + """Lazily create the MmItemMemoryPool inside the subprocess. + + This is deferred because CUDA context creation must happen after + ``torch.multiprocessing.Process`` has started (post-fork/spawn). + Once the pool is created we update the shared queue's transport config + in-place so the same underlying ``multiprocessing.Queue`` object is reused + (both processes already hold a reference to it). + """ + if self._ipc_pool is not None: + return + try: + import torch + + if not torch.cuda.is_available(): + logger.warning( + "CUDA not available; falling back to transport_mode='default'" + ) + self._transport_mode = "default" + if self._shared_queue is not None: + self._shared_queue._transport_mode = "default" + return + + pool_bytes = self._ipc_pool_size_mb * 1024 * 1024 + device = torch.cuda.current_device() + self._ipc_pool = MmItemMemoryPool( + memory_size=pool_bytes, + recycle_interval=self._ipc_recycle_interval, + device=device, + ) + # Update the shared queue's config in-place. + # Both processes share the same multiprocessing.Queue object, so we + # just update the wrapper's transport metadata; the underlying queue + # pipe is unchanged. + if self._shared_queue is not None: + self._shared_queue._transport_mode = self._transport_mode + self._shared_queue._pool = self._ipc_pool + + logger.info( + "MmItemMemoryPool initialised: %d MB on cuda:%d", + self._ipc_pool_size_mb, + device, + ) + except Exception as exc: + logger.error( + "Failed to initialise MmItemMemoryPool: %s; " + "falling back to transport_mode='default'", + exc, + exc_info=True, + ) + self._transport_mode = "default" + if self._shared_queue is not None: + self._shared_queue._transport_mode = "default" # ------------------------------------------------------------------ # Tokenization and multimodal preprocessing @@ -352,6 +467,8 @@ def _collect_mm_inputs( return mm def shutdown(self) -> None: + if self._ipc_pool is not None: + self._ipc_pool.shutdown() if self._recv_from_rr is not None: self._recv_from_rr.close() if self._send_to_scheduler is not None: From f10363c757a471884f4620daa09dd8eea798872c Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Mon, 2 Mar 2026 06:55:36 +0000 Subject: [PATCH 32/42] feat(sampling): add sampling module with FlashInfer acceleration and PyTorch fallback - Introduce pymllm.layers.sampling with diverse sampling functions - Implement softmax with temperature scaling and FlashInfer support - Add category sampling from probabilities and logits with optional determinism - Support top-p (nucleus), top-k, and min-p sampling methods - Provide combined top-k + top-p sampling from logits and probabilities - Add probability renormalization for top-p and top-k thresholds - Implement top-k masking for logits to filter out lower probabilities - Include chain speculative sampling for accelerated sequence generation - Provide pure-PyTorch fallback implementations for all methods - Update pymllm.layers.__init__.py to export new sampling functions - Rename pymllm.executor.eager_runner.py to model_runner.py for clarity --- .../{eager_runner.py => model_runner.py} | 0 pymllm/layers/__init__.py | 26 + pymllm/layers/sampling.py | 767 ++++++++++++++++++ 3 files changed, 793 insertions(+) rename pymllm/executor/{eager_runner.py => model_runner.py} (100%) diff --git a/pymllm/executor/eager_runner.py b/pymllm/executor/model_runner.py similarity index 100% rename from pymllm/executor/eager_runner.py rename to pymllm/executor/model_runner.py diff --git a/pymllm/layers/__init__.py b/pymllm/layers/__init__.py index fd9a070ea..97cfb9211 100644 --- a/pymllm/layers/__init__.py +++ b/pymllm/layers/__init__.py @@ -13,6 +13,20 @@ apply_rope_pos_ids, apply_rope_with_cos_sin_cache, ) +from pymllm.layers.sampling import ( + chain_speculative_sampling, + min_p_sampling_from_probs, + sampling_from_logits, + sampling_from_probs, + softmax, + top_k_mask_logits, + top_k_renorm_probs, + top_k_sampling_from_probs, + top_k_top_p_sampling_from_logits, + top_k_top_p_sampling_from_probs, + top_p_renorm_probs, + top_p_sampling_from_probs, +) from pymllm.layers.utils import set_weight_attrs __all__ = [ @@ -32,4 +46,16 @@ "apply_rope_pos_ids", "apply_llama31_rope_pos_ids", "apply_rope_with_cos_sin_cache", + "softmax", + "sampling_from_probs", + "sampling_from_logits", + "top_p_sampling_from_probs", + "top_k_sampling_from_probs", + "min_p_sampling_from_probs", + "top_k_top_p_sampling_from_logits", + "top_k_top_p_sampling_from_probs", + "top_p_renorm_probs", + "top_k_renorm_probs", + "top_k_mask_logits", + "chain_speculative_sampling", ] diff --git a/pymllm/layers/sampling.py b/pymllm/layers/sampling.py index e69de29bb..ff84879cf 100644 --- a/pymllm/layers/sampling.py +++ b/pymllm/layers/sampling.py @@ -0,0 +1,767 @@ +"""Sampling operations with FlashInfer acceleration and PyTorch fallback. + +This module wraps all flashinfer.sampling APIs and provides pure-PyTorch +fallback implementations so that the rest of the codebase can import from +here without worrying about whether FlashInfer is installed. +""" + +from __future__ import annotations + +import logging +from typing import Optional, Tuple, Union + +import torch + +logger = logging.getLogger(__name__) + +try: + import flashinfer.sampling as _fi_sampling + + _HAS_FLASHINFER = True +except ImportError: + _HAS_FLASHINFER = False + logger.warning("flashinfer not found, falling back to PyTorch sampling kernels") + + +# --------------------------------------------------------------------------- +# Helper utilities (torch fallback) +# --------------------------------------------------------------------------- + + +def _resolve_indices( + data: torch.Tensor, indices: Optional[torch.Tensor] +) -> torch.Tensor: + """If *indices* is given, gather rows from *data* accordingly.""" + if indices is None: + return data + return data[indices.long()] + + +def _to_scalar_or_tensor( + value: Union[torch.Tensor, float, int], + batch_size: int, + device: torch.device, +) -> torch.Tensor: + """Broadcast a scalar or per-batch tensor to shape ``(batch_size,)``.""" + if isinstance(value, (int, float)): + return torch.full((batch_size,), value, device=device, dtype=torch.float32) + return value.to(device=device, dtype=torch.float32) + + +# --------------------------------------------------------------------------- +# softmax +# --------------------------------------------------------------------------- + + +def softmax( + logits: torch.Tensor, + temperature: Optional[Union[torch.Tensor, float]] = None, + enable_pdl: Optional[bool] = None, +) -> torch.Tensor: + """Safe softmax with optional temperature scaling. + + Parameters + ---------- + logits : torch.Tensor + Shape ``(batch_size, num_classes)``. + temperature : Optional[Union[torch.Tensor, float]] + Scalar or per-request ``(batch_size,)`` temperature. + enable_pdl : Optional[bool] + FlashInfer PDL flag (ignored in fallback). + + Returns + ------- + torch.Tensor + Probabilities with the same shape as *logits*. + """ + if _HAS_FLASHINFER: + return _fi_sampling.softmax( + logits, temperature=temperature, enable_pdl=enable_pdl + ) + + if temperature is not None: + if isinstance(temperature, (int, float)): + logits = logits / temperature + else: + logits = logits / temperature.unsqueeze(-1) + return torch.softmax(logits, dim=-1) + + +# --------------------------------------------------------------------------- +# sampling_from_probs +# --------------------------------------------------------------------------- + + +def sampling_from_probs( + probs: torch.Tensor, + indices: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Category sampling from probabilities. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)`` or ``(unique_batch_size, num_classes)`` + when *indices* is provided. + indices : Optional[torch.Tensor] + Maps each output to a row in *probs*. + deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.sampling_from_probs( + probs, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + p = _resolve_indices(probs, indices) + samples = torch.multinomial(p.float(), num_samples=1, generator=generator).squeeze( + -1 + ) + return samples.to(torch.int32) + + +# --------------------------------------------------------------------------- +# sampling_from_logits +# --------------------------------------------------------------------------- + + +def sampling_from_logits( + logits: torch.Tensor, + indices: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Category sampling from logits (applies softmax internally). + + Parameters + ---------- + logits : torch.Tensor + ``(batch_size, num_classes)``. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.sampling_from_logits( + logits, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + probs = torch.softmax(logits.float(), dim=-1) + return sampling_from_probs( + probs, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + ) + + +# --------------------------------------------------------------------------- +# top_p_sampling_from_probs +# --------------------------------------------------------------------------- + + +def top_p_sampling_from_probs( + probs: torch.Tensor, + top_p: Union[torch.Tensor, float], + indices: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Top-p (nucleus) sampling from probabilities. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + top_p : Union[torch.Tensor, float] + Top-p threshold. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_p_sampling_from_probs( + probs, + top_p, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + p = _resolve_indices(probs, indices).float() + renormed = _torch_top_p_renorm_probs(p, top_p) + samples = torch.multinomial(renormed, num_samples=1, generator=generator).squeeze( + -1 + ) + return samples.to(torch.int32) + + +# --------------------------------------------------------------------------- +# top_k_sampling_from_probs +# --------------------------------------------------------------------------- + + +def top_k_sampling_from_probs( + probs: torch.Tensor, + top_k: Union[torch.Tensor, int], + indices: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Top-k sampling from probabilities. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + top_k : Union[torch.Tensor, int] + Top-k threshold. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_k_sampling_from_probs( + probs, + top_k, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + p = _resolve_indices(probs, indices).float() + renormed = _torch_top_k_renorm_probs(p, top_k) + samples = torch.multinomial(renormed, num_samples=1, generator=generator).squeeze( + -1 + ) + return samples.to(torch.int32) + + +# --------------------------------------------------------------------------- +# min_p_sampling_from_probs +# --------------------------------------------------------------------------- + + +def min_p_sampling_from_probs( + probs: torch.Tensor, + min_p: Union[torch.Tensor, float], + indices: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Min-p sampling from probabilities. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + min_p : Union[torch.Tensor, float] + Min-p threshold. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.min_p_sampling_from_probs( + probs, + min_p, + indices=indices, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + p = _resolve_indices(probs, indices).float() + batch_size = p.shape[0] + min_p_t = _to_scalar_or_tensor(min_p, batch_size, p.device) + # min-p: keep tokens whose probability >= min_p * max_prob + max_probs = p.max(dim=-1, keepdim=True).values # (B,1) + threshold = min_p_t.unsqueeze(-1) * max_probs # (B,1) + mask = p < threshold + filtered = p.clone() + filtered[mask] = 0.0 + # renormalize + sums = filtered.sum(dim=-1, keepdim=True) + sums = sums.clamp(min=1e-8) + filtered = filtered / sums + samples = torch.multinomial(filtered, num_samples=1, generator=generator).squeeze( + -1 + ) + return samples.to(torch.int32) + + +# --------------------------------------------------------------------------- +# top_k_top_p_sampling_from_logits +# --------------------------------------------------------------------------- + + +def top_k_top_p_sampling_from_logits( + logits: torch.Tensor, + top_k: Union[torch.Tensor, int], + top_p: Union[torch.Tensor, float], + indices: Optional[torch.Tensor] = None, + filter_apply_order: str = "top_k_first", + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Top-k + top-p sampling from pre-softmax logits. + + Parameters + ---------- + logits : torch.Tensor + ``(batch_size, num_classes)``. + top_k : Union[torch.Tensor, int] + top_p : Union[torch.Tensor, float] + filter_apply_order : str + ``"top_k_first"`` or ``"joint"``. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_k_top_p_sampling_from_logits( + logits, + top_k, + top_p, + indices=indices, + filter_apply_order=filter_apply_order, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + probs = torch.softmax(logits.float(), dim=-1) + return top_k_top_p_sampling_from_probs( + probs, + top_k, + top_p, + indices=indices, + filter_apply_order=filter_apply_order, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + ) + + +# --------------------------------------------------------------------------- +# top_k_top_p_sampling_from_probs +# --------------------------------------------------------------------------- + + +def top_k_top_p_sampling_from_probs( + probs: torch.Tensor, + top_k: Union[torch.Tensor, int], + top_p: Union[torch.Tensor, float], + indices: Optional[torch.Tensor] = None, + filter_apply_order: str = "top_k_first", + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + check_nan: bool = False, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> torch.Tensor: + """Top-k + top-p sampling from probabilities. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + top_k : Union[torch.Tensor, int] + top_p : Union[torch.Tensor, float] + filter_apply_order : str + ``"top_k_first"`` or ``"joint"``. + indices, deterministic, generator, check_nan, seed, offset + See FlashInfer docs. + + Returns + ------- + torch.Tensor + Sampled token ids, shape ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_k_top_p_sampling_from_probs( + probs, + top_k, + top_p, + indices=indices, + filter_apply_order=filter_apply_order, + deterministic=deterministic, + generator=generator, + check_nan=check_nan, + seed=seed, + offset=offset, + ) + + p = _resolve_indices(probs, indices).float() + if filter_apply_order == "top_k_first": + p = _torch_top_k_renorm_probs(p, top_k) + p = _torch_top_p_renorm_probs(p, top_p) + else: + # joint: apply both filters simultaneously + p = _torch_top_k_renorm_probs(p, top_k) + p = _torch_top_p_renorm_probs(p, top_p) + samples = torch.multinomial(p, num_samples=1, generator=generator).squeeze(-1) + return samples.to(torch.int32) + + +# --------------------------------------------------------------------------- +# top_p_renorm_probs +# --------------------------------------------------------------------------- + + +def top_p_renorm_probs( + probs: torch.Tensor, + top_p: Union[torch.Tensor, float], +) -> torch.Tensor: + """Renormalize probabilities by top-p thresholding. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + top_p : Union[torch.Tensor, float] + Top-p threshold in ``(0, 1)``. + + Returns + ------- + torch.Tensor + Renormalized probabilities. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_p_renorm_probs(probs, top_p) + + return _torch_top_p_renorm_probs(probs.float(), top_p).to(probs.dtype) + + +def _torch_top_p_renorm_probs( + probs: torch.Tensor, + top_p: Union[torch.Tensor, float], +) -> torch.Tensor: + """Pure-torch top-p renormalization (operates on float32).""" + sorted_probs, sorted_indices = torch.sort(probs, dim=-1, descending=True) + cumsum = torch.cumsum(sorted_probs, dim=-1) + + if isinstance(top_p, (int, float)): + mask = cumsum - sorted_probs > top_p + else: + top_p_t = top_p.unsqueeze(-1) + mask = cumsum - sorted_probs > top_p_t + + sorted_probs[mask] = 0.0 + # scatter back + result = torch.zeros_like(probs) + result.scatter_(1, sorted_indices, sorted_probs) + # renormalize + sums = result.sum(dim=-1, keepdim=True).clamp(min=1e-8) + return result / sums + + +# --------------------------------------------------------------------------- +# top_k_renorm_probs +# --------------------------------------------------------------------------- + + +def top_k_renorm_probs( + probs: torch.Tensor, + top_k: Union[torch.Tensor, int], +) -> torch.Tensor: + """Renormalize probabilities by top-k thresholding. + + Parameters + ---------- + probs : torch.Tensor + ``(batch_size, num_classes)``. + top_k : Union[torch.Tensor, int] + Top-k threshold. + + Returns + ------- + torch.Tensor + Renormalized probabilities. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_k_renorm_probs(probs, top_k) + + return _torch_top_k_renorm_probs(probs.float(), top_k).to(probs.dtype) + + +def _torch_top_k_renorm_probs( + probs: torch.Tensor, + top_k: Union[torch.Tensor, int], +) -> torch.Tensor: + """Pure-torch top-k renormalization (operates on float32).""" + if isinstance(top_k, int): + # uniform top_k across batch + topk_vals, _ = torch.topk(probs, top_k, dim=-1) + threshold = topk_vals[:, -1:] # (B, 1) + else: + # per-request top_k: use sorting + sorted_probs, _ = torch.sort(probs, dim=-1, descending=True) + # gather the k-th value for each row + k_indices = (top_k.long() - 1).unsqueeze(-1) # (B, 1) + threshold = sorted_probs.gather(1, k_indices) # (B, 1) + + mask = probs < threshold + filtered = probs.clone() + filtered[mask] = 0.0 + sums = filtered.sum(dim=-1, keepdim=True).clamp(min=1e-8) + return filtered / sums + + +# --------------------------------------------------------------------------- +# top_k_mask_logits +# --------------------------------------------------------------------------- + + +def top_k_mask_logits( + logits: torch.Tensor, + top_k: Union[torch.Tensor, int], +) -> torch.Tensor: + """Mask logits by top-k thresholding (set non-top-k to -inf). + + Parameters + ---------- + logits : torch.Tensor + ``(batch_size, num_classes)``. + top_k : Union[torch.Tensor, int] + Top-k threshold. + + Returns + ------- + torch.Tensor + Masked logits with the same shape and dtype. + """ + if _HAS_FLASHINFER: + return _fi_sampling.top_k_mask_logits(logits, top_k) + + if isinstance(top_k, int): + topk_vals, _ = torch.topk(logits, top_k, dim=-1) + threshold = topk_vals[:, -1:] + else: + sorted_logits, _ = torch.sort(logits, dim=-1, descending=True) + k_indices = (top_k.long() - 1).unsqueeze(-1) + threshold = sorted_logits.gather(1, k_indices) + + mask = logits < threshold + result = logits.clone() + result[mask] = float("-inf") + return result + + +# --------------------------------------------------------------------------- +# chain_speculative_sampling +# --------------------------------------------------------------------------- + + +def chain_speculative_sampling( + draft_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + target_probs: torch.Tensor, + maybe_output_accepted_token_num: Optional[torch.Tensor] = None, + maybe_output_emitted_draft_token_num: Optional[torch.Tensor] = None, + deterministic: bool = True, + generator: Optional[torch.Generator] = None, + seed: Optional[int] = None, + offset: Optional[int] = None, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Speculative sampling for sequence generation. + + Parameters + ---------- + draft_probs : torch.Tensor + ``(batch_size, num_speculate_tokens, vocab_size)``. + draft_token_ids : torch.Tensor + ``(batch_size, num_speculate_tokens)``. + target_probs : torch.Tensor + ``(batch_size, num_speculate_tokens + 1, vocab_size)``. + maybe_output_accepted_token_num : Optional[torch.Tensor] + If provided, accepted counts are added in-place. + maybe_output_emitted_draft_token_num : Optional[torch.Tensor] + If provided, emitted counts are added in-place. + deterministic, generator, seed, offset + See FlashInfer docs. + + Returns + ------- + output_token_ids : torch.Tensor + ``(batch_size, num_speculate_tokens + 1)``, rejected slots padded with -1. + output_accepted_token_num : torch.Tensor + ``(batch_size,)``. + output_emitted_draft_token_num : torch.Tensor + ``(batch_size,)``. + """ + if _HAS_FLASHINFER: + return _fi_sampling.chain_speculative_sampling( + draft_probs, + draft_token_ids, + target_probs, + maybe_output_accepted_token_num=maybe_output_accepted_token_num, + maybe_output_emitted_draft_token_num=maybe_output_emitted_draft_token_num, + deterministic=deterministic, + generator=generator, + seed=seed, + offset=offset, + ) + + return _torch_chain_speculative_sampling( + draft_probs, + draft_token_ids, + target_probs, + maybe_output_accepted_token_num, + maybe_output_emitted_draft_token_num, + generator, + ) + + +def _torch_chain_speculative_sampling( + draft_probs: torch.Tensor, + draft_token_ids: torch.Tensor, + target_probs: torch.Tensor, + maybe_output_accepted_token_num: Optional[torch.Tensor], + maybe_output_emitted_draft_token_num: Optional[torch.Tensor], + generator: Optional[torch.Generator], +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Pure-torch chain speculative sampling. + + Implements the rejection-sampling algorithm from + "Accelerating Large Language Model Decoding with Speculative Sampling" + (Leviathan et al., 2023). + """ + batch_size, num_spec, vocab_size = draft_probs.shape + device = draft_probs.device + + output_ids = torch.full( + (batch_size, num_spec + 1), -1, dtype=torch.int32, device=device + ) + accepted_count = torch.zeros(batch_size, dtype=torch.int32, device=device) + emitted_count = torch.zeros(batch_size, dtype=torch.int32, device=device) + + for b in range(batch_size): + all_accepted = True + for t in range(num_spec): + draft_tok = draft_token_ids[b, t].item() + p_draft = draft_probs[b, t, draft_tok].item() + p_target = target_probs[b, t, draft_tok].item() + + # independent acceptance check (for the metric) + if p_target >= p_draft: + accepted_count[b] += 1 + else: + r = torch.rand(1, generator=generator, device=device).item() + if r < p_target / max(p_draft, 1e-10): + accepted_count[b] += 1 + + # sequential chain: accept / reject + if all_accepted: + r = torch.rand(1, generator=generator, device=device).item() + if r < min(1.0, p_target / max(p_draft, 1e-10)): + output_ids[b, t] = draft_tok + emitted_count[b] += 1 + else: + # reject: sample from max(0, p_target - p_draft) + diff = target_probs[b, t].float() - draft_probs[b, t].float() + diff = torch.clamp(diff, min=0.0) + dsum = diff.sum() + if dsum > 1e-8: + diff = diff / dsum + else: + diff = target_probs[b, t].float() + diff = diff / diff.sum().clamp(min=1e-8) + resampled = torch.multinomial( + diff.unsqueeze(0), num_samples=1, generator=generator + ).item() + output_ids[b, t] = resampled + emitted_count[b] += 1 + all_accepted = False + + # bonus token (sampled from target at position after last emitted) + if all_accepted: + pos = num_spec + bonus_probs = target_probs[b, pos].float() + bonus_probs = bonus_probs / bonus_probs.sum().clamp(min=1e-8) + bonus = torch.multinomial( + bonus_probs.unsqueeze(0), num_samples=1, generator=generator + ).item() + output_ids[b, num_spec] = bonus + + if maybe_output_accepted_token_num is not None: + maybe_output_accepted_token_num.add_(accepted_count) + if maybe_output_emitted_draft_token_num is not None: + maybe_output_emitted_draft_token_num.add_(emitted_count) + + return output_ids, accepted_count, emitted_count + + +# --------------------------------------------------------------------------- +# Aliases (FlashInfer also exposes these) +# --------------------------------------------------------------------------- +top_p_renorm_prob = top_p_renorm_probs +top_k_renorm_prob = top_k_renorm_probs From c366ffcd1698fbfa843621415118b52046614636 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Mon, 9 Mar 2026 07:48:45 +0000 Subject: [PATCH 33/42] feat(cuda): add fused GDN decode and RMSNorm+SiLU gating kernels for attention - Implement fused GDN decode kernel performing gating, L2 normalization, delta update, and output computation in a single CUDA kernel optimized for SM80+ architectures - Add fused RMSNorm with optional SiLU gating kernel for Qwen3.5 GDN attention - Provide JIT Python wrappers for both kernels to enable easy integration - Extend CUDA JIT module imports to include new gdn_decode kernel - Update global and server configs with new backend options and default values - Enhance argument parsing in global_config to support literal choice constraints - Add fields for multimodal M-RoPE and vision inputs in ForwardBatch for decoding - Implement EOS token ID extraction and max output tokens normalization in launcher module --- .gitignore | 1 + .../mllm_kernel/cuda/csrc/gdn_decode.cuh | 432 ++++++ .../mllm_kernel/cuda/csrc/rms_norm_gated.cuh | 212 +++ mllm-kernel/mllm_kernel/cuda/jit/__init__.py | 3 +- .../mllm_kernel/cuda/jit/gdn_decode.py | 114 ++ .../mllm_kernel/cuda/jit/rms_norm_gated.py | 87 ++ pymllm/configs/global_config.py | 31 +- pymllm/configs/server_config.py | 23 +- pymllm/engine/forward_batch.py | 9 + pymllm/engine/launch.py | 206 ++- pymllm/executor/__init__.py | 10 + pymllm/executor/cuda_graph_runner.py | 590 ++++++++ pymllm/executor/model_runner.py | 1198 +++++++++++++++ pymllm/layers/__init__.py | 4 + pymllm/layers/attention/__init__.py | 8 + pymllm/layers/attention/attention_backend.py | 22 + pymllm/layers/attention/gdn_backend.py | 660 ++++++++ pymllm/layers/attention/hybrid_backend.py | 184 +++ .../attention/radix_linear_attention.py | 116 ++ pymllm/layers/gated_delta_net.py | 168 +++ pymllm/layers/rms_norm.py | 24 +- pymllm/layers/rms_norm_gated.py | 154 ++ pymllm/layers/rope.py | 147 +- pymllm/layers/sampling.py | 9 + pymllm/mem_cache/memory_pool.py | 159 ++ pymllm/mem_cache/radix_cache.py | 40 +- pymllm/models/__init__.py | 62 + pymllm/models/qwen3_5.py | 530 +++++++ pymllm/models/qwen3_vl.py | 1329 +++++++++++++++++ pymllm/orchestrator/async_disk_io_process.py | 84 -- pymllm/orchestrator/detokenizer_process.py | 116 +- pymllm/orchestrator/ipc_utils.py | 22 + pymllm/orchestrator/model_runner_process.py | 1007 +++++++++++-- .../orchestrator/request_response_process.py | 23 +- pymllm/orchestrator/scheduler_process.py | 820 +++++++++- pymllm/orchestrator/tokenizer_process.py | 4 +- pymllm/parsers/__init__.py | 10 + pymllm/parsers/reasoning_parser.py | 212 +++ pymllm/parsers/tool_call_parser.py | 433 ++++++ pymllm/server/launch.py | 923 +++++++++++- 40 files changed, 9810 insertions(+), 376 deletions(-) create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/gdn_decode.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/rms_norm_gated.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/jit/gdn_decode.py create mode 100644 mllm-kernel/mllm_kernel/cuda/jit/rms_norm_gated.py create mode 100644 pymllm/layers/attention/gdn_backend.py create mode 100644 pymllm/layers/attention/hybrid_backend.py create mode 100644 pymllm/layers/attention/radix_linear_attention.py create mode 100644 pymllm/layers/gated_delta_net.py create mode 100644 pymllm/layers/rms_norm_gated.py create mode 100644 pymllm/models/qwen3_5.py create mode 100644 pymllm/models/qwen3_vl.py delete mode 100644 pymllm/orchestrator/async_disk_io_process.py create mode 100644 pymllm/parsers/__init__.py create mode 100644 pymllm/parsers/reasoning_parser.py create mode 100644 pymllm/parsers/tool_call_parser.py diff --git a/.gitignore b/.gitignore index cdafc2707..7f14b37ec 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ .cache/ .tmp/ compile_commands.json +settings.local.json # MLLM Team Specific tasks/mllmteam* diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/gdn_decode.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/gdn_decode.cuh new file mode 100644 index 000000000..4c2833c06 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/gdn_decode.cuh @@ -0,0 +1,432 @@ +// Fused GDN (Gated Delta Net) decode kernel for linear attention. +// +// Performs a single-token recurrent update per request: +// g = -exp(A_log) * softplus(a + dt_bias) +// beta = sigmoid(b) +// q = L2norm(q) * scale +// k = L2norm(k) +// state *= exp(g) (decay) +// v_delta = v - state @ k (delta rule) +// v_delta *= beta (gated update) +// state += v_delta outer k (state update) +// output = state @ q (readout) +// +// Works on SM80+ (Ampere, Jetson Orin, Hopper, ...). +// Matches the algorithm of sglang's fused_sigmoid_gating_delta_rule_update. +// +// Grid : (NV, bs * HV) where NV = ceil(V / BV) +// Block: BLOCK_K threads (one thread per K-dimension element) +// +// Each thread owns BV state elements at its K position. +// Two cross-thread reductions (over K) compute delta and output dot products. + +#pragma once + +#include +#include + +#include +#include + +#include +#include +#include + +#include + +namespace GDNDecodeKernel { + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +inline constexpr int BV = 32; // V-dimension tile size + +// --------------------------------------------------------------------------- +// Warp-level reduction +// --------------------------------------------------------------------------- + +__device__ __forceinline__ float warp_reduce_sum(float val) { + #pragma unroll + for (int offset = 16; offset > 0; offset >>= 1) { + val += __shfl_xor_sync(0xffffffff, val, offset); + } + return val; +} + +// --------------------------------------------------------------------------- +// Type conversion helpers +// --------------------------------------------------------------------------- + +template +__device__ __forceinline__ float to_float(T val); + +template <> +__device__ __forceinline__ float to_float<__half>(__half val) { + return __half2float(val); +} + +template <> +__device__ __forceinline__ float to_float<__nv_bfloat16>(__nv_bfloat16 val) { + return __bfloat162float(val); +} + +template <> +__device__ __forceinline__ float to_float(float val) { + return val; +} + +template +__device__ __forceinline__ T from_float(float val); + +template <> +__device__ __forceinline__ __half from_float<__half>(float val) { + return __float2half(val); +} + +template <> +__device__ __forceinline__ __nv_bfloat16 from_float<__nv_bfloat16>(float val) { + return __float2bfloat16(val); +} + +template <> +__device__ __forceinline__ float from_float(float val) { + return val; +} + +// --------------------------------------------------------------------------- +// Block-level scalar reduction (sum across all threads → broadcast result) +// --------------------------------------------------------------------------- + +// Reduces a scalar across all threads in the block. +// Returns the sum in ALL threads (via shared memory broadcast). +// smem must have at least (blockDim.x / 32) floats. +__device__ __forceinline__ float block_reduce_sum(float val, float* smem) { + const int warp_id = threadIdx.x / 32; + const int lane_id = threadIdx.x % 32; + const int num_warps = blockDim.x / 32; + + val = warp_reduce_sum(val); + if (lane_id == 0) smem[warp_id] = val; + __syncthreads(); + + // First warp reduces across warps + if (warp_id == 0) { + float v = (lane_id < num_warps) ? smem[lane_id] : 0.0f; + v = warp_reduce_sum(v); + if (lane_id == 0) smem[0] = v; + } + __syncthreads(); + return smem[0]; +} + +// --------------------------------------------------------------------------- +// Block-level vector reduction: BV independent sums across all K threads +// --------------------------------------------------------------------------- + +// Each thread contributes partial[0..BV-1]. After this call, the results +// are written to out[0..BV-1] and are valid in all threads. +// reduce_buf must have at least BV * num_warps floats. +// broadcast_buf must have at least BV floats. +__device__ __forceinline__ void block_reduce_bv( + float partial[BV], + float* reduce_buf, // [num_warps * BV] + float* broadcast_buf, // [BV] + float out[BV] +) { + const int warp_id = threadIdx.x / 32; + const int lane_id = threadIdx.x % 32; + const int num_warps = blockDim.x / 32; + + // Intra-warp reduction for each bv + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + float val = warp_reduce_sum(partial[bv]); + if (lane_id == 0) { + reduce_buf[warp_id * BV + bv] = val; + } + } + __syncthreads(); + + // Inter-warp reduction: threads 0..BV-1 each reduce one bv + if (threadIdx.x < BV) { + float sum = 0.0f; + #pragma unroll 8 + for (int w = 0; w < num_warps; w++) { + sum += reduce_buf[w * BV + threadIdx.x]; + } + broadcast_buf[threadIdx.x] = sum; + } + __syncthreads(); + + // Broadcast to all threads + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + out[bv] = broadcast_buf[bv]; + } +} + +// --------------------------------------------------------------------------- +// Main GDN decode kernel +// --------------------------------------------------------------------------- + +template +__global__ void gdn_decode_kernel( + const T* __restrict__ q_ptr, // [bs, H, K] + const T* __restrict__ k_ptr, // [bs, H, K] + const T* __restrict__ v_ptr, // [bs, HV, V] + const T* __restrict__ a_ptr, // [bs, HV] + const T* __restrict__ b_ptr, // [bs, HV] + const float* __restrict__ A_log_ptr, // [HV] + const float* __restrict__ dt_bias_ptr, // [HV] + float* __restrict__ state_pool, // [pool_size, HV, V, K] + const int64_t* __restrict__ cache_indices, // [bs] + T* __restrict__ output_ptr, // [bs, HV, V] + const int bs, + const int H, // num_k_heads + const int HV, // num_v_heads + const int K, // head_k_dim + const int V, // head_v_dim + const float scale // K^-0.5 +) { + // Block indices + const int bv_block = blockIdx.x; // V-tile index + const int batch_head = blockIdx.y; // batch * HV + const int i_n = batch_head / HV; // batch index + const int i_hv = batch_head % HV; // value head index + const int i_h = i_hv * H / HV; // key head index (GQA mapping) + const int k_idx = threadIdx.x; // K-dimension index + const int v_start = bv_block * BV; // V-dimension start + + if (i_n >= bs) return; + + // Shared memory layout (declared dynamically) + extern __shared__ float smem[]; + const int num_warps = BLOCK_K / 32; + float* sq = smem; // [BLOCK_K] + float* sk = smem + BLOCK_K; // [BLOCK_K] + float* sv_broadcast = smem + 2 * BLOCK_K; // [BV] + float* warp_buf = smem + 2 * BLOCK_K + BV; // [num_warps] + float* reduce_buf = smem + 2 * BLOCK_K + BV + num_warps; // [BV * num_warps] + + // ===== 1. Load gating parameters and compute decay + beta ===== + // All threads load the same scalars (cheap, avoids shared memory) + const float A_log_val = A_log_ptr[i_hv]; + const float dt_bias_val = dt_bias_ptr[i_hv]; + const float a_val = to_float(a_ptr[i_n * HV + i_hv]); + const float b_val = to_float(b_ptr[i_n * HV + i_hv]); + + const float x = a_val + dt_bias_val; + // softplus with numerical stability: softplus(x) = log(1+exp(x)), or x for x>20 + const float softplus_x = (x <= 20.0f) ? logf(1.0f + expf(x)) : x; + const float g = -expf(A_log_val) * softplus_x; + const float decay = expf(g); + const float beta = 1.0f / (1.0f + expf(-b_val)); + + // ===== 2. Load q, k and compute L2 norms ===== + float q_val = 0.0f, k_val = 0.0f; + if (k_idx < K) { + q_val = to_float(q_ptr[i_n * H * K + i_h * K + k_idx]); + k_val = to_float(k_ptr[i_n * H * K + i_h * K + k_idx]); + } + + // L2 norm: reduce q*q and k*k across block + float q_sq_sum = block_reduce_sum(q_val * q_val, warp_buf); + float k_sq_sum = block_reduce_sum(k_val * k_val, warp_buf); + + float q_norm = rsqrtf(q_sq_sum + 1e-6f); + float k_norm = rsqrtf(k_sq_sum + 1e-6f); + + // Store normalized q (scaled) and k in shared memory + if (k_idx < K) { + sq[k_idx] = q_val * q_norm * scale; + sk[k_idx] = k_val * k_norm; + } else { + sq[k_idx] = 0.0f; + sk[k_idx] = 0.0f; + } + __syncthreads(); + + // ===== 3. Load state elements for this thread ===== + const int64_t pool_idx = cache_indices[i_n]; + // state_pool layout: [pool_size, HV, V, K] + const int64_t state_base = pool_idx * HV * V * K + i_hv * V * K; + + float state[BV]; + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + const int v_idx = v_start + bv; + if (v_idx < V && k_idx < K) { + state[bv] = state_pool[state_base + (int64_t)v_idx * K + k_idx]; + } else { + state[bv] = 0.0f; + } + } + + // ===== 4. Decay: state *= exp(g) ===== + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + state[bv] *= decay; + } + + // ===== 5. Delta: v_delta[bv] = v[bv] - sum_k(state[bv,k] * k_norm[k]) ===== + float partial_delta[BV]; + const float my_k = sk[k_idx]; + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + partial_delta[bv] = state[bv] * my_k; + } + + float delta[BV]; + block_reduce_bv(partial_delta, reduce_buf, sv_broadcast, delta); + + // Compute v_delta = (v - delta) * beta and broadcast to all threads. + // Threads 0..BV-1 each load one v element, compute v_delta, write to smem. + if (k_idx < BV) { + const int my_v_idx = v_start + k_idx; + float my_v = (my_v_idx < V) + ? to_float(v_ptr[i_n * HV * V + i_hv * V + my_v_idx]) + : 0.0f; + sv_broadcast[k_idx] = (my_v - delta[k_idx]) * beta; + } + __syncthreads(); + + float v_delta[BV]; + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + v_delta[bv] = sv_broadcast[bv]; + } + + // ===== 6. State update: state[bv,k] += v_delta[bv] * k_norm[k] ===== + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + state[bv] += v_delta[bv] * my_k; + } + + // ===== 7. Output: o[bv] = sum_k(state[bv,k] * q_norm_scaled[k]) ===== + float partial_out[BV]; + const float my_q = sq[k_idx]; + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + partial_out[bv] = state[bv] * my_q; + } + + float out_vals[BV]; + block_reduce_bv(partial_out, reduce_buf, sv_broadcast, out_vals); + + // ===== 8. Store output ===== + // output layout: [bs, HV, V] + if (k_idx < BV) { + const int v_idx = v_start + k_idx; + if (v_idx < V) { + output_ptr[i_n * HV * V + i_hv * V + v_idx] = from_float(out_vals[k_idx]); + } + } + + // ===== 9. Store state back to pool ===== + #pragma unroll + for (int bv = 0; bv < BV; bv++) { + const int v_idx = v_start + bv; + if (v_idx < V && k_idx < K) { + state_pool[state_base + (int64_t)v_idx * K + k_idx] = state[bv]; + } + } +} + +// --------------------------------------------------------------------------- +// Launch wrapper (called via TVM FFI) +// --------------------------------------------------------------------------- + +void run( + tvm::ffi::TensorView q, // [bs, H, K] + tvm::ffi::TensorView k, // [bs, H, K] + tvm::ffi::TensorView v, // [bs, HV, V] + tvm::ffi::TensorView a, // [bs, HV] + tvm::ffi::TensorView b, // [bs, HV] + tvm::ffi::TensorView A_log, // [HV] + tvm::ffi::TensorView dt_bias, // [HV] + tvm::ffi::TensorView state_pool, // [pool_size, HV, V, K] + tvm::ffi::TensorView cache_indices, // [bs] + tvm::ffi::TensorView output // [bs, HV, V] +) { + using namespace mllm_kernel::host; + + // --- Extract dimensions --- + auto BS = SymbolicSize{"bs"}; + auto H_ = SymbolicSize{"H"}; + auto HV_ = SymbolicSize{"HV"}; + auto K_ = SymbolicSize{"K"}; + auto V_ = SymbolicSize{"V"}; + auto PS = SymbolicSize{"pool_size"}; + auto dtype = SymbolicDType{}; + auto device = SymbolicDevice{}; + device.set_options(); + dtype.set_options(); + + (void)TensorMatcher({BS, H_, K_}).with_dtype(dtype).with_device(device).verify(q); + (void)TensorMatcher({BS, H_, K_}).with_dtype(dtype).with_device(device).verify(k); + (void)TensorMatcher({BS, HV_, V_}).with_dtype(dtype).with_device(device).verify(v); + (void)TensorMatcher({BS, HV_}).with_dtype(dtype).with_device(device).verify(a); + (void)TensorMatcher({BS, HV_}).with_dtype(dtype).with_device(device).verify(b); + (void)TensorMatcher({HV_}).with_dtype().with_device(device).verify(A_log); + (void)TensorMatcher({HV_}).with_dtype().with_device(device).verify(dt_bias); + (void)TensorMatcher({PS, HV_, V_, K_}).with_dtype().with_device(device).verify(state_pool); + (void)TensorMatcher({BS}).with_device(device).verify(cache_indices); + (void)TensorMatcher({BS, HV_, V_}).with_dtype(dtype).with_device(device).verify(output); + + const int bs = static_cast(BS.unwrap()); + const int H = static_cast(H_.unwrap()); + const int HV = static_cast(HV_.unwrap()); + const int K = static_cast(K_.unwrap()); + const int V = static_cast(V_.unwrap()); + const float scale = 1.0f / sqrtf(static_cast(K)); + + // Block size = K (rounded up to warp multiple, max 1024) + int block_k = ((K + 31) / 32) * 32; + if (block_k > 1024) block_k = 1024; + const int num_warps = block_k / 32; + + // Grid + const int NV = (V + BV - 1) / BV; + dim3 grid(NV, bs * HV); + dim3 block(block_k); + + // Dynamic shared memory: sq[block_k] + sk[block_k] + sv[BV] + warp_buf[nw] + reduce[BV*nw] + const size_t smem_bytes = (2 * block_k + BV + num_warps + BV * num_warps) * sizeof(float); + + const DLDevice dl_device = device.unwrap(); + + // Typed launch helper + #define LAUNCH_GDN_DECODE(CType, BKVAL) \ + LaunchKernel(grid, block, dl_device, smem_bytes)( \ + gdn_decode_kernel, \ + static_cast(q.data_ptr()), \ + static_cast(k.data_ptr()), \ + static_cast(v.data_ptr()), \ + static_cast(a.data_ptr()), \ + static_cast(b.data_ptr()), \ + static_cast(A_log.data_ptr()), \ + static_cast(dt_bias.data_ptr()), \ + static_cast(state_pool.data_ptr()), \ + static_cast(cache_indices.data_ptr()), \ + static_cast(output.data_ptr()), \ + bs, H, HV, K, V, scale \ + ) + + // Dispatch based on dtype and block size + if (dtype.is_type()) { + if (block_k == 64) { LAUNCH_GDN_DECODE(__nv_bfloat16, 64); } + else if (block_k == 128) { LAUNCH_GDN_DECODE(__nv_bfloat16, 128); } + else if (block_k == 256) { LAUNCH_GDN_DECODE(__nv_bfloat16, 256); } + else { LAUNCH_GDN_DECODE(__nv_bfloat16, 256); } + } else { + if (block_k == 64) { LAUNCH_GDN_DECODE(__half, 64); } + else if (block_k == 128) { LAUNCH_GDN_DECODE(__half, 128); } + else if (block_k == 256) { LAUNCH_GDN_DECODE(__half, 256); } + else { LAUNCH_GDN_DECODE(__half, 256); } + } + + #undef LAUNCH_GDN_DECODE +} + +} // namespace GDNDecodeKernel diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/rms_norm_gated.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/rms_norm_gated.cuh new file mode 100644 index 000000000..b61246029 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/rms_norm_gated.cuh @@ -0,0 +1,212 @@ +// Fused RMSNorm with optional SiLU gating for Qwen3.5 GDN attention. +// +// Computes: output = rmsnorm(x, weight, eps) * silu(z) (if z provided) +// output = rmsnorm(x, weight, eps) (if z is null) +// +// Where: rmsnorm(x) = x / sqrt(mean(x^2) + eps) * weight +// silu(z) = z * sigmoid(z) +// +// This kernel fuses both operations into a single pass over the data, +// maximizing memory bandwidth utilization. Each block processes one row +// (one token position). +// +// Supported dtypes: float16, bfloat16 (accumulation in float32). + +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace RMSNormGatedKernel { + +// --------------------------------------------------------------------------- +// Warp-level reduction +// --------------------------------------------------------------------------- + +__device__ __forceinline__ float warp_reduce_sum(float val) { + #pragma unroll + for (int offset = 16; offset > 0; offset >>= 1) { + val += __shfl_xor_sync(0xffffffff, val, offset); + } + return val; +} + +// --------------------------------------------------------------------------- +// Type conversion helpers +// --------------------------------------------------------------------------- + +template +__device__ __forceinline__ float to_float(T val); + +template <> +__device__ __forceinline__ float to_float(half val) { + return __half2float(val); +} + +template <> +__device__ __forceinline__ float to_float<__nv_bfloat16>(__nv_bfloat16 val) { + return __bfloat162float(val); +} + +template <> +__device__ __forceinline__ float to_float(float val) { + return val; +} + +template +__device__ __forceinline__ T from_float(float val); + +template <> +__device__ __forceinline__ half from_float(float val) { + return __float2half(val); +} + +template <> +__device__ __forceinline__ __nv_bfloat16 from_float<__nv_bfloat16>(float val) { + return __float2bfloat16(val); +} + +template <> +__device__ __forceinline__ float from_float(float val) { + return val; +} + +// --------------------------------------------------------------------------- +// Main kernel +// --------------------------------------------------------------------------- + +template +__global__ void rms_norm_gated_kernel( + T* __restrict__ output, // [M, N] + const T* __restrict__ input, // [M, N] + const T* __restrict__ weight, // [N] + const T* __restrict__ gate, // [M, N] or nullptr + const int M, // number of rows + const int N, // number of columns (hidden_size) + const float eps +) { + const int row = blockIdx.x; + if (row >= M) return; + + const int tid = threadIdx.x; + const T* x_row = input + row * N; + T* out_row = output + row * N; + const T* z_row = (gate != nullptr) ? gate + row * N : nullptr; + + // --- Pass 1: compute sum of squares --- + float sum_sq = 0.0f; + for (int col = tid; col < N; col += BLOCK_SIZE) { + float val = to_float(x_row[col]); + sum_sq += val * val; + } + + // Block-level reduction + __shared__ float shared_sum[32]; // one per warp + int warp_id = tid / 32; + int lane_id = tid % 32; + + sum_sq = warp_reduce_sum(sum_sq); + if (lane_id == 0) { + shared_sum[warp_id] = sum_sq; + } + __syncthreads(); + + // Final reduction in first warp + if (warp_id == 0) { + float val = (lane_id < (BLOCK_SIZE / 32)) ? shared_sum[lane_id] : 0.0f; + val = warp_reduce_sum(val); + if (lane_id == 0) { + shared_sum[0] = val; + } + } + __syncthreads(); + + float rms = rsqrtf(shared_sum[0] / (float)N + eps); + + // --- Pass 2: normalize, scale by weight, optionally gate with silu(z) --- + for (int col = tid; col < N; col += BLOCK_SIZE) { + float val = to_float(x_row[col]); + float w = to_float(weight[col]); + + float normed = val * rms * w; + + if (z_row != nullptr) { + float z = to_float(z_row[col]); + // silu(z) = z * sigmoid(z) + float silu_z = z / (1.0f + expf(-z)); + normed *= silu_z; + } + + out_row[col] = from_float(normed); + } +} + +// --------------------------------------------------------------------------- +// Launch wrapper (called via TVM FFI) +// --------------------------------------------------------------------------- + +void run( + tvm::ffi::TensorView output, + tvm::ffi::TensorView input, + tvm::ffi::TensorView weight, + tvm::ffi::TensorView gate, // empty tensor (numel==0) means no gate + double eps +) { + using namespace mllm_kernel::host; + + auto M = SymbolicSize{"M"}; + auto N = SymbolicSize{"N"}; + auto dtype = SymbolicDType{}; + auto device = SymbolicDevice{}; + device.set_options(); + dtype.set_options(); + + (void)TensorMatcher({M, N}).with_dtype(dtype).with_device(device).verify(input); + (void)TensorMatcher({M, N}).with_dtype(dtype).with_device(device).verify(output); + (void)TensorMatcher({N}).with_dtype(dtype).with_device(device).verify(weight); + + const int rows = static_cast(M.unwrap()); + const int cols = static_cast(N.unwrap()); + const bool has_gate = (gate.numel() > 0); + + constexpr int BLOCK_SIZE = 256; + + if (dtype.is_type()) { + LaunchKernel(rows, BLOCK_SIZE, device.unwrap())( + rms_norm_gated_kernel, + static_cast(output.data_ptr()), + static_cast(input.data_ptr()), + static_cast(weight.data_ptr()), + has_gate ? static_cast(gate.data_ptr()) : nullptr, + rows, cols, static_cast(eps) + ); + } else if (dtype.is_type()) { + LaunchKernel(rows, BLOCK_SIZE, device.unwrap())( + rms_norm_gated_kernel<__nv_bfloat16, BLOCK_SIZE>, + static_cast<__nv_bfloat16*>(output.data_ptr()), + static_cast(input.data_ptr()), + static_cast(weight.data_ptr()), + has_gate ? static_cast(gate.data_ptr()) : nullptr, + rows, cols, static_cast(eps) + ); + } else { + LaunchKernel(rows, BLOCK_SIZE, device.unwrap())( + rms_norm_gated_kernel, + static_cast(output.data_ptr()), + static_cast(input.data_ptr()), + static_cast(weight.data_ptr()), + has_gate ? static_cast(gate.data_ptr()) : nullptr, + rows, cols, static_cast(eps) + ); + } +} + +} // namespace RMSNormGatedKernel diff --git a/mllm-kernel/mllm_kernel/cuda/jit/__init__.py b/mllm-kernel/mllm_kernel/cuda/jit/__init__.py index 202ff3b36..cc4ab667a 100644 --- a/mllm-kernel/mllm_kernel/cuda/jit/__init__.py +++ b/mllm-kernel/mllm_kernel/cuda/jit/__init__.py @@ -1,4 +1,5 @@ from .add_constant import add_constant +from .gdn_decode import gdn_decode from .store_cache import can_use_store_cache, store_cache -__all__ = ["add_constant", "can_use_store_cache", "store_cache"] +__all__ = ["add_constant", "can_use_store_cache", "gdn_decode", "store_cache"] diff --git a/mllm-kernel/mllm_kernel/cuda/jit/gdn_decode.py b/mllm-kernel/mllm_kernel/cuda/jit/gdn_decode.py new file mode 100644 index 000000000..53aaeaab3 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/jit/gdn_decode.py @@ -0,0 +1,114 @@ +"""Fused GDN decode CUDA JIT kernel. + +Performs a single-token GDN (Gated Delta Net) recurrent update per request, +fusing gating + L2 normalization + delta rule + output computation into +one kernel. Works on SM80+ (Ampere, Jetson Orin, Hopper, ...). + +Usage:: + + from mllm_kernel.cuda.jit.gdn_decode import gdn_decode + + output = gdn_decode(q, k, v, a, b, A_log, dt_bias, state_pool, cache_indices) +""" + +from __future__ import annotations + +import torch + +from mllm_kernel.jit_utils import cache_once, jit + + +@cache_once +def _make_gdn_decode_kernel(): + """JIT-compile the fused GDN decode CUDA kernel.""" + + @jit( + args=[], + device="cuda", + cuda_files=["gdn_decode.cuh"], + cpp_wrappers=[], + cuda_wrappers=[ + ("gdn_decode", "GDNDecodeKernel::run"), + ], + func_name="gdn_decode", + ) + def _kernel( + compiled_module, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + A_log: torch.Tensor, + dt_bias: torch.Tensor, + state_pool: torch.Tensor, + cache_indices: torch.Tensor, + output: torch.Tensor, + ) -> None: + compiled_module.gdn_decode( + q, k, v, a, b, A_log, dt_bias, state_pool, cache_indices, output + ) + + return _kernel + + +def gdn_decode( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + A_log: torch.Tensor, + dt_bias: torch.Tensor, + state_pool: torch.Tensor, + cache_indices: torch.Tensor, +) -> torch.Tensor: + """Fused GDN decode: gating + L2 norm + delta rule + output. + + Parameters + ---------- + q : torch.Tensor + Query tensor, shape ``(bs, num_k_heads, head_k_dim)``, bf16/fp16. + k : torch.Tensor + Key tensor, shape ``(bs, num_k_heads, head_k_dim)``, bf16/fp16. + v : torch.Tensor + Value tensor, shape ``(bs, num_v_heads, head_v_dim)``, bf16/fp16. + a : torch.Tensor + Decay gate input, shape ``(bs, num_v_heads)``, bf16/fp16. + b : torch.Tensor + Update gate input, shape ``(bs, num_v_heads)``, bf16/fp16. + A_log : torch.Tensor + Log-space decay parameter, shape ``(num_v_heads,)``, float32. + dt_bias : torch.Tensor + Bias for decay gate, shape ``(num_v_heads,)``, float32. + state_pool : torch.Tensor + Pooled recurrent state, shape ``(pool_size, num_v_heads, head_v_dim, head_k_dim)``, + float32. Modified in-place. + cache_indices : torch.Tensor + Pool indices per request, shape ``(bs,)``, int64. + + Returns + ------- + torch.Tensor + Output tensor, shape ``(bs, num_v_heads, head_v_dim)``, same dtype as v. + """ + bs = q.shape[0] + num_v_heads = v.shape[1] + head_v_dim = v.shape[2] + + output = torch.empty(bs, num_v_heads, head_v_dim, dtype=v.dtype, device=v.device) + + kernel = _make_gdn_decode_kernel() + kernel( + q.contiguous(), + k.contiguous(), + v.contiguous(), + a.contiguous(), + b.contiguous(), + A_log.contiguous(), + dt_bias.contiguous(), + state_pool, + cache_indices.to(torch.int64).contiguous(), + output, + ) + return output diff --git a/mllm-kernel/mllm_kernel/cuda/jit/rms_norm_gated.py b/mllm-kernel/mllm_kernel/cuda/jit/rms_norm_gated.py new file mode 100644 index 000000000..d7906a383 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/jit/rms_norm_gated.py @@ -0,0 +1,87 @@ +"""Fused RMSNorm + SiLU gating CUDA JIT kernel for Qwen3.5 GDN attention. + +Computes ``rmsnorm(x, weight, eps) * silu(z)`` in a single fused pass. + +Usage:: + + from mllm_kernel.cuda.jit.rms_norm_gated import rms_norm_gated + + output = rms_norm_gated(x, weight, z=gate, eps=1e-6) +""" + +from __future__ import annotations + +import torch + +from mllm_kernel.jit_utils import cache_once, jit + + +@cache_once +def _make_rms_norm_gated_kernel(): + """JIT-compile the fused RMSNorm+gating CUDA kernel.""" + + @jit( + args=[], + device="cuda", + cuda_files=["rms_norm_gated.cuh"], + cpp_wrappers=[], + cuda_wrappers=[ + ("rms_norm_gated", "RMSNormGatedKernel::run"), + ], + func_name="rms_norm_gated", + ) + def _kernel( + compiled_module, + output: torch.Tensor, + input: torch.Tensor, + weight: torch.Tensor, + gate: torch.Tensor, + eps: float, + ) -> None: + compiled_module.rms_norm_gated(output, input, weight, gate, eps) + + return _kernel + + +def rms_norm_gated( + x: torch.Tensor, + weight: torch.Tensor, + z: torch.Tensor | None = None, + eps: float = 1e-6, +) -> torch.Tensor: + """Fused RMSNorm with optional SiLU gating. + + Parameters + ---------- + x : torch.Tensor + Input tensor, shape ``(M, N)`` or ``(..., N)``. + weight : torch.Tensor + Normalization weight, shape ``(N,)``. + z : torch.Tensor or None + Optional gating tensor, same shape as ``x``. + If provided: ``output = rmsnorm(x) * silu(z)`` + eps : float + Epsilon for numerical stability. + + Returns + ------- + torch.Tensor + Output with same shape and dtype as ``x``. + """ + x_shape = x.shape + x_2d = x.reshape(-1, x.shape[-1]) + + if z is not None: + z_2d = z.reshape(-1, z.shape[-1]) + if z_2d.stride(-1) != 1: + z_2d = z_2d.contiguous() + else: + z_2d = x.new_empty(0) # empty tensor signals "no gate" to the kernel + + if x_2d.stride(-1) != 1: + x_2d = x_2d.contiguous() + + output = torch.empty_like(x_2d) + kernel = _make_rms_norm_gated_kernel() + kernel(output, x_2d, weight.contiguous(), z_2d, eps) + return output.reshape(x_shape) diff --git a/pymllm/configs/global_config.py b/pymllm/configs/global_config.py index 1761697b1..711de3cd1 100644 --- a/pymllm/configs/global_config.py +++ b/pymllm/configs/global_config.py @@ -127,6 +127,16 @@ def _converter_for_annotation(annotation: Any) -> Optional[Callable[[str], Any]] return None +def _choices_for_annotation(annotation: Any) -> Optional[list]: + """Extract allowed values from a ``Literal`` annotation, if applicable.""" + + inner, _ = _unwrap_optional(annotation) + origin = get_origin(inner) + if origin is Literal: + return list(get_args(inner)) + return None + + def _is_bool_annotation(annotation: Any) -> bool: """Return ``True`` if annotation represents a bool/Optional[bool] field.""" @@ -225,16 +235,27 @@ def make_args( # Skip non-scalar or runtime-only fields (e.g. arbitrary objects). continue - section_group.add_argument( - option, + choices = _choices_for_annotation(annotation) + kwargs: dict[str, Any] = dict( dest=dest, type=converter, default=argparse.SUPPRESS, - help=( + ) + if choices is not None: + kwargs["choices"] = choices + choices_str = ", ".join(str(c) for c in choices) + kwargs["help"] = ( + f"{section_name}.{dc_field.name} " + f"{{choices: {choices_str}}} " + f"(default: {_format_default_for_help(default_value)})." + ) + else: + kwargs["help"] = ( f"{section_name}.{dc_field.name} (default: " f"{_format_default_for_help(default_value)})." - ), - ) + ) + + section_group.add_argument(option, **kwargs) return parser diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py index f6a2090fc..8727f7c13 100644 --- a/pymllm/configs/server_config.py +++ b/pymllm/configs/server_config.py @@ -40,19 +40,13 @@ class ServerConfig: max_queued_requests: Optional[int] = None max_total_tokens: Optional[int] = None chunked_prefill_size: Optional[int] = None - max_prefill_tokens: int = None + max_prefill_tokens: Optional[int] = None schedule_policy: Literal["auto", "fcfs"] = "fcfs" schedule_conservativeness: float = 1.0 sleep_on_idle: bool = False stream_interval: int = 1 stream_output: bool = True - # --------------------------------------------------------------------- # - # Threads - # --------------------------------------------------------------------- # - enable_disk_io_async: bool = False - disk_io_async_thread_count: int = 1 - # --------------------------------------------------------------------- # # Device # --------------------------------------------------------------------- # @@ -62,23 +56,34 @@ class ServerConfig: # Backend / acceleration # --------------------------------------------------------------------- # attention_backend: Literal["auto", "flashinfer"] = "auto" + gdn_decode_backend: Literal["auto", "flashinfer", "mllm_kernel", "pytorch"] = "auto" sampling_backend: Optional[str] = None disable_cuda_graph: bool = False - enable_torch_compile: bool = True + enable_torch_compile: bool = False torch_compile_max_bs: int = 32 random_seed: Optional[int] = 42 + # --------------------------------------------------------------------- # + # Output parsers (reasoning / tool calls) + # --------------------------------------------------------------------- # + reasoning_parser: Optional[str] = None # e.g. "deepseek-r1", "qwen3" + tool_call_parser: Optional[str] = None # e.g. "qwen25", "llama3", "hermes" + # --------------------------------------------------------------------- # # Logging and observability # --------------------------------------------------------------------- # log_level: Literal["debug", "info", "warning", "error", "critical"] = "info" enable_metrics: bool = False show_time_cost: bool = False + # Log prefill/decode throughput stats every N decode batches (0 = disabled) + decode_log_interval: int = 40 # --------------------------------------------------------------------- # # Feature switches # --------------------------------------------------------------------- # enable_shared_queue: bool = False # Use shared memory queue for fast IPC + disable_radix_cache: bool = False # Disable radix-tree prefix caching + radix_cache_page_size: int = 1 # Number of tokens per KV-pool page in RadixCache # CUDA IPC transport for multimodal GPU tensors. # Requires enable_shared_queue=True to take effect. @@ -161,5 +166,7 @@ def _validate(self) -> None: raise ValueError("`max_running_requests` must be > 0 when set.") if self.max_queued_requests is not None and self.max_queued_requests < 0: raise ValueError("`max_queued_requests` must be >= 0 when set.") + if self.radix_cache_page_size < 1: + raise ValueError("`radix_cache_page_size` must be >= 1.") if self.schedule_conservativeness <= 0: raise ValueError("`schedule_conservativeness` must be > 0.") diff --git a/pymllm/engine/forward_batch.py b/pymllm/engine/forward_batch.py index ebb715ff4..428da7b66 100644 --- a/pymllm/engine/forward_batch.py +++ b/pymllm/engine/forward_batch.py @@ -180,3 +180,12 @@ class ForwardBatch: # ---- attention backend (set by model runner) ---- attn_backend: Optional["AttentionBackend"] = None + + # ---- multimodal M-RoPE ---- + # Per-request position delta for M-RoPE decode steps. + # Set by the model during prefill; consumed during decode to offset positions. + mrope_position_deltas: Optional[torch.Tensor] = None # [batch_size] int64 + + # ---- multimodal vision inputs (extend / prefill only) ---- + pixel_values: Optional[torch.Tensor] = None + image_grid_thw: Optional[torch.Tensor] = None diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index 2ba04e1c1..e5214511f 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -28,12 +28,102 @@ ) from pymllm.orchestrator.tokenizer_process import run_tokenizer_process from pymllm.orchestrator.scheduler_process import run_scheduler_process -from pymllm.orchestrator.model_runner_process import run_model_runner_process from pymllm.orchestrator.detokenizer_process import run_detokenizer_process -from pymllm.orchestrator.async_disk_io_process import run_async_disk_io_process logger = logging.getLogger(__name__) +# Standard HuggingFace config fields that indicate max output tokens, +# checked in priority order. +_MAX_NEW_TOKENS_FIELDS = ( + "max_new_tokens", + "max_tokens", + "max_completion_tokens", +) + + +def _normalize_eos_raw(raw) -> List[int]: + """Normalize a raw eos_token_id value (int, list, or None) to a list.""" + if raw is None: + return [] + if isinstance(raw, int): + return [raw] + if isinstance(raw, (list, tuple)): + return [x for x in raw if isinstance(x, int)] + return [] + + +def _get_eos_token_ids(hf_config, model_path=None) -> List[int]: + """Extract EOS token ID(s) from a HuggingFace model config. + + Searches in priority order: + 1. ``hf_config.eos_token_id`` (top-level, standard models) + 2. ``hf_config.text_config.eos_token_id`` (VL / multimodal models) + 3. ``generation_config.json`` (many models store EOS here) + 4. ``tokenizer_config.json`` via AutoTokenizer (last resort) + """ + if hf_config is None: + return [] + + # 1. Top-level config + ids = _normalize_eos_raw(getattr(hf_config, "eos_token_id", None)) + if ids: + return ids + + # 2. Nested text_config (VL / multimodal models like Qwen3-VL) + text_config = getattr(hf_config, "text_config", None) + if text_config is not None: + ids = _normalize_eos_raw(getattr(text_config, "eos_token_id", None)) + if ids: + return ids + + # 3. generation_config.json (lightweight, just reads a JSON file) + if model_path is not None: + try: + from transformers import GenerationConfig + + gen_cfg = GenerationConfig.from_pretrained(str(model_path)) + ids = _normalize_eos_raw(getattr(gen_cfg, "eos_token_id", None)) + if ids: + logger.info("EOS token IDs from generation_config.json: %s", ids) + return ids + except Exception: + pass + + # 4. Tokenizer (last resort) + if model_path is not None: + try: + from transformers import AutoTokenizer + + tok = AutoTokenizer.from_pretrained(str(model_path), trust_remote_code=True) + if tok.eos_token_id is not None: + ids = [tok.eos_token_id] + logger.info("EOS token ID from tokenizer: %s", ids) + return ids + except Exception: + pass + + return [] + + +def _get_model_default_max_new_tokens(hf_config) -> Optional[int]: + """Extract max output token limit from a HuggingFace model config. + + Checks standard fields in priority order. Returns ``None`` when the + config does not specify any recognised output-length field. + """ + if hf_config is None: + return None + for field_name in _MAX_NEW_TOKENS_FIELDS: + value = getattr(hf_config, field_name, None) + if value is not None and isinstance(value, int) and value > 0: + logger.info( + "Using model config %s=%d as default max_new_tokens", + field_name, + value, + ) + return value + return None + class Engine: def __init__(self): @@ -59,20 +149,12 @@ def _launch_processes(self) -> None: addr_tokenizer_to_scheduler: str = make_ipc_address( "tokenizer_to_scheduler", uid ) - addr_scheduler_to_model_runner: str = make_ipc_address( - "scheduler_to_model_runner", uid - ) - addr_model_runner_to_scheduler: str = make_ipc_address( - "model_runner_to_scheduler", uid - ) addr_scheduler_to_detokenizer: str = make_ipc_address( "scheduler_to_detokenizer", uid ) addr_detokenizer_to_request_response: str = make_ipc_address( "detokenizer_to_request_response", uid ) - addr_scheduler_to_disk_io: str = make_ipc_address("scheduler_to_disk_io", uid) - # Record all subprocesses procs_and_readers: List[tuple] = [] @@ -114,6 +196,7 @@ def _launch_processes(self) -> None: "tensor_transport_mode": transport_mode, "cuda_ipc_pool_size_mb": cfg.server.cuda_ipc_pool_size_mb, "cuda_ipc_recycle_interval": cfg.server.cuda_ipc_recycle_interval, + "log_level": cfg.server.log_level, } # Tokenizer @@ -131,39 +214,44 @@ def _launch_processes(self) -> None: ) procs_and_readers.append((tokenizer_proc, tokenizer_reader, "tokenizer")) - # Scheduler + # Determine default max_new_tokens from model config (if available) + model_max_new_tokens = _get_model_default_max_new_tokens( + cfg.model.hf_config + ) + scheduler_kwargs = {} + if model_max_new_tokens is not None: + scheduler_kwargs["default_max_new_tokens"] = model_max_new_tokens + + # Extract EOS token ID(s) from model config + eos_token_ids = _get_eos_token_ids(cfg.model.hf_config, model_path=cfg.server.model_path) + if eos_token_ids: + scheduler_kwargs["eos_token_ids"] = eos_token_ids + logger.info("EOS token IDs for scheduler: %s", eos_token_ids) + + # Model runner config — passed to the scheduler process which now + # owns the model runner in-process (sglang-style architecture). + scheduler_kwargs["server_config"] = cfg.server + scheduler_kwargs["model_config"] = cfg.model + scheduler_kwargs["gpu_id"] = cfg.server.base_gpu_id + + # Scheduler (+ in-process model runner) scheduler_reader, scheduler_writer = mp.Pipe(duplex=False) scheduler_proc = mp.Process( target=run_scheduler_process, args=( addr_tokenizer_to_scheduler, - addr_scheduler_to_model_runner, - addr_model_runner_to_scheduler, addr_scheduler_to_detokenizer, scheduler_writer, shared_queue, # Pass shared queue enable_shared_queue, # Pass flag transport_mode, # Pass tensor transport mode + cfg.server.log_level, # Pass log level ), + kwargs=scheduler_kwargs, daemon=True, ) procs_and_readers.append((scheduler_proc, scheduler_reader, "scheduler")) - # Model Runner - model_runner_reader, model_runner_writer = mp.Pipe(duplex=False) - model_runner_proc = mp.Process( - target=run_model_runner_process, - args=( - addr_scheduler_to_model_runner, - addr_model_runner_to_scheduler, - model_runner_writer, - ), - daemon=True, - ) - procs_and_readers.append( - (model_runner_proc, model_runner_reader, "model_runner") - ) - # Detokenizer detokenizer_reader, detokenizer_writer = mp.Pipe(duplex=False) detokenizer_proc = mp.Process( @@ -172,21 +260,12 @@ def _launch_processes(self) -> None: addr_scheduler_to_detokenizer, addr_detokenizer_to_request_response, detokenizer_writer, + tokenizer_cfg, ), daemon=True, ) procs_and_readers.append((detokenizer_proc, detokenizer_reader, "detokenizer")) - # Async Disk I/O - if get_global_config().server.enable_disk_io_async: - disk_io_reader, disk_io_writer = mp.Pipe(duplex=False) - disk_io_proc = mp.Process( - target=run_async_disk_io_process, - args=(addr_scheduler_to_disk_io, disk_io_writer), - daemon=True, - ) - procs_and_readers.append((disk_io_proc, disk_io_reader, "async_disk_io")) - # Start all subprocesses for proc, _, name in procs_and_readers: proc.start() @@ -203,20 +282,15 @@ def _launch_processes(self) -> None: raise RuntimeError(f"{name} process failed to initialise: {msg}") logger.info("%s process ready", name) - # RR Process is current main process + # RR Process is current main process — only bind ZMQ sockets here. + # Background tasks are started lazily by listen() on the first + # add_request(), so they always run on the correct event loop. self._rr_process = RequestResponseProcess( send_to_tokenizer_addr=addr_request_response_to_tokenizer, recv_from_detokenizer_addr=addr_detokenizer_to_request_response, ) - - try: - self._loop = asyncio.get_running_loop() - except RuntimeError: - self._loop = asyncio.new_event_loop() - asyncio.set_event_loop(self._loop) - - self._rr_process.start(self._loop) - logger.info("RequestResponseProcess started in main process") + self._rr_process.start() + logger.info("RequestResponseProcess sockets bound") # Print colorful gradient ASCII art banner if HAS_BANNER_LIBS: @@ -296,7 +370,12 @@ async def _run() -> Union[Dict[str, Any], List[Dict[str, Any]]]: ) return list(outputs) - return self._loop.run_until_complete(_run()) + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop.run_until_complete(_run()) async def generate_async( self, @@ -354,10 +433,15 @@ async def generate_async( else: yield await self._wait_for_final_result(single_rid, state) finally: - self._rr_process.remove_state(single_rid) + if not state.finished: + logger.info("Aborting request %s (client disconnected)", single_rid) + await self._rr_process.abort_request(single_rid) + else: + self._rr_process.remove_state(single_rid) else: rids_list: List[str] = rid if isinstance(rid, list) else [rid] # type: ignore[assignment] states: List[ReqState] = result # type: ignore[assignment] + _bg_tasks: List[asyncio.Task] = [] try: if stream: # Merge streams from all sub-requests using an asyncio queue. @@ -368,18 +452,18 @@ async def _forward(r: str, s: ReqState) -> None: await queue.put(chunk) await queue.put(None) # sentinel - tasks = [ + _bg_tasks = [ asyncio.create_task(_forward(r, s)) for r, s in zip(rids_list, states) ] done_count = 0 - while done_count < len(tasks): + while done_count < len(_bg_tasks): item = await queue.get() if item is None: done_count += 1 else: yield item - await asyncio.gather(*tasks) + await asyncio.gather(*_bg_tasks) else: for coro in asyncio.as_completed( [ @@ -389,8 +473,14 @@ async def _forward(r: str, s: ReqState) -> None: ): yield await coro finally: - for r in rids_list: - self._rr_process.remove_state(r) + for t in _bg_tasks: + t.cancel() + for r, s in zip(rids_list, states): + if not s.finished: + logger.info("Aborting request %s (client disconnected)", r) + await self._rr_process.abort_request(r) + else: + self._rr_process.remove_state(r) @staticmethod async def _wait_for_final_result(rid: str, state: ReqState) -> Dict[str, Any]: @@ -443,7 +533,11 @@ def shutdown(self) -> None: """Terminate all subprocesses.""" if self._rr_process is not None: try: - self._loop.run_until_complete(self._rr_process.shutdown()) + loop = asyncio.get_event_loop() + if loop.is_running(): + loop.create_task(self._rr_process.shutdown()) + else: + loop.run_until_complete(self._rr_process.shutdown()) except Exception: pass for proc in self._subprocesses: diff --git a/pymllm/executor/__init__.py b/pymllm/executor/__init__.py index e69de29bb..b513b8705 100644 --- a/pymllm/executor/__init__.py +++ b/pymllm/executor/__init__.py @@ -0,0 +1,10 @@ +"""Executor module: model loading, forward pass, and sampling.""" + +from pymllm.executor.cuda_graph_runner import CudaGraphRunner +from pymllm.executor.model_runner import LogitsProcessorOutput, ModelRunner + +__all__ = [ + "CudaGraphRunner", + "LogitsProcessorOutput", + "ModelRunner", +] diff --git a/pymllm/executor/cuda_graph_runner.py b/pymllm/executor/cuda_graph_runner.py index e69de29bb..fe4fb0e92 100644 --- a/pymllm/executor/cuda_graph_runner.py +++ b/pymllm/executor/cuda_graph_runner.py @@ -0,0 +1,590 @@ +"""CUDA-graph accelerated forward pass for decode steps. + +Captures CUDA graphs for a set of discrete batch sizes so that the decode +forward pass can be replayed without CPU-side kernel-launch overhead. + +Simplified from sglang's ``CudaGraphRunner`` for pymllm's single-GPU +architecture. Handles: + +* Pre-allocated input buffers (avoids per-step allocations) +* CUDA-graph capture for each batch size +* Optional ``torch.compile`` integration +* Graph replay with padding to the nearest captured batch size + +Typical lifecycle:: + + runner = CudaGraphRunner(model_runner) # captures all batch sizes + + # --- inside the inference loop --- + if runner.can_run(forward_batch): + logits_output = runner.replay(forward_batch) + else: + logits_output = model_runner.forward(forward_batch) + +Integration with :class:`~pymllm.executor.model_runner.ModelRunner` +------------------------------------------------------------------- +The ``ModelRunner`` owns the ``CudaGraphRunner`` and delegates decode +batches to it when the batch size is within the captured range. The +``CudaGraphRunner`` calls ``attn_backend.init_forward_metadata_*_cuda_graph`` +directly (bypassing the normal ``init_forward_metadata`` path) so that +FlashInfer's per-batch planning is recorded inside the graph. +""" + +from __future__ import annotations + +import bisect +import gc +import logging +import time +from contextlib import contextmanager +from dataclasses import dataclass +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union + +import torch + +from pymllm.engine.forward_batch import ForwardBatch, ForwardMode +from pymllm.executor.model_runner import LogitsProcessorOutput + +if TYPE_CHECKING: + from pymllm.executor.model_runner import ModelRunner + from pymllm.layers.attention.attention_backend import AttentionBackend + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Global CUDA-graph memory pool (shared across all CudaGraphRunner instances) +# --------------------------------------------------------------------------- + +_global_graph_memory_pool: Optional[tuple] = None + + +def get_global_graph_memory_pool() -> Optional[tuple]: + """Return the shared CUDA graph memory pool handle.""" + return _global_graph_memory_pool + + +def set_global_graph_memory_pool(pool: tuple) -> None: + """Set the shared CUDA graph memory pool handle.""" + global _global_graph_memory_pool + _global_graph_memory_pool = pool + + +# --------------------------------------------------------------------------- +# Context managers +# --------------------------------------------------------------------------- + +# Flag indicating whether we are currently capturing a CUDA graph. +_is_capture_mode: bool = False + + +def is_capture_mode() -> bool: + """Return ``True`` if a CUDA-graph capture is in progress.""" + return _is_capture_mode + + +@contextmanager +def model_capture_mode(): + """Context manager that sets the global capture-mode flag.""" + global _is_capture_mode + _is_capture_mode = True + try: + yield + finally: + _is_capture_mode = False + + +@contextmanager +def freeze_gc(): + """Freeze the garbage collector during CUDA-graph capture. + + GC activity during capture can interfere with the recorded stream + ordering. This context manager collects garbage before capture, + freezes all surviving objects, and unfreezes + re-collects afterwards. + """ + gc.collect() + gc.freeze() + try: + yield + finally: + gc.unfreeze() + gc.collect() + + +# --------------------------------------------------------------------------- +# Pre-allocated input buffers +# --------------------------------------------------------------------------- + + +@dataclass +class _InputBuffers: + """Pre-allocated GPU tensors used as CUDA-graph inputs. + + During graph capture these buffers are used as-is. During replay the + real batch data is copied into the first ``batch_size`` rows while the + remaining padding rows retain their fill values. + """ + + input_ids: torch.Tensor # [max_bs] int64 + req_pool_indices: torch.Tensor # [max_bs] int32 + seq_lens: torch.Tensor # [max_bs] int32 + seq_lens_cpu: torch.Tensor # [max_bs] int32 (CPU) + out_cache_loc: torch.Tensor # [max_bs] int64 + positions: torch.Tensor # [max_bs] int64 + mrope_position_deltas: torch.Tensor # [max_bs] int64 + + @classmethod + def create( + cls, + *, + device: torch.device, + max_bs: int, + seq_len_fill_value: int, + ) -> "_InputBuffers": + """Allocate all buffers for the given maximum batch size.""" + with torch.device(device): + input_ids = torch.zeros((max_bs,), dtype=torch.int64) + req_pool_indices = torch.zeros((max_bs,), dtype=torch.int32) + seq_lens = torch.full((max_bs,), seq_len_fill_value, dtype=torch.int32) + out_cache_loc = torch.zeros((max_bs,), dtype=torch.int64) + positions = torch.zeros((max_bs,), dtype=torch.int64) + mrope_position_deltas = torch.zeros((max_bs,), dtype=torch.int64) + + # seq_lens_cpu must be a real CPU tensor. + seq_lens_cpu = torch.full( + (max_bs,), + seq_len_fill_value, + dtype=torch.int32, + device="cpu", + ) + + return cls( + input_ids=input_ids, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + seq_lens_cpu=seq_lens_cpu, + out_cache_loc=out_cache_loc, + positions=positions, + mrope_position_deltas=mrope_position_deltas, + ) + + def populate( + self, + forward_batch: ForwardBatch, + padded_bs: int, + seq_len_fill_value: int, + ) -> None: + """Copy real batch data into the pre-allocated buffers. + + Any padding slots (``[real_bs : padded_bs]``) are filled with safe + defaults so that the captured graph does not access invalid memory. + """ + real_bs = forward_batch.batch_size + + # Reset padding slots when the padded size exceeds the real size. + if padded_bs != real_bs: + self.seq_lens.fill_(seq_len_fill_value) + self.out_cache_loc.zero_() + self.mrope_position_deltas.zero_() + + self.input_ids[:real_bs].copy_(forward_batch.input_ids) + self.req_pool_indices[:real_bs].copy_(forward_batch.req_pool_indices) + self.seq_lens[:real_bs].copy_(forward_batch.seq_lens) + self.out_cache_loc[:real_bs].copy_(forward_batch.out_cache_loc) + self.positions[:real_bs].copy_(forward_batch.positions) + + # Copy M-RoPE position deltas (used by Qwen3-VL for multimodal). + if forward_batch.mrope_position_deltas is not None: + self.mrope_position_deltas[:real_bs].copy_( + forward_batch.mrope_position_deltas + ) + else: + self.mrope_position_deltas[:real_bs].zero_() + + if forward_batch.seq_lens_cpu is not None: + if padded_bs != real_bs: + self.seq_lens_cpu.fill_(seq_len_fill_value) + self.seq_lens_cpu[:real_bs].copy_(forward_batch.seq_lens_cpu) + + +# --------------------------------------------------------------------------- +# Batch-size schedule +# --------------------------------------------------------------------------- + + +def _default_capture_batch_sizes(max_bs: int) -> List[int]: + """Return a list of batch sizes to capture. + + Uses the same schedule as sglang (non-speculative):: + + [1, 2, 4, 8, 12, 16, 24, 32, 40, …, 256, 272, 288, …, 512, 544, …] + + Capped at *max_bs*. + """ + bs_list = ( + [1, 2, 4, 8, 12] + + list(range(16, 257, 8)) + + list(range(272, 512, 16)) + + list(range(512, max_bs + 1, 32)) + ) + bs_list = sorted(set(bs for bs in bs_list if bs <= max_bs)) + if not bs_list: + bs_list = [1] + return bs_list + + +# --------------------------------------------------------------------------- +# CudaGraphRunner +# --------------------------------------------------------------------------- + + +class CudaGraphRunner: + """Captures and replays CUDA graphs for decode-step forward passes. + + This class is the pymllm equivalent of sglang's ``CudaGraphRunner``, + stripped of distributed, speculative-decoding, LoRA, mamba, TBO, and + piecewise-graph complexities. + + Parameters + ---------- + model_runner + The owning :class:`~pymllm.executor.model_runner.ModelRunner`. + Must have been fully initialised before the ``CudaGraphRunner`` + is constructed. + """ + + def __init__(self, model_runner: "ModelRunner"): + self.model_runner = model_runner + self.device = model_runner.device + + self.graphs: Dict[int, torch.cuda.CUDAGraph] = {} + self.output_buffers: Dict[int, LogitsProcessorOutput] = {} + + self.enable_torch_compile: bool = ( + model_runner.server_config.enable_torch_compile + ) + self.torch_compile_max_bs: int = model_runner.server_config.torch_compile_max_bs + + # ----------------------------------------------------------- + # Batch-size schedule + # ----------------------------------------------------------- + max_bs = model_runner.max_running_requests + self.capture_bs: List[int] = _default_capture_batch_sizes(max_bs) + self.compile_bs: List[int] = ( + [bs for bs in self.capture_bs if bs <= self.torch_compile_max_bs] + if self.enable_torch_compile + else [] + ) + self.max_bs: int = max(self.capture_bs) + + logger.info("CUDA graph capture batch sizes: %s", self.capture_bs) + + # ----------------------------------------------------------- + # Attention-backend CUDA-graph state + # ----------------------------------------------------------- + self.model_runner.attn_backend.init_cuda_graph_state(self.max_bs, self.max_bs) + + # Fill value for padded seq_lens so attention kernels don't div-by-0. + self.seq_len_fill_value: int = ( + self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value() + ) + + # ----------------------------------------------------------- + # Pre-allocated input buffers + # ----------------------------------------------------------- + self.buffers: _InputBuffers = _InputBuffers.create( + device=torch.device(self.device), + max_bs=self.max_bs, + seq_len_fill_value=self.seq_len_fill_value, + ) + + # ----------------------------------------------------------- + # Optional torch.compile config + # ----------------------------------------------------------- + if self.enable_torch_compile: + _set_torch_compile_config() + + # ----------------------------------------------------------- + # Capture all batch sizes + # ----------------------------------------------------------- + try: + with model_capture_mode(): + self.capture() + except RuntimeError as exc: + raise RuntimeError( + f"CUDA graph capture failed: {exc}\n" + "Possible fixes:\n" + " 1. Reduce --server.mem_fraction_static (e.g. 0.7)\n" + " 2. Reduce --server.max_running_requests\n" + " 3. Disable CUDA graph with --server.disable_cuda_graph\n" + ) from exc + + # ------------------------------------------------------------------ + # Capability check + # ------------------------------------------------------------------ + + def can_run(self, forward_batch: ForwardBatch) -> bool: + """Return ``True`` if the batch can be run via CUDA graph replay. + + The batch must be a decode (or idle) batch whose size does not + exceed the largest captured batch size. + """ + return ( + forward_batch.forward_mode.is_decode_or_idle() + and forward_batch.batch_size <= self.max_bs + ) + + # ------------------------------------------------------------------ + # Capture + # ------------------------------------------------------------------ + + def capture(self) -> None: + """Capture CUDA graphs for every batch size in ``capture_bs``. + + Iterates in reverse order (largest first) so that the GPU memory + pool allocated for the largest graph is reused by smaller ones. + """ + tic = time.perf_counter() + before_mem = _get_avail_mem(self.device) + logger.info("CUDA graph capture begin. avail mem=%.2f GB", before_mem) + + with freeze_gc(): + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + for bs in reversed(self.capture_bs): + forward_fn = self._get_forward_fn(bs) + graph, output = self._capture_one_batch_size(bs, forward_fn, stream) + self.graphs[bs] = graph + self.output_buffers[bs] = output + + after_mem = _get_avail_mem(self.device) + logger.info( + "CUDA graph capture end. elapsed=%.2f s, mem usage=%.2f GB, " + "avail mem=%.2f GB", + time.perf_counter() - tic, + before_mem - after_mem, + after_mem, + ) + + def _get_forward_fn(self, bs: int) -> Callable: + """Return the forward callable for the given batch size. + + When ``torch.compile`` is enabled and *bs* is within the compile + threshold, the model's forward method is wrapped with + ``torch.compile``. + """ + model_forward = self.model_runner.model.forward + if self.enable_torch_compile and bs in self.compile_bs: + return torch.compile( + torch.no_grad()(model_forward), + mode="max-autotune-no-cudagraphs", + ) + return model_forward + + def _capture_one_batch_size( + self, + bs: int, + forward: Callable, + stream: torch.cuda.Stream, + ) -> tuple: + """Capture a single CUDA graph for batch size *bs*. + + Steps: + 1. Build a ``ForwardBatch`` from the pre-allocated buffers. + 2. Tell the attention backend to plan for CUDA-graph capture. + 3. Run the forward pass twice for warmup. + 4. Capture the third run into a ``CUDAGraph``. + + Returns ``(graph, output_buffers)``. + """ + buffers = self.buffers + + # Slice pre-allocated buffers to the capture size. + input_ids = buffers.input_ids[:bs] + req_pool_indices = buffers.req_pool_indices[:bs] + seq_lens = buffers.seq_lens[:bs] + seq_lens_cpu = buffers.seq_lens_cpu[:bs] + out_cache_loc = buffers.out_cache_loc[:bs] + positions = buffers.positions[:bs] + mrope_position_deltas = buffers.mrope_position_deltas[:bs] + + # Build ForwardBatch (DECODE mode). + # mrope_position_deltas is set to the static buffer (initially zeros) + # so that the graph captures the ``positions + deltas`` path. During + # replay the buffer is updated with real delta values. + forward_batch = ForwardBatch( + forward_mode=ForwardMode.DECODE, + batch_size=bs, + input_ids=input_ids, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + out_cache_loc=out_cache_loc, + seq_lens_sum=int(seq_lens.sum().item()), + seq_lens_cpu=seq_lens_cpu, + positions=positions, + return_logprob=False, + req_to_token_pool=self.model_runner.req_to_token_pool, + token_to_kv_pool=self.model_runner.token_to_kv_pool, + attn_backend=self.model_runner.attn_backend, + mrope_position_deltas=mrope_position_deltas, + ) + + # Tell the attention backend to set up CUDA-graph-aware metadata. + self.model_runner.attn_backend.init_forward_metadata_capture_cuda_graph( + bs=bs, + num_tokens=bs, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + forward_mode=ForwardMode.DECODE, + ) + + # The single forward-pass function to be captured. + def run_once(): + return forward( + input_ids, + forward_batch.positions, + forward_batch, + ) + + # Warmup (2 eager runs to stabilise cudnn / autotuner / etc.). + for _ in range(2): + torch.cuda.synchronize() + run_once() + + # ----- Capture ----- + global _global_graph_memory_pool + if _global_graph_memory_pool is None: + _global_graph_memory_pool = torch.cuda.graph_pool_handle() + + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph( + graph, + pool=_global_graph_memory_pool, + stream=stream, + ): + output = run_once() + + return graph, output + + # ------------------------------------------------------------------ + # Replay + # ------------------------------------------------------------------ + + def replay( + self, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Replay a captured CUDA graph for the given decode batch. + + The batch is padded to the nearest captured size, inputs are copied + into the pre-allocated buffers, the graph is replayed, and the + output is sliced back to the real batch size. + + Parameters + ---------- + forward_batch + The decode batch from the scheduler. + + Returns + ------- + LogitsProcessorOutput + The logits for the real (un-padded) sequences. + """ + real_bs = forward_batch.batch_size + + # Find the smallest captured bs >= real_bs. + idx = bisect.bisect_left(self.capture_bs, real_bs) + padded_bs = self.capture_bs[idx] + + # Copy real data into the static buffers. + self.buffers.populate( + forward_batch, + padded_bs=padded_bs, + seq_len_fill_value=self.seq_len_fill_value, + ) + + # Update the attention backend for replay. + seq_lens_sum = ( + forward_batch.seq_lens_sum + (padded_bs - real_bs) * self.seq_len_fill_value + ) + self.model_runner.attn_backend.init_forward_metadata_replay_cuda_graph( + bs=padded_bs, + req_pool_indices=self.buffers.req_pool_indices[:padded_bs], + seq_lens=self.buffers.seq_lens[:padded_bs], + seq_lens_sum=seq_lens_sum, + forward_mode=ForwardMode.DECODE, + seq_lens_cpu=self.buffers.seq_lens_cpu[:padded_bs], + ) + + # Replay the graph. + self.graphs[padded_bs].replay() + + # Retrieve output and slice to real batch size. + output = self.output_buffers[padded_bs] + + if isinstance(output, LogitsProcessorOutput): + return LogitsProcessorOutput( + next_token_logits=output.next_token_logits[:real_bs], + hidden_states=( + output.hidden_states[:real_bs] + if output.hidden_states is not None + else None + ), + ) + elif isinstance(output, torch.Tensor): + # Raw tensor output: assume [padded_bs, vocab_size]. + return LogitsProcessorOutput( + next_token_logits=output[:real_bs], + ) + else: + # HuggingFace-style output with .logits attribute. + if hasattr(output, "logits"): + logits = output.logits + if logits.dim() == 3: + return LogitsProcessorOutput( + next_token_logits=logits[:real_bs, -1, :], + ) + return LogitsProcessorOutput( + next_token_logits=logits[:real_bs], + ) + raise TypeError(f"Unexpected CUDA graph output type: {type(output)}") + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + """Release all captured CUDA graphs and associated buffers.""" + for graph in self.graphs.values(): + del graph + self.graphs.clear() + self.output_buffers.clear() + logger.info("CudaGraphRunner shutdown complete.") + + +# --------------------------------------------------------------------------- +# Utility helpers +# --------------------------------------------------------------------------- + + +def _get_avail_mem(device: str) -> float: + """Return available GPU memory in GB.""" + if device != "cuda" or not torch.cuda.is_available(): + return 0.0 + free, _ = torch.cuda.mem_get_info() + return free / (1 << 30) + + +def _set_torch_compile_config() -> None: + """Set dynamo / inductor configs for optimal CUDA-graph + compile.""" + try: + import torch._dynamo.config + import torch._inductor.config + + torch._inductor.config.coordinate_descent_tuning = True + torch._inductor.config.triton.unique_kernel_names = True + torch._inductor.config.fx_graph_cache = True + torch._dynamo.config.accumulated_cache_size_limit = 1024 + if hasattr(torch._dynamo.config, "cache_size_limit"): + torch._dynamo.config.cache_size_limit = 1024 + except ImportError: + logger.warning("torch._dynamo / torch._inductor not available.") diff --git a/pymllm/executor/model_runner.py b/pymllm/executor/model_runner.py index e69de29bb..6d6f33fea 100644 --- a/pymllm/executor/model_runner.py +++ b/pymllm/executor/model_runner.py @@ -0,0 +1,1198 @@ +"""ModelRunner runs the forward passes of the models. + +Simplified from sglang's ``ModelRunner`` for pymllm's single-GPU inference +architecture. Handles: + +* Model loading (HuggingFace checkpoint via ``transformers``) +* KV-cache memory pool initialisation +* Attention backend setup (FlashInfer) +* Forward pass dispatch (extend / decode / idle) +* Token sampling from logits + +Typical lifecycle:: + + runner = ModelRunner(server_config, model_config) + runner.initialize() + + # --- inside the inference loop --- + forward_batch = runner.prepare_forward_batch_decode(...) + logits_output = runner.forward(forward_batch) + next_token_ids = runner.sample(logits_output, forward_batch) + +Typical data flow +----------------- + SchedulerProcess builds a batch dict + ↓ + ModelRunnerProcess calls ModelRunner.forward(forward_batch) + ↓ + attn_backend.init_forward_metadata(forward_batch) + ↓ + model.forward(input_ids, positions, forward_batch) + ↓ + ModelRunner.sample(logits_output, forward_batch) + ↓ + next_token_ids returned to scheduler +""" + +from __future__ import annotations + +import gc +import logging +import time +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union + +import torch +from torch import nn + +from pymllm.configs import get_global_config +from pymllm.engine.forward_batch import ForwardBatch, ForwardMode +from pymllm.mem_cache.memory_pool import ( + GDNPool, + KVPool, + ReqToTokenPool, + TokenToKVPoolAllocator, + make_full_attention_net_mem_pool, + make_req_to_token_pool, +) + +if TYPE_CHECKING: + from pymllm.configs.model_config import ModelConfig + from pymllm.configs.server_config import ServerConfig + from pymllm.executor.cuda_graph_runner import CudaGraphRunner + from pymllm.layers.attention.attention_backend import AttentionBackend + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Utility: GPU memory query +# --------------------------------------------------------------------------- + + +def get_available_gpu_memory(device: str = "cuda", gpu_id: int = 0) -> float: + """Return available GPU memory in GB.""" + if device != "cuda" or not torch.cuda.is_available(): + return 0.0 + torch.cuda.set_device(gpu_id) + free, _ = torch.cuda.mem_get_info(gpu_id) + return free / (1 << 30) + + +def get_total_gpu_memory(device: str = "cuda", gpu_id: int = 0) -> float: + """Return total GPU memory in GB.""" + if device != "cuda" or not torch.cuda.is_available(): + return 0.0 + torch.cuda.set_device(gpu_id) + _, total = torch.cuda.mem_get_info(gpu_id) + return total / (1 << 30) + + +# --------------------------------------------------------------------------- +# LogitsProcessorOutput +# --------------------------------------------------------------------------- + + +@dataclass +class LogitsProcessorOutput: + """Container for output logits produced by the model's forward pass. + + Attributes + ---------- + next_token_logits + Raw logits for the last token of each sequence in the batch, + shape ``[batch_size, vocab_size]``. + hidden_states + Optional hidden states from the model (e.g. for speculative decoding + or auxiliary loss computation). + """ + + next_token_logits: torch.Tensor # [batch_size, vocab_size] + hidden_states: Optional[torch.Tensor] = None + + +# --------------------------------------------------------------------------- +# ModelRunner +# --------------------------------------------------------------------------- + + +class ModelRunner: + """Runs the forward passes of the models. + + This is the core execution component that owns the model, memory pools, + and attention backend. It is used by + :class:`~pymllm.orchestrator.model_runner_process.ModelRunnerProcess` to + execute batches dispatched by the scheduler. + + Parameters + ---------- + server_config + Server runtime configuration. Falls back to the global singleton + when ``None``. + model_config + Model configuration (wraps a HuggingFace ``PretrainedConfig``). + Falls back to the global singleton when ``None``. + gpu_id + GPU device index to use. + """ + + def __init__( + self, + server_config: Optional["ServerConfig"] = None, + model_config: Optional["ModelConfig"] = None, + gpu_id: int = 0, + ): + cfg = get_global_config() + self.server_config = server_config or cfg.server + self.model_config = model_config or cfg.model + + self.gpu_id = gpu_id + self.device: str = "cuda" if torch.cuda.is_available() else "cpu" + self.dtype: torch.dtype = self._resolve_dtype() + + # Set by initialize() + self.model: Optional[nn.Module] = None + self.req_to_token_pool: Optional[ReqToTokenPool] = None + self.token_to_kv_pool: Optional[KVPool] = None + self.token_to_kv_pool_allocator: Optional[TokenToKVPoolAllocator] = None + self.gdn_pool: Optional[GDNPool] = None + self.attn_backend: Optional["AttentionBackend"] = None + self.graph_runner: Optional["CudaGraphRunner"] = None + + # Memory configuration + self.max_total_num_tokens: int = 0 + self.max_running_requests: int = 0 + + # Model metadata (populated after loading) + self.num_hidden_layers: int = 0 + self.num_attention_heads: int = 0 + self.num_kv_heads: int = 0 + self.head_dim: int = 0 + self.hidden_size: int = 0 + self.vocab_size: int = 0 + self.context_len: int = 0 + + # KV cache dtype -- same as model dtype by default; may differ for + # quantised KV caches in the future. + self.kv_cache_dtype: torch.dtype = self.dtype + + # Forward pass counter (monotonically increasing). + self.forward_pass_id: int = 0 + + # ------------------------------------------------------------------ + # Initialisation + # ------------------------------------------------------------------ + + def initialize(self) -> None: + """Full initialisation: set device, load model, init memory + backend. + + Call this once before any forward pass. + """ + tic = time.perf_counter() + logger.info("ModelRunner initialisation begin.") + + # Set device + if self.device == "cuda": + torch.cuda.set_device(self.gpu_id) + + # Set default dtype + torch.set_default_dtype(self.dtype) + + # Load the model + self.load_model() + + # Extract model metadata from hf_config + self._extract_model_metadata() + + # Resolve KV-cache dtype + self._configure_kv_cache_dtype() + + # Initialise memory pools + self.init_memory_pool() + + # Initialise attention backend + self.init_attention_backend() + + # Warm up cuBLAS + if self.device == "cuda": + self._init_cublas() + + # Capture CUDA graphs (must be after model + pools + backend) + self.init_cuda_graphs() + + elapsed = time.perf_counter() - tic + logger.info( + "ModelRunner initialisation complete. elapsed=%.2f s, " + "device=%s, dtype=%s, kv_dtype=%s, max_tokens=%d, max_reqs=%d", + elapsed, + self.device, + self.dtype, + self.kv_cache_dtype, + self.max_total_num_tokens, + self.max_running_requests, + ) + + # ------------------------------------------------------------------ + # Dtype resolution + # ------------------------------------------------------------------ + + def _resolve_dtype(self) -> torch.dtype: + """Resolve the model dtype from configuration.""" + dtype_str = self.server_config.dtype + if dtype_str == "auto": + if torch.cuda.is_available(): + if torch.cuda.get_device_capability()[0] >= 8: + return torch.bfloat16 + return torch.float16 + return torch.float32 + dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, + } + result = dtype_map.get(dtype_str) + if result is None: + raise ValueError(f"Unsupported dtype: {dtype_str!r}") + return result + + def _configure_kv_cache_dtype(self) -> None: + """Determine the dtype used for KV-cache storage. + + The global ``QuantizationConfig.kv_cache_dtype`` can override the + model dtype (e.g. ``fp8_e4m3`` for quantised KV caches). When set + to ``"auto"`` the model dtype is used as-is. + """ + cfg = get_global_config() + kv_dtype_str = cfg.quantization.kv_cache_dtype + + if kv_dtype_str == "auto": + self.kv_cache_dtype = self.dtype + return + + kv_dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "fp8_e4m3": torch.float8_e4m3fn, + "fp8_e5m2": torch.float8_e5m2, + } + resolved = kv_dtype_map.get(kv_dtype_str) + if resolved is None: + logger.warning( + "Unrecognised kv_cache_dtype %r, falling back to model dtype.", + kv_dtype_str, + ) + self.kv_cache_dtype = self.dtype + else: + self.kv_cache_dtype = resolved + + logger.info("KV-cache dtype: %s", self.kv_cache_dtype) + + # ------------------------------------------------------------------ + # Model metadata + # ------------------------------------------------------------------ + + def _extract_model_metadata(self) -> None: + """Extract key model parameters from the HuggingFace config.""" + hf_config = self.model_config.hf_config + if hf_config is None: + raise RuntimeError( + "HuggingFace config not loaded. " + "Make sure model_config.hf_config is set before calling " + "initialize()." + ) + + # Handle text_config for multimodal models + text_config = getattr(hf_config, "text_config", hf_config) + + self.num_hidden_layers = getattr(text_config, "num_hidden_layers", 0) + self.num_attention_heads = getattr(text_config, "num_attention_heads", 0) + self.num_kv_heads = getattr( + text_config, + "num_key_value_heads", + self.num_attention_heads, + ) + self.head_dim = getattr( + text_config, + "head_dim", + getattr(text_config, "hidden_size", 0) // max(self.num_attention_heads, 1), + ) + self.hidden_size = getattr(text_config, "hidden_size", 0) + self.vocab_size = getattr(text_config, "vocab_size", 0) + + # V-head dim may differ from K-head dim (e.g. MLA) + self.v_head_dim: int = getattr(text_config, "v_head_dim", self.head_dim) + + # Context length + self.context_len = self.server_config.context_length or getattr( + text_config, "max_position_embeddings", 4096 + ) + + # Hybrid model metadata (GDN layers) + self.num_gdn_layers: int = getattr(self.model, "num_gdn_layers", 0) + self.full_attn_layer_ids: set = getattr(self.model, "full_attn_layer_ids", set()) + + logger.info( + "Model metadata: layers=%d, q_heads=%d, kv_heads=%d, " + "head_dim=%d, v_head_dim=%d, hidden=%d, vocab=%d, ctx_len=%d" + + (", gdn_layers=%d" if self.num_gdn_layers > 0 else ""), + self.num_hidden_layers, + self.num_attention_heads, + self.num_kv_heads, + self.head_dim, + self.v_head_dim, + self.hidden_size, + self.vocab_size, + self.context_len, + *([self.num_gdn_layers] if self.num_gdn_layers > 0 else []), + ) + + # ------------------------------------------------------------------ + # Model loading + # ------------------------------------------------------------------ + + def load_model(self) -> None: + """Load the model from a HuggingFace checkpoint. + + First checks the pymllm model registry for a custom implementation + that uses ``RadixAttention``. If found, instantiates it with the + HuggingFace config and loads weights via ``load_weights()``. + Otherwise falls back to ``AutoModelForCausalLM.from_pretrained``. + """ + tic = time.perf_counter() + model_path = self.server_config.model_path + + if model_path is None: + raise RuntimeError("server_config.model_path is not set.") + + before_mem = get_available_gpu_memory(self.device, self.gpu_id) + logger.info( + "Load model begin. path=%s, avail mem=%.2f GB", + model_path, + before_mem, + ) + + # Look up the architecture in the pymllm model registry + from pymllm.models import _MODEL_REGISTRY, get_model_class + + hf_config = self.model_config.hf_config + architectures = [] + if hf_config is not None: + architectures = getattr(hf_config, "architectures", None) or [] + + if not architectures: + supported = ", ".join(sorted(_MODEL_REGISTRY.keys())) + raise RuntimeError( + f"Cannot determine model architecture from config. " + f"Supported architectures: {supported}" + ) + + architecture = architectures[0] + model_cls = get_model_class(architecture) + if model_cls is None: + supported = ", ".join(sorted(_MODEL_REGISTRY.keys())) + raise RuntimeError( + f"Architecture {architecture!r} is not supported by pymllm. " + f"Supported architectures: {supported}" + ) + + logger.info("Using pymllm model class: %s", model_cls.__name__) + device_str = f"cuda:{self.gpu_id}" if self.device == "cuda" else self.device + # Use set_default_dtype so parameters created without explicit dtype + # get the target dtype, while parameters with explicit dtype=torch.float32 + # (e.g. A_log, dt_bias in GDN layers) stay in float32. + old_dtype = torch.get_default_dtype() + torch.set_default_dtype(self.dtype) + try: + with torch.device(device_str): + self.model = model_cls(hf_config) + finally: + torch.set_default_dtype(old_dtype) + self.model.load_weights(self._iter_weights(model_path)) + self.model.eval() + + after_mem = get_available_gpu_memory(self.device, self.gpu_id) + weight_mem = before_mem - after_mem + logger.info( + "Load model end. elapsed=%.2f s, type=%s, " + "weight_mem=%.2f GB, avail mem=%.2f GB", + time.perf_counter() - tic, + type(self.model).__name__, + weight_mem, + after_mem, + ) + + @staticmethod + def _iter_weights(model_path) -> "Generator[Tuple[str, torch.Tensor], None, None]": + """Yield ``(name, tensor)`` pairs from safetensors or ``.bin`` files. + + Prefers safetensors when available; falls back to PyTorch ``.bin`` + files otherwise. + """ + import glob as _glob + from pathlib import Path + + model_path = Path(model_path) + + # Prefer safetensors + st_files = sorted(_glob.glob(str(model_path / "*.safetensors"))) + if st_files: + from safetensors.torch import load_file + + for fpath in st_files: + state_dict = load_file(fpath) + yield from state_dict.items() + del state_dict + return + + # Fallback: PyTorch .bin files + bin_files = sorted(_glob.glob(str(model_path / "*.bin"))) + for fpath in bin_files: + state_dict = torch.load(fpath, map_location="cpu", weights_only=True) + yield from state_dict.items() + del state_dict + + # ------------------------------------------------------------------ + # Memory pool initialisation + # ------------------------------------------------------------------ + + def init_memory_pool(self) -> None: + """Initialise KV-cache memory pools and request-to-token mapping. + + 1. Profiles available GPU memory to determine the maximum number of + KV-cache token slots (``max_total_num_tokens``). + 2. Derives ``max_running_requests`` from config or heuristic. + 3. Creates :class:`~pymllm.mem_cache.memory_pool.ReqToTokenPool`, + :class:`~pymllm.mem_cache.memory_pool.KVPool`, and + :class:`~pymllm.mem_cache.memory_pool.TokenToKVPoolAllocator`. + """ + logger.info("Initialising memory pools...") + + # Determine max number of tokens in KV cache + self.max_total_num_tokens = self._profile_max_num_tokens() + + # Determine max running requests + max_reqs = self.server_config.max_running_requests + if max_reqs is None: + max_reqs = min( + max( + int(self.max_total_num_tokens / self.context_len * 512), + 2048, + ), + 4096, + ) + self.max_running_requests = max_reqs + + if self.max_total_num_tokens <= 0: + raise RuntimeError( + "Not enough memory for KV cache. " + "Try reducing context_length or using a smaller model." + ) + + # Create ReqToTokenPool + self.req_to_token_pool = make_req_to_token_pool( + max_reqs=self.max_running_requests, + max_context_len=self.context_len + 4, # small padding + device=self.device, + ) + + # Create KVPool + TokenToKVPoolAllocator + # Note: layer_num uses num_hidden_layers even for hybrid models + # because the KV pool is indexed by global layer_id. GDN layers' + # KV slots are allocated but unused (they use GDNPool instead). + self.token_to_kv_pool, self.token_to_kv_pool_allocator = ( + make_full_attention_net_mem_pool( + size=self.max_total_num_tokens, + layer_num=self.num_hidden_layers, + k_head_num=self.num_kv_heads, + k_head_dim=self.head_dim, + v_head_num=self.num_kv_heads, + v_head_dim=self.v_head_dim, + device=self.device, + dtype=self.kv_cache_dtype, + ) + ) + + # Create GDNPool if hybrid model with GDN layers + if self.num_gdn_layers > 0: + hf_config = self.model_config.hf_config + text_config = getattr(hf_config, "text_config", hf_config) + gdn_num_k_heads = getattr(text_config, "linear_num_key_heads", 16) + gdn_num_v_heads = getattr(text_config, "linear_num_value_heads", 32) + gdn_head_k_dim = getattr(text_config, "linear_key_head_dim", 128) + gdn_head_v_dim = getattr(text_config, "linear_value_head_dim", 128) + gdn_conv_kernel = getattr(text_config, "linear_conv_kernel_dim", 4) + gdn_conv_dim = gdn_num_k_heads * gdn_head_k_dim * 2 + gdn_num_v_heads * gdn_head_v_dim + + self.gdn_pool = GDNPool( + max_reqs=self.max_running_requests, + num_gdn_layers=self.num_gdn_layers, + num_v_heads=gdn_num_v_heads, + head_k_dim=gdn_head_k_dim, + head_v_dim=gdn_head_v_dim, + conv_dim=gdn_conv_dim, + conv_kernel_size=gdn_conv_kernel, + device=self.device, + dtype=self.dtype, + max_track_slots=self.max_running_requests, + ) + + logger.info( + "Memory pool initialised: max_tokens=%d, max_reqs=%d, kv_pool=%.2f GB" + + (", gdn_pool=%.2f GB" if self.gdn_pool is not None else ""), + self.max_total_num_tokens, + self.max_running_requests, + self.token_to_kv_pool._mem_bytes() / (1 << 30), + *([self.gdn_pool.mem_bytes() / (1 << 30)] if self.gdn_pool is not None else []), + ) + + def _profile_max_num_tokens(self) -> int: + """Profile available memory to determine maximum KV-cache tokens. + + If ``server_config.max_total_tokens`` is explicitly set that value + is used directly. Otherwise a memory-fraction-based heuristic + similar to sglang's ``profile_max_num_token`` is applied. + """ + # If user explicitly set max_total_tokens, use that. + if self.server_config.max_total_tokens is not None: + return self.server_config.max_total_tokens + + if self.device != "cuda": + # For CPU, use a conservative default. + return 4096 + + available_gb = get_available_gpu_memory(self.device, self.gpu_id) + + # Determine memory fraction for static allocation (KV cache). + mem_fraction = self.server_config.mem_fraction_static + if mem_fraction is None: + mem_fraction = 0.85 # default: use 85% of remaining memory + + # Calculate per-token KV cache size in bytes. + kv_element_size = torch.tensor([], dtype=self.kv_cache_dtype).element_size() + cell_size = ( + self.num_kv_heads + * (self.head_dim + self.v_head_dim) # K + V + * self.num_hidden_layers + * kv_element_size + ) + + if cell_size == 0: + logger.warning( + "cell_size is 0 (model metadata may be incomplete); " + "using default max_total_num_tokens=4096" + ) + return 4096 + + rest_memory_bytes = int(available_gb * mem_fraction * (1 << 30)) + + # Reserve memory for GDN pool if hybrid model + if self.num_gdn_layers > 0: + hf_config = self.model_config.hf_config + text_config = getattr(hf_config, "text_config", hf_config) + gdn_num_k_heads = getattr(text_config, "linear_num_key_heads", 16) + gdn_num_v_heads = getattr(text_config, "linear_num_value_heads", 32) + gdn_head_k_dim = getattr(text_config, "linear_key_head_dim", 128) + gdn_head_v_dim = getattr(text_config, "linear_value_head_dim", 128) + gdn_conv_kernel = getattr(text_config, "linear_conv_kernel_dim", 4) + gdn_conv_dim = gdn_num_k_heads * gdn_head_k_dim * 2 + gdn_num_v_heads * gdn_head_v_dim + + # Estimate GDN pool memory for max_running_requests + # Track slots add max_reqs_est extra slots for prefix cache snapshots + max_reqs_est = min( + max(int(rest_memory_bytes / cell_size / self.context_len * 512), 2048), + 4096, + ) if self.server_config.max_running_requests is None else self.server_config.max_running_requests + pool_size = max_reqs_est + 1 + max_reqs_est # +track_slots + recurrent_bytes = ( + self.num_gdn_layers * pool_size * gdn_num_v_heads + * gdn_head_v_dim * gdn_head_k_dim * 4 # float32 + ) + dtype_size = torch.tensor([], dtype=self.dtype).element_size() + conv_bytes = ( + self.num_gdn_layers * pool_size * gdn_conv_dim + * (gdn_conv_kernel - 1) * dtype_size + ) + gdn_pool_bytes = recurrent_bytes + conv_bytes + rest_memory_bytes -= gdn_pool_bytes + logger.info( + "GDN pool memory reservation: %.2f GB", + gdn_pool_bytes / (1 << 30), + ) + + max_num_tokens = rest_memory_bytes // cell_size + + logger.info( + "Memory profiling: avail=%.2f GB, fraction=%.2f, " + "cell_size=%d bytes, max_tokens=%d", + available_gb, + mem_fraction, + cell_size, + max_num_tokens, + ) + + return max(max_num_tokens, 1) # at least 1 + + # ------------------------------------------------------------------ + # Attention backend + # ------------------------------------------------------------------ + + def init_attention_backend(self) -> None: + """Initialise the attention backend. + + Creates a :class:`FlashInferAttnBackend` for standard models, or a + :class:`HybridAttnBackend` (FlashInfer + GDN) for hybrid models. + """ + from pymllm.layers.attention.flashinfer_backend import FlashInferAttnBackend + + logger.info("Initialising attention backend...") + + flash_backend = FlashInferAttnBackend( + num_heads=self.num_attention_heads, + num_kv_heads=self.num_kv_heads, + head_dim=self.head_dim, + kv_cache_dtype=self.kv_cache_dtype, + q_dtype=self.dtype, + max_context_len=self.context_len, + req_to_token=self.req_to_token_pool.req_to_token, + device=torch.device(self.device), + max_req_pool_size=self.req_to_token_pool.size, + ) + + if self.gdn_pool is not None: + from pymllm.layers.attention.gdn_backend import GDNAttnBackend + from pymllm.layers.attention.hybrid_backend import HybridAttnBackend + + gdn_backend = GDNAttnBackend( + gdn_pool=self.gdn_pool, + device=torch.device(self.device), + ) + self.attn_backend = HybridAttnBackend( + full_attn_backend=flash_backend, + gdn_backend=gdn_backend, + full_attn_layer_ids=self.full_attn_layer_ids, + ) + else: + self.attn_backend = flash_backend + + logger.info( + "Attention backend: %s", + type(self.attn_backend).__name__, + ) + + # ------------------------------------------------------------------ + # Warmup + # ------------------------------------------------------------------ + + def _init_cublas(self) -> None: + """Run a small matmul to initialise cuBLAS. + + Without this, the first real matmul may incur a significant + initialisation overhead. + """ + dtype = torch.float16 + device = "cuda" + a = torch.ones((16, 16), dtype=dtype, device=device) + b = torch.ones((16, 16), dtype=dtype, device=device) + _ = a @ b + + # ------------------------------------------------------------------ + # CUDA graph capture + # ------------------------------------------------------------------ + + def init_cuda_graphs(self) -> None: + """Capture CUDA graphs for decode-step acceleration. + + Skipped when: + * The device is not CUDA. + * ``server_config.disable_cuda_graph`` is ``True``. + * The model is not a generation model. + """ + self.graph_runner = None + + if self.device != "cuda": + return + if self.server_config.disable_cuda_graph: + logger.info("CUDA graphs disabled by config.") + return + if not self.is_generation: + return + + from pymllm.executor.cuda_graph_runner import CudaGraphRunner + + tic = time.perf_counter() + before_mem = get_available_gpu_memory(self.device, self.gpu_id) + logger.info("Capturing CUDA graphs... avail mem=%.2f GB", before_mem) + + self.graph_runner = CudaGraphRunner(self) + + after_mem = get_available_gpu_memory(self.device, self.gpu_id) + logger.info( + "CUDA graph capture complete. elapsed=%.2f s, " + "mem usage=%.2f GB, avail mem=%.2f GB", + time.perf_counter() - tic, + before_mem - after_mem, + after_mem, + ) + + # ------------------------------------------------------------------ + # ForwardBatch construction + # ------------------------------------------------------------------ + + def prepare_forward_batch_extend( + self, + input_ids: torch.Tensor, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + extend_seq_lens: torch.Tensor, + extend_prefix_lens: torch.Tensor, + out_cache_loc: torch.Tensor, + return_logprob: bool = False, + top_logprobs_nums: Optional[List[int]] = None, + ) -> ForwardBatch: + """Build a :class:`ForwardBatch` for an extend (prefill) pass. + + Parameters + ---------- + input_ids + Token IDs for all new tokens, shape ``[total_new_tokens]``. + req_pool_indices + Index of each request in ``ReqToTokenPool``, + shape ``[batch_size]``. + seq_lens + Total (prefix + new) length of each sequence, + shape ``[batch_size]``. + extend_seq_lens + Number of new tokens per sequence, shape ``[batch_size]``. + extend_prefix_lens + Cached prefix length per sequence, shape ``[batch_size]``. + out_cache_loc + KV-pool slot indices for each new token, + shape ``[total_new_tokens]``. + return_logprob + Whether to return per-token log-probabilities. + top_logprobs_nums + Number of top log-probs per sequence. + """ + batch_size = req_pool_indices.shape[0] + seq_lens_sum = int(seq_lens.sum().item()) + extend_num_tokens = int(extend_seq_lens.sum().item()) + + # Compute positions for each token + positions = _compute_positions(extend_seq_lens, extend_prefix_lens) + + # Compute extend_start_loc (exclusive cumsum of extend_seq_lens) + extend_start_loc = torch.zeros( + batch_size, dtype=torch.int32, device=self.device + ) + if batch_size > 1: + extend_start_loc[1:] = torch.cumsum(extend_seq_lens[:-1], dim=0).to( + torch.int32 + ) + + return ForwardBatch( + forward_mode=ForwardMode.EXTEND, + batch_size=batch_size, + input_ids=input_ids, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + out_cache_loc=out_cache_loc, + seq_lens_sum=seq_lens_sum, + seq_lens_cpu=seq_lens.cpu(), + positions=positions, + extend_num_tokens=extend_num_tokens, + extend_seq_lens=extend_seq_lens, + extend_prefix_lens=extend_prefix_lens, + extend_start_loc=extend_start_loc, + extend_prefix_lens_cpu=extend_prefix_lens.tolist(), + extend_seq_lens_cpu=extend_seq_lens.tolist(), + return_logprob=return_logprob, + top_logprobs_nums=top_logprobs_nums, + req_to_token_pool=self.req_to_token_pool, + token_to_kv_pool=self.token_to_kv_pool, + attn_backend=self.attn_backend, + ) + + def prepare_forward_batch_decode( + self, + input_ids: torch.Tensor, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + out_cache_loc: torch.Tensor, + return_logprob: bool = False, + top_logprobs_nums: Optional[List[int]] = None, + mrope_position_deltas: Optional[torch.Tensor] = None, + ) -> ForwardBatch: + """Build a :class:`ForwardBatch` for a decode step. + + Parameters + ---------- + input_ids + Token IDs (one per sequence), shape ``[batch_size]``. + req_pool_indices + Index of each request in ``ReqToTokenPool``, + shape ``[batch_size]``. + seq_lens + Total sequence length of each request, shape ``[batch_size]``. + out_cache_loc + KV-pool slot for each sequence's new token, + shape ``[batch_size]``. + return_logprob + Whether to return per-token log-probabilities. + top_logprobs_nums + Number of top log-probs per sequence. + mrope_position_deltas + Per-request M-RoPE position deltas, shape ``[batch_size]`` (int64). + Used by multimodal models (e.g. Qwen3-VL) to offset decode-step + positions by the spatial extent of prefill images. + """ + batch_size = req_pool_indices.shape[0] + seq_lens_sum = int(seq_lens.sum().item()) + + # For decode, positions = seq_lens - 1 (the new token position) + positions = (seq_lens - 1).to(torch.int64) + + return ForwardBatch( + forward_mode=ForwardMode.DECODE, + batch_size=batch_size, + input_ids=input_ids, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + out_cache_loc=out_cache_loc, + seq_lens_sum=seq_lens_sum, + seq_lens_cpu=seq_lens.cpu(), + positions=positions, + return_logprob=return_logprob, + top_logprobs_nums=top_logprobs_nums, + req_to_token_pool=self.req_to_token_pool, + token_to_kv_pool=self.token_to_kv_pool, + attn_backend=self.attn_backend, + mrope_position_deltas=mrope_position_deltas, + ) + + # ------------------------------------------------------------------ + # Forward pass + # ------------------------------------------------------------------ + + def forward( + self, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Run a forward pass through the model. + + Dispatches to the appropriate method based on the batch's + :attr:`~pymllm.engine.forward_batch.ForwardMode`. For decode + batches, automatically uses CUDA-graph replay when a captured + graph is available. + + Parameters + ---------- + forward_batch + The prepared batch (from ``prepare_forward_batch_*``). + + Returns + ------- + LogitsProcessorOutput + Contains ``next_token_logits`` of shape + ``[batch_size, vocab_size]``. + """ + self.forward_pass_id += 1 + + if forward_batch.forward_mode.is_idle(): + return self._forward_idle(forward_batch) + + # Try CUDA graph replay for decode batches. + if ( + forward_batch.forward_mode.is_decode() + and self.graph_runner is not None + and self.graph_runner.can_run(forward_batch) + ): + return self.graph_runner.replay(forward_batch) + + if forward_batch.forward_mode.is_decode(): + return self.forward_decode(forward_batch) + elif forward_batch.forward_mode.is_extend(): + return self.forward_extend(forward_batch) + else: + raise ValueError(f"Unsupported forward mode: {forward_batch.forward_mode}") + + def forward_decode( + self, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Run a decode forward pass (one new token per sequence). + + Calls ``attn_backend.init_forward_metadata`` followed by + ``model.forward``. + """ + self.attn_backend.init_forward_metadata(forward_batch) + model_output = self.model.forward( + forward_batch.input_ids, + forward_batch.positions, + forward_batch, + ) + return self._process_logits(model_output, forward_batch) + + def forward_extend( + self, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Run an extend (prefill) forward pass. + + Calls ``attn_backend.init_forward_metadata`` followed by + ``model.forward``. + """ + self.attn_backend.init_forward_metadata(forward_batch) + model_output = self.model.forward( + forward_batch.input_ids, + forward_batch.positions, + forward_batch, + ) + return self._process_logits(model_output, forward_batch) + + def _forward_idle( + self, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Return empty logits for an idle batch (no sequences to process).""" + return LogitsProcessorOutput( + next_token_logits=torch.empty( + (0, self.vocab_size), + dtype=self.dtype, + device=self.device, + ), + ) + + # ------------------------------------------------------------------ + # Logits post-processing + # ------------------------------------------------------------------ + + def _process_logits( + self, + model_output: Any, + forward_batch: ForwardBatch, + ) -> LogitsProcessorOutput: + """Extract last-token logits from model output. + + Handles: + * A :class:`LogitsProcessorOutput` returned by custom model + implementations. + * A ``CausalLMOutput`` (from HuggingFace ``transformers``) with a + ``.logits`` attribute. + * A raw ``torch.Tensor`` of logits. + """ + if isinstance(model_output, LogitsProcessorOutput): + return model_output + + # Standard HuggingFace output + if hasattr(model_output, "logits"): + logits = model_output.logits + elif isinstance(model_output, torch.Tensor): + logits = model_output + else: + raise TypeError( + f"Unexpected model output type: {type(model_output)}. " + "Expected torch.Tensor or an object with .logits attribute." + ) + + # --- Decode: logits is [bs, 1, vocab] or [bs, vocab] --- + if forward_batch.forward_mode.is_decode(): + if logits.dim() == 3: + next_token_logits = logits[:, -1, :] + else: + next_token_logits = logits + else: + # --- Extend: pick the last token of each sequence --- + next_token_logits = self._gather_last_token_logits(logits, forward_batch) + + return LogitsProcessorOutput(next_token_logits=next_token_logits) + + def _gather_last_token_logits( + self, + logits: torch.Tensor, + forward_batch: ForwardBatch, + ) -> torch.Tensor: + """Gather the logits of the last token in each sequence for extend. + + During extend, the model processes all tokens but we only need the + logits at the last position of each sequence for next-token sampling. + """ + if logits.dim() == 3: + # [batch_size, seq_len, vocab_size] from standard HF model + return logits[:, -1, :] + + # Flat layout [total_tokens, vocab_size] + if ( + forward_batch.extend_start_loc is not None + and forward_batch.extend_seq_lens is not None + ): + last_indices = ( + forward_batch.extend_start_loc + forward_batch.extend_seq_lens - 1 + ).long() + return logits[last_indices] + + # Fallback: last row + return logits[-1:, :] + + # ------------------------------------------------------------------ + # Sampling + # ------------------------------------------------------------------ + + def sample( + self, + logits_output: LogitsProcessorOutput, + forward_batch: ForwardBatch, + temperatures: Optional[torch.Tensor] = None, + top_ps: Optional[torch.Tensor] = None, + top_ks: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Sample next-token IDs from logits. + + Supports per-request temperature, top-p, and top-k. + + Parameters + ---------- + logits_output + The logits from :meth:`forward`. + forward_batch + The current forward batch. + temperatures + Per-request temperature, shape ``[batch_size]``. + top_ps + Per-request top-p, shape ``[batch_size]``. + top_ks + Per-request top-k, shape ``[batch_size]``. + + Returns + ------- + torch.Tensor + Next-token IDs, shape ``[batch_size]``, dtype ``int32``. + """ + from pymllm.layers.sampling import ( + sampling_from_probs, + softmax, + top_k_top_p_sampling_from_probs, + ) + + logits = logits_output.next_token_logits + + if logits.numel() == 0: + return torch.empty(0, dtype=torch.int32, device=self.device) + + # Greedy path: temperature=0 (or all zeros) → argmax, no sampling. + if temperatures is not None: + all_greedy = bool((temperatures < 1e-6).all()) + else: + all_greedy = False + + if all_greedy: + return logits.argmax(dim=-1).to(torch.int32) + + # Stochastic path: apply temperature then sample. + if temperatures is not None: + probs = softmax(logits, temperature=temperatures) + else: + probs = torch.softmax(logits.float(), dim=-1) + + # Apply top-k / top-p sampling if specified + has_top_k = top_ks is not None + has_top_p = top_ps is not None + + if has_top_k or has_top_p: + k = top_ks if has_top_k else logits.shape[-1] + p = top_ps if has_top_p else 1.0 + next_token_ids = top_k_top_p_sampling_from_probs(probs, k, p) + else: + next_token_ids = sampling_from_probs(probs) + + return next_token_ids + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def shutdown(self) -> None: + """Release model and memory resources.""" + logger.info("ModelRunner shutting down...") + + if self.graph_runner is not None: + self.graph_runner.shutdown() + self.graph_runner = None + if self.model is not None: + del self.model + self.model = None + if self.token_to_kv_pool is not None: + del self.token_to_kv_pool + self.token_to_kv_pool = None + if self.token_to_kv_pool_allocator is not None: + del self.token_to_kv_pool_allocator + self.token_to_kv_pool_allocator = None + if self.gdn_pool is not None: + del self.gdn_pool + self.gdn_pool = None + if self.req_to_token_pool is not None: + del self.req_to_token_pool + self.req_to_token_pool = None + self.attn_backend = None + + if self.device == "cuda": + torch.cuda.empty_cache() + gc.collect() + + logger.info("ModelRunner shutdown complete.") + + # ------------------------------------------------------------------ + # Properties + # ------------------------------------------------------------------ + + @property + def is_generation(self) -> bool: + """True if the model is a generation (causal-LM) model.""" + return True + + @property + def sliding_window_size(self) -> Optional[int]: + """Sliding-window attention span, or ``None`` for full context.""" + hf_config = self.model_config.hf_config + if hf_config is None: + return None + text_config = getattr(hf_config, "text_config", hf_config) + return getattr(text_config, "sliding_window", None) + + +# --------------------------------------------------------------------------- +# Utility functions +# --------------------------------------------------------------------------- + + +def _compute_positions( + extend_seq_lens: torch.Tensor, + extend_prefix_lens: torch.Tensor, +) -> torch.Tensor: + """Compute per-token positions for an extend batch. + + For each sequence, positions are + ``[prefix_len, prefix_len+1, ..., prefix_len+seq_len-1]``. + The result is a flat 1-D tensor of shape ``[sum(extend_seq_lens)]``. + """ + device = extend_seq_lens.device + batch_size = extend_seq_lens.shape[0] + total_tokens = int(extend_seq_lens.sum().item()) + + if total_tokens == 0: + return torch.empty(0, dtype=torch.int64, device=device) + + positions = torch.empty(total_tokens, dtype=torch.int64, device=device) + offset = 0 + for i in range(batch_size): + seq_len = int(extend_seq_lens[i].item()) + prefix_len = int(extend_prefix_lens[i].item()) + if seq_len > 0: + positions[offset : offset + seq_len] = torch.arange( + prefix_len, + prefix_len + seq_len, + dtype=torch.int64, + device=device, + ) + offset += seq_len + + return positions diff --git a/pymllm/layers/__init__.py b/pymllm/layers/__init__.py index 97cfb9211..2ecb13965 100644 --- a/pymllm/layers/__init__.py +++ b/pymllm/layers/__init__.py @@ -6,9 +6,12 @@ from pymllm.layers.linear import ColumnParallelLinear, Linear, RowParallelLinear from pymllm.layers.mlp import MLP, ParallelMLP from pymllm.layers.rms_norm import GemmaRMSNorm, RMSNorm +from pymllm.layers.rms_norm_gated import RMSNormGated +from pymllm.layers.gated_delta_net import GatedDeltaNet from pymllm.layers.rope import ( apply_llama31_rope, apply_llama31_rope_pos_ids, + apply_mrope, apply_rope, apply_rope_pos_ids, apply_rope_with_cos_sin_cache, @@ -41,6 +44,7 @@ "LayerNorm", "RMSNorm", "GemmaRMSNorm", + "apply_mrope", "apply_rope", "apply_llama31_rope", "apply_rope_pos_ids", diff --git a/pymllm/layers/attention/__init__.py b/pymllm/layers/attention/__init__.py index 5d0dbf076..ae187975d 100644 --- a/pymllm/layers/attention/__init__.py +++ b/pymllm/layers/attention/__init__.py @@ -8,7 +8,10 @@ WrapperDispatch, should_use_tensor_core, ) +from pymllm.layers.attention.gdn_backend import GDNAttnBackend +from pymllm.layers.attention.hybrid_backend import HybridAttnBackend from pymllm.layers.attention.radix_attention import AttentionType, RadixAttention +from pymllm.layers.attention.radix_linear_attention import RadixLinearAttention __all__ = [ # Base @@ -16,10 +19,15 @@ # RadixAttention "AttentionType", "RadixAttention", + # RadixLinearAttention (GDN) + "RadixLinearAttention", # FlashInfer backend "FlashInferAttnBackend", "DecodeMetadata", "PrefillMetadata", "WrapperDispatch", "should_use_tensor_core", + # GDN + Hybrid backends + "GDNAttnBackend", + "HybridAttnBackend", ] diff --git a/pymllm/layers/attention/attention_backend.py b/pymllm/layers/attention/attention_backend.py index 07e2f6a17..fe168c2d2 100644 --- a/pymllm/layers/attention/attention_backend.py +++ b/pymllm/layers/attention/attention_backend.py @@ -103,6 +103,28 @@ def forward( q, k, v, layer, forward_batch, save_kv_cache=save_kv_cache, **kwargs ) + # ------------------------------------------------------------------ + # GDN linear-attention interface (used by HybridAttnBackend) + # ------------------------------------------------------------------ + + def forward_gdn( + self, + layer: "RadixLinearAttention", + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """Run GDN linear-attention for one layer. + + Only implemented by backends that support hybrid (full + GDN) + architectures. The default raises ``NotImplementedError``. + """ + raise NotImplementedError( + f"{type(self).__name__} does not support GDN linear attention. " + "Use HybridAttnBackend for hybrid full+GDN models." + ) + # ------------------------------------------------------------------ # Optional CUDA-graph interface # ------------------------------------------------------------------ diff --git a/pymllm/layers/attention/gdn_backend.py b/pymllm/layers/attention/gdn_backend.py new file mode 100644 index 000000000..2b6e27b48 --- /dev/null +++ b/pymllm/layers/attention/gdn_backend.py @@ -0,0 +1,660 @@ +"""GDN attention backend -- pooled-state GDN computation for hybrid models. + +Performs GDN (Gated Delta Net) linear-attention using externalized state +stored in a :class:`~pymllm.mem_cache.memory_pool.GDNPool`. Supports +both extend (prefill) and decode paths with FlashInfer kernels. + +This backend is not used directly; it is wrapped by +:class:`~pymllm.layers.attention.hybrid_backend.HybridAttnBackend`. +""" + +from __future__ import annotations + +import logging +import os +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Optional, Tuple + +import torch +import torch.nn.functional as F + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch + from pymllm.layers.attention.radix_linear_attention import RadixLinearAttention + from pymllm.mem_cache.memory_pool import GDNPool + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Server config: gdn_decode_backend override +# --------------------------------------------------------------------------- + + +def _get_gdn_decode_backend_override() -> str: + """Read ``server.gdn_decode_backend`` from GlobalConfig. + + Returns one of: ``"auto"``, ``"flashinfer"``, ``"mllm_kernel"``, ``"pytorch"``. + """ + try: + from pymllm.configs import get_global_config + return get_global_config().server.gdn_decode_backend + except Exception: + return "auto" + + +# --------------------------------------------------------------------------- +# mllm-kernel GDN decode (lazy import, SM80+) +# --------------------------------------------------------------------------- + +_mllm_gdn_decode = None + + +def _get_mllm_gdn_decode(): + """Lazy import for mllm-kernel fused GDN decode CUDA kernel.""" + global _mllm_gdn_decode + if _mllm_gdn_decode is None: + try: + from mllm_kernel.cuda.jit.gdn_decode import gdn_decode + + _mllm_gdn_decode = gdn_decode + logger.info("GDNAttnBackend: [probe] mllm-kernel GDN decode available (SM80+)") + except (ImportError, RuntimeError) as e: + logger.info("GDNAttnBackend: [probe] mllm-kernel GDN decode not available: %s", e) + _mllm_gdn_decode = False + return _mllm_gdn_decode if _mllm_gdn_decode is not False else None + + +# --------------------------------------------------------------------------- +# FlashInfer GDN kernel (lazy import) +# --------------------------------------------------------------------------- + +_flashinfer_available: Optional[bool] = None +_fi_chunk_gated_delta_rule = None +_fi_gated_delta_rule_decode = None + + +def _get_flashinfer_gdn(): + """Lazy import for FlashInfer GDN kernels (prefill + decode).""" + global _flashinfer_available, _fi_chunk_gated_delta_rule, _fi_gated_delta_rule_decode + if _flashinfer_available is None: + try: + os.environ.setdefault("FLASHINFER_DISABLE_VERSION_CHECK", "1") + _flashinfer_available = ( + torch.cuda.is_available() + and torch.cuda.get_device_capability()[0] >= 9 + ) + if not _flashinfer_available: + logger.info( + "GDNAttnBackend: [probe] FlashInfer GDN not available (requires SM90+, " + "current SM%d%d)", *torch.cuda.get_device_capability() + ) + return _flashinfer_available, None, None + + from flashinfer.gdn_prefill import chunk_gated_delta_rule + _fi_chunk_gated_delta_rule = chunk_gated_delta_rule + + try: + from flashinfer.gdn_decode import gated_delta_rule_decode_pretranspose + _fi_gated_delta_rule_decode = gated_delta_rule_decode_pretranspose + logger.info("GDNAttnBackend: [probe] FlashInfer GDN available (prefill + decode)") + except ImportError: + logger.info( + "GDNAttnBackend: [probe] FlashInfer GDN partially available " + "(prefill only, decode not found)" + ) + except (ImportError, RuntimeError) as e: + logger.info( + "GDNAttnBackend: [probe] FlashInfer GDN not available: %s", e + ) + _flashinfer_available = False + return _flashinfer_available, _fi_chunk_gated_delta_rule, _fi_gated_delta_rule_decode + + +# --------------------------------------------------------------------------- +# GDN gating computation +# --------------------------------------------------------------------------- + + +def _gdn_gating( + a: torch.Tensor, + b: torch.Tensor, + A_log: torch.Tensor, + dt_bias: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute GDN gating factors. + + Returns + ------- + g : log-space decay factor: -exp(A_log) * softplus(a + dt_bias) + beta : update gate: sigmoid(b) + """ + g = -torch.exp(A_log) * F.softplus(a + dt_bias) + beta = torch.sigmoid(b) + return g, beta + + +# --------------------------------------------------------------------------- +# Forward metadata +# --------------------------------------------------------------------------- + + +@dataclass +class GDNForwardMetadata: + """Per-batch metadata for GDN backend.""" + + cache_indices: torch.Tensor # [batch_size] = req_pool_indices + cu_seqlens: Optional[torch.Tensor] = None # extend only + + +# --------------------------------------------------------------------------- +# GDNAttnBackend +# --------------------------------------------------------------------------- + + +class GDNAttnBackend: + """GDN linear-attention backend using pooled states. + + Handles both extend (prefill) and decode paths for GDN layers. + Uses FlashInfer kernels when available (SM90+), with PyTorch fallback. + + Parameters + ---------- + gdn_pool + Pre-allocated :class:`~pymllm.mem_cache.memory_pool.GDNPool`. + device + Target device. + """ + + def __init__(self, gdn_pool: "GDNPool", device: torch.device): + self.gdn_pool = gdn_pool + self.device = device + self.forward_metadata: Optional[GDNForwardMetadata] = None + + # Pre-check FlashInfer availability + self._use_flashinfer, _, _ = _get_flashinfer_gdn() + + # One-shot flags to log the selected backend on first actual forward call + self._decode_backend_logged = False + self._extend_backend_logged = False + + def init_forward_metadata(self, forward_batch: "ForwardBatch") -> None: + """Prepare GDN metadata from the current forward batch.""" + cache_indices = forward_batch.req_pool_indices.to(torch.int64) + + cu_seqlens = None + if forward_batch.forward_mode.is_extend(): + # Build cu_seqlens from extend_seq_lens + if forward_batch.extend_seq_lens is not None: + seq_lens = forward_batch.extend_seq_lens.to(torch.int64) + cu_seqlens = torch.zeros( + len(seq_lens) + 1, + dtype=torch.int64, + device=self.device, + ) + torch.cumsum(seq_lens, dim=0, out=cu_seqlens[1:]) + + self.forward_metadata = GDNForwardMetadata( + cache_indices=cache_indices, + cu_seqlens=cu_seqlens, + ) + + # ------------------------------------------------------------------ + # CUDA-graph interface + # ------------------------------------------------------------------ + + def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int) -> None: + """Allocate CUDA-graph state for GDN backend. + + The GDN pool buffers are already pre-allocated at fixed addresses, + so we only need to allocate the metadata tensor. + """ + self._cuda_graph_cache_indices = torch.zeros( + (max_bs,), dtype=torch.int64, device=self.device + ) + + def init_forward_metadata_capture_cuda_graph( + self, + bs: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + ) -> None: + """Set up GDN metadata for CUDA-graph capture (decode only).""" + self._cuda_graph_cache_indices[:bs].copy_( + req_pool_indices[:bs].to(torch.int64) + ) + self.forward_metadata = GDNForwardMetadata( + cache_indices=self._cuda_graph_cache_indices[:bs], + ) + + def init_forward_metadata_replay_cuda_graph( + self, + bs: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + ) -> None: + """Update GDN metadata for CUDA-graph replay (decode only).""" + self._cuda_graph_cache_indices[:bs].copy_( + req_pool_indices[:bs].to(torch.int64) + ) + self.forward_metadata = GDNForwardMetadata( + cache_indices=self._cuda_graph_cache_indices[:bs], + ) + + # ------------------------------------------------------------------ + # Forward: decode + # ------------------------------------------------------------------ + + def forward_decode( + self, + layer: "RadixLinearAttention", + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """GDN decode: one new token per request. + + Steps: + 1. Gather conv_state from pool → [bs, conv_dim, K-1] + 2. Conv1d update: shift + weighted sum for 1 new token + 3. Scatter updated conv_state back to pool + 4. SiLU → split q,k,v + 5. FlashInfer gated_delta_rule_decode (or PyTorch fallback) + """ + metadata = self.forward_metadata + cache_indices = metadata.cache_indices + gdn_idx = layer.gdn_layer_idx + bs = mixed_qkv.shape[0] + + recurrent_buf, conv_buf = self.gdn_pool.get_layer_state(gdn_idx) + conv_weight = layer.conv_weight # [conv_dim, kernel_size] + K = conv_weight.shape[1] + + # --- Conv1d decode: single-token update --- + conv_state = conv_buf[cache_indices] # [bs, conv_dim, K-1] + x = mixed_qkv.unsqueeze(-1) # [bs, conv_dim, 1] + + new_conv_state = torch.cat([conv_state[:, :, 1:], x], dim=-1) + full_window = torch.cat([conv_state, x], dim=-1) # [bs, conv_dim, K] + conv_out = (full_window * conv_weight.unsqueeze(0)).sum(dim=-1) + + conv_buf[cache_indices] = new_conv_state + + # --- SiLU activation --- + conv_out = F.silu(conv_out) + + # --- Split q, k, v --- + key_dim = layer.num_k_heads * layer.head_k_dim + value_dim = layer.num_v_heads * layer.head_v_dim + q, k, v = conv_out.split([key_dim, key_dim, value_dim], dim=-1) + q = q.view(bs, layer.num_k_heads, layer.head_k_dim) + k = k.view(bs, layer.num_k_heads, layer.head_k_dim) + v = v.view(bs, layer.num_v_heads, layer.head_v_dim) + + # --- Recurrent update --- + # Priority (when "auto"): FlashInfer SM90+ > mllm-kernel SM80+ > PyTorch + # Can be overridden via --server.gdn_decode_backend + backend = _get_gdn_decode_backend_override() + use_fi, _, fi_decode = _get_flashinfer_gdn() + mllm_gdn = _get_mllm_gdn_decode() + + use_flashinfer = ( + (backend in ("auto", "flashinfer")) + and use_fi and fi_decode is not None + and mixed_qkv.is_cuda + ) + use_mllm = ( + (backend in ("auto", "mllm_kernel")) + and not (backend == "auto" and use_flashinfer) + and mllm_gdn is not None + and mixed_qkv.is_cuda + ) + + if backend == "flashinfer" and not use_flashinfer: + logger.warning("GDNAttnBackend: gdn_decode_backend='flashinfer' requested but unavailable, falling back") + if backend == "mllm_kernel" and mllm_gdn is None: + logger.warning("GDNAttnBackend: gdn_decode_backend='mllm_kernel' requested but unavailable, falling back") + + if not self._decode_backend_logged: + if use_flashinfer: + selected = "flashinfer" + elif use_mllm: + selected = "mllm_kernel" + else: + selected = "pytorch" + logger.info( + "GDNAttnBackend: [decode] using backend=%s (config=%s)", selected, backend + ) + self._decode_backend_logged = True + + if use_flashinfer: + # FlashInfer decode (SM90+) + query_fi = q.unsqueeze(1) + key_fi = k.unsqueeze(1) + value_fi = v.unsqueeze(1) + a_fi = a.unsqueeze(1) + b_fi = b.unsqueeze(1) + + state_batch = recurrent_buf[cache_indices] + + output_fi, new_state = fi_decode( + q=query_fi, k=key_fi, v=value_fi, + state=state_batch, + A_log=layer.A_log.detach(), + a=a_fi, dt_bias=layer.dt_bias.detach(), b=b_fi, + scale=None, output=None, use_qk_l2norm=True, + ) + + recurrent_buf[cache_indices] = new_state + output = output_fi.squeeze(1) + + elif use_mllm: + # mllm-kernel fused CUDA decode (SM80+) + output = mllm_gdn( + q, k, v, a, b, + layer.A_log, layer.dt_bias, + recurrent_buf, cache_indices, + ) + + else: + # PyTorch fallback + g, beta = _gdn_gating(a, b, layer.A_log, layer.dt_bias) + output = self._decode_pytorch_fallback( + q, k, v, g, beta, recurrent_buf, cache_indices, layer + ) + + return output.reshape(bs, value_dim) + + def _decode_pytorch_fallback( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + recurrent_buf: torch.Tensor, + cache_indices: torch.Tensor, + layer: "RadixLinearAttention", + ) -> torch.Tensor: + """Pure PyTorch decode fallback for GDN with delta rule and L2 norm. + + Matches the sglang Triton kernel (fused_sigmoid_gating_delta_rule_update): + state *= exp(g) # decay + v_delta = v - state @ k # delta rule + v_delta *= beta # gating + state += v_delta outer k # state update + output = state @ q # readout + """ + bs = q.shape[0] + num_v_heads = layer.num_v_heads + num_k_heads = layer.num_k_heads + + # GQA: expand k/q heads to match v heads + if num_k_heads != num_v_heads: + repeats = num_v_heads // num_k_heads + q = q.repeat_interleave(repeats, dim=1) + k = k.repeat_interleave(repeats, dim=1) + + # All computation in float32 (state is float32, avoids dtype mismatch) + orig_dtype = q.dtype + q = q.float() + k = k.float() + v = v.float() + + # L2 normalize q and k per-head (matching use_qk_l2norm_in_kernel=True) + q = q / (q.norm(dim=-1, keepdim=True) + 1e-6) + k = k / (k.norm(dim=-1, keepdim=True) + 1e-6) + + decay = torch.exp(g.float()) # [bs, num_v_heads] + beta_f = beta.float() # [bs, num_v_heads] + + outputs = [] + for i in range(bs): + idx = cache_indices[i] + state = recurrent_buf[idx] # [H, V, K] float32 + + # Decay + state = state * decay[i].unsqueeze(-1).unsqueeze(-1) + + k_i = k[i] # [H, K] + v_i = v[i] # [H, V] + b_i = beta_f[i] # [H] + q_i = q[i] # [H, K] + + # Delta rule: v_delta = v - state @ k + v_delta = v_i - torch.bmm(state, k_i.unsqueeze(-1)).squeeze(-1) + v_delta = v_delta * b_i.unsqueeze(-1) # gating + + # State update: state += v_delta ⊗ k (outer product in [V, K] layout) + state = state + v_delta.unsqueeze(-1) * k_i.unsqueeze(-2) + recurrent_buf[idx] = state + + # Output: o = state @ q + o_t = torch.bmm(state, q_i.unsqueeze(-1)).squeeze(-1) # [H, V] + outputs.append(o_t) + + return torch.stack(outputs, dim=0).to(orig_dtype) # [bs, H, V] + + # ------------------------------------------------------------------ + # Forward: extend (prefill) + # ------------------------------------------------------------------ + + def forward_extend( + self, + layer: "RadixLinearAttention", + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """GDN extend (prefill): multi-token per request. + + Steps: + 1. Gather conv_state from pool for each request + 2. Per-request causal conv1d + 3. Scatter new conv_state back to pool + 4. SiLU → split q,k,v → gating + 5. FlashInfer chunk_gated_delta_rule (or PyTorch fallback) + 6. Scatter final recurrent state back to pool + """ + metadata = self.forward_metadata + cache_indices = metadata.cache_indices + cu_seqlens = metadata.cu_seqlens + gdn_idx = layer.gdn_layer_idx + total_tokens = mixed_qkv.shape[0] + + recurrent_buf, conv_buf = self.gdn_pool.get_layer_state(gdn_idx) + conv_weight = layer.conv_weight # [conv_dim, kernel_size] + K = conv_weight.shape[1] + batch_size = cache_indices.shape[0] + + key_dim = layer.num_k_heads * layer.head_k_dim + value_dim = layer.num_v_heads * layer.head_v_dim + + # --- Per-request causal conv1d --- + conv_out = torch.empty_like(mixed_qkv) # [total_tokens, conv_dim] + + for i in range(batch_size): + start = int(cu_seqlens[i].item()) + end = int(cu_seqlens[i + 1].item()) + seq_len = end - start + if seq_len == 0: + continue + + idx = cache_indices[i] + x = mixed_qkv[start:end] # [seq_len, conv_dim] + prev_state = conv_buf[idx] # [conv_dim, K-1] + + # Pad with previous conv state + x_padded = torch.cat([prev_state.T, x], dim=0) # [K-1+seq_len, conv_dim] + + # Save new conv state (last K-1 tokens) + conv_buf[idx] = x_padded[-(K - 1):].T.clone() + + # Causal conv1d + out = torch.zeros(seq_len, x.shape[1], device=x.device, dtype=x.dtype) + for kk in range(K): + out += x_padded[kk: kk + seq_len] * conv_weight[:, kk] + conv_out[start:end] = out + + # --- SiLU activation --- + conv_out = F.silu(conv_out) + + # --- Split q, k, v --- + q, k, v = conv_out.split([key_dim, key_dim, value_dim], dim=-1) + q = q.view(total_tokens, layer.num_k_heads, layer.head_k_dim) + k = k.view(total_tokens, layer.num_k_heads, layer.head_k_dim) + v = v.view(total_tokens, layer.num_v_heads, layer.head_v_dim) + + # --- GDN gating --- + g, beta = _gdn_gating(a, b, layer.A_log, layer.dt_bias) + + # --- Recurrent computation --- + use_fi, fi_prefill, _ = _get_flashinfer_gdn() + use_fi_extend = use_fi and fi_prefill is not None and mixed_qkv.is_cuda + + if not self._extend_backend_logged: + logger.info( + "GDNAttnBackend: [extend] using backend=%s", + "flashinfer" if use_fi_extend else "pytorch", + ) + self._extend_backend_logged = True + + if use_fi_extend: + # Gather initial states for this batch + init_state = recurrent_buf[cache_indices].to(torch.float32) + # [batch_size, num_v_heads, head_v_dim, head_k_dim] + + alpha = torch.exp(g.to(torch.float32)) + beta_f32 = beta.to(torch.float32) + + # FlashInfer's use_qk_l2norm_in_kernel is silently ignored — + # the flag is declared in the Python wrapper but never forwarded + # to the CUDA kernel. Pre-normalize q and k here, matching + # sglang's approach (l2norm_fwd before calling with False). + q_fi = q / (q.norm(dim=-1, keepdim=True) + 1e-6) + k_fi = k / (k.norm(dim=-1, keepdim=True) + 1e-6) + + output, final_state = fi_prefill( + q=q_fi.contiguous(), + k=k_fi.contiguous(), + v=v.contiguous(), + g=alpha, + beta=beta_f32, + initial_state=init_state, + output_final_state=True, + cu_seqlens=cu_seqlens, + use_qk_l2norm_in_kernel=False, + ) + + # Scatter final states back to pool + recurrent_buf[cache_indices] = final_state.to(recurrent_buf.dtype) + else: + # PyTorch fallback: per-request sequential scan + output = self._extend_pytorch_fallback( + q, k, v, g, beta, recurrent_buf, cache_indices, cu_seqlens, layer + ) + + return output.reshape(total_tokens, value_dim) + + def _extend_pytorch_fallback( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + beta: torch.Tensor, + recurrent_buf: torch.Tensor, + cache_indices: torch.Tensor, + cu_seqlens: torch.Tensor, + layer: "RadixLinearAttention", + ) -> torch.Tensor: + """Pure PyTorch extend fallback for GDN with delta rule and L2 norm.""" + total_tokens = q.shape[0] + num_v_heads = layer.num_v_heads + num_k_heads = layer.num_k_heads + head_v_dim = layer.head_v_dim + batch_size = cache_indices.shape[0] + + # All computation in float32 + orig_dtype = q.dtype + q = q.float() + k = k.float() + v = v.float() + + # L2 normalize q and k per-head + q = q / (q.norm(dim=-1, keepdim=True) + 1e-6) + k = k / (k.norm(dim=-1, keepdim=True) + 1e-6) + + # GQA expansion + if num_k_heads != num_v_heads: + repeats = num_v_heads // num_k_heads + q = q.repeat_interleave(repeats, dim=1) + k = k.repeat_interleave(repeats, dim=1) + + output = torch.zeros( + total_tokens, num_v_heads, head_v_dim, + device=q.device, dtype=torch.float32, + ) + + for i in range(batch_size): + start = int(cu_seqlens[i].item()) + end = int(cu_seqlens[i + 1].item()) + seq_len = end - start + if seq_len == 0: + continue + + idx = cache_indices[i] + q_seq = q[start:end] + k_seq = k[start:end] + v_seq = v[start:end] + g_seq = g[start:end] + beta_seq = beta[start:end] + + decay = torch.exp(g_seq.float()) # [seq_len, H] + beta_f = beta_seq.float() # [seq_len, H] + state = recurrent_buf[idx].clone() # [H, V, K] float32 + + seq_outputs = [] + for t in range(seq_len): + # Decay + state = state * decay[t].unsqueeze(-1).unsqueeze(-1) + + k_t = k_seq[t] # [H, K] + v_t = v_seq[t] # [H, V] + b_t = beta_f[t] # [H] + q_t = q_seq[t] # [H, K] + + # Delta rule: v_delta = v - state @ k + v_delta = v_t - torch.bmm(state, k_t.unsqueeze(-1)).squeeze(-1) + v_delta = v_delta * b_t.unsqueeze(-1) + + # State update + state = state + v_delta.unsqueeze(-1) * k_t.unsqueeze(-2) + + # Output + o_t = torch.bmm(state, q_t.unsqueeze(-1)).squeeze(-1) + seq_outputs.append(o_t) + + recurrent_buf[idx] = state + output[start:end] = torch.stack(seq_outputs, dim=0) + + return output.to(orig_dtype) + + # ------------------------------------------------------------------ + # Dispatch entry point + # ------------------------------------------------------------------ + + def forward_gdn( + self, + layer: "RadixLinearAttention", + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """Route to decode or extend based on forward mode.""" + if forward_batch.forward_mode.is_decode(): + return self.forward_decode(layer, forward_batch, mixed_qkv, a, b) + else: + return self.forward_extend(layer, forward_batch, mixed_qkv, a, b) diff --git a/pymllm/layers/attention/hybrid_backend.py b/pymllm/layers/attention/hybrid_backend.py new file mode 100644 index 000000000..a5628259e --- /dev/null +++ b/pymllm/layers/attention/hybrid_backend.py @@ -0,0 +1,184 @@ +"""Hybrid attention backend -- FlashInfer + GDN for hybrid architectures. + +Wraps a :class:`FlashInferAttnBackend` (for full-attention layers) and a +:class:`GDNAttnBackend` (for GDN linear-attention layers). Dispatches +based on layer type: + +* ``RadixAttention`` calls → delegated to ``full_attn_backend`` +* ``RadixLinearAttention`` calls (via ``forward_gdn``) → delegated to ``gdn_backend`` + +CUDA-graph compatible: delegates all graph lifecycle methods to both +sub-backends. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Optional, Set + +import torch + +from pymllm.layers.attention.attention_backend import AttentionBackend + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch, ForwardMode + from pymllm.layers.attention.flashinfer_backend import FlashInferAttnBackend + from pymllm.layers.attention.gdn_backend import GDNAttnBackend + from pymllm.layers.attention.radix_attention import RadixAttention + from pymllm.layers.attention.radix_linear_attention import RadixLinearAttention + +logger = logging.getLogger(__name__) + + +class HybridAttnBackend(AttentionBackend): + """Composite attention backend for hybrid full-attention + GDN models. + + Parameters + ---------- + full_attn_backend + FlashInfer backend for standard transformer attention layers. + gdn_backend + GDN backend for linear-attention layers. + full_attn_layer_ids + Set of global layer IDs that use full attention (for logging). + """ + + def __init__( + self, + full_attn_backend: "FlashInferAttnBackend", + gdn_backend: "GDNAttnBackend", + full_attn_layer_ids: Set[int], + ): + self.full_attn_backend = full_attn_backend + self.gdn_backend = gdn_backend + self.full_attn_layer_ids = full_attn_layer_ids + + logger.info( + "HybridAttnBackend created: %d full-attn layers, " + "%d GDN layers", + len(full_attn_layer_ids), + gdn_backend.gdn_pool.num_gdn_layers, + ) + + # ------------------------------------------------------------------ + # Core interface: init_forward_metadata + # ------------------------------------------------------------------ + + def init_forward_metadata(self, forward_batch: "ForwardBatch") -> None: + """Initialize metadata for both sub-backends.""" + self.full_attn_backend.init_forward_metadata(forward_batch) + self.gdn_backend.init_forward_metadata(forward_batch) + + # ------------------------------------------------------------------ + # Full attention: forward_decode / forward_extend + # ------------------------------------------------------------------ + + def forward_decode( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Delegate full-attention decode to FlashInfer backend.""" + return self.full_attn_backend.forward_decode( + q, k, v, layer, forward_batch, save_kv_cache=save_kv_cache, **kwargs + ) + + def forward_extend( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + layer: "RadixAttention", + forward_batch: "ForwardBatch", + save_kv_cache: bool = True, + **kwargs, + ) -> torch.Tensor: + """Delegate full-attention extend to FlashInfer backend.""" + return self.full_attn_backend.forward_extend( + q, k, v, layer, forward_batch, save_kv_cache=save_kv_cache, **kwargs + ) + + # ------------------------------------------------------------------ + # GDN linear attention: forward_gdn + # ------------------------------------------------------------------ + + def forward_gdn( + self, + layer: "RadixLinearAttention", + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """Delegate GDN computation to the GDN backend.""" + return self.gdn_backend.forward_gdn( + layer=layer, + forward_batch=forward_batch, + mixed_qkv=mixed_qkv, + a=a, + b=b, + ) + + # ------------------------------------------------------------------ + # CUDA-graph interface: delegate to both sub-backends + # ------------------------------------------------------------------ + + def get_cuda_graph_seq_len_fill_value(self) -> int: + """Delegate to the full-attention backend.""" + return self.full_attn_backend.get_cuda_graph_seq_len_fill_value() + + def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int) -> None: + """Allocate CUDA-graph state for both sub-backends.""" + self.full_attn_backend.init_cuda_graph_state(max_bs, max_num_tokens) + self.gdn_backend.init_cuda_graph_state(max_bs, max_num_tokens) + + def init_forward_metadata_capture_cuda_graph( + self, + bs: int, + num_tokens: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + forward_mode: "ForwardMode", + ) -> None: + """Set up metadata for CUDA-graph capture in both sub-backends.""" + self.full_attn_backend.init_forward_metadata_capture_cuda_graph( + bs=bs, + num_tokens=num_tokens, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + forward_mode=forward_mode, + ) + self.gdn_backend.init_forward_metadata_capture_cuda_graph( + bs=bs, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + ) + + def init_forward_metadata_replay_cuda_graph( + self, + bs: int, + req_pool_indices: torch.Tensor, + seq_lens: torch.Tensor, + seq_lens_sum: int, + forward_mode: "ForwardMode", + seq_lens_cpu: Optional[torch.Tensor], + ) -> None: + """Update metadata for CUDA-graph replay in both sub-backends.""" + self.full_attn_backend.init_forward_metadata_replay_cuda_graph( + bs=bs, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + seq_lens_sum=seq_lens_sum, + forward_mode=forward_mode, + seq_lens_cpu=seq_lens_cpu, + ) + self.gdn_backend.init_forward_metadata_replay_cuda_graph( + bs=bs, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens, + ) diff --git a/pymllm/layers/attention/radix_linear_attention.py b/pymllm/layers/attention/radix_linear_attention.py new file mode 100644 index 000000000..01993163d --- /dev/null +++ b/pymllm/layers/attention/radix_linear_attention.py @@ -0,0 +1,116 @@ +"""RadixLinearAttention -- GDN linear-attention layer for hybrid models. + +Analogous to :class:`RadixAttention` but for GDN (Gated Delta Net) layers. +Stores per-layer GDN parameters and delegates computation to the +:meth:`AttentionBackend.forward_gdn` method on the current +:class:`~pymllm.engine.forward_batch.ForwardBatch`. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import torch +from torch import nn + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch + + +class RadixLinearAttention(nn.Module): + """GDN linear-attention layer that delegates to the attention backend. + + Each GDN layer in a pymllm model creates one ``RadixLinearAttention`` + with a unique ``layer_id`` and ``gdn_layer_idx``. During forward, it + calls ``forward_batch.attn_backend.forward_gdn(...)`` which routes to + the appropriate GDN backend implementation. + + Parameters + ---------- + layer_id : int + Global zero-based layer index within the model. + gdn_layer_idx : int + Sequential zero-based index among GDN layers only (not global). + Used to index into :class:`~pymllm.mem_cache.memory_pool.GDNPool`. + num_k_heads : int + Number of key heads. + num_v_heads : int + Number of value heads. + head_k_dim : int + Per-head key dimension. + head_v_dim : int + Per-head value dimension. + conv_weight : nn.Parameter + Reference to the GDNConv1d weight parameter. + A_log : nn.Parameter + Log-space decay parameter. + dt_bias : nn.Parameter + Bias for the decay gate. + """ + + def __init__( + self, + layer_id: int, + gdn_layer_idx: int, + num_k_heads: int, + num_v_heads: int, + head_k_dim: int, + head_v_dim: int, + conv_weight: nn.Parameter, + A_log: nn.Parameter, + dt_bias: nn.Parameter, + ): + super().__init__() + self.layer_id = layer_id + self.gdn_layer_idx = gdn_layer_idx + self.num_k_heads = num_k_heads + self.num_v_heads = num_v_heads + self.head_k_dim = head_k_dim + self.head_v_dim = head_v_dim + # Store references to model parameters (not copies) + self.conv_weight = conv_weight + self.A_log = A_log + self.dt_bias = dt_bias + + def forward( + self, + forward_batch: "ForwardBatch", + mixed_qkv: torch.Tensor, + a: torch.Tensor, + b: torch.Tensor, + ) -> torch.Tensor: + """Delegate GDN computation to the attention backend. + + Parameters + ---------- + forward_batch + Batch metadata with ``attn_backend`` attached. + mixed_qkv + Concatenated Q/K/V projection output before conv1d. + a + Decay gate input, shape ``[num_tokens, num_v_heads]``. + b + Update gate input, shape ``[num_tokens, num_v_heads]``. + + Returns + ------- + torch.Tensor + GDN attention output, shape ``[num_tokens, num_v_heads * head_v_dim]``. + """ + return forward_batch.attn_backend.forward_gdn( + layer=self, + forward_batch=forward_batch, + mixed_qkv=mixed_qkv, + a=a, + b=b, + ) + + def extra_repr(self) -> str: + return ( + f"layer_id={self.layer_id}, " + f"gdn_layer_idx={self.gdn_layer_idx}, " + f"k_heads={self.num_k_heads}, " + f"v_heads={self.num_v_heads}, " + f"k_dim={self.head_k_dim}, " + f"v_dim={self.head_v_dim}" + ) diff --git a/pymllm/layers/gated_delta_net.py b/pymllm/layers/gated_delta_net.py new file mode 100644 index 000000000..3753734d9 --- /dev/null +++ b/pymllm/layers/gated_delta_net.py @@ -0,0 +1,168 @@ +"""Gated Delta Network (GDN) linear attention for Qwen3.5. + +This implements the linear attention mechanism used in Qwen3.5's hybrid +architecture. GDN alternates with standard full-attention layers. + +Core formulation (decode, per-head): + g_t = -exp(A_log) * softplus(a_t + dt_bias) + beta_t = sigmoid(b_t) + state_t = exp(g_t) * state_{t-1} + beta_t * (k_t outer v_t) + output_t = (q_t @ state_t) + +State is externalized into a :class:`~pymllm.mem_cache.memory_pool.GDNPool` +and computation is delegated to the attention backend via +:class:`~pymllm.layers.attention.radix_linear_attention.RadixLinearAttention`. +""" + +from __future__ import annotations + +import logging +from typing import Any, Optional + +import torch +import torch.nn as nn + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.linear import Linear +from pymllm.layers.utils import set_weight_attrs + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Conv1d weight holder +# --------------------------------------------------------------------------- + + +class GDNConv1d(nn.Module): + """Causal 1D convolution weight holder for GDN sequence mixing. + + The actual convolution computation is performed by the GDN backend + using pooled conv states. This module only holds the learnable weight. + """ + + def __init__(self, channels: int, kernel_size: int): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.weight = nn.Parameter(torch.empty(channels, kernel_size)) + + +# --------------------------------------------------------------------------- +# GatedDeltaNet — main GDN layer +# --------------------------------------------------------------------------- + + +class GatedDeltaNet(MllmBaseLayer): + """Gated Delta Network linear attention layer for Qwen3.5. + + State is externalized into a GDNPool and computation is delegated to + the attention backend via RadixLinearAttention. + + Parameters + ---------- + hidden_size : int + Model hidden dimension. + num_k_heads : int + Number of key heads. + num_v_heads : int + Number of value heads. + head_k_dim : int + Per-head key dimension. + head_v_dim : int + Per-head value dimension. + conv_kernel_size : int + Causal conv1d kernel width. + layer_id : int + Global layer index. + gdn_layer_idx : int + Sequential index among GDN layers (0-based). + rms_norm_eps : float + Epsilon for gated RMS normalization. + """ + + def __init__( + self, + hidden_size: int, + num_k_heads: int = 16, + num_v_heads: int = 32, + head_k_dim: int = 128, + head_v_dim: int = 128, + conv_kernel_size: int = 4, + layer_id: int = 0, + gdn_layer_idx: int = 0, + rms_norm_eps: float = 1e-6, + ): + super().__init__() + self.hidden_size = hidden_size + self.num_k_heads = num_k_heads + self.num_v_heads = num_v_heads + self.head_k_dim = head_k_dim + self.head_v_dim = head_v_dim + self.key_dim = head_k_dim * num_k_heads + self.value_dim = head_v_dim * num_v_heads + self.conv_kernel_size = conv_kernel_size + self.layer_id = layer_id + self.gdn_layer_idx = gdn_layer_idx + + # Input projections + self.in_proj_qkv = Linear(hidden_size, self.key_dim * 2 + self.value_dim, bias=False) + self.in_proj_z = Linear(hidden_size, self.value_dim, bias=False) + self.in_proj_a = Linear(hidden_size, num_v_heads, bias=False) + self.in_proj_b = Linear(hidden_size, num_v_heads, bias=False) + + # Causal convolution (weight only — computation is in the backend) + self.conv1d = GDNConv1d(self.key_dim * 2 + self.value_dim, conv_kernel_size) + + # State parameters (must stay float32 for numerical stability) + self.A_log = nn.Parameter(torch.empty(num_v_heads, dtype=torch.float32)) + self.dt_bias = nn.Parameter(torch.ones(num_v_heads, dtype=torch.float32)) + set_weight_attrs(self.A_log, {"weight_loader": self.weight_loader}) + set_weight_attrs(self.dt_bias, {"weight_loader": self.weight_loader}) + + # Gated RMSNorm (mllm-kernel accelerated) + from pymllm.layers.rms_norm_gated import RMSNormGated + self.norm = RMSNormGated(head_v_dim, eps=rms_norm_eps, norm_before_gate=True) + + # Output projection + self.out_proj = Linear(self.value_dim, hidden_size, bias=False) + + # RadixLinearAttention — delegates to the attention backend + from pymllm.layers.attention.radix_linear_attention import RadixLinearAttention + self.attn = RadixLinearAttention( + layer_id=layer_id, + gdn_layer_idx=gdn_layer_idx, + num_k_heads=num_k_heads, + num_v_heads=num_v_heads, + head_k_dim=head_k_dim, + head_v_dim=head_v_dim, + conv_weight=self.conv1d.weight, + A_log=self.A_log, + dt_bias=self.dt_bias, + ) + + def forward( + self, hidden_states: torch.Tensor, forward_batch: Any = None, + ) -> torch.Tensor: + seq_len, _ = hidden_states.shape + + # Input projections + mixed_qkv = self.in_proj_qkv(hidden_states) + z = self.in_proj_z(hidden_states) + a = self.in_proj_a(hidden_states) + b = self.in_proj_b(hidden_states) + + # Delegate to backend via RadixLinearAttention + # The backend handles: conv1d, SiLU, split, gating, recurrent update + attn_out = self.attn(forward_batch, mixed_qkv, a, b) + + # Gated norm + output projection + attn_out = attn_out.view(seq_len, self.num_v_heads, self.head_v_dim) + z = z.view(seq_len, self.num_v_heads, self.head_v_dim) + + attn_flat = attn_out.reshape(-1, self.head_v_dim) + z_flat = z.reshape(-1, self.head_v_dim) + normed = self.norm(attn_flat, z_flat) + normed = normed.view(seq_len, self.num_v_heads, self.head_v_dim) + normed = normed.reshape(seq_len, self.value_dim) + return self.out_proj(normed) diff --git a/pymllm/layers/rms_norm.py b/pymllm/layers/rms_norm.py index b55a0ea6c..b20b36f30 100644 --- a/pymllm/layers/rms_norm.py +++ b/pymllm/layers/rms_norm.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import Optional, Tuple, Union + import torch import flashinfer from torch.nn import Parameter @@ -19,7 +21,15 @@ def __init__(self, hidden_size: int, eps: float = 1e-6): self.weight = Parameter(torch.empty(hidden_size)) set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if residual is not None: + flashinfer.norm.fused_add_rmsnorm(x, residual, self.weight.data, self.eps) + return x, residual + if x.shape[-1] != self.hidden_size: raise ValueError( f"Expected last dim == hidden_size ({self.hidden_size}), " @@ -47,7 +57,17 @@ def __init__(self, hidden_size: int, eps: float = 1e-6): self.weight = Parameter(torch.empty(hidden_size)) set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if residual is not None: + flashinfer.norm.gemma_fused_add_rmsnorm( + x, residual, self.weight.data, self.eps + ) + return x, residual + if x.shape[-1] != self.hidden_size: raise ValueError( f"Expected last dim == hidden_size ({self.hidden_size}), " diff --git a/pymllm/layers/rms_norm_gated.py b/pymllm/layers/rms_norm_gated.py new file mode 100644 index 000000000..caec9b88d --- /dev/null +++ b/pymllm/layers/rms_norm_gated.py @@ -0,0 +1,154 @@ +"""Gated RMSNorm layer for Qwen3.5 GDN attention. + +Computes ``rmsnorm(x, weight, eps) * silu(z)`` using a fused CUDA kernel +from mllm-kernel. Falls back to PyTorch when the kernel is unavailable. +""" + +from __future__ import annotations + +import logging +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter + +from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.utils import set_weight_attrs + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Try to load the mllm-kernel fused CUDA implementation +# --------------------------------------------------------------------------- +_HAS_MLLM_KERNEL_CUDA = False +try: + from mllm_kernel.cuda.jit.rms_norm_gated import ( + rms_norm_gated as _mllm_rms_norm_gated, + ) + + _HAS_MLLM_KERNEL_CUDA = True +except Exception: + _mllm_rms_norm_gated = None + + +# --------------------------------------------------------------------------- +# Pure-PyTorch fallback +# --------------------------------------------------------------------------- + + +def _rms_norm_gated_pytorch( + x: torch.Tensor, + weight: torch.Tensor, + z: Optional[torch.Tensor] = None, + eps: float = 1e-6, + norm_before_gate: bool = True, +) -> torch.Tensor: + """Pure-PyTorch reference implementation.""" + dtype = x.dtype + x_fp32 = x.float() + w_fp32 = weight.float() + z_fp32 = z.float() if z is not None else None + + if z_fp32 is not None and not norm_before_gate: + x_fp32 = x_fp32 * F.silu(z_fp32) + + variance = x_fp32.pow(2).mean(dim=-1, keepdim=True) + rstd = torch.rsqrt(variance + eps) + out = x_fp32 * rstd * w_fp32 + + if z_fp32 is not None and norm_before_gate: + out = out * F.silu(z_fp32) + + return out.to(dtype) + + +# --------------------------------------------------------------------------- +# Unified dispatch +# --------------------------------------------------------------------------- + + +def rms_norm_gated( + x: torch.Tensor, + weight: torch.Tensor, + z: Optional[torch.Tensor] = None, + eps: float = 1e-6, + norm_before_gate: bool = True, +) -> torch.Tensor: + """Compute (optionally gated) RMS normalization. + + Uses the fused mllm-kernel CUDA implementation when available, + otherwise falls back to a pure-PyTorch implementation. + """ + if _HAS_MLLM_KERNEL_CUDA and x.is_cuda: + return _mllm_rms_norm_gated(x, weight, z=z, eps=eps) + return _rms_norm_gated_pytorch( + x, weight, z=z, eps=eps, norm_before_gate=norm_before_gate, + ) + + +# --------------------------------------------------------------------------- +# nn.Module wrapper +# --------------------------------------------------------------------------- + + +class RMSNormGated(MllmBaseLayer): + """Gated RMS Normalization layer for Qwen3.5 GDN attention. + + Computes:: + + output = rmsnorm(x, weight) * silu(z) # z is not None + output = rmsnorm(x, weight) # z is None + + Uses a fused CUDA kernel from mllm-kernel for maximum throughput. + + Parameters + ---------- + hidden_size : int + Dimensionality of the input (and weight vector). + eps : float + Small constant for numerical stability. + norm_before_gate : bool + If ``True`` (default): ``rmsnorm(x) * silu(z)``. + If ``False``: ``rmsnorm(x * silu(z))``. + """ + + def __init__( + self, + hidden_size: int, + eps: float = 1e-6, + group_size: Optional[int] = None, + norm_before_gate: bool = True, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + super().__init__() + self.hidden_size = hidden_size + self.eps = eps + self.norm_before_gate = norm_before_gate + + factory_kwargs = {} + if device is not None: + factory_kwargs["device"] = device + if dtype is not None: + factory_kwargs["dtype"] = dtype + + self.weight = Parameter(torch.ones(hidden_size, **factory_kwargs)) + set_weight_attrs(self.weight, {"weight_loader": self.weight_loader}) + + def forward( + self, + x: torch.Tensor, + z: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + return rms_norm_gated( + x, self.weight, z=z, eps=self.eps, + norm_before_gate=self.norm_before_gate, + ) + + def extra_repr(self) -> str: + return ( + f"hidden_size={self.hidden_size}, eps={self.eps}, " + f"norm_before_gate={self.norm_before_gate}" + ) diff --git a/pymllm/layers/rope.py b/pymllm/layers/rope.py index 045774e93..94f89b20d 100644 --- a/pymllm/layers/rope.py +++ b/pymllm/layers/rope.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Optional, Tuple +from typing import List, Optional, Tuple import torch import flashinfer @@ -44,7 +44,10 @@ def apply_rope( """ if inplace: flashinfer.rope.apply_rope_inplace( - q, k, indptr, offsets, + q, + k, + indptr, + offsets, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -53,7 +56,10 @@ def apply_rope( return None return flashinfer.rope.apply_rope( - q, k, indptr, offsets, + q, + k, + indptr, + offsets, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -102,7 +108,10 @@ def apply_llama31_rope( """ if inplace: flashinfer.rope.apply_llama31_rope_inplace( - q, k, indptr, offsets, + q, + k, + indptr, + offsets, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -114,7 +123,10 @@ def apply_llama31_rope( return None return flashinfer.rope.apply_llama31_rope( - q, k, indptr, offsets, + q, + k, + indptr, + offsets, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -156,7 +168,9 @@ def apply_rope_pos_ids( """ if inplace: flashinfer.rope.apply_rope_pos_ids_inplace( - q, k, pos_ids, + q, + k, + pos_ids, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -165,7 +179,9 @@ def apply_rope_pos_ids( return None return flashinfer.rope.apply_rope_pos_ids( - q, k, pos_ids, + q, + k, + pos_ids, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -208,7 +224,9 @@ def apply_llama31_rope_pos_ids( """ if inplace: flashinfer.rope.apply_llama31_rope_pos_ids_inplace( - q, k, pos_ids, + q, + k, + pos_ids, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -220,7 +238,9 @@ def apply_llama31_rope_pos_ids( return None return flashinfer.rope.apply_llama31_rope_pos_ids( - q, k, pos_ids, + q, + k, + pos_ids, rotary_dim=rotary_dim, interleave=interleave, rope_scale=rope_scale, @@ -265,12 +285,117 @@ def apply_rope_with_cos_sin_cache( """ if inplace: flashinfer.rope.apply_rope_with_cos_sin_cache_inplace( - positions, query, key, head_size, cos_sin_cache, + positions, + query, + key, + head_size, + cos_sin_cache, is_neox=is_neox, ) return None return flashinfer.rope.apply_rope_with_cos_sin_cache( - positions, query, key, head_size, cos_sin_cache, + positions, + query, + key, + head_size, + cos_sin_cache, is_neox=is_neox, ) + + +def _rotate_half(x: torch.Tensor) -> torch.Tensor: + """Rotate the second half of the last dimension into the first half (neox-style).""" + half = x.shape[-1] // 2 + return torch.cat((-x[..., half:], x[..., :half]), dim=-1) + + +def apply_mrope( + q: torch.Tensor, + k: torch.Tensor, + positions: torch.Tensor, + cos_sin_cache: torch.Tensor, + mrope_section: List[int], + mrope_interleaved: bool = True, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Apply multi-dimensional rotary position embedding (M-RoPE). + + Used by Qwen3-VL which assigns independent (t, h, w) position indices to + each token. For text tokens all three indices are the same sequential + value; for image tokens they follow the spatial grid layout. + + Args: + q: Query tensor, shape ``(T, num_q_heads, head_dim)``. + k: Key tensor, shape ``(T, num_kv_heads, head_dim)``. + positions: 3-D position IDs, shape ``(3, T)`` — rows are + ``(temporal, height, width)`` position indices. + cos_sin_cache: Precomputed cache, shape ``(max_pos, head_dim)``. + The first ``head_dim // 2`` columns are cosine values and the + remaining columns are sine values, each for frequencies + ``0, 1, ..., head_dim // 2 - 1``. + mrope_section: Three integers ``[s_t, s_h, s_w]`` that partition + the ``head_dim // 2`` rotary frequency dimensions among the + temporal, height, and width components. + ``sum(mrope_section)`` must equal ``head_dim // 2``. + mrope_interleaved: When ``True`` (Qwen3-VL default), uses the + interleaved layout where frequency dimensions are cycled + ``(t, h, w, t, h, w, ...)`` rather than grouped consecutively. + + Returns: + ``(q_rope, k_rope)`` with the same shapes as the inputs. + """ + rotary_dim = cos_sin_cache.shape[-1] # = head_dim + half_dim = rotary_dim // 2 + + # Look up cos/sin for each of the 3 position dimensions. + # positions: [3, T] => cos_sin: [3, T, rotary_dim] + cos_sin = cos_sin_cache[positions] + cos = cos_sin[..., :half_dim] # [3, T, half_dim] + sin = cos_sin[..., half_dim:] # [3, T, half_dim] + + if mrope_interleaved: + # Interleaved layout (Qwen3-VL): within the first + # mrope_section[1]*3 frequency dims, indices cycle (t, h, w). + # Remaining dims (indices >= span) all use the temporal position. + # Matches SGLang's apply_interleaved_rope. + cos_merged = cos[0].clone() # start with temporal; shape [T, half_dim] + sin_merged = sin[0].clone() + span_h = mrope_section[1] * 3 + span_w = mrope_section[2] * 3 + cos_merged[..., 1:span_h:3] = cos[1, ..., 1:span_h:3] + cos_merged[..., 2:span_w:3] = cos[2, ..., 2:span_w:3] + sin_merged[..., 1:span_h:3] = sin[1, ..., 1:span_h:3] + sin_merged[..., 2:span_w:3] = sin[2, ..., 2:span_w:3] + else: + # Non-interleaved (Qwen2-VL style): consecutive frequency sections. + cos_sects = cos.split(mrope_section, dim=-1) # list of [T, s_i] + sin_sects = sin.split(mrope_section, dim=-1) + # Section i picks its cos/sin from positions[i] + cos_merged = torch.cat( + [cos_sects[i][i] for i in range(3)], dim=-1 + ) # [T, half_dim] + sin_merged = torch.cat( + [sin_sects[i][i] for i in range(3)], dim=-1 + ) # [T, half_dim] + + # Expand to full rotary_dim for the neox-style rotation formula: + # q_rot = q * cos_full + rotate_half(q) * sin_full + cos_full = cos_merged.repeat(1, 2) # [T, rotary_dim] + sin_full = sin_merged.repeat(1, 2) # [T, rotary_dim] + cos_4d = cos_full.unsqueeze(1) # [T, 1, rotary_dim] -- broadcasts over heads + sin_4d = sin_full.unsqueeze(1) + + q_rot = q[..., :rotary_dim] * cos_4d + _rotate_half(q[..., :rotary_dim]) * sin_4d + k_rot = k[..., :rotary_dim] * cos_4d + _rotate_half(k[..., :rotary_dim]) * sin_4d + + q_out = ( + torch.cat([q_rot, q[..., rotary_dim:]], dim=-1) + if rotary_dim < q.shape[-1] + else q_rot + ) + k_out = ( + torch.cat([k_rot, k[..., rotary_dim:]], dim=-1) + if rotary_dim < k.shape[-1] + else k_rot + ) + return q_out, k_out diff --git a/pymllm/layers/sampling.py b/pymllm/layers/sampling.py index ff84879cf..26c769ffd 100644 --- a/pymllm/layers/sampling.py +++ b/pymllm/layers/sampling.py @@ -74,6 +74,15 @@ def softmax( torch.Tensor Probabilities with the same shape as *logits*. """ + # Clamp temperature to avoid division by zero (temperature=0 → greedy). + # Replace 0 with 1 here; the caller (ModelRunner.sample) handles + # temperature=0 via argmax before reaching this path. + if temperature is not None: + if isinstance(temperature, torch.Tensor): + temperature = temperature.clamp(min=1e-6) + elif temperature < 1e-6: + temperature = 1.0 # effectively no scaling; caller uses argmax + if _HAS_FLASHINFER: return _fi_sampling.softmax( logits, temperature=temperature, enable_pdl=enable_pdl diff --git a/pymllm/mem_cache/memory_pool.py b/pymllm/mem_cache/memory_pool.py index f9c176a94..9c8ab2a99 100644 --- a/pymllm/mem_cache/memory_pool.py +++ b/pymllm/mem_cache/memory_pool.py @@ -83,6 +83,10 @@ def __init__( self.device = torch.device(device) self.dtype = dtype + # pin_memory only applies to CPU tensors + if self.device.type != "cpu": + pin_memory = False + buf_len = size + 1 # slot 0 is padding if buf_len % 8 != 0: @@ -472,6 +476,161 @@ def make_full_attention_net_mem_pool( return pool, allocator +class GDNPool: + """Pre-allocated memory pool for GDN recurrent and conv states. + + Indexed by ``req_pool_idx`` (same index space as :class:`ReqToTokenPool`). + Slot 0 is reserved as a padding / dummy slot and is never allocated. + + Layout:: + + recurrent_state[gdn_layer_idx, slot, num_v_heads, head_k_dim, head_v_dim] + float32 (FlashInfer requirement) + conv_state[gdn_layer_idx, slot, conv_dim, kernel_size - 1] + model dtype (bfloat16 / float16) + + Parameters + ---------- + max_reqs : int + Maximum number of concurrent requests (matches ``ReqToTokenPool.size``). + num_gdn_layers : int + Number of GDN (linear attention) layers in the model. + num_v_heads : int + Number of value heads per GDN layer. + head_k_dim : int + Per-head key dimension. + head_v_dim : int + Per-head value dimension. + conv_dim : int + Total convolution input dimension (``key_dim * 2 + value_dim``). + conv_kernel_size : int + Causal conv1d kernel width (state stores ``kernel_size - 1`` columns). + device : str | torch.device + Target device. + dtype : torch.dtype + Storage dtype for conv_state (recurrent_state is always float32). + """ + + def __init__( + self, + max_reqs: int, + num_gdn_layers: int, + num_v_heads: int, + head_k_dim: int, + head_v_dim: int, + conv_dim: int, + conv_kernel_size: int, + device: Union[str, torch.device] = "cuda", + dtype: torch.dtype = torch.bfloat16, + max_track_slots: int = 0, + ): + self.max_reqs = max_reqs + self.num_gdn_layers = num_gdn_layers + self.num_v_heads = num_v_heads + self.head_k_dim = head_k_dim + self.head_v_dim = head_v_dim + self.conv_dim = conv_dim + self.conv_kernel_size = conv_kernel_size + self.device = torch.device(device) + self.dtype = dtype + self.max_track_slots = max_track_slots + + # Track slots live after the working slots: indices + # [max_reqs + 1, max_reqs + 1 + max_track_slots) + pool_size = max_reqs + 1 + max_track_slots # slot 0 is padding + + # Recurrent state: always float32 (FlashInfer requirement) + # Shape: [num_gdn_layers, pool_size, num_v_heads, head_v_dim, head_k_dim] + # Note: FlashInfer uses (V, K) layout for the state matrix + self.recurrent_state = torch.zeros( + (num_gdn_layers, pool_size, num_v_heads, head_v_dim, head_k_dim), + dtype=torch.float32, + device=self.device, + ) + + # Conv state: model dtype + # Shape: [num_gdn_layers, pool_size, conv_dim, kernel_size - 1] + self.conv_state = torch.zeros( + (num_gdn_layers, pool_size, conv_dim, conv_kernel_size - 1), + dtype=dtype, + device=self.device, + ) + + # Track-slot free list (indices into the pool starting after working slots) + self._track_slot_base = max_reqs + 1 + self._free_track_slots: List[int] = list( + range(self._track_slot_base, self._track_slot_base + max_track_slots) + ) + + logger.info( + "GDNPool allocated: %d GDN layers, %d working + %d track slots, " + "v_heads=%d, k_dim=%d, v_dim=%d, conv_dim=%d, kernel=%d, %.2f GB", + num_gdn_layers, + max_reqs, + max_track_slots, + num_v_heads, + head_k_dim, + head_v_dim, + conv_dim, + conv_kernel_size, + self.mem_bytes() / (1 << 30), + ) + + def get_layer_state( + self, gdn_layer_idx: int + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Return ``(recurrent_state, conv_state)`` for a specific GDN layer. + + Both are views into the pool tensors with shape: + - recurrent: ``[pool_size, num_v_heads, head_v_dim, head_k_dim]`` + - conv: ``[pool_size, conv_dim, kernel_size - 1]`` + """ + return ( + self.recurrent_state[gdn_layer_idx], + self.conv_state[gdn_layer_idx], + ) + + def reset_states(self, req_pool_indices: torch.Tensor) -> None: + """Zero-init GDN states for the given request pool indices. + + Called when new requests are allocated to ensure clean state. + """ + if req_pool_indices.numel() == 0: + return + # Zero both recurrent and conv states for all GDN layers + self.recurrent_state[:, req_pool_indices] = 0 + self.conv_state[:, req_pool_indices] = 0 + + # ------------------------------------------------------------------ + # Track-slot management (for prefix cache GDN state snapshots) + # ------------------------------------------------------------------ + + def alloc_track_slot(self) -> Optional[int]: + """Allocate a single track slot index. Returns ``None`` if exhausted.""" + if not self._free_track_slots: + return None + return self._free_track_slots.pop() + + def free_track_slot(self, slot: int) -> None: + """Return a track slot to the free list.""" + self._free_track_slots.append(slot) + + def copy_states(self, src_index: int, dst_index: int) -> None: + """Copy recurrent and conv states from *src_index* to *dst_index*. + + Works for any pool indices (working or track slots). + """ + self.recurrent_state[:, dst_index] = self.recurrent_state[:, src_index] + self.conv_state[:, dst_index] = self.conv_state[:, src_index] + + def mem_bytes(self) -> int: + """Total memory consumption in bytes.""" + return ( + self.recurrent_state.nelement() * self.recurrent_state.element_size() + + self.conv_state.nelement() * self.conv_state.element_size() + ) + + def make_req_to_token_pool( max_reqs: int, max_context_len: int, diff --git a/pymllm/mem_cache/radix_cache.py b/pymllm/mem_cache/radix_cache.py index 997790ea5..441a8c097 100644 --- a/pymllm/mem_cache/radix_cache.py +++ b/pymllm/mem_cache/radix_cache.py @@ -18,7 +18,7 @@ import time from collections import defaultdict from dataclasses import dataclass -from typing import Any, Dict, Iterator, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union import torch @@ -189,6 +189,7 @@ class InsertResult: """Returned by :meth:`RadixCache.insert`.""" prefix_len: int = 0 + last_node: Optional[TreeNode] = None @dataclass @@ -224,11 +225,13 @@ def __init__( sliding_window_size: Optional[int] = None, disable: bool = False, token_to_kv_pool_allocator: Any = None, + on_node_evict: Optional[Callable[[int], None]] = None, ): self.page_size = page_size self.sliding_window_size = sliding_window_size self.disable = disable self.pool = token_to_kv_pool_allocator + self.on_node_evict = on_node_evict if self.pool is not None and hasattr(self.pool, "device"): self.device = self.pool.device @@ -332,9 +335,10 @@ def insert( plen = self._insert_swa( self.root_node, key, value, prev_prefix_len, swa_evicted_seqlen ) + return InsertResult(prefix_len=plen) else: - plen = self._insert_normal(self.root_node, key, value) - return InsertResult(prefix_len=plen) + plen, last_node = self._insert_normal(self.root_node, key, value) + return InsertResult(prefix_len=plen, last_node=last_node) def evict(self, num_tokens: int, swa_num_tokens: int = 0) -> EvictResult: """Evict up to *num_tokens* (full) and *swa_num_tokens* (SWA) tokens. @@ -589,30 +593,38 @@ def _match_swa(self, key: RadixKey) -> Tuple[List[torch.Tensor], TreeNode, int]: return values, best_node, best_count - def _insert_normal(self, node: TreeNode, key: RadixKey, value: torch.Tensor) -> int: + def _insert_normal( + self, node: TreeNode, key: RadixKey, value: torch.Tensor + ) -> Tuple[int, TreeNode]: + """Insert into non-SWA tree. Returns ``(prefix_len, last_node)``.""" now = time.monotonic() node.last_access_time = now if len(key) == 0: - return 0 + return 0, node total_prefix = 0 - while len(key) > 0: - ck = _child_key(key, self.page_size) - if ck not in node.children: - break + ck = _child_key(key, self.page_size) + while len(key) > 0 and ck in node.children: node = node.children[ck] node.last_access_time = now plen = _key_match(node.key, key, self.page_size) - if plen < len(node.key): - self._split_node(node.key, node, plen) total_prefix += plen key = key[plen:] value = value[plen:] + if plen < len(node.key): + # Partial match: split the node. ``node`` must advance to + # the NEW parent so that any remaining key is added as a + # sibling of the tail, not a child of it. + node = self._split_node(node.key, node, plen) + if len(key) > 0: + ck = _child_key(key, self.page_size) + if len(key) > 0: - self._add_leaf(node, key, value) + new_leaf = self._add_leaf(node, key, value) + node = new_leaf - return total_prefix + return total_prefix, node def _insert_swa( self, @@ -730,6 +742,8 @@ def _delete_leaf(self, node: TreeNode) -> None: self._evictable_size -= len(node.key) if self.supports_swa and not node.swa_tombstone: self._swa_evictable_size -= len(node.key) + if self.on_node_evict is not None: + self.on_node_evict(node.id) def _tombstone_node(self, node: TreeNode) -> None: node.swa_tombstone = True diff --git a/pymllm/models/__init__.py b/pymllm/models/__init__.py index e69de29bb..7751b3091 100644 --- a/pymllm/models/__init__.py +++ b/pymllm/models/__init__.py @@ -0,0 +1,62 @@ +"""Model registry for pymllm. + +Maps HuggingFace ``config.architectures[0]`` strings to pymllm model classes. +Models are imported lazily via ``importlib`` so that heavy dependencies (torch, +numpy, etc.) are only loaded when a model is actually requested. +""" + +from __future__ import annotations + +import importlib +import logging +from typing import Dict, Optional, Tuple, Type + +import torch.nn as nn + +logger = logging.getLogger(__name__) + +# (module_path, class_name) +_MODEL_REGISTRY: Dict[str, Tuple[str, str]] = { + "Qwen3VLForConditionalGeneration": ( + "pymllm.models.qwen3_vl", + "Qwen3VLForConditionalGeneration", + ), + # Qwen3.5 (hybrid attention: full + GDN linear) + "Qwen3_5ForCausalLM": ( + "pymllm.models.qwen3_5", + "Qwen3_5ForCausalLM", + ), + "Qwen3_5ForConditionalGeneration": ( + "pymllm.models.qwen3_5", + "Qwen3_5ForConditionalGeneration", + ), +} + + +def get_model_class(architecture: str) -> Optional[Type[nn.Module]]: + """Look up a pymllm model class by HuggingFace architecture string. + + Returns ``None`` if the architecture is not registered or cannot be + imported. The caller is responsible for raising an appropriate error. + """ + entry = _MODEL_REGISTRY.get(architecture) + if entry is None: + return None + + module_path, class_name = entry + try: + module = importlib.import_module(module_path) + cls = getattr(module, class_name) + logger.info( + "Resolved architecture %r -> %s.%s", architecture, module_path, class_name + ) + return cls + except (ImportError, AttributeError) as exc: + logger.warning( + "Failed to import %s.%s for architecture %r: %s", + module_path, + class_name, + architecture, + exc, + ) + return None diff --git a/pymllm/models/qwen3_5.py b/pymllm/models/qwen3_5.py new file mode 100644 index 000000000..ca4dbe2ea --- /dev/null +++ b/pymllm/models/qwen3_5.py @@ -0,0 +1,530 @@ +"""Inference-only Qwen3.5 model for pymllm. + +Implements the hybrid attention architecture: +- **Full attention layers** (standard transformer with RoPE + output gate) +- **GDN linear attention layers** (Gated Delta Network, O(n) complexity) + +Layers alternate: linear, attention, linear, attention, ... based on +``full_attention_interval`` in the config. + +Supports: +- Dense (non-MoE) variant +- Vision-Language (multimodal) via inheritance from Qwen3VL + +Adapted from sglang's ``qwen3_5.py``. +""" + +from __future__ import annotations + +import logging +import math +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from pymllm.layers.attention.radix_attention import RadixAttention +from pymllm.layers.embedding import VocabParallelEmbedding +from pymllm.layers.gated_delta_net import GatedDeltaNet +from pymllm.layers.linear import Linear +from pymllm.layers.mlp import MLP +from pymllm.layers.rms_norm import GemmaRMSNorm, RMSNorm +from pymllm.layers.rope import apply_rope_pos_ids +from pymllm.layers.utils import set_weight_attrs + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Config helpers +# --------------------------------------------------------------------------- + + +def _get_text_config(config): + """Extract the text sub-config from a multimodal config, or return as-is.""" + return getattr(config, "text_config", config) + + +def _get_layer_types(config) -> List[str]: + """Return per-layer type list: 'attention' or 'linear_attention'.""" + if hasattr(config, "layers_block_type"): + return config.layers_block_type + # Compute from full_attention_interval + interval = getattr(config, "full_attention_interval", 2) + n_layers = config.num_hidden_layers + types = [] + for i in range(n_layers): + if (i + 1) % interval == 0: + types.append("attention") + else: + types.append("linear_attention") + return types + + +# --------------------------------------------------------------------------- +# Full Attention Layer (with output gate + QK norm) +# --------------------------------------------------------------------------- + + +class Qwen3_5FullAttention(nn.Module): + """Standard multi-head attention with RoPE, QK-norm, and optional output gate.""" + + def __init__(self, config, layer_id: int): + super().__init__() + tc = _get_text_config(config) + self.hidden_size = tc.hidden_size + self.num_heads = tc.num_attention_heads + self.num_kv_heads = tc.num_key_value_heads + self.head_dim = getattr(tc, "head_dim", self.hidden_size // self.num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim ** -0.5 + self.layer_id = layer_id + + # Output gate: Qwen3.5 doubles the Q projection and uses half as a + # sigmoid gate on the attention output. + self.attn_output_gate = getattr(tc, "attn_output_gate", True) + + if self.attn_output_gate: + q_proj_size = self.q_size * 2 # Q + gate + else: + q_proj_size = self.q_size + + self.q_proj = Linear(self.hidden_size, q_proj_size, bias=False) + self.k_proj = Linear(self.hidden_size, self.kv_size, bias=False) + self.v_proj = Linear(self.hidden_size, self.kv_size, bias=False) + self.o_proj = Linear(self.q_size, self.hidden_size, bias=False) + + # QK normalization + self.q_norm = GemmaRMSNorm(self.head_dim, eps=tc.rms_norm_eps) + self.k_norm = GemmaRMSNorm(self.head_dim, eps=tc.rms_norm_eps) + + # RoPE config + self.partial_rotary_factor = getattr(tc, "partial_rotary_factor", 1.0) + rope_config = getattr(tc, "rope_parameters", None) or getattr(tc, "rope_scaling", None) or {} + self.rope_theta = rope_config.get("rope_theta", getattr(tc, "rope_theta", 10000.0)) + self.rotary_dim = int(self.head_dim * self.partial_rotary_factor) + + # RadixAttention layer — delegates to the pluggable attention backend + self.attn = RadixAttention( + num_heads=self.num_heads, + head_dim=self.head_dim, + scaling=self.scaling, + num_kv_heads=self.num_kv_heads, + layer_id=layer_id, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + forward_batch: Any, + ) -> torch.Tensor: + seq_len = hidden_states.shape[0] + + q = self.q_proj(hidden_states) + k = self.k_proj(hidden_states) + v = self.v_proj(hidden_states) + + if self.attn_output_gate: + # Split Q into actual Q and gate + q_gate = q.view(seq_len, self.num_heads, self.head_dim * 2) + q, gate = q_gate.chunk(2, dim=-1) + q = q.reshape(seq_len, -1) + gate = gate.reshape(seq_len, -1) + + # QK norm + q = self.q_norm(q.reshape(-1, self.head_dim)).view(seq_len, -1) + k = self.k_norm(k.reshape(-1, self.head_dim)).view(seq_len, -1) + + # RoPE (inplace; rotary_dim handles partial rotation) + q = q.view(seq_len, self.num_heads, self.head_dim) + k = k.view(seq_len, self.num_kv_heads, self.head_dim) + apply_rope_pos_ids( + q, k, positions, inplace=True, + rotary_dim=self.rotary_dim, rope_theta=self.rope_theta, + ) + q = q.reshape(seq_len, -1) + k = k.reshape(seq_len, -1) + + # Standard attention via RadixAttention → attn_backend + attn_output = self.attn(q, k, v, forward_batch) + + # Output gate + if self.attn_output_gate: + attn_output = attn_output * torch.sigmoid(gate) + + return self.o_proj(attn_output) + + +# --------------------------------------------------------------------------- +# Full Attention Decoder Layer +# --------------------------------------------------------------------------- + + +class Qwen3_5AttentionDecoderLayer(nn.Module): + """Decoder layer with full attention + MLP.""" + + def __init__(self, config, layer_id: int): + super().__init__() + tc = _get_text_config(config) + self.self_attn = Qwen3_5FullAttention(config, layer_id) + self.mlp = MLP( + hidden_size=tc.hidden_size, + intermediate_size=tc.intermediate_size, + activation=tc.hidden_act, + ) + self.input_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) + self.post_attention_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + forward_batch: Any, + ): + # Pre-norm + residual + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + + hidden_states = self.self_attn(positions, hidden_states, forward_batch) + + # Post-attention norm + residual + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +# --------------------------------------------------------------------------- +# Linear Attention (GDN) Decoder Layer +# --------------------------------------------------------------------------- + + +class Qwen3_5LinearDecoderLayer(nn.Module): + """Decoder layer with GDN linear attention + MLP.""" + + def __init__(self, config, layer_id: int, gdn_layer_idx: int = 0): + super().__init__() + tc = _get_text_config(config) + self.linear_attn = GatedDeltaNet( + hidden_size=tc.hidden_size, + num_k_heads=getattr(tc, "linear_num_key_heads", 16), + num_v_heads=getattr(tc, "linear_num_value_heads", 32), + head_k_dim=getattr(tc, "linear_key_head_dim", 128), + head_v_dim=getattr(tc, "linear_value_head_dim", 128), + conv_kernel_size=getattr(tc, "linear_conv_kernel_dim", 4), + layer_id=layer_id, + gdn_layer_idx=gdn_layer_idx, + rms_norm_eps=tc.rms_norm_eps, + ) + self.mlp = MLP( + hidden_size=tc.hidden_size, + intermediate_size=tc.intermediate_size, + activation=tc.hidden_act, + ) + self.input_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) + self.post_attention_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + forward_batch: Any, + ): + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + + hidden_states = self.linear_attn(hidden_states, forward_batch) + + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +# --------------------------------------------------------------------------- +# Layer type registry +# --------------------------------------------------------------------------- + +_DECODER_LAYER_TYPES = { + "attention": Qwen3_5AttentionDecoderLayer, + "linear_attention": Qwen3_5LinearDecoderLayer, +} + + +# --------------------------------------------------------------------------- +# Qwen3.5 Language Model (dense variant) +# --------------------------------------------------------------------------- + + +class Qwen3_5ForCausalLM(nn.Module): + """Qwen3.5 causal language model with hybrid attention. + + Alternates between full attention and GDN linear attention layers. + Dense (non-MoE) variant. + """ + + def __init__(self, config): + super().__init__() + tc = _get_text_config(config) + self.config = tc + self.hidden_size = tc.hidden_size + self.vocab_size = tc.vocab_size + + # Embedding + self.embed_tokens = VocabParallelEmbedding(tc.vocab_size, tc.hidden_size) + + # Build hybrid decoder layers with sequential GDN indexing + layer_types = _get_layer_types(tc) + self.layer_types = layer_types + self.layers = nn.ModuleList() + gdn_count = 0 + self.full_attn_layer_ids = set() + for idx in range(tc.num_hidden_layers): + layer_type = layer_types[idx] + if layer_type == "linear_attention": + self.layers.append( + Qwen3_5LinearDecoderLayer(config, idx, gdn_layer_idx=gdn_count) + ) + gdn_count += 1 + else: + self.layers.append( + Qwen3_5AttentionDecoderLayer(config, idx) + ) + self.full_attn_layer_ids.add(idx) + self.num_gdn_layers = gdn_count + + # Final norm + self.norm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) + + logger.info( + "Qwen3_5ForCausalLM: %d layers (%d attention + %d GDN)", + tc.num_hidden_layers, + len(self.full_attn_layer_ids), + self.num_gdn_layers, + ) + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + forward_batch: Any, + input_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if input_embeds is None: + hidden_states = self.embed_tokens(input_ids) + else: + hidden_states = input_embeds + + residual = None + for layer in self.layers: + hidden_states, residual = layer( + positions=positions, + hidden_states=hidden_states, + residual=residual, + forward_batch=forward_batch, + ) + + # Final normalization + if residual is not None: + hidden_states, _ = self.norm(hidden_states, residual) + else: + hidden_states = self.norm(hidden_states) + + return hidden_states + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + """Load HuggingFace checkpoint weights with name remapping.""" + stacked_params_mapping = [ + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters()) + loaded: Set[str] = set() + + for name, weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if "mtp" in name: + continue + if "visual" in name: + continue + if "language_model" in name: + name = name.replace("model.language_model.", "") + if name.startswith("model."): + name = name[len("model."):] + # NOTE: do NOT strip .self_attn — pymllm keeps it as a submodule + + # Handle stacked params (gate_up_proj = gate_proj + up_proj) + matched = False + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + if "mlp.experts" in name: + continue + name = name.replace(weight_name, param_name) + if name not in params_dict: + continue + param = params_dict[name] + # gate_up_proj is a plain Linear — manually place each shard + output_dim = param.shape[0] // 2 + param.data[shard_id * output_dim : (shard_id + 1) * output_dim].copy_( + weight + ) + matched = True + break + + if not matched: + if name not in params_dict: + continue + param = params_dict[name] + loader = getattr(param, "weight_loader", None) + if loader is not None: + loader(param, weight) + else: + # Squeeze conv1d weight from [C, 1, K] to [C, K] + if weight.dim() != param.dim(): + weight = weight.squeeze() + param.data.copy_(weight) + + loaded.add(name) + + logger.info("Loaded %d parameter tensors for Qwen3_5ForCausalLM", len(loaded)) + return loaded + + +# --------------------------------------------------------------------------- +# Qwen3.5 Vision-Language Model +# --------------------------------------------------------------------------- + + +class Qwen3_5ForConditionalGeneration(nn.Module): + """Qwen3.5 multimodal model (text + vision). + + Inherits vision encoder from Qwen3VL and uses Qwen3.5's hybrid + language model. + """ + + def __init__(self, config): + super().__init__() + from pymllm.models.qwen3_vl import ( + Qwen3VLVisionModel, + ) + + self.config = config + tc = _get_text_config(config) + + # Vision encoder (reuse Qwen3VL's vision model) + vision_config = getattr(config, "vision_config", None) + if vision_config is not None: + self.visual = Qwen3VLVisionModel( + depth=getattr(vision_config, "depth", 27), + hidden_size=getattr(vision_config, "hidden_size", 1152), + hidden_act=getattr(vision_config, "hidden_act", "gelu_pytorch_tanh"), + intermediate_size=getattr(vision_config, "intermediate_size", 4304), + num_heads=getattr(vision_config, "num_heads", 16), + in_channels=getattr(vision_config, "in_channels", 3), + patch_size=getattr(vision_config, "patch_size", 16), + spatial_merge_size=getattr(vision_config, "spatial_merge_size", 2), + temporal_patch_size=getattr(vision_config, "temporal_patch_size", 2), + out_hidden_size=getattr(vision_config, "out_hidden_size", 3584), + num_position_embeddings=getattr( + vision_config, "num_position_embeddings", 2304 + ), + deepstack_visual_indexes=getattr( + vision_config, "deepstack_visual_indexes", [8, 16, 24] + ), + norm_eps=getattr(tc, "rms_norm_eps", 1e-6), + ) + else: + self.visual = None + + # Language model + self.model = Qwen3_5ForCausalLM(config) + + # Expose hybrid model metadata for ModelRunner + self.num_gdn_layers = self.model.num_gdn_layers + self.full_attn_layer_ids = self.model.full_attn_layer_ids + + # LM head (tied to embedding when tie_word_embeddings=True) + self.lm_head = Linear(tc.hidden_size, tc.vocab_size, bias=False) + if getattr(tc, "tie_word_embeddings", False): + self.lm_head.weight = self.model.embed_tokens.weight + + # Vision token IDs + self.image_token_id = getattr(config, "image_token_id", 151655) + self.video_token_id = getattr(config, "video_token_id", 151656) + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + forward_batch: Any, + input_embeds: Optional[torch.Tensor] = None, + pixel_values: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # Process vision inputs if provided + if input_embeds is None and pixel_values is not None and self.visual is not None: + input_embeds = self.model.embed_tokens(input_ids) + # Run vision encoder + visual_embeds = self.visual(pixel_values, grid_thw=image_grid_thw) + # Replace image/video token positions with visual embeddings + mask = (input_ids == self.image_token_id) | (input_ids == self.video_token_id) + if mask.any(): + input_embeds[mask] = visual_embeds.reshape(-1, visual_embeds.shape[-1]) + + hidden_states = self.model( + input_ids=input_ids, + positions=positions, + forward_batch=forward_batch, + input_embeds=input_embeds, + ) + + # LM head + logits = self.lm_head(hidden_states) + return logits + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + """Load weights, dispatching visual vs language params.""" + visual_weights = [] + language_weights = [] + + for name, weight in weights: + if "visual" in name or "model.visual" in name: + # Normalize visual weight names + name = name.replace("model.visual.", "visual.") + name = name.replace("attn.qkv.", "attn.qkv_proj.") + visual_weights.append((name, weight)) + else: + language_weights.append((name, weight)) + + # Load language model weights + self.model.load_weights(language_weights) + + # Load visual weights + if self.visual is not None and visual_weights: + params_dict = dict(self.named_parameters()) + for name, weight in visual_weights: + if name in params_dict: + param = params_dict[name] + loader = getattr(param, "weight_loader", None) + if loader is not None: + loader(param, weight) + else: + param.data.copy_(weight) + + logger.info("Qwen3_5ForConditionalGeneration weights loaded") diff --git a/pymllm/models/qwen3_vl.py b/pymllm/models/qwen3_vl.py new file mode 100644 index 000000000..3bee27c8d --- /dev/null +++ b/pymllm/models/qwen3_vl.py @@ -0,0 +1,1329 @@ +# Copyright 2025 Qwen Team +# Copyright 2025 SGLang Team +# Adapted for pymllm +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Inference-only Qwen3-VL model for pymllm. + +Adapted from sglang's Qwen3-VL implementation for pymllm's single-GPU +inference architecture. Uses pymllm layers (RadixAttention, RMSNorm, MLP) +and conforms to the pymllm forward interface:: + + model.forward(input_ids, positions, forward_batch) + +Designed for a single accelerator card — no tensor / pipeline parallelism. +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from pymllm.layers import RMSNorm, apply_mrope +from pymllm.layers.attention.radix_attention import RadixAttention +from pymllm.layers.mlp import MLP + +if TYPE_CHECKING: + from pymllm.engine.forward_batch import ForwardBatch + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Vision Encoder +# --------------------------------------------------------------------------- + + +class Qwen3VisionMLP(nn.Module): + """MLP block for the vision encoder.""" + + def __init__( + self, + in_features: int, + hidden_features: int, + hidden_act: str = "silu", + bias: bool = True, + ): + super().__init__() + self.linear_fc1 = nn.Linear(in_features, hidden_features, bias=bias) + self.linear_fc2 = nn.Linear(hidden_features, in_features, bias=bias) + if hidden_act == "gelu_pytorch_tanh": + self.act = nn.GELU(approximate="tanh") + elif hidden_act == "gelu": + self.act = nn.GELU() + else: + self.act = nn.SiLU() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.linear_fc2(self.act(self.linear_fc1(x))) + + +class Qwen3VLVisionPatchEmbed(nn.Module): + """3D convolution patch embedding for video/image patchification.""" + + def __init__( + self, + patch_size: int = 16, + temporal_patch_size: int = 2, + in_channels: int = 3, + embed_dim: int = 1152, + ): + super().__init__() + self.patch_size = patch_size + self.temporal_patch_size = temporal_patch_size + self.in_channels = in_channels + self.embed_dim = embed_dim + + kernel_size = [temporal_patch_size, patch_size, patch_size] + self.proj = nn.Conv3d( + in_channels, + embed_dim, + kernel_size=kernel_size, + stride=kernel_size, + bias=True, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + target_dtype = self.proj.weight.dtype + hidden_states = hidden_states.view( + -1, + self.in_channels, + self.temporal_patch_size, + self.patch_size, + self.patch_size, + ) + hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view( + -1, self.embed_dim + ) + return hidden_states + + +def _rotate_half(x: torch.Tensor) -> torch.Tensor: + """Rotate half the hidden dims of the input for RoPE.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +class Qwen3VisionAttention(nn.Module): + """Multi-head self-attention for the vision encoder (no KV cache).""" + + def __init__(self, embed_dim: int, num_heads: int): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + + self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim, bias=True) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=True) + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, + ) -> torch.Tensor: + """Forward pass with variable-length sequences via cu_seqlens. + + Args: + x: [total_tokens, embed_dim] + cu_seqlens: [num_seqs + 1] cumulative sequence lengths + rotary_pos_emb_cos: [total_tokens, rotary_dim] + rotary_pos_emb_sin: [total_tokens, rotary_dim] + """ + seq_len = x.shape[0] + qkv = self.qkv_proj(x) + q, k, v = qkv.reshape(seq_len, 3, self.num_heads, self.head_dim).unbind(dim=1) + + # Apply rotary position embedding. + # cos/sin are [total_tokens, head_dim // 2]. Following sglang's + # VisionAttention: double them to full head_dim and apply RoPE to + # all head dimensions (the rotation pairs (q[i], q[i + head_dim//2])). + cos = rotary_pos_emb_cos + sin = rotary_pos_emb_sin + if cos.shape[-1] * 2 == self.head_dim: + cos = torch.cat([cos, cos], dim=-1) + sin = torch.cat([sin, sin], dim=-1) + + cos = cos.unsqueeze(1) # [seq, 1, head_dim] + sin = sin.unsqueeze(1) # [seq, 1, head_dim] + + q = q * cos + _rotate_half(q) * sin + k = k * cos + _rotate_half(k) * sin + + # Scaled dot-product attention per variable-length sequence + output = torch.empty_like(q) + num_seqs = cu_seqlens.shape[0] - 1 + for i in range(num_seqs): + start = cu_seqlens[i].item() + end = cu_seqlens[i + 1].item() + qi = q[start:end].transpose(0, 1).unsqueeze(0) # [1, heads, seq, dim] + ki = k[start:end].transpose(0, 1).unsqueeze(0) + vi = v[start:end].transpose(0, 1).unsqueeze(0) + oi = F.scaled_dot_product_attention(qi, ki, vi) + output[start:end] = oi.squeeze(0).transpose(0, 1) + + output = output.reshape(seq_len, self.embed_dim) + return self.out_proj(output) + + +class Qwen3VisionBlock(nn.Module): + """Single vision transformer block.""" + + def __init__( + self, + dim: int, + num_heads: int, + intermediate_dim: int, + hidden_act: str = "silu", + norm_eps: float = 1e-6, + ): + super().__init__() + self.norm1 = nn.LayerNorm(dim, eps=norm_eps) + self.norm2 = nn.LayerNorm(dim, eps=norm_eps) + self.attn = Qwen3VisionAttention(embed_dim=dim, num_heads=num_heads) + self.mlp = Qwen3VisionMLP( + dim, intermediate_dim, hidden_act=hidden_act, bias=True + ) + + def forward( + self, + x: torch.Tensor, + cu_seqlens: torch.Tensor, + rotary_pos_emb_cos: torch.Tensor, + rotary_pos_emb_sin: torch.Tensor, + ) -> torch.Tensor: + x = x + self.attn( + self.norm1(x), + cu_seqlens=cu_seqlens, + rotary_pos_emb_cos=rotary_pos_emb_cos, + rotary_pos_emb_sin=rotary_pos_emb_sin, + ) + x = x + self.mlp(self.norm2(x)) + return x + + +class Qwen3VLVisionPatchMerger(nn.Module): + """Merges spatial patches to reduce sequence length. + + Groups ``spatial_merge_size ** 2`` consecutive patch tokens and projects + them to the language model hidden dimension. + """ + + def __init__( + self, + dim: int, + context_dim: int, + spatial_merge_size: int = 2, + use_postshuffle_norm: bool = False, + norm_eps: float = 1e-6, + ): + super().__init__() + self.hidden_size = context_dim * (spatial_merge_size**2) + self.use_postshuffle_norm = use_postshuffle_norm + self.norm = nn.LayerNorm( + self.hidden_size if use_postshuffle_norm else context_dim, eps=norm_eps + ) + self.linear_fc1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True) + self.act_fn = nn.GELU() + self.linear_fc2 = nn.Linear(self.hidden_size, dim, bias=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.use_postshuffle_norm: + x = self.norm(x.view(-1, self.hidden_size)) + else: + x = self.norm(x).view(-1, self.hidden_size) + x = self.act_fn(self.linear_fc1(x)) + return self.linear_fc2(x) + + +class Qwen3VLVisionModel(nn.Module): + """Complete vision encoder for Qwen3-VL. + + Produces patch embeddings from raw pixel values, applies a stack of + vision transformer blocks with 3D rotary embeddings, then merges + spatial patches. Supports "deep stack" where intermediate layer + outputs are captured and concatenated to the final output. + """ + + def __init__( + self, + depth: int = 27, + hidden_size: int = 1152, + hidden_act: str = "gelu_pytorch_tanh", + intermediate_size: int = 4304, + num_heads: int = 16, + in_channels: int = 3, + patch_size: int = 16, + spatial_merge_size: int = 2, + temporal_patch_size: int = 2, + out_hidden_size: int = 3584, + num_position_embeddings: int = 2304, + deepstack_visual_indexes: Optional[List[int]] = None, + norm_eps: float = 1e-6, + ): + super().__init__() + if deepstack_visual_indexes is None: + deepstack_visual_indexes = [8, 16, 24] + + self.hidden_size = hidden_size + self.num_heads = num_heads + self.num_position_embeddings = num_position_embeddings + self.num_grid_per_side = int(num_position_embeddings**0.5) + self.patch_size = patch_size + self.spatial_merge_size = spatial_merge_size + self.temporal_patch_size = temporal_patch_size + self.deepstack_visual_indexes = deepstack_visual_indexes + # Total output dim = out_hidden_size * (1 main + N deepstack mergers) + self.out_hidden_size = out_hidden_size * (1 + len(deepstack_visual_indexes)) + + self.patch_embed = Qwen3VLVisionPatchEmbed( + patch_size=patch_size, + temporal_patch_size=temporal_patch_size, + in_channels=in_channels, + embed_dim=hidden_size, + ) + + self.pos_embed = nn.Embedding(num_position_embeddings, hidden_size) + + head_dim = hidden_size // num_heads + self._init_rope_cache(head_dim) + + self.blocks = nn.ModuleList( + [ + Qwen3VisionBlock( + dim=hidden_size, + num_heads=num_heads, + intermediate_dim=intermediate_size, + hidden_act=hidden_act, + norm_eps=norm_eps, + ) + for _ in range(depth) + ] + ) + + self.merger = Qwen3VLVisionPatchMerger( + dim=out_hidden_size, + context_dim=hidden_size, + spatial_merge_size=spatial_merge_size, + norm_eps=norm_eps, + ) + + self.deepstack_merger_list = nn.ModuleList( + [ + Qwen3VLVisionPatchMerger( + dim=out_hidden_size, + context_dim=hidden_size, + spatial_merge_size=spatial_merge_size, + use_postshuffle_norm=True, + norm_eps=norm_eps, + ) + for _ in range(len(deepstack_visual_indexes)) + ] + ) + + def _init_rope_cache(self, head_dim: int, max_grid_size: int = 8192): + """Precompute cos/sin cache for 2D rotary embeddings.""" + rotary_dim = head_dim // 2 + inv_freq = 1.0 / ( + 10000.0 + ** (torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim) + ) + t = torch.arange(max_grid_size, dtype=torch.float32) + freqs = torch.outer(t, inv_freq) + self.register_buffer("cos_cache", torch.cos(freqs), persistent=False) + self.register_buffer("sin_cache", torch.sin(freqs), persistent=False) + + @property + def dtype(self) -> torch.dtype: + return self.patch_embed.proj.weight.dtype + + @property + def device(self) -> torch.device: + return self.patch_embed.proj.weight.device + + # -- Rotary position embedding helpers -- + + @staticmethod + def _rot_pos_ids(h: int, w: int, spatial_merge_size: int) -> torch.Tensor: + """Compute 2D rotary position IDs for a grid of *h* x *w* patches. + + The patches are re-ordered to group ``spatial_merge_size ** 2`` + neighbours together (matching the merger's token order). + + Returns tensor of shape ``[h*w, 2]`` with ``(height_pos, width_pos)``. + """ + merge = spatial_merge_size + h_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + w_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + + h_ids = h_ids.reshape(h // merge, merge, w // merge, merge) + w_ids = w_ids.reshape(h // merge, merge, w // merge, merge) + + h_ids = h_ids.permute(0, 2, 1, 3).flatten() + w_ids = w_ids.permute(0, 2, 1, 3).flatten() + + return torch.stack([h_ids, w_ids], dim=-1) + + def rot_pos_emb( + self, grid_thw: List[List[int]] + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Compute rotary pos-emb cos/sin for all images/videos in the batch.""" + pos_ids = [] + for t, h, w in grid_thw: + base = self._rot_pos_ids(h, w, self.spatial_merge_size) + pos_ids.append(base if t == 1 else base.repeat(t, 1)) + + pos_ids = torch.cat(pos_ids, dim=0).to(self.device, non_blocking=True) + cos_combined = self.cos_cache[pos_ids].flatten(1) + sin_combined = self.sin_cache[pos_ids].flatten(1) + return cos_combined, sin_combined + + # -- Position embedding interpolation -- + + def _get_interpolation_indices(self, dim_size: int) -> np.ndarray: + indices = (np.arange(dim_size, dtype=np.float32) + 0.5) * ( + self.num_grid_per_side / dim_size + ) - 0.5 + return np.clip(indices, 0, self.num_grid_per_side - 1) + + def _calculate_indices_and_weights( + self, h_idxs: np.ndarray, w_idxs: np.ndarray + ) -> Tuple[List[np.ndarray], List[np.ndarray]]: + """Compute bilinear interpolation indices and weights.""" + side = self.num_grid_per_side + h_f = np.floor(h_idxs).astype(np.int64) + h_c = np.clip(h_f + 1, 0, side - 1) + dh = h_idxs - h_f + w_f = np.floor(w_idxs).astype(np.int64) + w_c = np.clip(w_f + 1, 0, side - 1) + dw = w_idxs - w_f + + indices = [ + (h_f[:, None] * side + w_f).flatten(), + (h_f[:, None] * side + w_c).flatten(), + (h_c[:, None] * side + w_f).flatten(), + (h_c[:, None] * side + w_c).flatten(), + ] + weights = [ + ((1 - dh)[:, None] * (1 - dw)).flatten(), + ((1 - dh)[:, None] * dw).flatten(), + (dh[:, None] * (1 - dw)).flatten(), + (dh[:, None] * dw).flatten(), + ] + return indices, weights + + def _get_position_embedding( + self, + patch_pos_embeds: List[torch.Tensor], + grid_ts: List[int], + grid_hs: List[int], + grid_ws: List[int], + ) -> torch.Tensor: + """Tile and reorganize position embeddings to align with the merged token order.""" + result_parts = [] + merge = self.spatial_merge_size + for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws): + pos_embed = pos_embed.repeat(t, 1) + pos_embed = ( + pos_embed.view(t, h // merge, merge, w // merge, merge, -1) + .permute(0, 1, 3, 2, 4, 5) + .flatten(0, 4) + ) + result_parts.append(pos_embed) + return torch.cat(result_parts, dim=0) + + def fast_pos_embed_interpolate(self, grid_thw: torch.Tensor) -> torch.Tensor: + """Interpolate position embeddings via bilinear interpolation.""" + grid_thw_cpu = grid_thw.cpu().numpy() + temporal_dims = grid_thw_cpu[:, 0].tolist() + height_dims = grid_thw_cpu[:, 1].tolist() + width_dims = grid_thw_cpu[:, 2].tolist() + + device = self.pos_embed.weight.device + dtype = self.pos_embed.weight.dtype + + patches_size = [h * w for h, w in zip(height_dims, width_dims)] + total_patches = sum(patches_size) + all_indices_np = np.zeros((4, total_patches), dtype=np.int64) + all_weights_np = np.zeros((4, total_patches), dtype=np.float32) + + current_idx = 0 + for _t, h, w in zip(temporal_dims, height_dims, width_dims): + h_idxs = self._get_interpolation_indices(h) + w_idxs = self._get_interpolation_indices(w) + indices, weights = self._calculate_indices_and_weights(h_idxs, w_idxs) + end_idx = current_idx + h * w + for i in range(4): + all_indices_np[i, current_idx:end_idx] = indices[i] + all_weights_np[i, current_idx:end_idx] = weights[i] + current_idx = end_idx + + idx_tensor = torch.from_numpy(all_indices_np).to(device) + weight_tensor = torch.from_numpy(all_weights_np).to(dtype=dtype, device=device) + + pos_embeds = self.pos_embed(idx_tensor.view(-1)) + pos_embeds = pos_embeds.view(4, total_patches, -1) + patch_pos_embeds = (pos_embeds * weight_tensor.unsqueeze(-1)).sum(dim=0) + patch_pos_embeds = patch_pos_embeds.split(patches_size) + return self._get_position_embedding( + list(patch_pos_embeds), temporal_dims, height_dims, width_dims + ) + + # -- Forward -- + + def forward( + self, + x: torch.Tensor, + grid_thw: torch.Tensor, + ) -> torch.Tensor: + """Run the vision encoder. + + Args: + x: Pixel values, shape ``[total_patches, patch_dim]``. + grid_thw: Grid dimensions ``[num_images, 3]`` with ``(T, H, W)``. + + Returns: + Vision features of shape + ``[num_merged_tokens, out_hidden_size * (1 + num_deepstack)]``. + """ + x = x.to(device=self.device, dtype=self.dtype) + x = self.patch_embed(x) + + if isinstance(grid_thw, list): + grid_thw_list = grid_thw + grid_thw = torch.tensor(grid_thw, dtype=torch.int32) + else: + grid_thw_list = grid_thw.tolist() + + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + x += pos_embeds + + rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list) + + cu_seqlens = _compute_cu_seqlens_from_grid(grid_thw) + cu_seqlens = cu_seqlens.to(self.device, non_blocking=True) + + deepstack_features = [] + ds_idx = 0 + + for layer_num, blk in enumerate(self.blocks): + x = blk(x, cu_seqlens, rotary_pos_emb_cos, rotary_pos_emb_sin) + + if layer_num in self.deepstack_visual_indexes: + # x is [total_tokens, hidden]. The merger expects the last + # dim to be context_dim so it can group spatial_merge_size^2 + # tokens; reshape to [total_tokens, 1, hidden] so that the + # `.view(-1, hidden_size)` inside the merger collapses the + # spatial merge correctly. + ds_feat = self.deepstack_merger_list[ds_idx](x.unsqueeze(1)) + deepstack_features.append(ds_feat) + ds_idx += 1 + + x = self.merger(x.unsqueeze(1)) + + # Concatenate main + deepstack features along the feature dimension. + # Result: [num_merged_tokens, out_hidden_size * (1 + num_deepstack)] + hidden_states = torch.cat([x] + deepstack_features, dim=-1) + return hidden_states + + +def _compute_cu_seqlens_from_grid(grid_thw: torch.Tensor) -> torch.Tensor: + """Compute cumulative sequence lengths from grid dimensions.""" + grid_np = grid_thw.cpu().numpy() + seq_lens = (grid_np[:, 0] * grid_np[:, 1] * grid_np[:, 2]).astype(np.int32) + cu_seqlens = np.concatenate([[0], np.cumsum(seq_lens)]) + return torch.tensor(cu_seqlens, dtype=torch.int32) + + +def _build_cos_sin_cache( + head_dim: int, + rope_theta: float, + max_pos: int, + dtype: torch.dtype, +) -> torch.Tensor: + """Build a [max_pos, head_dim] cos/sin cache for M-RoPE. + + Layout: first ``head_dim // 2`` columns are cos values, second half are sin. + Each row corresponds to one position index. + """ + inv_freq = 1.0 / ( + rope_theta ** (torch.arange(0, head_dim, 2, dtype=torch.float32) / head_dim) + ) + t = torch.arange(max_pos, dtype=torch.float32) + freqs = torch.outer(t, inv_freq) # [max_pos, head_dim // 2] + return torch.cat([torch.cos(freqs), torch.sin(freqs)], dim=-1).to(dtype) + + +def get_rope_index( + input_ids: torch.Tensor, + image_grid_thw: Optional[torch.Tensor], + image_token_id: int, + vision_start_token_id: int, + spatial_merge_size: int, +) -> Tuple[torch.Tensor, int]: + """Compute M-RoPE 3-D position IDs for one sequence. + + For text tokens all three (temporal, height, width) indices are equal to + the sequential counter. For image tokens the indices follow the spatial + grid ``(t, h, w)``. + + Args: + input_ids: Token IDs for one sequence, shape ``[T]``. + image_grid_thw: Grid dimensions for every image in the sequence, + shape ``[num_images, 3]``. ``None`` when there are no images. + image_token_id: Token ID used as placeholder for image patches. + vision_start_token_id: Token ID that precedes each image block. + spatial_merge_size: Number of patches merged per spatial dimension + (e.g. 2 → 2x2 merge, so llm_grid_h = H // 2). + + Returns: + ``(position_ids, mrope_position_delta)`` where ``position_ids`` has + shape ``[3, T]`` and ``mrope_position_delta`` is a Python ``int`` + equal to ``max_position_used + 1 - T``. + """ + total_tokens = input_ids.shape[0] + device = input_ids.device + position_ids = torch.zeros(3, total_tokens, dtype=torch.long, device=device) + + if image_grid_thw is None or image_grid_thw.shape[0] == 0: + pos = torch.arange(total_tokens, dtype=torch.long, device=device) + position_ids[0] = pos + position_ids[1] = pos + position_ids[2] = pos + return position_ids, 0 + + input_ids_cpu = input_ids.cpu().tolist() + grid_thw_list = image_grid_thw.cpu().tolist() + + llm_pos_ids_start = 0 + image_idx = 0 + i = 0 + + while i < total_tokens: + token = input_ids_cpu[i] + + if token == vision_start_token_id and image_idx < len(grid_thw_list): + # The vision_start token itself gets a regular sequential position. + position_ids[:, i] = llm_pos_ids_start + llm_pos_ids_start += 1 + i += 1 + + # Compute LLM-side grid dimensions (after spatial merging). + t_g = int(grid_thw_list[image_idx][0]) + h_g = int(grid_thw_list[image_idx][1]) + w_g = int(grid_thw_list[image_idx][2]) + llm_grid_t = t_g + llm_grid_h = h_g // spatial_merge_size + llm_grid_w = w_g // spatial_merge_size + num_image_tokens = llm_grid_t * llm_grid_h * llm_grid_w + + # Build per-patch 3-D indices. + t_idx = ( + torch.arange(llm_grid_t, device=device) + .view(-1, 1, 1) + .expand(-1, llm_grid_h, llm_grid_w) + .flatten() + ) + h_idx = ( + torch.arange(llm_grid_h, device=device) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_idx = ( + torch.arange(llm_grid_w, device=device) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + + img_start = i + img_end = i + num_image_tokens + position_ids[0, img_start:img_end] = t_idx + llm_pos_ids_start + position_ids[1, img_start:img_end] = h_idx + llm_pos_ids_start + position_ids[2, img_start:img_end] = w_idx + llm_pos_ids_start + + llm_pos_ids_start += max(llm_grid_t, llm_grid_h, llm_grid_w) + i += num_image_tokens + image_idx += 1 + else: + # Text token (including vision_end and all non-image tokens). + position_ids[:, i] = llm_pos_ids_start + llm_pos_ids_start += 1 + i += 1 + + mrope_position_delta = llm_pos_ids_start - total_tokens + return position_ids, mrope_position_delta + + +# --------------------------------------------------------------------------- +# Text Decoder (Language Model) +# --------------------------------------------------------------------------- + + +class Qwen3VLAttention(nn.Module): + """Attention layer for the Qwen3-VL text decoder. + + Uses QK-norm (per-head RMSNorm on Q and K before RoPE) and + :class:`RadixAttention` for KV-cached inference. Applies + interleaved M-RoPE with a precomputed cos/sin cache. + """ + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + head_dim: int, + layer_id: int, + rope_theta: float = 5_000_000.0, + rms_norm_eps: float = 1e-6, + mrope_section: Tuple[int, int, int] = (24, 20, 20), + mrope_interleaved: bool = True, + max_position_embeddings: int = 32768, + ): + super().__init__() + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = head_dim + self.q_size = num_heads * head_dim + self.kv_size = num_kv_heads * head_dim + self.scaling = head_dim**-0.5 + self.mrope_section = list(mrope_section) + self.mrope_interleaved = mrope_interleaved + + # Fused QKV projection + self.qkv_proj = nn.Linear( + hidden_size, self.q_size + 2 * self.kv_size, bias=False + ) + + # Output projection + self.o_proj = nn.Linear(num_heads * head_dim, hidden_size, bias=False) + + # QK normalization + self.q_norm = RMSNorm(head_dim, eps=rms_norm_eps) + self.k_norm = RMSNorm(head_dim, eps=rms_norm_eps) + + # Precomputed M-RoPE cos/sin cache: [max_pos, head_dim] + cos_sin = _build_cos_sin_cache( + head_dim, rope_theta, max_position_embeddings, torch.float32 + ) + self.register_buffer("cos_sin_cache", cos_sin, persistent=False) + + # Radix attention (single-GPU: heads == tp_heads) + self.attn = RadixAttention( + num_heads=num_heads, + head_dim=head_dim, + scaling=self.scaling, + num_kv_heads=num_kv_heads, + layer_id=layer_id, + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + forward_batch: "ForwardBatch", + ) -> torch.Tensor: + qkv = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + # Per-head QK normalization + q = self.q_norm(q.view(-1, self.num_heads, self.head_dim)) + k = self.k_norm(k.view(-1, self.num_kv_heads, self.head_dim)) + + # Apply M-RoPE. positions is [3, T] for prefill (3-D) or may arrive + # as [T] for purely text-only batches; expand to [3, T] in that case. + if positions.ndim == 1: + positions = positions.unsqueeze(0).expand(3, -1) + q, k = apply_mrope( + q, + k, + positions, + self.cos_sin_cache.to(q.dtype), + self.mrope_section, + self.mrope_interleaved, + ) + + q = q.reshape(-1, self.q_size) + k = k.reshape(-1, self.kv_size) + + # Attention with KV cache + attn_output = self.attn(q, k, v, forward_batch) + return self.o_proj(attn_output) + + +class Qwen3VLDecoderLayer(nn.Module): + """Single decoder layer for the Qwen3-VL text model.""" + + def __init__( + self, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + head_dim: int, + intermediate_size: int, + layer_id: int, + rope_theta: float = 5_000_000.0, + rms_norm_eps: float = 1e-6, + mrope_section: Tuple[int, int, int] = (24, 20, 20), + mrope_interleaved: bool = True, + max_position_embeddings: int = 32768, + ): + super().__init__() + self.self_attn = Qwen3VLAttention( + hidden_size=hidden_size, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_dim=head_dim, + layer_id=layer_id, + rope_theta=rope_theta, + rms_norm_eps=rms_norm_eps, + mrope_section=mrope_section, + mrope_interleaved=mrope_interleaved, + max_position_embeddings=max_position_embeddings, + ) + self.mlp = MLP( + hidden_size=hidden_size, + intermediate_size=intermediate_size, + activation="silu", + use_fused_gate_up_proj=True, + use_bias_gate_up=False, + use_bias_down=False, + ) + self.input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps) + self.post_attention_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + forward_batch: "ForwardBatch", + deepstack_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # Self-attention + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states = self.self_attn(positions, hidden_states, forward_batch) + hidden_states = residual + hidden_states + + # Add deepstack embeddings after residual (matches HF ordering) + if deepstack_embeds is not None: + hidden_states = hidden_states + deepstack_embeds + + # MLP + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class Qwen3VLTextModel(nn.Module): + """Qwen3-VL text backbone (embedding + decoder layers + final norm).""" + + def __init__( + self, + vocab_size: int = 151936, + hidden_size: int = 4096, + intermediate_size: int = 22016, + num_hidden_layers: int = 32, + num_attention_heads: int = 32, + num_key_value_heads: int = 32, + head_dim: int = 128, + rope_theta: float = 5_000_000.0, + rms_norm_eps: float = 1e-6, + mrope_section: Tuple[int, int, int] = (24, 20, 20), + mrope_interleaved: bool = True, + max_position_embeddings: int = 32768, + ): + super().__init__() + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + + self.embed_tokens = nn.Embedding(vocab_size, hidden_size) + + self.layers = nn.ModuleList( + [ + Qwen3VLDecoderLayer( + hidden_size=hidden_size, + num_heads=num_attention_heads, + num_kv_heads=num_key_value_heads, + head_dim=head_dim, + intermediate_size=intermediate_size, + layer_id=layer_id, + rope_theta=rope_theta, + rms_norm_eps=rms_norm_eps, + mrope_section=mrope_section, + mrope_interleaved=mrope_interleaved, + max_position_embeddings=max_position_embeddings, + ) + for layer_id in range(num_hidden_layers) + ] + ) + + self.norm = RMSNorm(hidden_size, eps=rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + forward_batch: "ForwardBatch", + input_embeds: Optional[torch.Tensor] = None, + input_deepstack_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if input_embeds is None: + hidden_states = self.embed_tokens(input_ids) + else: + hidden_states = input_embeds + + for layer_idx, layer in enumerate(self.layers): + ds_embeds = _get_deepstack_embeds( + layer_idx, input_deepstack_embeds, self.hidden_size + ) + hidden_states = layer( + positions, + hidden_states, + forward_batch, + deepstack_embeds=ds_embeds, + ) + + return self.norm(hidden_states) + + +def _get_deepstack_embeds( + layer_idx: int, + input_deepstack_embeds: Optional[torch.Tensor], + hidden_size: int, +) -> Optional[torch.Tensor]: + """Extract deepstack embeddings for a specific decoder layer.""" + if input_deepstack_embeds is None: + return None + num_deepstack = input_deepstack_embeds.shape[-1] // hidden_size + if layer_idx >= num_deepstack: + return None + start = hidden_size * layer_idx + return input_deepstack_embeds[:, start : start + hidden_size] + + +# --------------------------------------------------------------------------- +# Full Model: Qwen3VLForConditionalGeneration +# --------------------------------------------------------------------------- + + +class Qwen3VLForConditionalGeneration(nn.Module): + """Qwen3-VL multimodal model for conditional generation. + + Combines a vision encoder and text decoder. During prefill, image/video + tokens are replaced with visual features from the vision encoder. + During decode, the model runs only the text decoder. + + Forward interface:: + + logits = model.forward(input_ids, positions, forward_batch) + """ + + def __init__(self, config) -> None: + super().__init__() + self.config = config + + text_config = getattr(config, "text_config", config) + vision_config = getattr(config, "vision_config", None) + + # Vision encoder + if vision_config is not None: + self.visual = Qwen3VLVisionModel( + depth=getattr(vision_config, "depth", 27), + hidden_size=getattr(vision_config, "hidden_size", 1152), + hidden_act=getattr(vision_config, "hidden_act", "gelu_pytorch_tanh"), + intermediate_size=getattr(vision_config, "intermediate_size", 4304), + num_heads=getattr(vision_config, "num_heads", 16), + in_channels=getattr(vision_config, "in_channels", 3), + patch_size=getattr(vision_config, "patch_size", 16), + spatial_merge_size=getattr(vision_config, "spatial_merge_size", 2), + temporal_patch_size=getattr(vision_config, "temporal_patch_size", 2), + out_hidden_size=getattr(vision_config, "out_hidden_size", 3584), + num_position_embeddings=getattr( + vision_config, "num_position_embeddings", 2304 + ), + deepstack_visual_indexes=getattr( + vision_config, "deepstack_visual_indexes", [8, 16, 24] + ), + norm_eps=getattr(text_config, "rms_norm_eps", 1e-6), + ) + else: + self.visual = None + + # Text decoder + hidden_size = getattr(text_config, "hidden_size", 4096) + vocab_size = getattr(text_config, "vocab_size", 151936) + + # M-RoPE configuration -- mrope_section lives inside rope_scaling, + # NOT as a top-level attribute of text_config. + rope_scaling = getattr(text_config, "rope_scaling", None) or {} + if isinstance(rope_scaling, dict): + mrope_section = rope_scaling.get("mrope_section", [24, 20, 20]) + mrope_interleaved = rope_scaling.get("mrope_interleaved", True) + else: + mrope_section = getattr(rope_scaling, "mrope_section", [24, 20, 20]) + mrope_interleaved = getattr(rope_scaling, "mrope_interleaved", True) + max_position_embeddings = getattr(text_config, "max_position_embeddings", 32768) + + self.model = Qwen3VLTextModel( + vocab_size=vocab_size, + hidden_size=hidden_size, + intermediate_size=getattr(text_config, "intermediate_size", 22016), + num_hidden_layers=getattr(text_config, "num_hidden_layers", 32), + num_attention_heads=getattr(text_config, "num_attention_heads", 32), + num_key_value_heads=getattr(text_config, "num_key_value_heads", 32), + head_dim=getattr(text_config, "head_dim", 128), + rope_theta=getattr(text_config, "rope_theta", 5_000_000.0), + rms_norm_eps=getattr(text_config, "rms_norm_eps", 1e-6), + mrope_section=tuple(mrope_section), + mrope_interleaved=bool(mrope_interleaved), + max_position_embeddings=max_position_embeddings, + ) + + # LM head — following sglang's pattern: always use lm_head.weight + # for matmul in forward(), so it works whether lm_head is nn.Embedding + # (tied) or nn.Linear (untied). + tie_word_embeddings = getattr(config, "tie_word_embeddings", False) + if tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = nn.Linear(hidden_size, vocab_size, bias=False) + + # Token IDs for multimodal + self.image_token_id = getattr(config, "image_token_id", 151655) + self.video_token_id = getattr(config, "video_token_id", 151656) + self.vision_start_token_id = getattr(config, "vision_start_token_id", 151652) + + # Spatial merge size (needed for get_rope_index) + self.spatial_merge_size = ( + getattr(vision_config, "spatial_merge_size", 2) + if vision_config is not None + else 2 + ) + + # Deepstack config + if vision_config is not None: + ds_indexes = getattr(vision_config, "deepstack_visual_indexes", [8, 16, 24]) + self.num_deepstack_embeddings = len(ds_indexes) + else: + self.num_deepstack_embeddings = 0 + + self._hidden_size = hidden_size + + def get_input_embeddings(self) -> nn.Module: + return self.model.embed_tokens + + @torch.no_grad() + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + forward_batch: "ForwardBatch", + ) -> torch.Tensor: + """Run forward pass for Qwen3-VL. + + Args: + input_ids: Flattened input token IDs, shape ``[num_tokens]``. + positions: Position IDs, shape ``[num_tokens]`` (1-D, from model + runner). Overridden internally with 3-D M-RoPE positions. + forward_batch: :class:`ForwardBatch` with attention metadata. + + Returns: + Logits tensor of shape ``[num_tokens, vocab_size]``. + """ + pixel_values = getattr(forward_batch, "pixel_values", None) + image_grid_thw = getattr(forward_batch, "image_grid_thw", None) + + # ------------------------------------------------------------------ + # Build 3-D M-RoPE positions + # ------------------------------------------------------------------ + if forward_batch.forward_mode.is_extend(): + # Prefill: compute per-sequence 3-D position IDs from input_ids + # and image grids, then store per-request deltas for future decode. + mrope_positions_list: List[torch.Tensor] = [] + deltas: List[int] = [] + image_idx_offset = 0 + + for i in range(forward_batch.batch_size): + start = int(forward_batch.extend_start_loc[i].item()) + length = int(forward_batch.extend_seq_lens[i].item()) + seq_ids = input_ids[start : start + length] + + # Determine how many images belong to this sequence. + num_img = int((seq_ids == self.vision_start_token_id).sum().item()) + if image_grid_thw is not None and num_img > 0: + thw_seq = image_grid_thw[ + image_idx_offset : image_idx_offset + num_img + ] + image_idx_offset += num_img + else: + thw_seq = None + + pos3d, delta = get_rope_index( + seq_ids, + thw_seq, + self.image_token_id, + self.vision_start_token_id, + self.spatial_merge_size, + ) + mrope_positions_list.append(pos3d) + deltas.append(delta) + + # Concatenate across sequences: [3, total_extend_tokens] + positions = torch.cat(mrope_positions_list, dim=1) + forward_batch.mrope_position_deltas = torch.tensor( + deltas, dtype=torch.int64, device=input_ids.device + ) + else: + # Decode: each sequence emits exactly one token. Apply the stored + # per-request delta so the position matches the image extent. + stored_deltas = getattr(forward_batch, "mrope_position_deltas", None) + if stored_deltas is not None: + pos_1d = forward_batch.positions + stored_deltas + else: + pos_1d = forward_batch.positions + positions = pos_1d.unsqueeze(0).expand(3, -1) # [3, batch_size] + + input_embeds = None + input_deepstack_embeds = None + + if ( + pixel_values is not None + and image_grid_thw is not None + and self.visual is not None + and not forward_batch.forward_mode.is_decode() + ): + # Run vision encoder + vision_features = self.visual(pixel_values, grid_thw=image_grid_thw) + + # Separate main embeddings and deepstack embeddings + if self.num_deepstack_embeddings > 0: + vision_embeds = vision_features[:, : self._hidden_size] + deepstack_embeds = vision_features[:, self._hidden_size :] + else: + vision_embeds = vision_features + deepstack_embeds = None + + # Get text embeddings and replace image tokens with vision features + input_embeds = self.model.embed_tokens(input_ids) + image_mask = input_ids == self.image_token_id + if image_mask.any(): + input_embeds[image_mask] = vision_embeds.to(input_embeds.dtype) + + # Build per-token deepstack embeddings + if deepstack_embeds is not None and image_mask.any(): + input_deepstack_embeds = torch.zeros( + input_embeds.shape[0], + deepstack_embeds.shape[-1], + dtype=input_embeds.dtype, + device=input_embeds.device, + ) + input_deepstack_embeds[image_mask] = deepstack_embeds.to( + input_embeds.dtype + ) + + # Text decoder + hidden_states = self.model( + input_ids, + positions, + forward_batch, + input_embeds=input_embeds, + input_deepstack_embeds=input_deepstack_embeds, + ) + + # Prune hidden_states before lm_head to avoid a wasteful + # [total_tokens, vocab] matmul during prefill. Following sglang's + # LogitsProcessor._get_pruned_states(): in extend mode only keep + # the last token of each sequence; in decode mode all rows are + # already one-per-sequence. + if forward_batch.forward_mode.is_extend(): + if ( + forward_batch.extend_start_loc is not None + and forward_batch.extend_seq_lens is not None + ): + last_index = ( + forward_batch.extend_start_loc + forward_batch.extend_seq_lens - 1 + ).long() + hidden_states = hidden_states[last_index] + else: + hidden_states = hidden_states[-1:] + + # LM head: always use weight matrix directly for the linear + # projection. Works for both nn.Embedding (tied) and nn.Linear + # (untied). Matches sglang LogitsProcessor._compute_lm_head(). + logits = torch.matmul( + hidden_states.to(self.lm_head.weight.dtype), + self.lm_head.weight.T, + ) + + # Return LogitsProcessorOutput so that ModelRunner._process_logits + # skips redundant last-token gathering. + from pymllm.executor.model_runner import LogitsProcessorOutput + + return LogitsProcessorOutput(next_token_logits=logits) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> None: + """Load weights from a HuggingFace checkpoint. + + Handles weight name remapping between HuggingFace Qwen3-VL + checkpoints and this model's parameter names. + """ + stacked_params_mapping = [ + # (param_name, weight_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".up_proj", 1), + (".gate_up_proj", ".gate_proj", 0), + ] + + params_dict = dict(self.named_parameters()) + + tie_word_embeddings = getattr(self.config, "tie_word_embeddings", False) + + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + + # When weights are tied, lm_head.weight is the same tensor as + # embed_tokens.weight — skip the duplicate from the checkpoint. + if tie_word_embeddings and "lm_head.weight" in name: + continue + + name = _remap_weight_name(name) + + # Handle language model stacked parameters (QKV, gate_up) + handled = False + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name or "visual" in name: + continue + name = name.replace(weight_name, param_name) + if name not in params_dict: + continue + _load_stacked_weight(params_dict[name], loaded_weight, shard_id) + handled = True + break + + if handled: + continue + + # Handle vision encoder QKV stacking + if "visual" in name: + for qkv_key in (".attn.q.", ".attn.k.", ".attn.v."): + if qkv_key not in name: + continue + qkv_name = name.replace(qkv_key, ".attn.qkv_proj.") + if qkv_name in params_dict: + shard = {"q": 0, "k": 1, "v": 2}[qkv_key[-2]] + _load_vision_qkv_weight( + params_dict[qkv_name], loaded_weight, shard + ) + handled = True + break + + if handled: + continue + + # Direct parameter loading + if name in params_dict: + param = params_dict[name] + if param.data.shape == loaded_weight.shape: + param.data.copy_(loaded_weight) + else: + logger.warning( + "Shape mismatch: param %s (%s) vs loaded (%s), skipping.", + name, + param.data.shape, + loaded_weight.shape, + ) + + +# --------------------------------------------------------------------------- +# Weight loading helpers +# --------------------------------------------------------------------------- + + +def _remap_weight_name(name: str) -> str: + """Remap HuggingFace weight names to pymllm parameter names.""" + # transformers >= v4.52: model.language_model.* -> model.* + if name.startswith("model.language_model."): + name = name.replace("model.language_model.", "model.", 1) + # model.visual.* -> visual.* + elif name.startswith("model.visual."): + name = name.replace("model.visual.", "visual.", 1) + + # Vision attention QKV renaming (fused weights in checkpoint) + if "visual" in name: + name = name.replace("attn.qkv.", "attn.qkv_proj.") + + return name + + +def _load_stacked_weight( + param: nn.Parameter, + loaded_weight: torch.Tensor, + shard_id, +) -> None: + """Load one shard (q/k/v or gate/up) into a fused parameter. + + For QKV with GQA (grouped-query attention), Q has a different size + from K and V. The fused layout is ``[Q, K, V]`` where + ``Q_size = total - 2 * KV_size``. We must use cumulative offsets + rather than ``idx * shard_size`` to handle the asymmetry correctly. + """ + if isinstance(shard_id, str): + # QKV fused layout: [Q, K, V] + # Q may have a different size from K/V (GQA). + total_size = param.data.shape[0] + shard_size = loaded_weight.shape[0] + if shard_id == "q": + param.data[0:shard_size].copy_(loaded_weight) + elif shard_id == "k": + kv_size = shard_size + q_size = total_size - 2 * kv_size + param.data[q_size : q_size + kv_size].copy_(loaded_weight) + elif shard_id == "v": + kv_size = shard_size + q_size = total_size - 2 * kv_size + param.data[q_size + kv_size : q_size + 2 * kv_size].copy_( + loaded_weight + ) + else: + # gate_up: 0 -> gate, 1 -> up (same size, idx*size is correct) + shard_size = loaded_weight.shape[0] + param.data[shard_id * shard_size : (shard_id + 1) * shard_size].copy_( + loaded_weight + ) + + +def _load_vision_qkv_weight( + param: nn.Parameter, + loaded_weight: torch.Tensor, + shard_idx: int, +) -> None: + """Load a Q, K, or V weight shard into a fused QKV parameter.""" + shard_size = param.data.shape[0] // 3 + start = shard_idx * shard_size + param.data[start : start + shard_size].copy_(loaded_weight) diff --git a/pymllm/orchestrator/async_disk_io_process.py b/pymllm/orchestrator/async_disk_io_process.py deleted file mode 100644 index ef3fd5f00..000000000 --- a/pymllm/orchestrator/async_disk_io_process.py +++ /dev/null @@ -1,84 +0,0 @@ -""" -AsyncDiskIoProcess -- optional subprocess for asynchronous disk I/O. - -Handles weight loading, checkpoint saving, or other heavy disk operations -without blocking the scheduler or model runner. -""" - -import logging -from multiprocessing.connection import Connection -from typing import Any, Dict, Optional - -import zmq - -from pymllm.orchestrator.ipc_utils import create_zmq_socket - -logger = logging.getLogger(__name__) - - -class AsyncDiskIoProcess: - """Runs inside a subprocess. Performs disk I/O on behalf of the scheduler.""" - - def __init__(self, recv_addr: str): - self._recv_addr = recv_addr - - self._zmq_ctx: Optional[zmq.Context] = None - self._recv_sock: Optional[zmq.Socket] = None - - # ------------------------------------------------------------------ - # Lifecycle - # ------------------------------------------------------------------ - - def init_sockets(self) -> None: - self._zmq_ctx = zmq.Context() - self._recv_sock = create_zmq_socket( - self._zmq_ctx, zmq.PULL, self._recv_addr, bind=True, - ) - - def event_loop(self) -> None: - """Infinite loop: recv I/O request -> execute -> (optionally reply).""" - logger.info("AsyncDiskIoProcess event loop started") - while True: - io_request: Dict[str, Any] = self._recv_sock.recv_pyobj() - self._handle(io_request) - - # ------------------------------------------------------------------ - # I/O handling (placeholder) - # ------------------------------------------------------------------ - - def _handle(self, io_request: Dict[str, Any]) -> None: - """Dispatch an I/O request. - - TODO: implement weight loading, checkpoint save, etc. - """ - kind = io_request.get("kind") - logger.debug("AsyncDiskIoProcess received request kind=%s", kind) - - # ------------------------------------------------------------------ - # Cleanup - # ------------------------------------------------------------------ - - def shutdown(self) -> None: - if self._recv_sock is not None: - self._recv_sock.close() - if self._zmq_ctx is not None: - self._zmq_ctx.term() - - -def run_async_disk_io_process( - recv_addr: str, - pipe_writer: Connection, -) -> None: - """Entry point for ``torch.multiprocessing.Process(target=...)``.""" - proc = AsyncDiskIoProcess(recv_addr) - proc.init_sockets() - - pipe_writer.send({"status": "ready", "process": "async_disk_io"}) - pipe_writer.close() - - try: - proc.event_loop() - except KeyboardInterrupt: - pass - finally: - proc.shutdown() diff --git a/pymllm/orchestrator/detokenizer_process.py b/pymllm/orchestrator/detokenizer_process.py index e9d5184b6..c2154e447 100644 --- a/pymllm/orchestrator/detokenizer_process.py +++ b/pymllm/orchestrator/detokenizer_process.py @@ -12,7 +12,7 @@ import zmq -from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.orchestrator.ipc_utils import create_zmq_socket, setup_subprocess_logging logger = logging.getLogger(__name__) @@ -24,16 +24,19 @@ def __init__( self, recv_from_scheduler_addr: str, send_to_rr_addr: str, + tokenizer_cfg: Optional[Dict[str, Any]] = None, ): self._recv_from_scheduler_addr = recv_from_scheduler_addr self._send_to_rr_addr = send_to_rr_addr + self._tokenizer_cfg = tokenizer_cfg or {} self._zmq_ctx: Optional[zmq.Context] = None self._recv_from_scheduler: Optional[zmq.Socket] = None self._send_to_rr: Optional[zmq.Socket] = None - # TODO: initialise the tokenizer (needed for decode) self._tokenizer = None + # Track previous decoded text per rid for incremental (delta) output + self._rid_to_prev_text: Dict[str, str] = {} # ------------------------------------------------------------------ # Lifecycle @@ -54,32 +57,102 @@ def init_sockets(self) -> None: bind=False, ) + def init_tokenizer(self) -> None: + """Load the tokenizer from the configured path.""" + tokenizer_path = self._tokenizer_cfg.get("tokenizer_path") + if tokenizer_path is None: + logger.warning( + "No tokenizer_path in tokenizer_cfg; detokenization disabled" + ) + return + + from transformers import AutoTokenizer + + trust_remote_code = self._tokenizer_cfg.get("trust_remote_code", False) + self._tokenizer = AutoTokenizer.from_pretrained( + tokenizer_path, + trust_remote_code=trust_remote_code, + ) + logger.info("Detokenizer loaded tokenizer from %s", tokenizer_path) + def event_loop(self) -> None: """Infinite loop: recv token IDs -> detokenize -> send text to RR.""" logger.info("DetokenizerProcess event loop started") while True: token_id_out = self._recv_from_scheduler.recv_pyobj() - str_out = self._detokenize(token_id_out) - self._send_to_rr.send_pyobj(str_out) + results = self._detokenize(token_id_out) + for result in results: + self._send_to_rr.send_pyobj(result) # ------------------------------------------------------------------ - # Detokenization (placeholder) + # Detokenization # ------------------------------------------------------------------ - def _detokenize(self, token_id_out: Dict[str, Any]) -> Dict[str, Any]: - """Convert token IDs to text. + def _detokenize(self, token_id_out: Dict[str, Any]) -> List[Dict[str, Any]]: + """Convert token IDs to text and fan out one result per rid. - TODO: replace with real tokenizer.decode() call and incremental - detokenization logic. + The scheduler sends a batch dict with parallel lists keyed by + ``"rids"``, ``"output_ids"``, ``"finished_reasons"``, etc. + This method decodes each rid's output_ids and produces one result + dict per rid with keys ``"rid"`` (singular) and ``"finished"`` + (bool) as expected by ``RequestResponseProcess._recv_loop``. """ - output_ids: List[int] = token_id_out.get("output_token_ids", []) - # placeholder: join ids as string - text = "" # TODO: self._tokenizer.decode(output_ids) - return { - "rid": token_id_out.get("rid"), - "text": text, - "output_token_ids": output_ids, - } + rids: List[str] = token_id_out.get("rids", []) + output_ids: List[int] = token_id_out.get("output_ids", []) + finished_reasons: List[Optional[str]] = token_id_out.get("finished_reasons", []) + decode_ids: List[int] = token_id_out.get("decode_ids", []) + skip_special_tokens_list: List[bool] = token_id_out.get( + "skip_special_tokens", [] + ) + prompt_tokens_list: List[int] = token_id_out.get("prompt_tokens", []) + completion_tokens_list: List[int] = token_id_out.get("completion_tokens", []) + + results: List[Dict[str, Any]] = [] + + for i, rid in enumerate(rids): + finished_reason = finished_reasons[i] if i < len(finished_reasons) else None + is_finished = finished_reason is not None + skip_special = ( + skip_special_tokens_list[i] + if i < len(skip_special_tokens_list) + else True + ) + prompt_tokens = prompt_tokens_list[i] if i < len(prompt_tokens_list) else 0 + completion_tokens = ( + completion_tokens_list[i] if i < len(completion_tokens_list) else 0 + ) + + # Decode text from output_ids + if self._tokenizer is not None: + text = self._tokenizer.decode( + output_ids, + skip_special_tokens=skip_special, + ) + else: + text = "" + + # Compute incremental delta by diffing against previous text + prev_text = self._rid_to_prev_text.get(rid, "") + delta_text = text[len(prev_text):] + self._rid_to_prev_text[rid] = text + + # Clean up tracking when request finishes + if is_finished: + self._rid_to_prev_text.pop(rid, None) + + result: Dict[str, Any] = { + "rid": rid, + "text": text, + "delta": delta_text, + "output_token_ids": list(output_ids), + "finished": is_finished, + "finished_reason": finished_reason, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + } + results.append(result) + + return results # ------------------------------------------------------------------ # Cleanup @@ -98,10 +171,17 @@ def run_detokenizer_process( recv_from_scheduler_addr: str, send_to_rr_addr: str, pipe_writer: Connection, + tokenizer_cfg: Optional[Dict[str, Any]] = None, ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" - proc = DetokenizerProcess(recv_from_scheduler_addr, send_to_rr_addr) + setup_subprocess_logging((tokenizer_cfg or {}).get("log_level", "info")) + proc = DetokenizerProcess( + recv_from_scheduler_addr, + send_to_rr_addr, + tokenizer_cfg=tokenizer_cfg, + ) proc.init_sockets() + proc.init_tokenizer() pipe_writer.send({"status": "ready", "process": "detokenizer"}) pipe_writer.close() diff --git a/pymllm/orchestrator/ipc_utils.py b/pymllm/orchestrator/ipc_utils.py index faaf7a6d9..b464a3979 100644 --- a/pymllm/orchestrator/ipc_utils.py +++ b/pymllm/orchestrator/ipc_utils.py @@ -4,6 +4,7 @@ ZMQ sockets so that every process uses the same conventions. """ +import logging import os import tempfile from typing import Optional @@ -68,3 +69,24 @@ def close_zmq_socket(sock: zmq.Socket) -> None: sock.close() except zmq.ZMQError: pass + + +def setup_subprocess_logging(log_level: str = "info") -> None: + """Configure logging for a spawned subprocess. + + When Python spawns a subprocess (``mp.set_start_method('spawn')``), the + child starts with a blank logging configuration. Call this function at the + very beginning of every subprocess entry point so that log records are + emitted at the correct level. + + Parameters + ---------- + log_level + Case-insensitive level name, e.g. ``"debug"``, ``"info"``, ``"warning"``. + """ + level = getattr(logging, log_level.upper(), logging.INFO) + logging.basicConfig( + level=level, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + logging.getLogger("pymllm").setLevel(level) diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py index b60966dd7..d850dd53e 100644 --- a/pymllm/orchestrator/model_runner_process.py +++ b/pymllm/orchestrator/model_runner_process.py @@ -1,143 +1,968 @@ """ -ModelRunnerProcess -- subprocess that executes model forward passes. +ModelRunnerProcess -- GPU-owning component that executes model forward passes. -Receives batches from the SchedulerProcess, runs the model forward + sampling, -and returns the results (logits, next_token_ids) back to the scheduler. +Instantiated **in-process** by :class:`SchedulerProcess` (sglang-style +architecture). The scheduler calls :meth:`_forward_batch` directly — +no inter-process communication is involved. + +This component owns the GPU: it holds a :class:`ModelRunner` with model +weights, KV-cache memory pools, and the attention backend. It also owns +the :class:`RadixCache` for prefix-aware KV reuse. + +RadixCache lifecycle +-------------------- +1. **match_prefix** — called during ``_allocate_extend`` before KV allocation. +2. **inc_lock_ref** — locks matched radix-tree nodes to prevent eviction. +3. **insert (prefill)** — inserts prompt KV indices after prefill. +4. **insert (completion)** — re-inserts the full sequence when a request finishes. +5. **dec_lock_ref** — unlocks radix-tree nodes when a request is freed. +6. **evict** — called when KV allocation fails to free stale cache entries. """ import logging -from multiprocessing.connection import Connection -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple -import zmq +import torch -from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.mem_cache.radix_cache import RadixCache, RadixKey, TreeNode logger = logging.getLogger(__name__) +# Fraction of KV pool to try evicting when allocation fails. +_EVICT_FRACTION = 0.10 +# Maximum number of eviction retries before giving up. +_MAX_EVICT_RETRIES = 3 + class ModelRunnerProcess: - """Runs inside a subprocess. Owns the model and performs forward passes.""" + """GPU-owning component created in-process by SchedulerProcess.""" def __init__( self, - recv_from_scheduler_addr: str, - send_to_scheduler_addr: str, + gpu_id: int = 0, + server_config: Optional[Any] = None, + model_config: Optional[Any] = None, ): - self._recv_from_scheduler_addr = recv_from_scheduler_addr - self._send_to_scheduler_addr = send_to_scheduler_addr + self._gpu_id = gpu_id + self._server_config = server_config + self._model_config = model_config + + # The ModelRunner instance (created in init_model) + self._runner = None + self._is_hybrid: bool = False + + # RadixCache instance (created in init_model, after memory pools) + self._radix_cache: Optional[RadixCache] = None - self._zmq_ctx: Optional[zmq.Context] = None - self._recv_from_scheduler: Optional[zmq.Socket] = None - self._send_to_scheduler: Optional[zmq.Socket] = None + # GPU resource tracking: maps rid -> req_pool_idx (slot in ReqToTokenPool) + self._rid_to_req_pool_idx: Dict[str, int] = {} + # Maps rid -> kv_indices tensor (all KV-cache token indices for this request) + self._rid_to_kv_indices: Dict[str, torch.Tensor] = {} + # Maps rid -> input_ids used for prefill (needed for radix cache insert) + self._rid_to_input_ids: Dict[str, List[int]] = {} + # Maps rid -> list of generated (decode) token ids, appended each step. + # Used to build the full sequence for radix cache insert at completion. + self._rid_to_output_ids: Dict[str, List[int]] = {} + # Maps rid -> cache_protected_len: the length of the prefix that has + # already been inserted into the radix cache. When insert() returns + # prefix_len > cache_protected_len, the KV indices in the overlap + # range [cache_protected_len, prefix_len) are duplicates that must + # be freed from the allocator (the tree already holds cloned copies). + self._rid_to_cache_protected_len: Dict[str, int] = {} + # Maps rid -> (last_node, swa_boundary_id) for radix cache lock tracking + self._rid_to_radix_lock: Dict[str, Tuple[TreeNode, Optional[int]]] = {} + # Maps rid -> mrope_position_delta (M-RoPE positional offset per request) + # Populated during prefill; used to offset decode-step positions for + # multimodal models (Qwen3-VL) that consume more position indices than + # tokens due to 3-D image grid positions. + self._rid_to_mrope_delta: Dict[str, int] = {} - # TODO: initialise model, attention backend, memory pool, etc. - self._model = None + # GDN prefix cache state tracking (hybrid models only): + # Maps rid -> GDN track slot index in GDNPool (for snapshotting state) + self._rid_to_gdn_track_slot: Dict[str, int] = {} + # Maps radix tree node id -> GDN track slot index + self._node_id_to_gdn_track_slot: Dict[int, int] = {} # ------------------------------------------------------------------ # Lifecycle # ------------------------------------------------------------------ - def init_sockets(self) -> None: - self._zmq_ctx = zmq.Context() - self._recv_from_scheduler = create_zmq_socket( - self._zmq_ctx, - zmq.PULL, - self._recv_from_scheduler_addr, - bind=False, + def init_model(self) -> None: + """Create and initialise the ModelRunner and RadixCache. + + Must run inside the subprocess (after spawn) since it does CUDA init. + """ + from pymllm.executor.model_runner import ModelRunner + + logger.info( + "ModelRunnerProcess: initialising ModelRunner on GPU %d", + self._gpu_id, ) - self._send_to_scheduler = create_zmq_socket( - self._zmq_ctx, - zmq.PUSH, - self._send_to_scheduler_addr, - bind=False, + self._runner = ModelRunner( + server_config=self._server_config, + model_config=self._model_config, + gpu_id=self._gpu_id, ) + self._runner.initialize() - def event_loop(self) -> None: - """Infinite loop: recv batch -> forward -> sample -> send result.""" - logger.info("ModelRunnerProcess event loop started") - while True: - batch = self._recv_from_scheduler.recv_pyobj() - result = self._forward_batch(batch) - self._send_to_scheduler.send_pyobj(result) + # Initialise RadixCache after memory pools are ready. + disable_cache = getattr(self._server_config, "disable_radix_cache", False) + self._is_hybrid = self._runner.num_gdn_layers > 0 + if self._is_hybrid and not disable_cache: + logger.info( + "ModelRunnerProcess: prefix caching ENABLED with GDN state " + "tracking (%d GDN layers)", + self._runner.num_gdn_layers, + ) + sliding_window = self._runner.sliding_window_size + page_size = getattr(self._server_config, "radix_cache_page_size", 1) + # For hybrid models, register an eviction callback so that evicted + # radix nodes free their associated GDN track slots. + evict_cb = self._on_radix_node_evict if self._is_hybrid else None + self._radix_cache = RadixCache( + page_size=page_size, + sliding_window_size=sliding_window, + disable=disable_cache, + token_to_kv_pool_allocator=self._runner.token_to_kv_pool_allocator, + on_node_evict=evict_cb, + ) + logger.info( + "ModelRunnerProcess: RadixCache initialized " + "(disable=%s, sliding_window=%s)", + disable_cache, + sliding_window, + ) + logger.info("ModelRunnerProcess: ModelRunner ready") # ------------------------------------------------------------------ - # Forward pass (placeholder) + # Forward pass # ------------------------------------------------------------------ def _forward_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: """Run the model forward pass and sampling for *batch*. - *batch* is a dict produced by ``SchedulerProcess.get_next_batch_to_run`` - whose ``"requests"`` list contains - :class:`~pymllm.engine.io_struct.TokenizedGenerateReqInput` objects. - - Returns a dict ``{"batch_id": ..., "finished": [...], "unfinished": [...]}`` - where each element of *finished* / *unfinished* is a plain output dict - containing at least ``"rid"`` and ``"output_token_ids"``. + *batch* is a dict produced by ``ScheduleBatch.to_batch_dict()`` + containing ``"forward_mode"``, ``"input_ids"``, ``"seq_lens"``, + ``"req_pool_indices"``, ``"requests"`` (metadata list), etc. - TODO: implement real forward pass, logits processing, and sampling. + Implements 6 phases: + 1. Cleanup: free GPU resources for rids no longer in the batch + 2. Prefix matching + KV allocation + 3. Build GPU tensors + 4. Forward + sample + 5. Radix cache insert (extend only) + 6. Build result dict """ - requests = batch.get("requests", []) - finished: List[Dict[str, Any]] = [] - unfinished: List[Dict[str, Any]] = [] + runner = self._runner + forward_mode = batch.get("forward_mode", "decode") + batch_size = batch.get("batch_size", 0) + requests_meta: List[Dict[str, Any]] = batch.get("requests", []) + + if batch_size == 0: + return {"batch_id": batch.get("batch_id"), "outputs": []} + + device = runner.device + + # Collect current batch rids + current_rids: Set[str] = {m["rid"] for m in requests_meta} + + # ============================================================== + # Phase 2: Prefix matching + KV allocation + # ============================================================== + # For extend batches, match_prefix is done inside _allocate_extend + # which may update extend_prefix_lens and extend_seq_lens. + if forward_mode == "extend": + out_cache_loc, actual_prefix_lens, actual_extend_lens = ( + self._allocate_extend(batch, requests_meta) + ) + else: + out_cache_loc = self._allocate_decode(batch, requests_meta) + actual_prefix_lens = None + actual_extend_lens = None + + # ============================================================== + # Phase 3: Build GPU tensors + # ============================================================== + if forward_mode == "extend" and actual_prefix_lens is not None: + # Rebuild input_ids and seq_lens using actual prefix matches. + # The scheduler sent tokens assuming prefix_len=0; we need to + # trim the input_ids to skip the prefix-matched tokens. + ( + input_ids_tensor, + seq_lens_tensor, + extend_seq_lens_t, + extend_prefix_lens_t, + ) = self._rebuild_extend_tensors( + batch, requests_meta, actual_prefix_lens, actual_extend_lens, device + ) + else: + input_ids_list: List[int] = batch["input_ids"] + seq_lens_list: List[int] = batch["seq_lens"] + input_ids_tensor = torch.tensor( + input_ids_list, dtype=torch.int32, device=device + ) + seq_lens_tensor = torch.tensor( + seq_lens_list, dtype=torch.int32, device=device + ) + extend_seq_lens_t = None + extend_prefix_lens_t = None + + # Build req_pool_indices from our own tracking (NOT from scheduler) + req_pool_indices = torch.tensor( + [self._rid_to_req_pool_idx[m["rid"]] for m in requests_meta], + dtype=torch.int64, + device=device, + ) + + out_cache_loc = out_cache_loc.to(torch.int64) - for req in requests: - # Support both TokenizedGenerateReqInput dataclass (normal path) and - # legacy plain dicts (defensive). - rid: str = req.rid if hasattr(req, "rid") else req.get("rid") - input_ids: List[int] = ( - req.input_ids if hasattr(req, "input_ids") else req.get("input_ids", []) + # ============================================================== + # Phase 4: Forward + sample + # ============================================================== + # Extract per-request sampling params + temperatures = [] + top_ps = [] + top_ks = [] + for m in requests_meta: + sp = m.get("sampling_params") or {} + temperatures.append(sp.get("temperature", 1.0)) + top_ps.append(sp.get("top_p", 1.0)) + top_ks.append(sp.get("top_k", -1)) + + temps_tensor = torch.tensor(temperatures, dtype=torch.float32, device=device) + top_ps_tensor = torch.tensor(top_ps, dtype=torch.float32, device=device) + top_ks_tensor = torch.tensor(top_ks, dtype=torch.int32, device=device) + + if forward_mode == "extend": + if extend_seq_lens_t is None: + extend_seq_lens_list: List[int] = batch["extend_seq_lens"] + extend_prefix_lens_list: List[int] = batch["extend_prefix_lens"] + extend_seq_lens_t = torch.tensor( + extend_seq_lens_list, dtype=torch.int32, device=device + ) + extend_prefix_lens_t = torch.tensor( + extend_prefix_lens_list, dtype=torch.int32, device=device + ) + + fb = runner.prepare_forward_batch_extend( + input_ids=input_ids_tensor, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens_tensor, + extend_seq_lens=extend_seq_lens_t, + extend_prefix_lens=extend_prefix_lens_t, + out_cache_loc=out_cache_loc, ) - mm_inputs: Optional[Dict[str, Any]] = ( - req.mm_inputs if hasattr(req, "mm_inputs") else req.get("mm_inputs") + + # Attach multimodal vision inputs to ForwardBatch so the + # model's vision encoder can process images during prefill. + # The tokenizer wraps processor output under "image_inputs"; + # fall back to top-level keys for direct dicts. + pixel_values_list = [] + image_grid_thw_list = [] + for m in requests_meta: + mm = m.get("mm_inputs") + if mm is None: + continue + # AutoProcessor output is nested under "image_inputs" + src = mm.get("image_inputs") if "image_inputs" in mm else mm + if src is None: + continue + pv = src.get("pixel_values") if hasattr(src, "get") else getattr(src, "pixel_values", None) + thw = src.get("image_grid_thw") if hasattr(src, "get") else getattr(src, "image_grid_thw", None) + if pv is not None: + if not isinstance(pv, torch.Tensor): + pv = torch.as_tensor(pv) + pixel_values_list.append(pv.to(device=device)) + if thw is not None: + if not isinstance(thw, torch.Tensor): + thw = torch.as_tensor(thw) + image_grid_thw_list.append(thw.to(device=device)) + if pixel_values_list: + fb.pixel_values = torch.cat(pixel_values_list, dim=0) + if image_grid_thw_list: + fb.image_grid_thw = torch.cat(image_grid_thw_list, dim=0) + else: + # Build mrope_position_deltas tensor for decode batches. + mrope_deltas = [ + self._rid_to_mrope_delta.get(m["rid"], 0) for m in requests_meta + ] + mrope_deltas_tensor = torch.tensor( + mrope_deltas, dtype=torch.int64, device=device ) - # TODO: actual model forward; pass input_ids and mm_inputs to the model. - next_token_ids: List[int] = [] # placeholder + fb = runner.prepare_forward_batch_decode( + input_ids=input_ids_tensor, + req_pool_indices=req_pool_indices, + seq_lens=seq_lens_tensor, + out_cache_loc=out_cache_loc, + mrope_position_deltas=mrope_deltas_tensor, + ) - output: Dict[str, Any] = { + logits_output = runner.forward(fb) + + # Persist M-RoPE position deltas for multimodal models (Qwen3-VL). + # The model sets mrope_position_deltas on the ForwardBatch during + # prefill; we store them here so decode steps can retrieve them. + if ( + forward_mode == "extend" + and getattr(fb, "mrope_position_deltas", None) is not None + ): + deltas_cpu = fb.mrope_position_deltas.cpu().tolist() + for idx, m in enumerate(requests_meta): + self._rid_to_mrope_delta[m["rid"]] = int(deltas_cpu[idx]) + + next_token_ids = runner.sample( + logits_output, + fb, + temperatures=temps_tensor, + top_ps=top_ps_tensor, + top_ks=top_ks_tensor, + ) + + # ============================================================== + # Phase 4.5: Snapshot GDN state after extend (hybrid models) + # ============================================================== + if forward_mode == "extend" and self._is_hybrid: + self._track_gdn_state_after_extend(requests_meta) + + # ============================================================== + # Phase 5: Radix cache insert (extend only) + # ============================================================== + if forward_mode == "extend" and self._radix_cache is not None: + self._insert_into_radix_cache(requests_meta) + + # ============================================================== + # Phase 6: Build result & track output tokens + # ============================================================== + next_ids_cpu = next_token_ids.cpu().tolist() + outputs: List[Dict[str, Any]] = [] + for i, m in enumerate(requests_meta): + rid = m["rid"] + token_id = next_ids_cpu[i] if i < len(next_ids_cpu) else 0 + # Track output tokens for radix cache insert at completion + out_ids = self._rid_to_output_ids.get(rid) + if out_ids is not None: + out_ids.append(token_id) + + out: Dict[str, Any] = { "rid": rid, - "output_token_ids": next_token_ids, - "finished": True, + "output_token_ids": [token_id], } - # TODO: check EOS / max_tokens to decide finished vs. unfinished. - finished.append(output) + # Report actual prefix_len back to the scheduler so it can + # update its token budget tracking accurately. + if actual_prefix_lens is not None: + out["prefix_len"] = actual_prefix_lens[i] + outputs.append(out) return { "batch_id": batch.get("batch_id"), - "finished": finished, - "unfinished": unfinished, + "outputs": outputs, } + # ------------------------------------------------------------------ + # Tensor rebuild for prefix-matched extend + # ------------------------------------------------------------------ + + def _rebuild_extend_tensors( + self, + batch: Dict[str, Any], + requests_meta: List[Dict[str, Any]], + actual_prefix_lens: List[int], + actual_extend_lens: List[int], + device: str, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Rebuild input_ids and related tensors after prefix matching. + + The scheduler sent input_ids assuming no prefix cache hit. After + radix cache matching, we know the actual prefix lengths and must + trim the input_ids accordingly. + + Returns (input_ids, seq_lens, extend_seq_lens, extend_prefix_lens) + as GPU tensors. + """ + # Reconstruct trimmed input_ids: for each request, take only the + # tokens beyond the matched prefix. + new_input_ids: List[int] = [] + seq_lens_list: List[int] = batch["seq_lens"] + + for i, m in enumerate(requests_meta): + full_input_ids = m.get("input_ids", []) + prefix_len = actual_prefix_lens[i] + # Only send tokens after the prefix + new_input_ids.extend(full_input_ids[prefix_len:]) + + input_ids = torch.tensor(new_input_ids, dtype=torch.int32, device=device) + seq_lens = torch.tensor(seq_lens_list, dtype=torch.int32, device=device) + extend_seq_lens = torch.tensor( + actual_extend_lens, dtype=torch.int32, device=device + ) + extend_prefix_lens = torch.tensor( + actual_prefix_lens, dtype=torch.int32, device=device + ) + return input_ids, seq_lens, extend_seq_lens, extend_prefix_lens + + # ------------------------------------------------------------------ + # Radix cache insert + # ------------------------------------------------------------------ + + def _insert_into_radix_cache(self, requests_meta: List[Dict[str, Any]]) -> None: + """Insert prefill KV indices into the radix cache for future reuse. + + Mirrors sglang's ``cache_unfinished_req`` pattern: + + 1. **Insert** the request's token → KV index mapping into the tree. + 2. **Free duplicates** — indices in ``[cache_protected_len, new_prefix_len)`` + are now owned by the tree; the request's copies are redundant. + 3. **Re-match + write-back** — fetch the tree's *own* indices via + ``match_prefix`` and write them into ``req_to_token_pool``, + replacing the just-freed entries. Without this step the pool + still points at freed slots → use-after-free during decode. + 4. **Update** ``cache_protected_len`` and radix lock. + """ + cache = self._radix_cache + if cache is None or cache.disable: + return + + runner = self._runner + gdn_pool = getattr(runner, "gdn_pool", None) + + for m in requests_meta: + rid = m["rid"] + input_ids = self._rid_to_input_ids.get(rid) + if input_ids is None: + continue + + slot = self._rid_to_req_pool_idx.get(rid) + if slot is None: + continue + + seq_len = len(input_ids) + kv_indices = runner.req_to_token_pool.req_to_token[slot, :seq_len].to( + torch.int64 + ) + + key = RadixKey(input_ids) + result = cache.insert(key, kv_indices) + new_prefix_len = result.prefix_len + + # --- Step 2: free duplicates --- + cache_protected_len = self._rid_to_cache_protected_len.get(rid, 0) + if new_prefix_len > cache_protected_len: + dup_indices = kv_indices[cache_protected_len:new_prefix_len] + if dup_indices.numel() > 0: + runner.token_to_kv_pool_allocator.free(dup_indices) + + # --- Step 3: re-match + write-back --- + # The tree now owns indices for [0, new_prefix_len). Fetch them + # and patch req_to_token_pool so the request reads the tree's + # (still-live) indices instead of the freed ones. + rematch = cache.match_prefix(key) + new_indices = rematch.indices + if len(new_indices) > cache_protected_len: + runner.req_to_token_pool.write( + (slot, slice(cache_protected_len, len(new_indices))), + new_indices[cache_protected_len:].to(torch.int32), + ) + + # --- Step 4: update tracking --- + self._rid_to_cache_protected_len[rid] = len(new_indices) + + # Update radix lock to cover the new (potentially deeper) node. + old_lock = self._rid_to_radix_lock.pop(rid, None) + if old_lock is not None: + old_node, old_swa = old_lock + cache.dec_lock_ref(old_node, old_swa) + new_last_node = rematch.last_node + if new_last_node is not None and len(new_indices) > 0: + swa_id = cache.inc_lock_ref(new_last_node) + self._rid_to_radix_lock[rid] = (new_last_node, swa_id) + + # --- GDN track slot association (hybrid models) --- + if gdn_pool is not None and result.last_node is not None: + track_slot = self._rid_to_gdn_track_slot.get(rid) + if track_slot is not None: + node_id = result.last_node.id + old_ts = self._node_id_to_gdn_track_slot.get(node_id) + if old_ts is None: + self._node_id_to_gdn_track_slot[node_id] = track_slot + else: + gdn_pool.free_track_slot(track_slot) + self._rid_to_gdn_track_slot.pop(rid, None) + + # ------------------------------------------------------------------ + # KV allocation helpers + # ------------------------------------------------------------------ + + def _allocate_extend( + self, batch: Dict[str, Any], requests_meta: List[Dict[str, Any]] + ) -> Tuple[torch.Tensor, List[int], List[int]]: + """Allocate req pool slots and KV tokens for an extend (prefill) batch. + + Performs radix cache prefix matching before allocation: + 1. For each request, call ``match_prefix`` to find cached KV indices. + 2. Write cached indices into ``ReqToTokenPool``. + 3. Only allocate new KV tokens for the non-cached suffix. + 4. Lock matched radix nodes to prevent eviction. + + Returns ``(out_cache_loc, actual_prefix_lens, actual_extend_lens)``. + ``out_cache_loc`` has shape ``[total_new_tokens]``. + """ + runner = self._runner + cache = self._radix_cache + batch_size = batch["batch_size"] + seq_lens: List[int] = batch["seq_lens"] + + # --- Step 1: Radix cache prefix matching --- + actual_prefix_lens: List[int] = [] + actual_extend_lens: List[int] = [] + matched_nodes: List[Optional[TreeNode]] = [] + # Cache the match results so we don't call match_prefix twice + cached_indices_list: List[Optional[torch.Tensor]] = [] + gdn_pool = getattr(runner, "gdn_pool", None) + + for i, m in enumerate(requests_meta): + full_input_ids: List[int] = m.get("input_ids", []) + full_seq_len = seq_lens[i] + + # Store input_ids for later radix cache insert + self._rid_to_input_ids[m["rid"]] = full_input_ids + + if cache is not None and not cache.disable and len(full_input_ids) > 0: + key = RadixKey(full_input_ids) + match_result = cache.match_prefix(key) + prefix_len = match_result.prefix_len + last_node = match_result.last_node + cached_indices = match_result.indices + else: + prefix_len = 0 + last_node = None + cached_indices = None + + # Hybrid model guard: only use a KV cache hit if the matched + # node has a GDN state snapshot. Without it, the full-attention + # layers would use cached KV while GDN layers start from zero, + # causing an attention/GDN state mismatch. Discard the hit so + # the entire prompt is processed from scratch. + if ( + gdn_pool is not None + and prefix_len > 0 + and last_node is not None + and self._node_id_to_gdn_track_slot.get(last_node.id) is None + ): + logger.debug( + "Discarding radix cache hit for rid=%s: no GDN state " + "for matched node (prefix_len=%d)", + m["rid"], prefix_len, + ) + prefix_len = 0 + last_node = None + cached_indices = None + + # Ensure at least 1 token is extended (not fully cached). + # A full cache hit (prefix_len == full_seq_len) would produce a + # 0-length input tensor that crashes CUDA kernels. Back off by 1 + # so the model always sees the last token. + if prefix_len >= full_seq_len: + prefix_len = full_seq_len - 1 + if cached_indices is not None: + cached_indices = cached_indices[:prefix_len] + + extend_len = full_seq_len - prefix_len + actual_prefix_lens.append(prefix_len) + actual_extend_lens.append(extend_len) + matched_nodes.append(last_node) + cached_indices_list.append(cached_indices) + + if prefix_len > 0: + logger.info( + "Radix cache hit for rid=%s: %d/%d tokens reused (%.1f%%)", + m["rid"], + prefix_len, + full_seq_len, + 100.0 * prefix_len / full_seq_len, + ) + + total_new_tokens = sum(actual_extend_lens) + + # --- Step 2: Allocate req pool slots --- + slots = runner.req_to_token_pool.alloc(batch_size) + if slots is None: + raise RuntimeError("Failed to allocate req pool slots for extend batch") + + # --- Step 3: Allocate KV tokens (with eviction retry) --- + out_cache_loc = self._alloc_kv_with_eviction(total_new_tokens) + if out_cache_loc is None: + for s in slots: + runner.req_to_token_pool.free(s) + raise RuntimeError( + f"Failed to allocate {total_new_tokens} KV tokens for extend batch " + f"(even after eviction)" + ) + + # --- Step 4: Write indices into req_to_token_pool --- + offset = 0 + for i, m in enumerate(requests_meta): + rid = m["rid"] + slot = slots[i] + prefix_len = actual_prefix_lens[i] + extend_len = actual_extend_lens[i] + full_seq_len = seq_lens[i] + + # Write cached prefix indices (from the match result we saved) + cached_indices = cached_indices_list[i] + if cached_indices is not None and prefix_len > 0: + runner.req_to_token_pool.write( + (slot, slice(0, prefix_len)), + cached_indices[:prefix_len].to(torch.int32), + ) + + # Write new KV indices for the suffix + kv_indices = out_cache_loc[offset : offset + extend_len] + runner.req_to_token_pool.write( + (slot, slice(prefix_len, full_seq_len)), kv_indices + ) + + self._rid_to_req_pool_idx[rid] = slot + self._rid_to_kv_indices[rid] = kv_indices.clone() + self._rid_to_output_ids[rid] = [] + # The prefix portion is already protected in the radix cache + # (from a previous request's insert). We start with this as + # cache_protected_len so that subsequent insert() calls know + # which range is already covered. + self._rid_to_cache_protected_len[rid] = actual_prefix_lens[i] + offset += extend_len + + # GDN state management: restore from track slot on cache hit, or reset + if gdn_pool is not None: + for i, m in enumerate(requests_meta): + rid = m["rid"] + working_slot = slots[i] + prefix_len = actual_prefix_lens[i] + node = matched_nodes[i] + + if prefix_len > 0 and node is not None: + # Cache hit — try to restore GDN state from the track slot + # associated with the matched radix node. + track_slot = self._node_id_to_gdn_track_slot.get(node.id) + if track_slot is not None: + gdn_pool.copy_states(track_slot, working_slot) + logger.debug( + "GDN state restored for rid=%s from track_slot=%d " + "(prefix_len=%d)", + rid, track_slot, prefix_len, + ) + else: + # Cache hit but no GDN snapshot — reset to zero. + # This can happen if the track slot was evicted. + idx = torch.tensor( + [working_slot], dtype=torch.int64, device=runner.device + ) + gdn_pool.reset_states(idx) + logger.debug( + "GDN state reset for rid=%s (cache hit but no " + "track slot, prefix_len=%d)", + rid, prefix_len, + ) + else: + # No cache hit — fresh request, zero-init + idx = torch.tensor( + [working_slot], dtype=torch.int64, device=runner.device + ) + gdn_pool.reset_states(idx) + + # Allocate a track slot only when the radix cache is enabled; + # track slots are freed via the eviction callback so they must + # be associated with a node, which only happens when cache is on. + if cache is not None and not cache.disable: + ts = gdn_pool.alloc_track_slot() + if ts is not None: + self._rid_to_gdn_track_slot[rid] = ts + + # --- Step 5: Lock matched radix nodes --- + if cache is not None and not cache.disable: + for i, m in enumerate(requests_meta): + node = matched_nodes[i] + if node is not None and actual_prefix_lens[i] > 0: + swa_boundary_id = cache.inc_lock_ref(node) + self._rid_to_radix_lock[m["rid"]] = (node, swa_boundary_id) + + return out_cache_loc, actual_prefix_lens, actual_extend_lens + + def _alloc_kv_with_eviction(self, num_tokens: int) -> Optional[torch.Tensor]: + """Try to allocate KV tokens, evicting from radix cache if needed.""" + runner = self._runner + cache = self._radix_cache + + if num_tokens == 0: + return torch.empty(0, dtype=torch.int32, device=runner.device) + + # First attempt: direct allocation + result = runner.token_to_kv_pool_allocator.alloc(num_tokens) + if result is not None: + return result + + # Eviction loop: try evicting from radix cache to free space + if cache is None or cache.disable: + return None + + for attempt in range(_MAX_EVICT_RETRIES): + evictable = cache.evictable_size() + if evictable == 0: + logger.warning( + "KV allocation failed: need %d tokens, no evictable cache entries", + num_tokens, + ) + return None + + # Evict a fraction of the cache (at least what we need) + evict_target = max( + num_tokens, + int(runner.token_to_kv_pool_allocator.size * _EVICT_FRACTION), + ) + evict_result = cache.evict(evict_target) + logger.info( + "Radix cache eviction attempt %d: evicted %d tokens (target=%d)", + attempt + 1, + evict_result.full_evicted, + evict_target, + ) + + # Retry allocation + result = runner.token_to_kv_pool_allocator.alloc(num_tokens) + if result is not None: + return result + + return None + + def _allocate_decode( + self, batch: Dict[str, Any], requests_meta: List[Dict[str, Any]] + ) -> torch.Tensor: + """Allocate 1 KV token per request for a decode step. + + Returns ``out_cache_loc`` tensor of shape ``[batch_size]``. + """ + runner = self._runner + batch_size = batch["batch_size"] + seq_lens: List[int] = batch["seq_lens"] + + # Allocate 1 new KV token per request (with eviction retry) + out_cache_loc = self._alloc_kv_with_eviction(batch_size) + if out_cache_loc is None: + raise RuntimeError( + f"Failed to allocate {batch_size} KV tokens for decode batch" + ) + + # Write the new KV token index into each request's mapping + for i, m in enumerate(requests_meta): + rid = m["rid"] + slot = self._rid_to_req_pool_idx.get(rid) + if slot is None: + logger.warning("Decode step for unknown rid=%s, skipping KV write", rid) + continue + + cur_seq_len = seq_lens[i] + kv_new = out_cache_loc[i : i + 1] + # The scheduler increments req.seq_len by 1 after every step, so + # seq_lens[i] == (number of tokens in the KV cache INCLUDING the + # token being decoded now). The new token's slot must therefore be + # written at index seq_lens[i] - 1, matching the position used by + # prepare_forward_batch_decode (positions = seq_lens - 1) and the + # window FlashInfer reads (req_to_token_pool[slot, 0:seq_lens[i]]). + write_pos = cur_seq_len - 1 + runner.req_to_token_pool.write( + (slot, slice(write_pos, write_pos + 1)), kv_new + ) + + # Append to tracked kv_indices + prev = self._rid_to_kv_indices.get(rid) + if prev is not None: + self._rid_to_kv_indices[rid] = torch.cat([prev, kv_new]) + else: + self._rid_to_kv_indices[rid] = kv_new.clone() + + return out_cache_loc + + # ------------------------------------------------------------------ + # Resource cleanup + # ------------------------------------------------------------------ + + def _free_rid_resources(self, rid: str) -> None: + """Free GPU resources (req pool slot + KV indices) for a finished rid. + + KV index ownership model (when radix cache is enabled): + + ``req_to_token_pool[slot]`` contains three regions after + ``insert()`` returns ``new_prefix_len``:: + + [0, cache_protected_len) + Indices shared with the radix tree from a previous insert. + **Do not free** — the tree already owns them. + + [cache_protected_len, new_prefix_len) + Indices allocated by THIS request that turned out to overlap + with tree nodes inserted concurrently. The tree already + holds cloned copies → these are duplicates → **free them**. + + [new_prefix_len, total_len) + Indices that ``insert()`` just added to the tree (cloned). + The tree now owns the underlying KV pool slots. + **Do not free** — the tree will free during eviction. + + When the radix cache is disabled, all KV indices are freed directly. + """ + runner = self._runner + cache = self._radix_cache + + slot = self._rid_to_req_pool_idx.pop(rid, None) + kv_indices = self._rid_to_kv_indices.pop(rid, None) + input_ids = self._rid_to_input_ids.pop(rid, None) + output_ids = self._rid_to_output_ids.pop(rid, None) + cache_protected_len = self._rid_to_cache_protected_len.pop(rid, 0) + radix_lock = self._rid_to_radix_lock.pop(rid, None) + self._rid_to_mrope_delta.pop(rid, None) + + # Free GDN track slot (if any) — the slot's association with a + # radix node is managed separately via _node_id_to_gdn_track_slot + # and the eviction callback; here we just remove the rid mapping. + self._rid_to_gdn_track_slot.pop(rid, None) + + cache_enabled = cache is not None and not cache.disable + + # ---------------------------------------------------------- + # Phase 1: Read all KV indices BEFORE freeing anything. + # ---------------------------------------------------------- + prompt_len = len(input_ids) if input_ids is not None else 0 + decode_len = len(output_ids) if output_ids else 0 + total_len = prompt_len + decode_len + + all_kv_indices: Optional[torch.Tensor] = None + if slot is not None and input_ids is not None: + all_kv_indices = runner.req_to_token_pool.req_to_token[slot, :total_len].to( + torch.int64 + ) + + # ---------------------------------------------------------- + # Phase 2: Insert into radix cache (if enabled). + # ---------------------------------------------------------- + did_insert = False + if cache_enabled and all_kv_indices is not None: + if self._is_hybrid and decode_len > 0: + # Hybrid model: insert only prompt tokens (not decode) + # because GDN state is only tracked at the prompt boundary. + prompt_kv = all_kv_indices[:prompt_len] + decode_kv = all_kv_indices[prompt_len:] + key = RadixKey(list(input_ids)) + result = cache.insert(key, prompt_kv) + new_prefix_len = result.prefix_len + + # Free duplicate KV indices in the overlap region. + if new_prefix_len > cache_protected_len: + dup_indices = prompt_kv[cache_protected_len:new_prefix_len] + if dup_indices.numel() > 0: + runner.token_to_kv_pool_allocator.free(dup_indices) + + # Free decode KV indices (tree does not own them) + if decode_kv.numel() > 0: + runner.token_to_kv_pool_allocator.free(decode_kv) + else: + # Non-hybrid or no decode tokens: insert full sequence + full_token_ids = list(input_ids) + if output_ids: + full_token_ids.extend(output_ids) + key = RadixKey(full_token_ids) + result = cache.insert(key, all_kv_indices) + new_prefix_len = result.prefix_len + + # Free duplicate KV indices in the overlap region. + if new_prefix_len > cache_protected_len: + dup_indices = all_kv_indices[cache_protected_len:new_prefix_len] + if dup_indices.numel() > 0: + runner.token_to_kv_pool_allocator.free(dup_indices) + + did_insert = True + + # ---------------------------------------------------------- + # Phase 3: Unlock radix cache nodes. + # ---------------------------------------------------------- + if cache_enabled and radix_lock is not None: + node, swa_boundary_id = radix_lock + cache.dec_lock_ref(node, swa_boundary_id) + + # ---------------------------------------------------------- + # Phase 4: Free KV indices not owned by the radix cache. + # ---------------------------------------------------------- + if not did_insert: + if cache_enabled and all_kv_indices is not None: + # Cache enabled but insert skipped (shouldn't happen in + # normal flow). Tree owns [0, cache_protected_len); + # free the rest. + tail = all_kv_indices[cache_protected_len:] + if tail.numel() > 0: + runner.token_to_kv_pool_allocator.free(tail) + elif not cache_enabled: + # Cache disabled — free all newly-allocated KV indices. + if all_kv_indices is not None and all_kv_indices.numel() > 0: + runner.token_to_kv_pool_allocator.free(all_kv_indices) + elif kv_indices is not None and kv_indices.numel() > 0: + runner.token_to_kv_pool_allocator.free(kv_indices) + + # ---------------------------------------------------------- + # Phase 5: Free the req pool slot. + # ---------------------------------------------------------- + if slot is not None: + runner.req_to_token_pool.free(slot) + + logger.debug( + "Freed resources for rid=%s (slot=%s, kv_tokens=%d)", + rid, + slot, + kv_indices.numel() if kv_indices is not None else 0, + ) + + # ------------------------------------------------------------------ + # GDN state tracking helpers (hybrid models) + # ------------------------------------------------------------------ + + def _track_gdn_state_after_extend( + self, requests_meta: List[Dict[str, Any]] + ) -> None: + """Snapshot working GDN state into each request's track slot. + + Called immediately after ``runner.forward()`` for extend batches so + that the FINAL recurrent/conv state (after processing the full prompt) + is saved. The track slot is later associated with a radix node in + ``_insert_into_radix_cache``. + """ + gdn_pool = getattr(self._runner, "gdn_pool", None) + if gdn_pool is None: + return + + for m in requests_meta: + rid = m["rid"] + working_slot = self._rid_to_req_pool_idx.get(rid) + track_slot = self._rid_to_gdn_track_slot.get(rid) + if working_slot is not None and track_slot is not None: + gdn_pool.copy_states(working_slot, track_slot) + + def _on_radix_node_evict(self, node_id: int) -> None: + """Callback invoked by RadixCache when a node is evicted. + + Frees the GDN track slot associated with the evicted node. + """ + track_slot = self._node_id_to_gdn_track_slot.pop(node_id, None) + if track_slot is not None: + gdn_pool = getattr(self._runner, "gdn_pool", None) + if gdn_pool is not None: + gdn_pool.free_track_slot(track_slot) + logger.debug( + "Freed GDN track slot %d for evicted node %d", + track_slot, node_id, + ) + # ------------------------------------------------------------------ # Cleanup # ------------------------------------------------------------------ def shutdown(self) -> None: - if self._recv_from_scheduler is not None: - self._recv_from_scheduler.close() - if self._send_to_scheduler is not None: - self._send_to_scheduler.close() - if self._zmq_ctx is not None: - self._zmq_ctx.term() - - -def run_model_runner_process( - recv_from_scheduler_addr: str, - send_to_scheduler_addr: str, - pipe_writer: Connection, -) -> None: - """Entry point for ``torch.multiprocessing.Process(target=...)``.""" - proc = ModelRunnerProcess(recv_from_scheduler_addr, send_to_scheduler_addr) - proc.init_sockets() - - pipe_writer.send({"status": "ready", "process": "model_runner"}) - pipe_writer.close() - - try: - proc.event_loop() - except KeyboardInterrupt: - pass - finally: - proc.shutdown() + if self._runner is not None: + self._runner.shutdown() diff --git a/pymllm/orchestrator/request_response_process.py b/pymllm/orchestrator/request_response_process.py index fa9d92ece..5c72a14c4 100644 --- a/pymllm/orchestrator/request_response_process.py +++ b/pymllm/orchestrator/request_response_process.py @@ -65,8 +65,12 @@ def __init__( self._loop_task: Optional[asyncio.Task] = None - def start(self, loop: asyncio.AbstractEventLoop) -> None: - """Kick off the background send/recv tasks on *loop*.""" + def start(self) -> None: + """Bind ZMQ sockets. Background tasks are started lazily by + :meth:`listen` on the first :meth:`add_request` call, so they + always run on the correct event loop regardless of whether the + caller is uvicorn, ``loop.run_until_complete``, or anything else. + """ self._zmq_ctx = zmq.asyncio.Context() self._send_to_tokenizer = create_zmq_socket( self._zmq_ctx, @@ -80,7 +84,20 @@ def start(self, loop: asyncio.AbstractEventLoop) -> None: self._recv_from_detokenizer_addr, bind=True, ) + + def listen(self) -> None: + """Start the send/recv background tasks on the **current** running + event loop. Idempotent — subsequent calls are no-ops while the + tasks are still alive. + + Called automatically by :meth:`add_request`, so callers never need + to invoke this directly. + """ + if self._loop_task is not None and not self._loop_task.done(): + return + loop = asyncio.get_running_loop() self._loop_task = loop.create_task(self._run()) + logger.debug("RequestResponseProcess: background tasks started") async def add_request( self, request: GenerateReqInput @@ -98,6 +115,8 @@ async def add_request( Callers should ``await state.event.wait()`` in a loop, consuming ``state.out_list`` entries until ``state.finished`` is ``True``. """ + self.listen() + if request.is_single: rid = request.rid if isinstance(request.rid, str) else request.rid[0] state = ReqState() diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py index 8f2d9a958..8594a8997 100644 --- a/pymllm/orchestrator/scheduler_process.py +++ b/pymllm/orchestrator/scheduler_process.py @@ -1,27 +1,28 @@ """ -SchedulerProcess -- the central scheduling hub. +SchedulerProcess -- the central scheduling and inference hub. Receives tokenized requests from the TokenizerProcess, organises them into -batches, dispatches batches to the ModelRunnerProcess for forward passes, -collects results, and streams finished token IDs to the DetokenizerProcess. +batches, runs model forward passes via the **in-process** model runner +(sglang-style), and streams finished token IDs to the DetokenizerProcess. -Supports two modes: - 1. Legacy ZMQ path: Receive TokenizedGenerateReqInput via ZMQ recv_pyobj - 2. Shared queue fast path: Read rid from shared queue and metadata from shared memory +Architecture: the scheduler owns the :class:`ModelRunnerProcess` directly +(same process, direct function calls). GPU resources (KV cache, req pool +slots) are freed immediately when requests finish — no cross-process +communication needed. -When the shared queue fast path is active the scheduler also handles CUDA IPC -tensor reconstruction via -:func:`~pymllm.orchestrator.cuda_ipc_transport.unwrap_mm_inputs_from_ipc`. +Request ingestion supports two modes: + 1. ZMQ path: Receive TokenizedGenerateReqInput via ZMQ recv_pyobj + 2. Shared queue fast path: Read from shared memory + multiprocessing queue -The main ``event_loop`` scheduler flow:: +The main ``event_loop``:: while True: recv_requests() process_input_requests() - batch = get_next_batch_to_run() + batch = get_next_batch_to_run() # also frees finished GPU resources if batch: - run_batch(batch) - process_batch_result(batch) + result = run_batch(batch) # direct call to model runner + process_batch_result(batch, result) stream_output() """ @@ -34,16 +35,297 @@ import zmq -from pymllm.engine.io_struct import TokenizedGenerateReqInput +from pymllm.engine.forward_batch import ForwardMode +from pymllm.engine.io_struct import BatchTokenIDOutput, TokenizedGenerateReqInput from pymllm.orchestrator.cuda_ipc_transport import ( TensorTransportMode, unwrap_mm_inputs_from_ipc, ) -from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.orchestrator.ipc_utils import create_zmq_socket, setup_subprocess_logging from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue logger = logging.getLogger(__name__) +# Default scheduling limits +_DEFAULT_MAX_RUNNING_REQUESTS = 256 +_DEFAULT_MAX_PREFILL_TOKENS = 8192 +_DEFAULT_MAX_TOTAL_TOKENS = 131072 +_DEFAULT_MAX_NEW_TOKENS = 32768 + + +# ====================================================================== +# Req -- per-request state tracker +# ====================================================================== + + +class Req: + """Tracks a single request through its lifecycle (prefill -> decode -> finish). + + Created by :meth:`SchedulerProcess.process_input_requests` from a + :class:`~pymllm.engine.io_struct.TokenizedGenerateReqInput`. + """ + + __slots__ = ( + "rid", + "input_ids", + "input_text", + "sampling_params", + "mm_inputs", + "stream", + "return_logprob", + "logprob_start_len", + "top_logprobs_num", + # KV-cache state + "req_pool_idx", + "seq_len", + # Prefix-cache hit (set during scheduling when radix cache is active) + "prefix_len", + # Generation state + "output_ids", + "finished_reason", + "is_prefilled", + # Sampling parameters (parsed) + "max_new_tokens", + "temperature", + "top_p", + "top_k", + "stop_token_ids", + # Streaming + "read_offset", + # Prompt length (for token accounting) + "prompt_len", + ) + + def __init__( + self, + rid: str, + input_ids: List[int], + input_text: str = "", + sampling_params: Optional[Dict[str, Any]] = None, + mm_inputs: Optional[Dict[str, Any]] = None, + stream: bool = False, + return_logprob: bool = False, + logprob_start_len: int = -1, + top_logprobs_num: int = 0, + ): + self.rid = rid + self.input_ids = list(input_ids) + self.input_text = input_text + self.mm_inputs = mm_inputs + self.stream = stream + self.return_logprob = return_logprob + self.logprob_start_len = logprob_start_len + self.top_logprobs_num = top_logprobs_num + + # Parse sampling params + sp = sampling_params or {} + self.sampling_params = sp + self.max_new_tokens: int = sp.get("max_new_tokens", _DEFAULT_MAX_NEW_TOKENS) + self.temperature: float = sp.get("temperature", 1.0) + self.top_p: float = sp.get("top_p", 1.0) + self.top_k: int = sp.get("top_k", -1) + self.stop_token_ids: List[int] = list(sp.get("stop_token_ids", [])) + + # KV-cache state (assigned during scheduling) + self.req_pool_idx: int = -1 + self.seq_len: int = len(input_ids) + # Number of prefix tokens served from the radix/KV cache (0 = no hit). + # Updated by process_batch_result when the model runner reports a + # prefix cache hit. Used in _free_req_resources to correctly + # release the token budget. + self.prefix_len: int = 0 + + # Generation state + self.output_ids: List[int] = [] + self.finished_reason: Optional[str] = None + self.is_prefilled: bool = False + + # Streaming + self.read_offset: int = 0 + + # Prompt length + self.prompt_len: int = len(input_ids) + + def check_finished(self, eos_token_id: Optional[int] = None) -> bool: + """Check if this request has reached a finish condition. + + Sets ``finished_reason`` and returns True if finished. + Checks: + 1. EOS token in the latest generated token + 2. ``max_new_tokens`` reached + """ + if self.finished_reason is not None: + return True + + if self.output_ids: + last_token = self.output_ids[-1] + # Check model EOS token + if eos_token_id is not None and last_token == eos_token_id: + self.finished_reason = "eos" + return True + # Check stop token IDs from sampling params + if last_token in self.stop_token_ids: + self.finished_reason = "eos" + return True + + # Check max_new_tokens + if len(self.output_ids) >= self.max_new_tokens: + self.finished_reason = "length" + return True + + return False + + @property + def is_finished(self) -> bool: + return self.finished_reason is not None + + def abort(self) -> None: + """Mark this request as aborted.""" + self.finished_reason = "abort" + + def __repr__(self) -> str: + return ( + f"Req(rid={self.rid!r}, seq_len={self.seq_len}, " + f"out={len(self.output_ids)}, finished={self.finished_reason})" + ) + + +# ====================================================================== +# ScheduleBatch -- batch container +# ====================================================================== + + +class ScheduleBatch: + """Wraps a list of :class:`Req` objects for a single forward pass. + + Provides helpers to assemble the batch dict sent to the ModelRunnerProcess + in the format expected by :class:`~pymllm.engine.forward_batch.ForwardBatch`. + """ + + def __init__(self, reqs: List[Req], forward_mode: ForwardMode): + self.reqs = reqs + self.forward_mode = forward_mode + + @property + def batch_size(self) -> int: + return len(self.reqs) + + def prepare_for_extend(self) -> Dict[str, Any]: + """Assemble a batch dict for prefill / extend forward pass. + + Returns a dict with flattened ``input_ids``, per-request ``positions``, + ``req_pool_indices``, ``seq_lens``, ``extend_seq_lens``, + ``extend_prefix_lens``, and request metadata. + + Note: The scheduler sends the **full** input_ids (no prefix trimming). + The ModelRunnerProcess performs radix cache prefix matching and + rebuilds the tensors with actual prefix lengths before the forward + pass. The ``extend_prefix_lens`` here are always 0 from the + scheduler; they serve as placeholders. + """ + all_input_ids: List[int] = [] + all_positions: List[int] = [] + req_pool_indices: List[int] = [] + seq_lens: List[int] = [] + extend_seq_lens: List[int] = [] + extend_prefix_lens: List[int] = [] + requests_meta: List[Dict[str, Any]] = [] + + for req in self.reqs: + input_len = len(req.input_ids) + + # Send full input_ids; model runner will trim based on prefix + all_input_ids.extend(req.input_ids) + all_positions.extend(range(input_len)) + req_pool_indices.append(req.req_pool_idx) + seq_lens.append(req.seq_len) + extend_seq_lens.append(input_len) + extend_prefix_lens.append(0) + requests_meta.append( + { + "rid": req.rid, + "input_ids": req.input_ids, + "mm_inputs": req.mm_inputs, + "sampling_params": req.sampling_params, + "return_logprob": req.return_logprob, + "logprob_start_len": req.logprob_start_len, + "top_logprobs_num": req.top_logprobs_num, + } + ) + + return { + "forward_mode": "extend", + "batch_size": self.batch_size, + "input_ids": all_input_ids, + "positions": all_positions, + "req_pool_indices": req_pool_indices, + "seq_lens": seq_lens, + "extend_seq_lens": extend_seq_lens, + "extend_prefix_lens": extend_prefix_lens, + "requests": requests_meta, + "batch_id": id(self), + "created_at": time.time(), + } + + def prepare_for_decode(self) -> Dict[str, Any]: + """Assemble a batch dict for decode forward pass (one token per request). + + Returns a dict with one input token per request (the last generated + token), positions at ``seq_len``, and request metadata. + """ + all_input_ids: List[int] = [] + all_positions: List[int] = [] + req_pool_indices: List[int] = [] + seq_lens: List[int] = [] + requests_meta: List[Dict[str, Any]] = [] + + for req in self.reqs: + # For decode, the input is the last generated token + if req.output_ids: + all_input_ids.append(req.output_ids[-1]) + else: + # Fallback: last input token (shouldn't happen normally) + all_input_ids.append(req.input_ids[-1]) + all_positions.append(req.seq_len) + req_pool_indices.append(req.req_pool_idx) + seq_lens.append(req.seq_len) + requests_meta.append( + { + "rid": req.rid, + "sampling_params": req.sampling_params, + "return_logprob": req.return_logprob, + "logprob_start_len": req.logprob_start_len, + "top_logprobs_num": req.top_logprobs_num, + } + ) + + return { + "forward_mode": "decode", + "batch_size": self.batch_size, + "input_ids": all_input_ids, + "positions": all_positions, + "req_pool_indices": req_pool_indices, + "seq_lens": seq_lens, + "requests": requests_meta, + "batch_id": id(self), + "created_at": time.time(), + } + + def to_batch_dict(self) -> Dict[str, Any]: + """Build the batch dict appropriate for the current forward mode.""" + if self.forward_mode.is_extend(): + return self.prepare_for_extend() + else: + return self.prepare_for_decode() + + def __repr__(self) -> str: + return f"ScheduleBatch(mode={self.forward_mode.name}, size={self.batch_size})" + + +# ====================================================================== +# SchedulerProcess +# ====================================================================== + class SchedulerProcess: """Runs inside a subprocess. Central hub that drives the inference loop.""" @@ -51,19 +333,29 @@ class SchedulerProcess: def __init__( self, recv_from_tokenizer_addr: str, - send_to_model_runner_addr: str, - recv_from_model_runner_addr: str, send_to_detokenizer_addr: str, + server_config: Optional[Any] = None, + model_config: Optional[Any] = None, + gpu_id: int = 0, shared_queue: Optional[TensorQueue] = None, enable_shared_queue: bool = False, tensor_transport_mode: TensorTransportMode = "default", + # Scheduling limits + max_running_requests: int = _DEFAULT_MAX_RUNNING_REQUESTS, + max_prefill_tokens: int = _DEFAULT_MAX_PREFILL_TOKENS, + max_total_tokens: int = _DEFAULT_MAX_TOTAL_TOKENS, + eos_token_ids: Optional[List[int]] = None, + default_max_new_tokens: int = _DEFAULT_MAX_NEW_TOKENS, ): - # ZMQ addresses + # ZMQ addresses (tokenizer + detokenizer only) self._recv_from_tokenizer_addr = recv_from_tokenizer_addr - self._send_to_model_runner_addr = send_to_model_runner_addr - self._recv_from_model_runner_addr = recv_from_model_runner_addr self._send_to_detokenizer_addr = send_to_detokenizer_addr + # Model config (for in-process model runner, sglang-style) + self._server_config = server_config + self._model_config = model_config + self._gpu_id = gpu_id + # Shared queue configuration self._shared_queue = shared_queue self._enable_shared_queue = enable_shared_queue @@ -72,16 +364,53 @@ def __init__( # ZMQ runtime objects (initialised in init_sockets) self._zmq_ctx: Optional[zmq.Context] = None self._recv_from_tokenizer: Optional[zmq.Socket] = None - self._send_to_model_runner: Optional[zmq.Socket] = None - self._recv_from_model_runner: Optional[zmq.Socket] = None self._send_to_detokenizer: Optional[zmq.Socket] = None self._poller: Optional[zmq.Poller] = None - # Request management + # In-process model runner (initialised in init_model) + self._model_runner = None + + # Request management -- three-stage pipeline self._waiting_queue: Deque[TokenizedGenerateReqInput] = deque() - self._running_batch: Optional[Dict[str, Any]] = None + self._pending_queue: List[Req] = [] + self._running_batch: List[Req] = [] self._finished: List[Dict[str, Any]] = [] + # Scheduling limits + self._max_running_requests = max_running_requests + self._max_prefill_tokens = max_prefill_tokens + + # KV-cache token budget (simplified single-GPU tracking). + self._max_total_tokens = max_total_tokens + self._used_tokens: int = 0 + + # EOS token(s) for finish detection + self._eos_token_ids: List[int] = list(eos_token_ids) if eos_token_ids else [] + + # Default max_new_tokens (from model config or fallback) + self._default_max_new_tokens = default_max_new_tokens + + # Monotonic request-slot counter (simplified; no GPU pool access) + self._next_req_pool_idx: int = 0 + + # ------ Throughput metrics (sglang-style interval logging) ------ + # How often (in decode batches) to log throughput stats. + self._decode_log_interval: int = ( + server_config.decode_log_interval + if server_config is not None and hasattr(server_config, "decode_log_interval") + else 40 + ) + # Accumulators reset at each log interval + self._num_prefill_tokens: int = 0 # new prefill tokens (excluding cache hits) + self._num_prefill_cache_tokens: int = 0 # prefill tokens served from cache + self._num_decode_tokens: int = 0 # generated decode tokens + self._num_prefill_reqs: int = 0 # prefill requests count + # Timestamps for throughput calculation + self._last_prefill_stats_tic: float = time.time() + self._last_decode_stats_tic: float = time.time() + # Forward pass counters + self._forward_ct_decode: int = 0 + # ------------------------------------------------------------------ # Lifecycle # ------------------------------------------------------------------ @@ -95,18 +424,6 @@ def init_sockets(self) -> None: self._recv_from_tokenizer_addr, bind=False, ) - self._send_to_model_runner = create_zmq_socket( - self._zmq_ctx, - zmq.PUSH, - self._send_to_model_runner_addr, - bind=True, - ) - self._recv_from_model_runner = create_zmq_socket( - self._zmq_ctx, - zmq.PULL, - self._recv_from_model_runner_addr, - bind=True, - ) self._send_to_detokenizer = create_zmq_socket( self._zmq_ctx, zmq.PUSH, @@ -118,6 +435,22 @@ def init_sockets(self) -> None: self._poller = zmq.Poller() self._poller.register(self._recv_from_tokenizer, zmq.POLLIN) + def init_model(self) -> None: + """Create and initialise the in-process model runner (sglang-style). + + Must be called after ``init_sockets`` and inside the subprocess + (after spawn) since it performs CUDA initialisation. + """ + from pymllm.orchestrator.model_runner_process import ModelRunnerProcess + + self._model_runner = ModelRunnerProcess( + gpu_id=self._gpu_id, + server_config=self._server_config, + model_config=self._model_config, + ) + self._model_runner.init_model() + logger.info("In-process model runner initialised on GPU %d", self._gpu_id) + def event_loop(self) -> None: """Infinite scheduling loop.""" logger.info( @@ -170,6 +503,8 @@ def _recv_from_zmq(self) -> None: self._waiting_queue = type(self._waiting_queue)( r for r in self._waiting_queue if r.rid != rid ) + # Also abort from pending queue + self._abort_request(rid) else: self._waiting_queue.append(msg) @@ -236,90 +571,398 @@ def _recv_from_shared_queue(self) -> None: # ------------------------------------------------------------------ def process_input_requests(self) -> None: - """Pre-process and validate requests sitting in ``_waiting_queue``. - - TODO: attach sampling params, allocate KV-cache slots, etc. + """Convert raw :class:`TokenizedGenerateReqInput` in ``_waiting_queue`` + into :class:`Req` objects and move them to ``_pending_queue``. + + For each request: + 1. Parse sampling params (max_new_tokens, temperature, top_p, top_k, + stop_token_ids with defaults from EOS token). + 2. Create a ``Req`` object. + 3. Move from ``_waiting_queue`` to ``_pending_queue``. """ - pass + while self._waiting_queue: + raw = self._waiting_queue.popleft() + + # Merge EOS token into stop_token_ids if not already present + sp = dict(raw.sampling_params) if raw.sampling_params else {} + # Inject model-aware default for max_new_tokens when not provided + if "max_new_tokens" not in sp: + sp["max_new_tokens"] = self._default_max_new_tokens + stop_ids = list(sp.get("stop_token_ids", [])) + for eid in self._eos_token_ids: + if eid not in stop_ids: + stop_ids.append(eid) + sp["stop_token_ids"] = stop_ids + + req = Req( + rid=raw.rid, + input_ids=raw.input_ids, + input_text=raw.input_text, + sampling_params=sp, + mm_inputs=raw.mm_inputs, + stream=raw.stream, + return_logprob=raw.return_logprob, + logprob_start_len=raw.logprob_start_len, + top_logprobs_num=raw.top_logprobs_num, + ) + self._pending_queue.append(req) + logger.debug("Processed input request %s (len=%d)", req.rid, req.seq_len) # ------------------------------------------------------------------ # Step 3: build the next batch # ------------------------------------------------------------------ - def get_next_batch_to_run(self) -> Optional[Dict[str, Any]]: - """Select requests from ``_waiting_queue`` and form a batch. - - TODO: implement real batching / scheduling policy. + def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: + """Implements continuous batching with two phases. + + 1. **Filter finished**: Remove finished requests from + ``_running_batch`` and free their token budget. + 2. **Schedule new prefills**: From ``_pending_queue``, admit + requests that fit within the token budget and + ``max_running_requests``. + 3. **Build batch**: + - If new prefill requests exist -> EXTEND batch + - Else if running decode requests exist -> DECODE batch + - Else -> None (idle) + + Note on prefix cache: The actual prefix matching is done by the + ModelRunnerProcess (which owns the RadixCache). The scheduler + uses ``input_len`` as a conservative budget estimate. The model + runner reports back actual ``prefix_len`` in results, and the + scheduler adjusts ``_used_tokens`` accordingly in + ``process_batch_result``. """ - if not self._waiting_queue: - return None + # Phase 1: filter finished requests from running batch + still_running: List[Req] = [] + for req in self._running_batch: + if req.is_finished: + self._model_runner._free_rid_resources(req.rid) + self._free_req_resources(req) + else: + still_running.append(req) + self._running_batch = still_running + + # Phase 2: schedule new prefill requests from pending queue + new_prefill: List[Req] = [] + remaining_pending: List[Req] = [] + prefill_token_budget = self._max_prefill_tokens + + for req in self._pending_queue: + input_len = len(req.input_ids) + total_running = len(self._running_batch) + len(new_prefill) + + # Check capacity constraints. + # We reserve the full input_len as KV budget (conservative). + # If the model runner finds a prefix cache hit, some tokens + # won't need new KV allocation; the budget is corrected in + # process_batch_result. + can_fit_request = total_running < self._max_running_requests + can_fit_tokens = (self._used_tokens + input_len) <= self._max_total_tokens + can_fit_prefill = input_len <= prefill_token_budget + + if can_fit_request and can_fit_tokens and can_fit_prefill: + # Allocate req pool slot + req.req_pool_idx = self._next_req_pool_idx + self._next_req_pool_idx += 1 + # Reserve token budget (full input_len as conservative estimate) + self._used_tokens += input_len + prefill_token_budget -= input_len + new_prefill.append(req) + logger.debug( + "Scheduled prefill for %s (len=%d, used=%d/%d)", + req.rid, + input_len, + self._used_tokens, + self._max_total_tokens, + ) + else: + remaining_pending.append(req) - batch_requests: List[Dict[str, Any]] = [] - # TODO: respect max_running_requests, memory budget, etc. - while self._waiting_queue: - batch_requests.append(self._waiting_queue.popleft()) + self._pending_queue = remaining_pending - batch = { - "requests": batch_requests, - "batch_id": id(batch_requests), - "created_at": time.time(), - } - return batch + # Phase 3: build batch + if new_prefill: + return ScheduleBatch(new_prefill, ForwardMode.EXTEND) + elif self._running_batch: + return ScheduleBatch(self._running_batch, ForwardMode.DECODE) + else: + return None # ------------------------------------------------------------------ # Step 4: run the batch via ModelRunnerProcess # ------------------------------------------------------------------ - def run_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: - """Send *batch* to ModelRunnerProcess and wait for the result. + def run_batch(self, batch: ScheduleBatch) -> Dict[str, Any]: + """Execute the batch via the in-process model runner (sglang-style). - This is a **blocking** call: the scheduler is synchronous with the - model runner for simplicity. Overlap scheduling can be added later. + Direct function call — no ZMQ serialisation overhead. """ - self._send_to_model_runner.send_pyobj(batch) - result = self._recv_from_model_runner.recv_pyobj() - return result + batch_dict = batch.to_batch_dict() + return self._model_runner._forward_batch(batch_dict) # ------------------------------------------------------------------ # Step 5: process batch result # ------------------------------------------------------------------ def process_batch_result( - self, batch: Dict[str, Any], result: Dict[str, Any] + self, batch: ScheduleBatch, result: Dict[str, Any] ) -> None: """Handle the result returned by the ModelRunnerProcess. - TODO: check completion status (EOS, max_tokens), manage KV-cache, - split finished vs. unfinished requests. + For each request in the result: + 1. Update ``prefix_len`` from the model runner's radix cache hit. + 2. Adjust ``_used_tokens`` if a prefix cache hit was found (the + scheduler over-reserved during scheduling). + 3. Append new token(s) to ``req.output_ids``. + 4. Increment ``req.seq_len``. + 5. Call ``req.check_finished()`` (EOS token, max_new_tokens). + 6. If prefill request: mark ``req.is_prefilled = True``, move to + running batch for decode. + 7. If finished: collect for output, free KV-cache budget. """ - finished_requests = result.get("finished", []) - unfinished_requests = result.get("unfinished", []) - - self._finished.extend(finished_requests) - - # Put unfinished requests back for the next iteration - for req in unfinished_requests: - self._waiting_queue.appendleft(req) + # Build a rid -> Req lookup for the batch + rid_to_req: Dict[str, Req] = {req.rid: req for req in batch.reqs} + + # The result may contain per-request outputs in "finished" and + # "unfinished" lists, or a flat "outputs" list. Handle both. + output_items: List[Dict[str, Any]] = [] + output_items.extend(result.get("finished", [])) + output_items.extend(result.get("unfinished", [])) + if "outputs" in result: + output_items.extend(result["outputs"]) + + for out in output_items: + rid = out.get("rid") + req = rid_to_req.get(rid) + if req is None: + logger.warning("Result for unknown rid=%s, skipping", rid) + continue + + # Update prefix_len from model runner's radix cache matching. + # The model runner reports the actual prefix_len it found. + # The scheduler originally reserved full input_len in + # get_next_batch_to_run; correct the over-reservation now. + if "prefix_len" in out and batch.forward_mode.is_extend(): + actual_prefix_len = out["prefix_len"] + if actual_prefix_len > req.prefix_len: + saved = actual_prefix_len - req.prefix_len + req.prefix_len = actual_prefix_len + # Give back the over-reserved tokens. The model runner + # reused cached KV for `saved` tokens, so those tokens + # do not consume new KV pool slots. + self._used_tokens = max(0, self._used_tokens - saved) + logger.info( + "Prefix cache hit for rid=%s: %d tokens reused, " + "budget adjusted by -%d (used=%d/%d)", + rid, + actual_prefix_len, + saved, + self._used_tokens, + self._max_total_tokens, + ) + + # Append generated token(s) + new_token_ids = out.get("output_token_ids", []) + if isinstance(new_token_ids, int): + new_token_ids = [new_token_ids] + req.output_ids.extend(new_token_ids) + req.seq_len += len(new_token_ids) + + # Update token budget for newly generated tokens + self._used_tokens += len(new_token_ids) + + # Check finish conditions + req.check_finished(eos_token_id=self._eos_token_ids[0] if self._eos_token_ids else None) + + # Process batch requests based on forward mode + if batch.forward_mode.is_extend(): + # Prefill batch: mark as prefilled and route + for req in batch.reqs: + req.is_prefilled = True + if req.is_finished: + self._collect_finished_output(req) + self._model_runner._free_rid_resources(req.rid) + self._free_req_resources(req) + else: + self._running_batch.append(req) + + # --- Accumulate prefill metrics --- + total_input = 0 + total_cached = 0 + for req in batch.reqs: + total_input += req.prompt_len + total_cached += req.prefix_len + self._num_prefill_tokens += total_input - total_cached + self._num_prefill_cache_tokens += total_cached + self._num_prefill_reqs += len(batch.reqs) + self._log_prefill_stats() + else: + # Decode batch: check finish and collect + new_running: List[Req] = [] + for req in batch.reqs: + if req.is_finished: + self._collect_finished_output(req) + self._model_runner._free_rid_resources(req.rid) + self._free_req_resources(req) + else: + new_running.append(req) + self._running_batch = new_running + + # --- Accumulate decode metrics --- + self._num_decode_tokens += batch.batch_size # 1 token per request + self._forward_ct_decode += 1 + if ( + self._decode_log_interval > 0 + and self._forward_ct_decode % self._decode_log_interval == 0 + ): + self._log_decode_stats() # ------------------------------------------------------------------ # Step 6: stream output to DetokenizerProcess # ------------------------------------------------------------------ def stream_output(self) -> None: - """Send finished token-ID outputs to the DetokenizerProcess.""" + """Send finished/streaming outputs to the DetokenizerProcess. + + Produces :class:`~pymllm.engine.io_struct.BatchTokenIDOutput`-compatible + dicts. For streaming requests, intermediate tokens are also sent. + """ + # Collect streaming outputs from running requests + for req in self._running_batch: + if req.stream and len(req.output_ids) > req.read_offset: + decode_ids = req.output_ids[req.read_offset :] + output = { + "rids": [req.rid], + "finished_reasons": [None], + "decode_ids": decode_ids, + "read_offsets": [req.read_offset], + "output_ids": list(req.output_ids), + "skip_special_tokens": [True], + "prompt_tokens": [req.prompt_len], + "completion_tokens": [len(req.output_ids)], + } + req.read_offset = len(req.output_ids) + self._send_to_detokenizer.send_pyobj(output) + + # Send finished outputs while self._finished: item = self._finished.pop(0) self._send_to_detokenizer.send_pyobj(item) + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _log_prefill_stats(self) -> None: + """Log prefill throughput at INFO level (called after each prefill batch).""" + now = time.time() + elapsed = now - self._last_prefill_stats_tic + self._last_prefill_stats_tic = now + + if elapsed > 0: + input_throughput = self._num_prefill_tokens / elapsed + else: + input_throughput = 0.0 + + logger.info( + "Prefill batch: %d reqs, " + "new tokens: %d, " + "cached tokens: %d, " + "input throughput: %.2f token/s", + self._num_prefill_reqs, + self._num_prefill_tokens, + self._num_prefill_cache_tokens, + input_throughput, + ) + # Reset accumulators + self._num_prefill_tokens = 0 + self._num_prefill_cache_tokens = 0 + self._num_prefill_reqs = 0 + + def _log_decode_stats(self) -> None: + """Log decode throughput at INFO level (called every decode_log_interval batches).""" + now = time.time() + elapsed = now - self._last_decode_stats_tic + self._last_decode_stats_tic = now + + if elapsed > 0: + gen_throughput = self._num_decode_tokens / elapsed + else: + gen_throughput = 0.0 + + logger.info( + "Decode: %d steps, " + "gen tokens: %d, " + "running: %d reqs, " + "gen throughput: %.2f token/s", + self._forward_ct_decode, + self._num_decode_tokens, + len(self._running_batch), + gen_throughput, + ) + # Reset accumulators + self._num_decode_tokens = 0 + self._forward_ct_decode = 0 + + def _collect_finished_output(self, req: Req) -> None: + """Build a finished output dict and add it to ``_finished``.""" + decode_ids = req.output_ids[req.read_offset :] + output: Dict[str, Any] = { + "rids": [req.rid], + "finished_reasons": [req.finished_reason], + "decode_ids": decode_ids, + "read_offsets": [req.read_offset], + "output_ids": list(req.output_ids), + "skip_special_tokens": [True], + "prompt_tokens": [req.prompt_len], + "completion_tokens": [len(req.output_ids)], + } + self._finished.append(output) + logger.debug( + "Request %s finished: reason=%s, tokens=%d", + req.rid, + req.finished_reason, + len(req.output_ids), + ) + + def _free_req_resources(self, req: Req) -> None: + """Release KV-cache token budget for a finished request. + + The budget was charged as follows: + - At scheduling: ``+input_len`` (full prompt as conservative estimate) + - After prefix correction: ``-prefix_len`` (cached prefix doesn't need + new KV allocation; model runner manages those via radix cache) + - At each decode step: ``+1`` per generated token + + So the net charge for this request is: + ``(input_len - prefix_len) + num_decode_tokens`` + = ``seq_len - prefix_len`` + + We release exactly that amount. + """ + tokens_to_free = req.seq_len - req.prefix_len + self._used_tokens = max(0, self._used_tokens - tokens_to_free) + req.req_pool_idx = -1 + + def _abort_request(self, rid: str) -> None: + """Abort a request by rid from pending or running queues.""" + # Remove from pending queue + self._pending_queue = [r for r in self._pending_queue if r.rid != rid] + # Abort in running batch + for req in self._running_batch: + if req.rid == rid: + req.abort() + break + # ------------------------------------------------------------------ # Cleanup # ------------------------------------------------------------------ def shutdown(self) -> None: + if self._model_runner is not None: + self._model_runner.shutdown() for sock in ( self._recv_from_tokenizer, - self._send_to_model_runner, - self._recv_from_model_runner, self._send_to_detokenizer, ): if sock is not None: @@ -330,25 +973,38 @@ def shutdown(self) -> None: def run_scheduler_process( recv_from_tokenizer_addr: str, - send_to_model_runner_addr: str, - recv_from_model_runner_addr: str, send_to_detokenizer_addr: str, pipe_writer: Connection, shared_queue: Optional[TensorQueue] = None, enable_shared_queue: bool = False, tensor_transport_mode: TensorTransportMode = "default", + log_level: str = "info", + default_max_new_tokens: int = _DEFAULT_MAX_NEW_TOKENS, + eos_token_ids: Optional[List[int]] = None, + server_config: Optional[Any] = None, + model_config: Optional[Any] = None, + gpu_id: int = 0, ) -> None: - """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + """Entry point for ``torch.multiprocessing.Process(target=...)``. + + The scheduler process now also owns the model runner (sglang-style), + so model initialisation happens here. + """ + setup_subprocess_logging(log_level) proc = SchedulerProcess( recv_from_tokenizer_addr, - send_to_model_runner_addr, - recv_from_model_runner_addr, send_to_detokenizer_addr, + server_config=server_config, + model_config=model_config, + gpu_id=gpu_id, shared_queue=shared_queue, enable_shared_queue=enable_shared_queue, tensor_transport_mode=tensor_transport_mode, + default_max_new_tokens=default_max_new_tokens, + eos_token_ids=eos_token_ids, ) proc.init_sockets() + proc.init_model() pipe_writer.send({"status": "ready", "process": "scheduler"}) pipe_writer.close() diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py index 587a7c1ea..703618a40 100644 --- a/pymllm/orchestrator/tokenizer_process.py +++ b/pymllm/orchestrator/tokenizer_process.py @@ -35,7 +35,7 @@ from pymllm.engine.io_struct import TokenizedGenerateReqInput from pymllm.orchestrator.cuda_ipc_transport import MmItemMemoryPool, TensorTransportMode -from pymllm.orchestrator.ipc_utils import create_zmq_socket +from pymllm.orchestrator.ipc_utils import create_zmq_socket, setup_subprocess_logging from pymllm.orchestrator.shared_memory_queue import SharedMemoryManager, TensorQueue logger = logging.getLogger(__name__) @@ -352,6 +352,7 @@ def _tokenize( ) # Accept a list for robustness; take the first element. input_text = str(text[0]) if isinstance(text, list) else str(text) + logger.debug(f"Tokenizing input text {input_text}") encode_kwargs: Dict[str, Any] = { "add_special_tokens": True, @@ -485,6 +486,7 @@ def run_tokenizer_process( shared_queue: Optional[TensorQueue] = None, ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" + setup_subprocess_logging(tokenizer_cfg.get("log_level", "info")) proc = TokenizerProcess( recv_from_rr_addr, send_to_scheduler_addr, tokenizer_cfg, shared_queue ) diff --git a/pymllm/parsers/__init__.py b/pymllm/parsers/__init__.py new file mode 100644 index 000000000..5ac5c2922 --- /dev/null +++ b/pymllm/parsers/__init__.py @@ -0,0 +1,10 @@ +"""Output parsers for reasoning (thinking) content and tool calls.""" + +from pymllm.parsers.reasoning_parser import ReasoningParser +from pymllm.parsers.tool_call_parser import ToolCallParser, ToolCallItem + +__all__ = [ + "ReasoningParser", + "ToolCallParser", + "ToolCallItem", +] diff --git a/pymllm/parsers/reasoning_parser.py b/pymllm/parsers/reasoning_parser.py new file mode 100644 index 000000000..1f73c7885 --- /dev/null +++ b/pymllm/parsers/reasoning_parser.py @@ -0,0 +1,212 @@ +"""Reasoning / thinking content parser. + +Separates ``...`` (or model-specific markers) from normal +assistant content. Supports both one-shot and incremental streaming modes. + +Usage:: + + # Non-streaming + parser = ReasoningParser("qwen3") + reasoning, content = parser.parse_non_stream(full_text) + + # Streaming + parser = ReasoningParser("qwen3") + for delta in deltas: + reasoning_delta, content_delta = parser.parse_stream_chunk(delta) +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, Optional, Tuple, Type + + +# --------------------------------------------------------------------------- +# Detector registry +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class _DetectorConfig: + start: str + end: str + force: bool # True = always assume reasoning at start + + +_DETECTOR_MAP: Dict[str, _DetectorConfig] = { + # DeepSeek-R1: always starts in reasoning mode + "deepseek-r1": _DetectorConfig("", "", force=True), + # Qwen3: optional thinking (controlled by request) + "qwen3": _DetectorConfig("", "", force=False), + # Qwen3 forced thinking + "qwen3-thinking": _DetectorConfig("", "", force=True), + # GLM-4.5 + "glm45": _DetectorConfig("", "", force=False), + # Kimi + "kimi": _DetectorConfig("\u25c1think\u25b7", "\u25c1/think\u25b7", force=False), +} + + +# --------------------------------------------------------------------------- +# ReasoningParser +# --------------------------------------------------------------------------- + + +class ReasoningParser: + """Model-agnostic reasoning content parser. + + Parameters + ---------- + model_type + Key into the detector registry (e.g. ``"qwen3"``, ``"deepseek-r1"``). + stream_reasoning + If ``True``, stream reasoning content incrementally as it arrives. + If ``False``, buffer reasoning until the end tag is found. + """ + + SUPPORTED = set(_DETECTOR_MAP) + + def __init__(self, model_type: str, stream_reasoning: bool = True): + cfg = _DETECTOR_MAP.get(model_type) + if cfg is None: + raise ValueError( + f"Unknown reasoning parser {model_type!r}. " + f"Supported: {sorted(_DETECTOR_MAP)}" + ) + self._start = cfg.start + self._end = cfg.end + self._force = cfg.force + self._stream_reasoning = stream_reasoning + + # -- streaming state -- + self._buffer = "" + self._in_reasoning = cfg.force + self._start_consumed = False # True once start tag has been stripped + self._done = False # True once end tag has been seen + + # ------------------------------------------------------------------ # + # Non-streaming + # ------------------------------------------------------------------ # + + def parse_non_stream(self, text: str) -> Tuple[Optional[str], str]: + """Parse complete text. + + Returns ``(reasoning_content, content)`` where either may be empty. + """ + start_idx = text.find(self._start) + end_idx = text.find(self._end) + + if start_idx == -1 and not self._force: + return None, text + + # Determine boundaries + if self._force and start_idx == -1: + # Model didn't emit explicit start tag; treat prefix as reasoning + reason_start = 0 + else: + reason_start = start_idx + len(self._start) + + before = text[:start_idx] if start_idx != -1 else "" + + if end_idx != -1 and end_idx >= reason_start: + reasoning = text[reason_start:end_idx] + after = text[end_idx + len(self._end) :] + else: + reasoning = text[reason_start:] + after = "" + + content = (before + after).strip() + reasoning = reasoning.strip() + return reasoning or None, content + + # ------------------------------------------------------------------ # + # Streaming + # ------------------------------------------------------------------ # + + def parse_stream_chunk(self, delta: str) -> Tuple[str, str]: + """Parse an incremental streaming delta. + + Returns ``(reasoning_delta, content_delta)``. Either may be ``""``. + """ + if not delta: + return "", "" + + if self._done: + return "", delta + + self._buffer += delta + reasoning_out = "" + content_out = "" + + # In forced reasoning mode, consume the start tag if it appears + # (the model may or may not emit it explicitly). + if self._in_reasoning and not self._start_consumed: + idx = self._buffer.find(self._start) + if idx != -1: + # Start tag found — strip it and any text before it + self._buffer = self._buffer[idx + len(self._start) :] + self._start_consumed = True + elif _could_be_partial(self._buffer, self._start): + # Might be a partial start tag — hold the buffer + return "", "" + else: + # No start tag coming — mark consumed and continue + self._start_consumed = True + + if not self._in_reasoning: + # --- look for start tag --- + idx = self._buffer.find(self._start) + if idx != -1: + content_out += self._buffer[:idx] + self._buffer = self._buffer[idx + len(self._start) :] + self._in_reasoning = True + self._start_consumed = True + elif _could_be_partial(self._buffer, self._start): + # Potential partial match at tail — hold the buffer + safe = len(self._buffer) - len(self._start) + 1 + if safe > 0: + content_out += self._buffer[:safe] + self._buffer = self._buffer[safe:] + return "", content_out + else: + content_out += self._buffer + self._buffer = "" + return "", content_out + + if self._in_reasoning: + # --- look for end tag --- + idx = self._buffer.find(self._end) + if idx != -1: + reasoning_out += self._buffer[:idx] + after = self._buffer[idx + len(self._end) :] + self._buffer = "" + self._in_reasoning = False + self._done = True + if after: + content_out += after + elif _could_be_partial(self._buffer, self._end): + safe = len(self._buffer) - len(self._end) + 1 + if safe > 0: + reasoning_out += self._buffer[:safe] + self._buffer = self._buffer[safe:] + else: + reasoning_out += self._buffer + self._buffer = "" + + if not self._stream_reasoning: + reasoning_out = "" + + return reasoning_out, content_out + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _could_be_partial(text: str, pattern: str) -> bool: + """Return True if *text* ends with a prefix of *pattern*.""" + for i in range(1, len(pattern)): + if text.endswith(pattern[:i]): + return True + return False diff --git a/pymllm/parsers/tool_call_parser.py b/pymllm/parsers/tool_call_parser.py new file mode 100644 index 000000000..fdfe93914 --- /dev/null +++ b/pymllm/parsers/tool_call_parser.py @@ -0,0 +1,433 @@ +"""Tool-call (function-calling) output parser. + +Extracts structured tool calls from model output text. Supports both +one-shot and incremental streaming modes. + +Formats supported: + +* **qwen25** — ``{"name":...,"arguments":...}`` +* **llama3** — ``<|python_tag|>{"name":...,"parameters":...}`` +* **hermes** — ``{"name":...,"arguments":...}`` (same tags, Hermes schema) + +Usage:: + + # Non-streaming + parser = ToolCallParser("qwen25", tools=tools_list) + content, tool_calls = parser.parse_non_stream(full_text) + + # Streaming + parser = ToolCallParser("qwen25", tools=tools_list) + for delta in deltas: + content_delta, tool_call_deltas = parser.parse_stream_chunk(delta) +""" + +from __future__ import annotations + +import json +import re +import uuid +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Tuple + + +# --------------------------------------------------------------------------- +# Data structures +# --------------------------------------------------------------------------- + + +@dataclass +class ToolCallItem: + """A single parsed tool call.""" + + name: Optional[str] = None + arguments: str = "" + tool_call_id: str = "" + index: int = 0 + + def to_openai_dict(self, streaming: bool = True) -> Dict[str, Any]: + """Convert to OpenAI ``tool_calls[]`` element format. + + Parameters + ---------- + streaming + If True, include ``index`` (streaming delta format). + If False, omit ``index`` (non-streaming message format). + """ + d: Dict[str, Any] = {"type": "function", "function": {}} + if streaming: + d["index"] = self.index + if self.tool_call_id: + d["id"] = self.tool_call_id + fn: Dict[str, Any] = d["function"] + if self.name is not None: + fn["name"] = self.name + fn["arguments"] = self.arguments or "" + return d + + +# --------------------------------------------------------------------------- +# Detector base +# --------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class _FormatConfig: + bot_token: str + end_token: str + # Regex to extract individual call bodies between bot/end tokens. + # If None, the entire text between bot and end tokens is one call. + call_regex: Optional[str] = None + + +_FORMAT_MAP: Dict[str, _FormatConfig] = { + "qwen25": _FormatConfig( + bot_token="\n", + end_token="\n", + ), + "qwen3_coder": _FormatConfig( + bot_token="", + end_token="", + ), + "hermes": _FormatConfig( + bot_token="\n", + end_token="\n", + ), + "llama3": _FormatConfig( + bot_token="<|python_tag|>", + end_token="", # Llama3 uses EOT, detected via EOS + ), +} + + +# --------------------------------------------------------------------------- +# ToolCallParser +# --------------------------------------------------------------------------- + + +class ToolCallParser: + """Model-agnostic tool-call parser. + + Parameters + ---------- + model_type + Key into the format registry (e.g. ``"qwen25"``, ``"llama3"``). + tools + The ``tools`` list from the OpenAI chat request (used to resolve + function names). + """ + + SUPPORTED = set(_FORMAT_MAP) + + def __init__(self, model_type: str, tools: Optional[List[Any]] = None): + cfg = _FORMAT_MAP.get(model_type) + if cfg is None: + raise ValueError( + f"Unknown tool-call parser {model_type!r}. " + f"Supported: {sorted(_FORMAT_MAP)}" + ) + self._bot = cfg.bot_token + self._end = cfg.end_token + self._model_type = model_type + self._tools = tools or [] + + # -- streaming state -- + self._buffer = "" + self._in_call = False + self._current_tool_idx = 0 + self._current_call_buf = "" + self._prev_args_len = 0 + self._name_sent = False + self._completed_calls: List[ToolCallItem] = [] + + # ------------------------------------------------------------------ # + # Non-streaming + # ------------------------------------------------------------------ # + + def has_tool_call(self, text: str) -> bool: + """Return True if *text* contains a tool-call pattern.""" + return self._bot in text + + def parse_non_stream( + self, text: str + ) -> Tuple[str, List[ToolCallItem]]: + """Parse complete text. + + Returns ``(remaining_content, tool_calls)``. + """ + if not self.has_tool_call(text): + return text, [] + + tool_calls: List[ToolCallItem] = [] + normal_parts: List[str] = [] + + remaining = text + idx = 0 + while True: + bot_pos = remaining.find(self._bot) + if bot_pos == -1: + normal_parts.append(remaining) + break + normal_parts.append(remaining[:bot_pos]) + remaining = remaining[bot_pos + len(self._bot) :] + + if self._end: + end_pos = remaining.find(self._end) + if end_pos == -1: + call_body = remaining + remaining = "" + else: + call_body = remaining[:end_pos] + remaining = remaining[end_pos + len(self._end) :] + else: + call_body = remaining + remaining = "" + + parsed = self._parse_call_body(call_body.strip()) + if parsed is not None: + parsed.index = idx + parsed.tool_call_id = _make_tool_call_id() + tool_calls.append(parsed) + idx += 1 + + content = "".join(normal_parts).strip() + return content, tool_calls + + # ------------------------------------------------------------------ # + # Streaming + # ------------------------------------------------------------------ # + + def parse_stream_chunk( + self, delta: str + ) -> Tuple[str, List[ToolCallItem]]: + """Parse an incremental streaming delta. + + Returns ``(content_delta, tool_call_items)``. + + For tool call items: + - First item for a call: ``name`` is set, ``arguments`` is ``""``. + - Subsequent items: ``name`` is ``None``, ``arguments`` is the new + characters appended (argument delta). + """ + if not delta: + return "", [] + + self._buffer += delta + content_out = "" + items: List[ToolCallItem] = [] + + while True: + if not self._in_call: + # --- look for bot token --- + bot_pos = self._buffer.find(self._bot) + if bot_pos != -1: + content_out += self._buffer[:bot_pos] + self._buffer = self._buffer[bot_pos + len(self._bot) :] + self._in_call = True + self._current_call_buf = "" + self._prev_args_len = 0 + self._name_sent = False + continue # try to process call content + else: + # Check for partial bot token at tail + if self._bot and _could_be_partial(self._buffer, self._bot): + safe = len(self._buffer) - len(self._bot) + 1 + if safe > 0: + content_out += self._buffer[:safe] + self._buffer = self._buffer[safe:] + else: + content_out += self._buffer + self._buffer = "" + break + + if self._in_call: + # --- look for end token --- + if self._end: + end_pos = self._buffer.find(self._end) + if end_pos != -1: + self._current_call_buf += self._buffer[:end_pos] + self._buffer = self._buffer[end_pos + len(self._end) :] + # Emit final tool call + item = self._finalize_call() + if item is not None: + items.append(item) + self._in_call = False + self._current_tool_idx += 1 + continue # there may be more calls + else: + # Accumulate and stream arguments + self._current_call_buf += self._buffer + self._buffer = "" + item = self._stream_partial_call() + if item is not None: + items.append(item) + break + else: + # No end token (e.g. Llama3) — accumulate everything + self._current_call_buf += self._buffer + self._buffer = "" + item = self._stream_partial_call() + if item is not None: + items.append(item) + break + + return content_out, items + + def flush(self) -> List[ToolCallItem]: + """Flush any remaining buffered tool call (call at request end).""" + items: List[ToolCallItem] = [] + if self._in_call and self._current_call_buf.strip(): + item = self._finalize_call() + if item is not None: + items.append(item) + self._in_call = False + return items + + # ------------------------------------------------------------------ # + # Internal helpers + # ------------------------------------------------------------------ # + + def _parse_call_body(self, body: str) -> Optional[ToolCallItem]: + """Parse a single call body (JSON or qwen3_coder XML-style).""" + if self._model_type == "qwen3_coder": + return self._parse_qwen3_coder_body(body) + try: + obj = json.loads(body) + except json.JSONDecodeError: + return None + name = obj.get("name") + args = obj.get("arguments") or obj.get("parameters") or {} + if isinstance(args, dict): + args = json.dumps(args, ensure_ascii=False) + return ToolCallItem(name=name, arguments=args) + + @staticmethod + def _parse_qwen3_coder_body(body: str) -> Optional[ToolCallItem]: + """Parse qwen3_coder XML-style: ``V...``.""" + # Extract function name + func_m = re.search(r"]+)>", body) + if func_m is None: + return None + name = func_m.group(1) + # Extract parameters + params: Dict[str, Any] = {} + for pm in re.finditer( + r"]+)>(.*?)(?:|(?=))", + body, + re.DOTALL, + ): + key = pm.group(1) + val = pm.group(2).strip() + # Try to parse as JSON value, otherwise keep as string + try: + params[key] = json.loads(val) + except (json.JSONDecodeError, ValueError): + params[key] = val + return ToolCallItem( + name=name, + arguments=json.dumps(params, ensure_ascii=False), + ) + + def _stream_partial_call(self) -> Optional[ToolCallItem]: + """Try to extract streaming information from the partial call.""" + body = self._current_call_buf.strip() + if not body: + return None + + # Try to extract name first + if not self._name_sent: + name = self._try_extract_name(body) + if name is not None: + self._name_sent = True + return ToolCallItem( + name=name, + arguments="", + tool_call_id=_make_tool_call_id(), + index=self._current_tool_idx, + ) + return None + + # Stream argument characters + args_str = self._try_extract_args_partial(body) + if args_str is not None and len(args_str) > self._prev_args_len: + new_chars = args_str[self._prev_args_len :] + self._prev_args_len = len(args_str) + return ToolCallItem( + name=None, + arguments=new_chars, + index=self._current_tool_idx, + ) + return None + + def _finalize_call(self) -> Optional[ToolCallItem]: + """Finalize a complete call — emit any remaining argument chars.""" + parsed = self._parse_call_body(self._current_call_buf.strip()) + if parsed is None: + return None + + if not self._name_sent: + # Entire call came at once + parsed.index = self._current_tool_idx + parsed.tool_call_id = _make_tool_call_id() + return parsed + + # Name was already sent — emit remaining arguments + full_args = parsed.arguments + new_chars = full_args[self._prev_args_len :] + if new_chars: + return ToolCallItem( + name=None, + arguments=new_chars, + index=self._current_tool_idx, + ) + return None + + def _try_extract_name(self, partial: str) -> Optional[str]: + """Try to extract function name from partial call body.""" + if self._model_type == "qwen3_coder": + m = re.search(r"]+)>", partial) + return m.group(1) if m else None + m = re.search(r'"name"\s*:\s*"([^"]+)"', partial) + return m.group(1) if m else None + + def _try_extract_args_partial(self, partial: str) -> Optional[str]: + """Try to extract partial arguments from call body.""" + if self._model_type == "qwen3_coder": + # Build JSON incrementally from V tags + params: Dict[str, Any] = {} + for pm in re.finditer( + r"]+)>(.*?)(?:)", + partial, + re.DOTALL, + ): + key = pm.group(1) + val = pm.group(2).strip() + try: + params[key] = json.loads(val) + except (json.JSONDecodeError, ValueError): + params[key] = val + if params: + return json.dumps(params, ensure_ascii=False) + return None + m = re.search(r'"arguments"\s*:\s*(\{.*)', partial, re.DOTALL) + if m: + return m.group(1) + m = re.search(r'"parameters"\s*:\s*(\{.*)', partial, re.DOTALL) + if m: + return m.group(1) + return None + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_tool_call_id() -> str: + return f"call_{uuid.uuid4().hex[:24]}" + + +def _could_be_partial(text: str, pattern: str) -> bool: + for i in range(1, len(pattern)): + if text.endswith(pattern[:i]): + return True + return False diff --git a/pymllm/server/launch.py b/pymllm/server/launch.py index 83a222f7e..b9f603220 100644 --- a/pymllm/server/launch.py +++ b/pymllm/server/launch.py @@ -1,17 +1,936 @@ +"""pymllm HTTP server -- RESTful API entry point. + +This module implements a FastAPI-based HTTP server that wraps the pymllm +:class:`Engine` and exposes OpenAI-compatible and native REST endpoints, +following the architecture of sglang's ``http_server.py``. + +Endpoints +--------- +* ``GET /health`` -- liveness probe +* ``GET /v1/models`` -- list served models (OpenAI-compatible) +* ``POST /generate`` -- native generate (streaming via SSE) +* ``POST /v1/completions`` -- OpenAI-compatible completions +* ``POST /v1/chat/completions`` -- OpenAI-compatible chat completions +* ``GET /model_info`` -- model metadata +* ``GET /server_info`` -- runtime config dump +* ``POST /flush_cache`` -- flush internal caches +* ``POST /abort_request`` -- cancel a running request +""" + +import asyncio +import logging +import os +import time +import uuid +from contextlib import asynccontextmanager +from typing import Any, AsyncIterator, Dict, List, Optional, Union + +import orjson +import uvicorn +import uvloop +from fastapi import FastAPI, HTTPException, Request +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import ORJSONResponse, Response, StreamingResponse +from pydantic import BaseModel, Field + +from pymllm.configs.global_config import get_global_config, make_args, read_args from pymllm.engine.launch import Engine -from pymllm.configs.global_config import make_args, read_args + +logger = logging.getLogger(__name__) +asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) + +# --------------------------------------------------------------------------- +# Global handles (populated at startup) +# --------------------------------------------------------------------------- +_engine: Optional[Engine] = None +_tokenizer: Optional[Any] = None + + +def _get_engine() -> Engine: + """Return the running engine or raise.""" + if _engine is None: + raise RuntimeError("Engine not initialised") + return _engine + + +# --------------------------------------------------------------------------- +# Pydantic request / response models +# --------------------------------------------------------------------------- + + +class GenerateRequest(BaseModel): + """Body for ``POST /generate``.""" + + text: Optional[Union[List[str], str]] = None + input_ids: Optional[Union[List[List[int]], List[int]]] = None + sampling_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None + image_data: Optional[Any] = None + audio_data: Optional[Any] = None + video_data: Optional[Any] = None + return_logprob: Optional[Union[List[bool], bool]] = None + logprob_start_len: Optional[Union[List[int], int]] = None + top_logprobs_num: Optional[Union[List[int], int]] = None + lora_path: Optional[Union[List[Optional[str]], str]] = None + session_params: Optional[Union[List[Dict[str, Any]], Dict[str, Any]]] = None + stream: bool = False + rid: Optional[Union[List[str], str]] = None + + model_config = {"extra": "allow"} # forward unknown keys as extra_options + + +# -- OpenAI-compatible models ----------------------------------------------- + + +class ImageUrl(BaseModel): + url: str + detail: Optional[str] = "auto" + + +class ContentPart(BaseModel): + type: str + text: Optional[str] = None + image_url: Optional[ImageUrl] = None + + +class ChatMessage(BaseModel): + role: str + content: Optional[Union[str, List[ContentPart]]] = None + name: Optional[str] = None + tool_calls: Optional[List[Any]] = None + tool_call_id: Optional[str] = None + + model_config = {"extra": "allow"} + + +class StreamOptions(BaseModel): + include_usage: Optional[bool] = False + continuous_usage_stats: Optional[bool] = False + + +class ToolFunction(BaseModel): + name: str + description: Optional[str] = None + parameters: Optional[Dict[str, Any]] = None + + +class Tool(BaseModel): + type: str = "function" + function: ToolFunction + + +class ChatCompletionRequest(BaseModel): + """OpenAI ``POST /v1/chat/completions`` body.""" + + model: str = "" + messages: List[ChatMessage] + temperature: Optional[float] = None + top_p: Optional[float] = None + top_k: Optional[int] = None + max_tokens: Optional[int] = None + max_completion_tokens: Optional[int] = None + stream: bool = False + stream_options: Optional[StreamOptions] = None + stop: Optional[Union[str, List[str]]] = None + n: int = 1 + frequency_penalty: Optional[float] = None + presence_penalty: Optional[float] = None + repetition_penalty: Optional[float] = None + seed: Optional[int] = None + logprobs: Optional[bool] = None + top_logprobs: Optional[int] = None + user: Optional[str] = None + # Tool calling + tools: Optional[List[Tool]] = None + tool_choice: Optional[Union[str, Dict[str, Any]]] = None + # Reasoning control + separate_reasoning: bool = True + stream_reasoning: bool = True + # Pass-through to tokenizer.apply_chat_template (e.g. enable_thinking) + chat_template_kwargs: Optional[Dict[str, Any]] = None + + model_config = {"extra": "allow"} + + +class CompletionRequest(BaseModel): + """OpenAI ``POST /v1/completions`` body.""" + + model: str = "" + prompt: Union[str, List[str]] + temperature: Optional[float] = None + top_p: Optional[float] = None + top_k: Optional[int] = None + max_tokens: Optional[int] = None + stream: bool = False + stream_options: Optional[StreamOptions] = None + stop: Optional[Union[str, List[str]]] = None + n: int = 1 + frequency_penalty: Optional[float] = None + presence_penalty: Optional[float] = None + repetition_penalty: Optional[float] = None + seed: Optional[int] = None + echo: bool = False + logprobs: Optional[int] = None + user: Optional[str] = None + + model_config = {"extra": "allow"} + + +class AbortRequest(BaseModel): + rid: Optional[str] = None + + +# --------------------------------------------------------------------------- +# FastAPI application & lifespan +# --------------------------------------------------------------------------- + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Startup / shutdown hooks for the FastAPI app.""" + global _engine, _tokenizer + _engine = app.state.engine # type: ignore[attr-defined] + + # Load tokenizer in server process for apply_chat_template + cfg = get_global_config() + try: + from transformers import AutoTokenizer + + _tokenizer = AutoTokenizer.from_pretrained( + str(cfg.server.tokenizer_path), + trust_remote_code=cfg.server.trust_remote_code, + ) + logger.info( + "Loaded tokenizer for chat template: %s", cfg.server.tokenizer_path + ) + except Exception as e: + logger.warning("Failed to load tokenizer for chat template: %s", e) + + logger.info( + "HTTP server ready at http://%s:%s", + cfg.server.host, + cfg.server.port, + ) + yield + # Shutdown + if _engine is not None: + _engine.shutdown() + _engine = None + + +app = FastAPI(lifespan=lifespan) +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# --------------------------------------------------------------------------- +# Exception handlers +# --------------------------------------------------------------------------- + + +@app.exception_handler(HTTPException) +async def http_exception_handler(request: Request, exc: HTTPException): + return ORJSONResponse( + content={"error": {"message": exc.detail, "code": exc.status_code}}, + status_code=exc.status_code, + ) + + +# --------------------------------------------------------------------------- +# Health / info endpoints +# --------------------------------------------------------------------------- + + +@app.get("/health") +@app.get("/health_generate") +async def health(): + """Liveness probe.""" + return Response(status_code=200) + + +@app.get("/model_info") +async def model_info(): + """Return basic model metadata.""" + cfg = get_global_config() + hf_cfg = cfg.model.hf_config + return { + "model_path": str(cfg.server.model_path), + "tokenizer_path": str(cfg.server.tokenizer_path), + "served_model_name": cfg.server.served_model_name, + "model_type": getattr(hf_cfg, "model_type", None) if hf_cfg else None, + "architectures": getattr(hf_cfg, "architectures", None) if hf_cfg else None, + } + + +@app.get("/server_info") +async def server_info(): + """Dump runtime server configuration.""" + import dataclasses as _dc + + cfg = get_global_config() + return _dc.asdict(cfg.server) + + +@app.get("/v1/models") +async def list_models(): + """OpenAI-compatible model listing.""" + cfg = get_global_config() + model_name = cfg.server.served_model_name or str(cfg.server.model_path) + return { + "object": "list", + "data": [_model_card(model_name)], + } + + +@app.get("/v1/models/{model_id:path}") +async def retrieve_model(model_id: str): + """OpenAI-compatible single model retrieval.""" + cfg = get_global_config() + model_name = cfg.server.served_model_name or str(cfg.server.model_path) + if model_id != model_name: + raise HTTPException( + status_code=404, + detail=f"Model '{model_id}' not found. Available: '{model_name}'", + ) + return _model_card(model_name) + + +def _model_card(model_name: str) -> Dict[str, Any]: + """Build an OpenAI-compatible Model object.""" + return { + "id": model_name, + "object": "model", + "created": int(time.time()), + "owned_by": "pymllm", + } + + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +# Map internal finish reasons to OpenAI-standard values. +_FINISH_REASON_MAP = { + "eos": "stop", + "stop": "stop", + "length": "length", + "abort": "stop", +} + + +def _normalize_finish_reason(reason: Optional[str]) -> Optional[str]: + """Convert internal finish reason to OpenAI-compatible value.""" + if reason is None: + return None + return _FINISH_REASON_MAP.get(reason, reason) + + +def _build_sampling_params( + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + max_tokens: Optional[int] = None, + stop: Optional[Union[str, List[str]]] = None, + frequency_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + repetition_penalty: Optional[float] = None, + seed: Optional[int] = None, + **extra: Any, +) -> Dict[str, Any]: + """Build a sampling_params dict from OpenAI-style fields.""" + params: Dict[str, Any] = {} + if temperature is not None: + params["temperature"] = temperature + if top_p is not None: + params["top_p"] = top_p + if top_k is not None: + params["top_k"] = top_k + if max_tokens is not None: + params["max_new_tokens"] = max_tokens + if stop is not None: + params["stop"] = stop if isinstance(stop, list) else [stop] + if frequency_penalty is not None: + params["frequency_penalty"] = frequency_penalty + if presence_penalty is not None: + params["presence_penalty"] = presence_penalty + if repetition_penalty is not None: + params["repetition_penalty"] = repetition_penalty + if seed is not None: + params["seed"] = seed + params.update(extra) + return params + + +def _messages_to_prompt( + messages: List[ChatMessage], + chat_template_kwargs: Optional[Dict[str, Any]] = None, +) -> str: + """Render chat messages into a prompt string via the model's chat template. + + Uses ``tokenizer.apply_chat_template()`` when available (handles Llama, + Qwen, Mistral, etc. automatically). Falls back to ChatML format. + + Parameters + ---------- + chat_template_kwargs + Extra keyword arguments forwarded to ``apply_chat_template`` + (e.g. ``enable_thinking=True`` for Qwen3). + """ + # Flatten each message into a plain dict for the tokenizer. + msg_dicts: List[Dict[str, Any]] = [] + for msg in messages: + content = msg.content + if isinstance(content, list): + # Multimodal: extract only text parts for the prompt string. + text_parts = [p.text for p in content if p.type == "text" and p.text] + content = "\n".join(text_parts) if text_parts else "" + elif content is None: + content = "" + d: Dict[str, Any] = {"role": msg.role, "content": content} + if msg.name is not None: + d["name"] = msg.name + msg_dicts.append(d) + + tokenizer = _tokenizer + if tokenizer is not None and hasattr(tokenizer, "apply_chat_template"): + try: + extra = dict(chat_template_kwargs) if chat_template_kwargs else {} + return tokenizer.apply_chat_template( + msg_dicts, + tokenize=False, + add_generation_prompt=True, + **extra, + ) + except Exception as e: + logger.warning("apply_chat_template failed, using fallback: %s", e) + + # Fallback: ChatML format (Qwen-style) + parts: List[str] = [] + for m in msg_dicts: + parts.append(f"<|im_start|>{m['role']}\n{m['content']}<|im_end|>") + parts.append("<|im_start|>assistant\n") + return "\n".join(parts) + + +def _extract_image_data(messages: List[ChatMessage]) -> Optional[List[str]]: + """Extract image URLs / base64 strings from multimodal content parts.""" + images: List[str] = [] + for msg in messages: + if not isinstance(msg.content, list): + continue + for part in msg.content: + if part.type == "image_url" and part.image_url is not None: + images.append(part.image_url.url) + return images if images else None + + +def _make_completion_id() -> str: + return f"cmpl-{uuid.uuid4().hex[:24]}" + + +def _make_chat_completion_id() -> str: + return f"chatcmpl-{uuid.uuid4().hex[:24]}" + + +# --------------------------------------------------------------------------- +# Native generate endpoint +# --------------------------------------------------------------------------- + + +@app.api_route("/generate", methods=["POST", "PUT"]) +async def generate(obj: GenerateRequest, request: Request): + """Native generation endpoint. Supports SSE streaming.""" + engine = _get_engine() + + # Collect extra fields as extra_options + known = set(GenerateRequest.model_fields.keys()) + extra_options = {k: v for k, v in obj.model_dump().items() if k not in known} + + kwargs: Dict[str, Any] = { + "prompt": obj.text, + "input_ids": obj.input_ids, + "sampling_params": obj.sampling_params, + "image_data": obj.image_data, + "audio_data": obj.audio_data, + "video_data": obj.video_data, + "return_logprob": obj.return_logprob, + "logprob_start_len": obj.logprob_start_len, + "top_logprobs_num": obj.top_logprobs_num, + "lora_path": obj.lora_path, + "session_params": obj.session_params, + "stream": obj.stream, + "rid": obj.rid, + **extra_options, + } + # Strip None values so Engine defaults are used + kwargs = {k: v for k, v in kwargs.items() if v is not None} + + if obj.stream: + + async def _stream() -> AsyncIterator[bytes]: + try: + async for chunk in engine.generate_async(**kwargs): + if await request.is_disconnected(): + break + # Skip empty intermediate chunks (e.g. special tokens + # stripped by the detokenizer) + if not chunk.get("delta") and not chunk.get("finished"): + continue + yield b"data: " + orjson.dumps(chunk) + b"\n\n" + except Exception as e: + err = {"error": {"message": str(e)}} + yield b"data: " + orjson.dumps(err) + b"\n\n" + yield b"data: [DONE]\n\n" + + return StreamingResponse(_stream(), media_type="text/event-stream") + + try: + results = [] + async for item in engine.generate_async(**kwargs): + results.append(item) + result = results[0] if len(results) == 1 else results + return ORJSONResponse(result) + except Exception as e: + logger.error("[generate] Error: %s", e) + raise HTTPException(status_code=400, detail=str(e)) + + +# --------------------------------------------------------------------------- +# OpenAI-compatible /v1/completions +# --------------------------------------------------------------------------- + + +@app.post("/v1/completions") +async def openai_completions(obj: CompletionRequest, request: Request): + """OpenAI-compatible text completion endpoint.""" + engine = _get_engine() + sp = _build_sampling_params( + temperature=obj.temperature, + top_p=obj.top_p, + top_k=obj.top_k, + max_tokens=obj.max_tokens, + stop=obj.stop, + frequency_penalty=obj.frequency_penalty, + presence_penalty=obj.presence_penalty, + repetition_penalty=obj.repetition_penalty, + seed=obj.seed, + ) + cfg = get_global_config() + model_name = obj.model or cfg.server.served_model_name or str(cfg.server.model_path) + include_usage = ( + obj.stream_options is not None and obj.stream_options.include_usage + ) + + if obj.stream: + + async def _stream() -> AsyncIterator[bytes]: + comp_id = _make_completion_id() + prompt_tokens = 0 + completion_tokens = 0 + try: + async for chunk in engine.generate_async( + prompt=obj.prompt, sampling_params=sp, stream=True + ): + if await request.is_disconnected(): + break + prompt_tokens = chunk.get("prompt_tokens", prompt_tokens) + completion_tokens = chunk.get("completion_tokens", completion_tokens) + delta_text = chunk.get("delta", "") + finish_reason = _normalize_finish_reason( + chunk.get("finished_reason") + ) + # Skip empty intermediate chunks + if not delta_text and finish_reason is None: + continue + sse: Dict[str, Any] = { + "id": comp_id, + "object": "text_completion", + "created": int(time.time()), + "model": model_name, + "choices": [ + { + "index": 0, + "text": delta_text, + "logprobs": None, + "finish_reason": finish_reason, + } + ], + } + yield b"data: " + orjson.dumps(sse) + b"\n\n" + except Exception as e: + err = {"error": {"message": str(e)}} + yield b"data: " + orjson.dumps(err) + b"\n\n" + # Final usage-only chunk (OpenAI stream_options.include_usage) + if include_usage: + usage_chunk: Dict[str, Any] = { + "id": comp_id, + "object": "text_completion", + "created": int(time.time()), + "model": model_name, + "choices": [], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + } + yield b"data: " + orjson.dumps(usage_chunk) + b"\n\n" + yield b"data: [DONE]\n\n" + + return StreamingResponse(_stream(), media_type="text/event-stream") + + try: + results = [] + async for item in engine.generate_async( + prompt=obj.prompt, sampling_params=sp + ): + results.append(item) + choices = [] + prompt_tokens = 0 + completion_tokens = 0 + for i, r in enumerate(results): + choices.append( + { + "index": i, + "text": r.get("text", ""), + "logprobs": None, + "finish_reason": _normalize_finish_reason( + r.get("finished_reason", "stop") + ), + } + ) + prompt_tokens += r.get("prompt_tokens", 0) + completion_tokens += r.get("completion_tokens", 0) + + return ORJSONResponse( + { + "id": _make_completion_id(), + "object": "text_completion", + "created": int(time.time()), + "model": model_name, + "choices": choices, + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + } + ) + except Exception as e: + logger.error("[v1/completions] Error: %s", e) + raise HTTPException(status_code=400, detail=str(e)) + + +# --------------------------------------------------------------------------- +# OpenAI-compatible /v1/chat/completions +# --------------------------------------------------------------------------- + + +@app.post("/v1/chat/completions") +async def openai_chat_completions(obj: ChatCompletionRequest, request: Request): + """OpenAI-compatible chat completion endpoint with reasoning & tool-call parsing.""" + engine = _get_engine() + cfg = get_global_config() + # Auto-enable thinking when reasoning_parser is configured and the + # client didn't explicitly set enable_thinking. + chat_kwargs = dict(obj.chat_template_kwargs) if obj.chat_template_kwargs else {} + if cfg.server.reasoning_parser and "enable_thinking" not in chat_kwargs: + chat_kwargs["enable_thinking"] = True + prompt = _messages_to_prompt(obj.messages, chat_template_kwargs=chat_kwargs or None) + image_data = _extract_image_data(obj.messages) + + # max_completion_tokens takes precedence over max_tokens (OpenAI convention) + max_tokens = obj.max_completion_tokens if obj.max_completion_tokens is not None else obj.max_tokens + + sp = _build_sampling_params( + temperature=obj.temperature, + top_p=obj.top_p, + top_k=obj.top_k, + max_tokens=max_tokens, + stop=obj.stop, + frequency_penalty=obj.frequency_penalty, + presence_penalty=obj.presence_penalty, + repetition_penalty=obj.repetition_penalty, + seed=obj.seed, + ) + cfg = get_global_config() + model_name = obj.model or cfg.server.served_model_name or str(cfg.server.model_path) + include_usage = ( + obj.stream_options is not None and obj.stream_options.include_usage + ) + + # Resolve parsers from server config + reasoning_type = cfg.server.reasoning_parser + tool_call_type = cfg.server.tool_call_parser + + gen_kwargs: Dict[str, Any] = { + "prompt": prompt, + "sampling_params": sp, + } + if image_data is not None: + gen_kwargs["image_data"] = image_data + + if obj.stream: + + async def _stream() -> AsyncIterator[bytes]: + from pymllm.parsers import ReasoningParser, ToolCallParser + + comp_id = _make_chat_completion_id() + created = int(time.time()) + first = True + prompt_tokens = 0 + completion_tokens = 0 + has_tool_calls = False # track across entire stream + + # Instantiate streaming parsers + r_parser = ( + ReasoningParser(reasoning_type, stream_reasoning=obj.stream_reasoning) + if reasoning_type and obj.separate_reasoning + else None + ) + tc_parser = ( + ToolCallParser(tool_call_type, tools=obj.tools) + if tool_call_type and obj.tools + else None + ) + + def _make_sse(delta: Dict[str, Any], finish: Optional[str] = None) -> bytes: + sse: Dict[str, Any] = { + "id": comp_id, + "object": "chat.completion.chunk", + "created": created, + "model": model_name, + "choices": [ + { + "index": 0, + "delta": delta, + "logprobs": None, + "finish_reason": finish, + } + ], + } + return b"data: " + orjson.dumps(sse) + b"\n\n" + + try: + async for chunk in engine.generate_async(**gen_kwargs, stream=True): + if await request.is_disconnected(): + break + prompt_tokens = chunk.get("prompt_tokens", prompt_tokens) + completion_tokens = chunk.get("completion_tokens", completion_tokens) + + raw_delta = chunk.get("delta", "") + finish_reason = _normalize_finish_reason( + chunk.get("finished_reason") + ) + + # --- Phase 1: reasoning parser --- + reasoning_delta = "" + content_delta = raw_delta + if r_parser and raw_delta: + reasoning_delta, content_delta = r_parser.parse_stream_chunk( + raw_delta + ) + + # --- Phase 2: tool-call parser --- + tool_items: list = [] + if tc_parser and content_delta: + content_delta, tool_items = tc_parser.parse_stream_chunk( + content_delta + ) + + # --- Emit chunks --- + # Role chunk (first) + if first: + yield _make_sse({"role": "assistant"}) + first = False + + # Reasoning content + if reasoning_delta: + yield _make_sse({"reasoning_content": reasoning_delta}) + + # Tool call deltas + if tool_items: + has_tool_calls = True + for tc in tool_items: + yield _make_sse({"tool_calls": [tc.to_openai_dict()]}) + + # Normal content + if content_delta: + yield _make_sse({"content": content_delta}) + + # Finish + if finish_reason is not None: + # Flush remaining tool call data + if tc_parser: + remaining = tc_parser.flush() + for tc in remaining: + has_tool_calls = True + yield _make_sse({"tool_calls": [tc.to_openai_dict()]}) + if has_tool_calls: + finish_reason = "tool_calls" + yield _make_sse({}, finish=finish_reason) + + except Exception as e: + err = {"error": {"message": str(e)}} + yield b"data: " + orjson.dumps(err) + b"\n\n" + # Final usage-only chunk + if include_usage: + usage_chunk: Dict[str, Any] = { + "id": comp_id, + "object": "chat.completion.chunk", + "created": created, + "model": model_name, + "choices": [], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + } + yield b"data: " + orjson.dumps(usage_chunk) + b"\n\n" + yield b"data: [DONE]\n\n" + + return StreamingResponse(_stream(), media_type="text/event-stream") + + # -- Non-streaming -- + try: + from pymllm.parsers import ReasoningParser, ToolCallParser + + r = {} + async for item in engine.generate_async(**gen_kwargs): + r = item + prompt_tokens = r.get("prompt_tokens", 0) + completion_tokens = r.get("completion_tokens", 0) + text = r.get("text", "") + finish_reason = _normalize_finish_reason(r.get("finished_reason", "stop")) + + # Parse reasoning + reasoning_content = None + if reasoning_type and obj.separate_reasoning: + rp = ReasoningParser(reasoning_type) + reasoning_content, text = rp.parse_non_stream(text) + + # Parse tool calls + tool_calls_list = None + if tool_call_type and obj.tools: + tp = ToolCallParser(tool_call_type, tools=obj.tools) + if tp.has_tool_call(text): + text, parsed_calls = tp.parse_non_stream(text) + if parsed_calls: + tool_calls_list = [tc.to_openai_dict(streaming=False) for tc in parsed_calls] + finish_reason = "tool_calls" + + message: Dict[str, Any] = {"role": "assistant", "content": text or None} + if reasoning_content: + message["reasoning_content"] = reasoning_content + if tool_calls_list: + message["tool_calls"] = tool_calls_list + + return ORJSONResponse( + { + "id": _make_chat_completion_id(), + "object": "chat.completion", + "created": int(time.time()), + "model": model_name, + "choices": [ + { + "index": 0, + "message": message, + "logprobs": None, + "finish_reason": finish_reason, + } + ], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + }, + } + ) + except Exception as e: + logger.error("[v1/chat/completions] Error: %s", e) + raise HTTPException(status_code=400, detail=str(e)) + + +# --------------------------------------------------------------------------- +# Administrative endpoints +# --------------------------------------------------------------------------- + + +@app.api_route("/flush_cache", methods=["GET", "POST"]) +async def flush_cache(): + """Placeholder cache flush.""" + return Response(content="Cache flushed.\n", status_code=200) + + +@app.post("/abort_request") +async def abort_request(obj: AbortRequest): + """Abort a running request by rid.""" + engine = _get_engine() + if obj.rid and engine._rr_process is not None: + await engine._rr_process.abort_request(obj.rid) + return Response(status_code=200) + raise HTTPException(status_code=400, detail="Missing or invalid rid") + + +# --------------------------------------------------------------------------- +# Prepare args helper +# --------------------------------------------------------------------------- def _prepare_args(): + """Parse CLI arguments into the global config singleton.""" parser = make_args() read_args(parser=parser) -def main(): +# --------------------------------------------------------------------------- +# Server launcher +# --------------------------------------------------------------------------- + + +def launch_server(): + """Launch the pymllm Engine then start the uvicorn HTTP server. + + This function mirrors sglang's ``launch_server``: it first boots all engine + subprocesses (tokenizer, scheduler, model-runner, detokenizer) and then + hands off to uvicorn to serve HTTP traffic. + """ _prepare_args() + cfg = get_global_config() + engine = Engine() engine.launch() + # Attach engine to app.state so the lifespan hook can pick it up. + app.state.engine = engine # type: ignore[attr-defined] + + logger.info( + "Starting HTTP server on %s:%s (root_path=%r)", + cfg.server.host, + cfg.server.port, + cfg.server.fastapi_root_path, + ) + + uvicorn.run( + app, + host=cfg.server.host, + port=cfg.server.port, + root_path=cfg.server.fastapi_root_path, + log_level=cfg.server.log_level, + timeout_keep_alive=5, + loop="uvloop", + ) + + +def main(): + """CLI entry point.""" + launch_server() + if __name__ == "__main__": main() From 506d61aaf567df64c27665cf796c4ee726a54c6f Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Tue, 17 Mar 2026 02:27:12 -0700 Subject: [PATCH 34/42] fix(attention): refine FlashInfer backend logic and improve RadixCache eviction handling - Adjusted logic in FlashInferAttnBackend to ensure proper handling of prefix lengths and ragged token extensions. - Enhanced RadixCache to correctly update evictable size when a childless node gains a child. - Added debug logging in ModelRunnerProcess for better traceability during cache operations and token allocations. - Ensured at least two tokens are extended to avoid edge cases in FlashInfer and CUDA kernel crashes. --- pymllm/layers/attention/flashinfer_backend.py | 28 +++-- pymllm/mem_cache/radix_cache.py | 19 ++++ pymllm/orchestrator/model_runner_process.py | 106 +++++++++++++++--- 3 files changed, 132 insertions(+), 21 deletions(-) diff --git a/pymllm/layers/attention/flashinfer_backend.py b/pymllm/layers/attention/flashinfer_backend.py index 479fb5cec..206947e4f 100644 --- a/pymllm/layers/attention/flashinfer_backend.py +++ b/pymllm/layers/attention/flashinfer_backend.py @@ -351,10 +351,17 @@ def init_forward_metadata(self, forward_batch: ForwardBatch) -> None: # Extend / prefill prefix_lens = forward_batch.extend_prefix_lens extend_no_prefix = ( - forward_batch.extend_prefix_lens_cpu is not None - and not any(forward_batch.extend_prefix_lens_cpu) + forward_batch.extend_prefix_lens_cpu is None + or not any(forward_batch.extend_prefix_lens_cpu) ) - use_ragged = extend_no_prefix + # use_ragged=True: match sglang's default. + # - extend_no_prefix=True → ragged-only (pure prefill, no cache) + # - extend_no_prefix=False → ragged+paged merge (cache hit) + # The paged wrapper covers only the cached prefix (prefix_lens), + # the ragged wrapper covers the new extend tokens. No overlap. + # NOTE: to avoid a FlashInfer edge-case with 1-token ragged + # extends, _allocate_extend guarantees extend_len >= 2. + use_ragged = True self.indices_updater_prefill.update( forward_batch.req_pool_indices, @@ -829,9 +836,12 @@ def update( ) else: if use_ragged: + # Merge path: paged covers ONLY the cached prefix so there + # is no overlap with the ragged (extend) tokens. paged_kernel_lens = prefix_lens - paged_kernel_lens_sum = paged_kernel_lens.sum().item() + paged_kernel_lens_sum = int(paged_kernel_lens.sum().item()) else: + # Paged-only path: covers the full sequence. paged_kernel_lens = seq_lens paged_kernel_lens_sum = seq_lens_sum @@ -872,9 +882,13 @@ def _update_sliding_window( paged_kernel_lens_sum = int(paged_kernel_lens.sum().item()) kv_start_idx = seq_lens - paged_kernel_lens else: - # Full-context portion. - paged_kernel_lens = seq_lens - paged_kernel_lens_sum = seq_lens_sum + # Full-context SWA wrapper: same split as non-SWA. + if use_ragged: + paged_kernel_lens = prefix_lens + paged_kernel_lens_sum = int(paged_kernel_lens.sum().item()) + else: + paged_kernel_lens = seq_lens + paged_kernel_lens_sum = seq_lens_sum kv_start_idx = None kv_indptr = self.kv_indptr[wrapper_id] diff --git a/pymllm/mem_cache/radix_cache.py b/pymllm/mem_cache/radix_cache.py index 441a8c097..a472d5085 100644 --- a/pymllm/mem_cache/radix_cache.py +++ b/pymllm/mem_cache/radix_cache.py @@ -698,6 +698,16 @@ def _add_leaf( value: torch.Tensor, swa_tombstone: bool = False, ) -> TreeNode: + # If parent was a childless (leaf) node, it will no longer be + # evictable after gaining a child. Adjust the size counter. + if ( + len(parent.children) == 0 + and parent != self.root_node + and parent.lock_ref == 0 + and not parent.evicted + ): + self._evictable_size -= len(parent.key) + new_node = TreeNode() new_node.parent = parent new_node.key = key @@ -711,6 +721,13 @@ def _add_leaf( def _split_node(self, key: RadixKey, child: TreeNode, split_len: int) -> TreeNode: """Split *child* at *split_len*, returning the new parent node.""" + logger.debug( + "[SPLIT] node_id=%d key_len=%d split_len=%d " + "parent_val[:4]=%s child_val[:4]=%s", + child.id, len(key), split_len, + child.value[:min(split_len, 4)].tolist() if child.value is not None else [], + child.value[split_len:split_len+4].tolist() if child.value is not None and len(child.value) > split_len else [], + ) new_node = TreeNode() new_node.children[_child_key(key[split_len:], self.page_size)] = child new_node.parent = child.parent @@ -742,6 +759,8 @@ def _delete_leaf(self, node: TreeNode) -> None: self._evictable_size -= len(node.key) if self.supports_swa and not node.swa_tombstone: self._swa_evictable_size -= len(node.key) + # Mark as evicted so node.evicted returns True. + node.value = None if self.on_node_evict is not None: self.on_node_evict(node.id) diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py index d850dd53e..af5574de2 100644 --- a/pymllm/orchestrator/model_runner_process.py +++ b/pymllm/orchestrator/model_runner_process.py @@ -419,6 +419,7 @@ def _insert_into_radix_cache(self, requests_meta: List[Dict[str, Any]]) -> None: still points at freed slots → use-after-free during decode. 4. **Update** ``cache_protected_len`` and radix lock. """ + _dbg = logger.isEnabledFor(logging.DEBUG) cache = self._radix_cache if cache is None or cache.disable: return @@ -441,14 +442,32 @@ def _insert_into_radix_cache(self, requests_meta: List[Dict[str, Any]]) -> None: torch.int64 ) + if _dbg: + logger.debug( + "[CACHE INSERT] rid=%s seq_len=%d pool[slot=%d,0:%d]=%s", + rid, seq_len, slot, min(seq_len, 8), + kv_indices[:min(seq_len, 8)].tolist(), + ) + key = RadixKey(input_ids) result = cache.insert(key, kv_indices) new_prefix_len = result.prefix_len # --- Step 2: free duplicates --- cache_protected_len = self._rid_to_cache_protected_len.get(rid, 0) + if _dbg: + logger.debug( + "[CACHE INSERT] rid=%s insert prefix_len=%d cache_protected=%d", + rid, new_prefix_len, cache_protected_len, + ) if new_prefix_len > cache_protected_len: dup_indices = kv_indices[cache_protected_len:new_prefix_len] + if _dbg: + logger.debug( + "[CACHE INSERT] rid=%s freeing dup [%d:%d]=%s", + rid, cache_protected_len, new_prefix_len, + dup_indices[:min(len(dup_indices), 8)].tolist(), + ) if dup_indices.numel() > 0: runner.token_to_kv_pool_allocator.free(dup_indices) @@ -458,7 +477,23 @@ def _insert_into_radix_cache(self, requests_meta: List[Dict[str, Any]]) -> None: # (still-live) indices instead of the freed ones. rematch = cache.match_prefix(key) new_indices = rematch.indices + if _dbg: + logger.debug( + "[CACHE INSERT] rid=%s rematch len=%d indices[:8]=%s", + rid, len(new_indices), new_indices[:min(len(new_indices), 8)].tolist(), + ) + if cache.page_size == 1: + assert len(new_indices) == seq_len, ( + f"Re-match length mismatch after insert: " + f"{len(new_indices)=}, {seq_len=}, rid={rid}" + ) if len(new_indices) > cache_protected_len: + if _dbg: + logger.debug( + "[CACHE INSERT] rid=%s write-back pool[slot=%d,%d:%d]=%s", + rid, slot, cache_protected_len, len(new_indices), + new_indices[cache_protected_len:cache_protected_len+8].tolist(), + ) runner.req_to_token_pool.write( (slot, slice(cache_protected_len, len(new_indices))), new_indices[cache_protected_len:].to(torch.int32), @@ -558,12 +593,17 @@ def _allocate_extend( last_node = None cached_indices = None - # Ensure at least 1 token is extended (not fully cached). - # A full cache hit (prefix_len == full_seq_len) would produce a - # 0-length input tensor that crashes CUDA kernels. Back off by 1 - # so the model always sees the last token. - if prefix_len >= full_seq_len: - prefix_len = full_seq_len - 1 + # Ensure at least 2 tokens are extended (not nearly fully cached). + # Reasons: + # 1. A full cache hit (prefix_len == full_seq_len) would produce a + # 0-length input tensor that crashes CUDA kernels. + # 2. A 1-token extend triggers an edge case in FlashInfer's + # ragged forward_return_lse (qo_len=1, kv_len=1, causal=True) + # where s1 (log-partition) is computed incorrectly, causing + # the cascade merge to produce wrong logits → EOS. + # By ensuring extend_len >= 2, we avoid both issues. + if prefix_len >= full_seq_len - 1 and full_seq_len >= 2: + prefix_len = full_seq_len - 2 if cached_indices is not None: cached_indices = cached_indices[:prefix_len] @@ -575,18 +615,41 @@ def _allocate_extend( if prefix_len > 0: logger.info( - "Radix cache hit for rid=%s: %d/%d tokens reused (%.1f%%)", + "Radix cache hit for rid=%s: %d/%d tokens reused (%.1f%%) " + "node_id=%s cached_kv[:8]=%s", m["rid"], prefix_len, full_seq_len, 100.0 * prefix_len / full_seq_len, + last_node.id if last_node is not None else None, + cached_indices[:min(prefix_len, 8)].tolist() + if cached_indices is not None else [], + ) + logger.info( + "Radix cache tree after match: evictable=%d protected=%d", + cache.evictable_size(), + cache.protected_size(), ) total_new_tokens = sum(actual_extend_lens) + # --- Step 1.5: Lock matched radix nodes BEFORE allocation --- + # This MUST happen before any allocation that could trigger eviction. + # Without locking first, _alloc_kv_with_eviction could evict the + # matched nodes, freeing their KV pool slots and causing + # use-after-free when we later read from cached_indices. + if cache is not None and not cache.disable: + for i, m in enumerate(requests_meta): + node = matched_nodes[i] + if node is not None and actual_prefix_lens[i] > 0: + swa_boundary_id = cache.inc_lock_ref(node) + self._rid_to_radix_lock[m["rid"]] = (node, swa_boundary_id) + # --- Step 2: Allocate req pool slots --- slots = runner.req_to_token_pool.alloc(batch_size) if slots is None: + # Rollback locks on failure + self._unlock_matched_nodes(requests_meta) raise RuntimeError("Failed to allocate req pool slots for extend batch") # --- Step 3: Allocate KV tokens (with eviction retry) --- @@ -594,6 +657,8 @@ def _allocate_extend( if out_cache_loc is None: for s in slots: runner.req_to_token_pool.free(s) + # Rollback locks on failure + self._unlock_matched_nodes(requests_meta) raise RuntimeError( f"Failed to allocate {total_new_tokens} KV tokens for extend batch " f"(even after eviction)" @@ -611,6 +676,11 @@ def _allocate_extend( # Write cached prefix indices (from the match result we saved) cached_indices = cached_indices_list[i] if cached_indices is not None and prefix_len > 0: + logger.debug( + "[ALLOC EXTEND] rid=%s writing prefix[0:%d] to pool[slot=%d]: %s", + rid, prefix_len, slot, + cached_indices[:min(prefix_len, 8)].tolist(), + ) runner.req_to_token_pool.write( (slot, slice(0, prefix_len)), cached_indices[:prefix_len].to(torch.int32), @@ -678,16 +748,24 @@ def _allocate_extend( if ts is not None: self._rid_to_gdn_track_slot[rid] = ts - # --- Step 5: Lock matched radix nodes --- - if cache is not None and not cache.disable: - for i, m in enumerate(requests_meta): - node = matched_nodes[i] - if node is not None and actual_prefix_lens[i] > 0: - swa_boundary_id = cache.inc_lock_ref(node) - self._rid_to_radix_lock[m["rid"]] = (node, swa_boundary_id) + # (Locking already done in Step 1.5 above) return out_cache_loc, actual_prefix_lens, actual_extend_lens + def _unlock_matched_nodes(self, requests_meta: List[Dict[str, Any]]) -> None: + """Rollback radix locks acquired during match_prefix. + + Called when allocation fails after locking matched nodes. + """ + cache = self._radix_cache + if cache is None or cache.disable: + return + for m in requests_meta: + lock = self._rid_to_radix_lock.pop(m["rid"], None) + if lock is not None: + node, swa_id = lock + cache.dec_lock_ref(node, swa_id) + def _alloc_kv_with_eviction(self, num_tokens: int) -> Optional[torch.Tensor]: """Try to allocate KV tokens, evicting from radix cache if needed.""" runner = self._runner From a420a0534ba6474ac176d51aea95c42176eaafe6 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Tue, 17 Mar 2026 02:35:09 -0700 Subject: [PATCH 35/42] refactor: improve code readability and structure across multiple modules - Consolidated multi-line statements for better clarity in `launch.py`, `cuda_graph_runner.py`, and `model_runner.py`. - Enhanced logging and debug information in `model_runner_process.py` for improved traceability during cache operations. - Streamlined comments and documentation to remove references to sglang architecture, focusing on the current implementation. - Adjusted logic in `flashinfer_backend.py` to simplify prefix length handling. - Improved formatting in `scheduler_process.py` for better readability of request handling logic. --- pymllm/engine/launch.py | 10 +-- pymllm/executor/cuda_graph_runner.py | 3 +- pymllm/executor/model_runner.py | 51 +++++++++---- pymllm/layers/attention/flashinfer_backend.py | 7 +- pymllm/models/qwen3_vl.py | 10 ++- pymllm/orchestrator/model_runner_process.py | 72 +++++++++++++------ pymllm/orchestrator/scheduler_process.py | 29 ++++---- pymllm/server/launch.py | 8 +-- 8 files changed, 119 insertions(+), 71 deletions(-) diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index e5214511f..5e3a2ef00 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -215,21 +215,21 @@ def _launch_processes(self) -> None: procs_and_readers.append((tokenizer_proc, tokenizer_reader, "tokenizer")) # Determine default max_new_tokens from model config (if available) - model_max_new_tokens = _get_model_default_max_new_tokens( - cfg.model.hf_config - ) + model_max_new_tokens = _get_model_default_max_new_tokens(cfg.model.hf_config) scheduler_kwargs = {} if model_max_new_tokens is not None: scheduler_kwargs["default_max_new_tokens"] = model_max_new_tokens # Extract EOS token ID(s) from model config - eos_token_ids = _get_eos_token_ids(cfg.model.hf_config, model_path=cfg.server.model_path) + eos_token_ids = _get_eos_token_ids( + cfg.model.hf_config, model_path=cfg.server.model_path + ) if eos_token_ids: scheduler_kwargs["eos_token_ids"] = eos_token_ids logger.info("EOS token IDs for scheduler: %s", eos_token_ids) # Model runner config — passed to the scheduler process which now - # owns the model runner in-process (sglang-style architecture). + # owns the model runner in-process. scheduler_kwargs["server_config"] = cfg.server scheduler_kwargs["model_config"] = cfg.model scheduler_kwargs["gpu_id"] = cfg.server.base_gpu_id diff --git a/pymllm/executor/cuda_graph_runner.py b/pymllm/executor/cuda_graph_runner.py index fe4fb0e92..7fa674b7b 100644 --- a/pymllm/executor/cuda_graph_runner.py +++ b/pymllm/executor/cuda_graph_runner.py @@ -3,8 +3,7 @@ Captures CUDA graphs for a set of discrete batch sizes so that the decode forward pass can be replayed without CPU-side kernel-launch overhead. -Simplified from sglang's ``CudaGraphRunner`` for pymllm's single-GPU -architecture. Handles: +``CudaGraphRunner`` for pymllm's single-GPU architecture. Handles: * Pre-allocated input buffers (avoids per-step allocations) * CUDA-graph capture for each batch size diff --git a/pymllm/executor/model_runner.py b/pymllm/executor/model_runner.py index 6d6f33fea..f39bf4f02 100644 --- a/pymllm/executor/model_runner.py +++ b/pymllm/executor/model_runner.py @@ -1,7 +1,6 @@ """ModelRunner runs the forward passes of the models. -Simplified from sglang's ``ModelRunner`` for pymllm's single-GPU inference -architecture. Handles: +pymllm's single-GPU inference architecture. Handles: * Model loading (HuggingFace checkpoint via ``transformers``) * KV-cache memory pool initialisation @@ -329,7 +328,9 @@ def _extract_model_metadata(self) -> None: # Hybrid model metadata (GDN layers) self.num_gdn_layers: int = getattr(self.model, "num_gdn_layers", 0) - self.full_attn_layer_ids: set = getattr(self.model, "full_attn_layer_ids", set()) + self.full_attn_layer_ids: set = getattr( + self.model, "full_attn_layer_ids", set() + ) logger.info( "Model metadata: layers=%d, q_heads=%d, kv_heads=%d, " @@ -521,7 +522,9 @@ def init_memory_pool(self) -> None: gdn_head_k_dim = getattr(text_config, "linear_key_head_dim", 128) gdn_head_v_dim = getattr(text_config, "linear_value_head_dim", 128) gdn_conv_kernel = getattr(text_config, "linear_conv_kernel_dim", 4) - gdn_conv_dim = gdn_num_k_heads * gdn_head_k_dim * 2 + gdn_num_v_heads * gdn_head_v_dim + gdn_conv_dim = ( + gdn_num_k_heads * gdn_head_k_dim * 2 + gdn_num_v_heads * gdn_head_v_dim + ) self.gdn_pool = GDNPool( max_reqs=self.max_running_requests, @@ -542,7 +545,11 @@ def init_memory_pool(self) -> None: self.max_total_num_tokens, self.max_running_requests, self.token_to_kv_pool._mem_bytes() / (1 << 30), - *([self.gdn_pool.mem_bytes() / (1 << 30)] if self.gdn_pool is not None else []), + *( + [self.gdn_pool.mem_bytes() / (1 << 30)] + if self.gdn_pool is not None + else [] + ), ) def _profile_max_num_tokens(self) -> int: @@ -594,23 +601,39 @@ def _profile_max_num_tokens(self) -> int: gdn_head_k_dim = getattr(text_config, "linear_key_head_dim", 128) gdn_head_v_dim = getattr(text_config, "linear_value_head_dim", 128) gdn_conv_kernel = getattr(text_config, "linear_conv_kernel_dim", 4) - gdn_conv_dim = gdn_num_k_heads * gdn_head_k_dim * 2 + gdn_num_v_heads * gdn_head_v_dim + gdn_conv_dim = ( + gdn_num_k_heads * gdn_head_k_dim * 2 + gdn_num_v_heads * gdn_head_v_dim + ) # Estimate GDN pool memory for max_running_requests # Track slots add max_reqs_est extra slots for prefix cache snapshots - max_reqs_est = min( - max(int(rest_memory_bytes / cell_size / self.context_len * 512), 2048), - 4096, - ) if self.server_config.max_running_requests is None else self.server_config.max_running_requests + max_reqs_est = ( + min( + max( + int(rest_memory_bytes / cell_size / self.context_len * 512), + 2048, + ), + 4096, + ) + if self.server_config.max_running_requests is None + else self.server_config.max_running_requests + ) pool_size = max_reqs_est + 1 + max_reqs_est # +track_slots recurrent_bytes = ( - self.num_gdn_layers * pool_size * gdn_num_v_heads - * gdn_head_v_dim * gdn_head_k_dim * 4 # float32 + self.num_gdn_layers + * pool_size + * gdn_num_v_heads + * gdn_head_v_dim + * gdn_head_k_dim + * 4 # float32 ) dtype_size = torch.tensor([], dtype=self.dtype).element_size() conv_bytes = ( - self.num_gdn_layers * pool_size * gdn_conv_dim - * (gdn_conv_kernel - 1) * dtype_size + self.num_gdn_layers + * pool_size + * gdn_conv_dim + * (gdn_conv_kernel - 1) + * dtype_size ) gdn_pool_bytes = recurrent_bytes + conv_bytes rest_memory_bytes -= gdn_pool_bytes diff --git a/pymllm/layers/attention/flashinfer_backend.py b/pymllm/layers/attention/flashinfer_backend.py index 206947e4f..85b785f6d 100644 --- a/pymllm/layers/attention/flashinfer_backend.py +++ b/pymllm/layers/attention/flashinfer_backend.py @@ -350,11 +350,10 @@ def init_forward_metadata(self, forward_batch: ForwardBatch) -> None: else: # Extend / prefill prefix_lens = forward_batch.extend_prefix_lens - extend_no_prefix = ( - forward_batch.extend_prefix_lens_cpu is None - or not any(forward_batch.extend_prefix_lens_cpu) + extend_no_prefix = forward_batch.extend_prefix_lens_cpu is None or not any( + forward_batch.extend_prefix_lens_cpu ) - # use_ragged=True: match sglang's default. + # use_ragged=True # - extend_no_prefix=True → ragged-only (pure prefill, no cache) # - extend_no_prefix=False → ragged+paged merge (cache hit) # The paged wrapper covers only the cached prefix (prefix_lens), diff --git a/pymllm/models/qwen3_vl.py b/pymllm/models/qwen3_vl.py index 3bee27c8d..ffa20f115 100644 --- a/pymllm/models/qwen3_vl.py +++ b/pymllm/models/qwen3_vl.py @@ -152,7 +152,7 @@ def forward( q, k, v = qkv.reshape(seq_len, 3, self.num_heads, self.head_dim).unbind(dim=1) # Apply rotary position embedding. - # cos/sin are [total_tokens, head_dim // 2]. Following sglang's + # cos/sin are [total_tokens, head_dim // 2]. # VisionAttention: double them to full head_dim and apply RoPE to # all head dimensions (the rotation pairs (q[i], q[i + head_dim//2])). cos = rotary_pos_emb_cos @@ -1154,7 +1154,7 @@ def forward( ) # Prune hidden_states before lm_head to avoid a wasteful - # [total_tokens, vocab] matmul during prefill. Following sglang's + # [total_tokens, vocab] matmul during prefill. # LogitsProcessor._get_pruned_states(): in extend mode only keep # the last token of each sequence; in decode mode all rows are # already one-per-sequence. @@ -1172,7 +1172,7 @@ def forward( # LM head: always use weight matrix directly for the linear # projection. Works for both nn.Embedding (tied) and nn.Linear - # (untied). Matches sglang LogitsProcessor._compute_lm_head(). + # (untied). logits = torch.matmul( hidden_states.to(self.lm_head.weight.dtype), self.lm_head.weight.T, @@ -1307,9 +1307,7 @@ def _load_stacked_weight( elif shard_id == "v": kv_size = shard_size q_size = total_size - 2 * kv_size - param.data[q_size + kv_size : q_size + 2 * kv_size].copy_( - loaded_weight - ) + param.data[q_size + kv_size : q_size + 2 * kv_size].copy_(loaded_weight) else: # gate_up: 0 -> gate, 1 -> up (same size, idx*size is correct) shard_size = loaded_weight.shape[0] diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py index af5574de2..f6fc709b2 100644 --- a/pymllm/orchestrator/model_runner_process.py +++ b/pymllm/orchestrator/model_runner_process.py @@ -1,8 +1,8 @@ """ ModelRunnerProcess -- GPU-owning component that executes model forward passes. -Instantiated **in-process** by :class:`SchedulerProcess` (sglang-style -architecture). The scheduler calls :meth:`_forward_batch` directly — +Instantiated **in-process** by :class:`SchedulerProcess` +The scheduler calls :meth:`_forward_batch` directly — no inter-process communication is involved. This component owns the GPU: it holds a :class:`ModelRunner` with model @@ -267,8 +267,16 @@ def _forward_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: src = mm.get("image_inputs") if "image_inputs" in mm else mm if src is None: continue - pv = src.get("pixel_values") if hasattr(src, "get") else getattr(src, "pixel_values", None) - thw = src.get("image_grid_thw") if hasattr(src, "get") else getattr(src, "image_grid_thw", None) + pv = ( + src.get("pixel_values") + if hasattr(src, "get") + else getattr(src, "pixel_values", None) + ) + thw = ( + src.get("image_grid_thw") + if hasattr(src, "get") + else getattr(src, "image_grid_thw", None) + ) if pv is not None: if not isinstance(pv, torch.Tensor): pv = torch.as_tensor(pv) @@ -408,8 +416,6 @@ def _rebuild_extend_tensors( def _insert_into_radix_cache(self, requests_meta: List[Dict[str, Any]]) -> None: """Insert prefill KV indices into the radix cache for future reuse. - Mirrors sglang's ``cache_unfinished_req`` pattern: - 1. **Insert** the request's token → KV index mapping into the tree. 2. **Free duplicates** — indices in ``[cache_protected_len, new_prefix_len)`` are now owned by the tree; the request's copies are redundant. @@ -445,8 +451,11 @@ def _insert_into_radix_cache(self, requests_meta: List[Dict[str, Any]]) -> None: if _dbg: logger.debug( "[CACHE INSERT] rid=%s seq_len=%d pool[slot=%d,0:%d]=%s", - rid, seq_len, slot, min(seq_len, 8), - kv_indices[:min(seq_len, 8)].tolist(), + rid, + seq_len, + slot, + min(seq_len, 8), + kv_indices[: min(seq_len, 8)].tolist(), ) key = RadixKey(input_ids) @@ -458,15 +467,19 @@ def _insert_into_radix_cache(self, requests_meta: List[Dict[str, Any]]) -> None: if _dbg: logger.debug( "[CACHE INSERT] rid=%s insert prefix_len=%d cache_protected=%d", - rid, new_prefix_len, cache_protected_len, + rid, + new_prefix_len, + cache_protected_len, ) if new_prefix_len > cache_protected_len: dup_indices = kv_indices[cache_protected_len:new_prefix_len] if _dbg: logger.debug( "[CACHE INSERT] rid=%s freeing dup [%d:%d]=%s", - rid, cache_protected_len, new_prefix_len, - dup_indices[:min(len(dup_indices), 8)].tolist(), + rid, + cache_protected_len, + new_prefix_len, + dup_indices[: min(len(dup_indices), 8)].tolist(), ) if dup_indices.numel() > 0: runner.token_to_kv_pool_allocator.free(dup_indices) @@ -480,7 +493,9 @@ def _insert_into_radix_cache(self, requests_meta: List[Dict[str, Any]]) -> None: if _dbg: logger.debug( "[CACHE INSERT] rid=%s rematch len=%d indices[:8]=%s", - rid, len(new_indices), new_indices[:min(len(new_indices), 8)].tolist(), + rid, + len(new_indices), + new_indices[: min(len(new_indices), 8)].tolist(), ) if cache.page_size == 1: assert len(new_indices) == seq_len, ( @@ -491,8 +506,13 @@ def _insert_into_radix_cache(self, requests_meta: List[Dict[str, Any]]) -> None: if _dbg: logger.debug( "[CACHE INSERT] rid=%s write-back pool[slot=%d,%d:%d]=%s", - rid, slot, cache_protected_len, len(new_indices), - new_indices[cache_protected_len:cache_protected_len+8].tolist(), + rid, + slot, + cache_protected_len, + len(new_indices), + new_indices[ + cache_protected_len : cache_protected_len + 8 + ].tolist(), ) runner.req_to_token_pool.write( (slot, slice(cache_protected_len, len(new_indices))), @@ -587,7 +607,8 @@ def _allocate_extend( logger.debug( "Discarding radix cache hit for rid=%s: no GDN state " "for matched node (prefix_len=%d)", - m["rid"], prefix_len, + m["rid"], + prefix_len, ) prefix_len = 0 last_node = None @@ -622,8 +643,9 @@ def _allocate_extend( full_seq_len, 100.0 * prefix_len / full_seq_len, last_node.id if last_node is not None else None, - cached_indices[:min(prefix_len, 8)].tolist() - if cached_indices is not None else [], + cached_indices[: min(prefix_len, 8)].tolist() + if cached_indices is not None + else [], ) logger.info( "Radix cache tree after match: evictable=%d protected=%d", @@ -678,8 +700,10 @@ def _allocate_extend( if cached_indices is not None and prefix_len > 0: logger.debug( "[ALLOC EXTEND] rid=%s writing prefix[0:%d] to pool[slot=%d]: %s", - rid, prefix_len, slot, - cached_indices[:min(prefix_len, 8)].tolist(), + rid, + prefix_len, + slot, + cached_indices[: min(prefix_len, 8)].tolist(), ) runner.req_to_token_pool.write( (slot, slice(0, prefix_len)), @@ -719,7 +743,9 @@ def _allocate_extend( logger.debug( "GDN state restored for rid=%s from track_slot=%d " "(prefix_len=%d)", - rid, track_slot, prefix_len, + rid, + track_slot, + prefix_len, ) else: # Cache hit but no GDN snapshot — reset to zero. @@ -731,7 +757,8 @@ def _allocate_extend( logger.debug( "GDN state reset for rid=%s (cache hit but no " "track slot, prefix_len=%d)", - rid, prefix_len, + rid, + prefix_len, ) else: # No cache hit — fresh request, zero-init @@ -1034,7 +1061,8 @@ def _on_radix_node_evict(self, node_id: int) -> None: gdn_pool.free_track_slot(track_slot) logger.debug( "Freed GDN track slot %d for evicted node %d", - track_slot, node_id, + track_slot, + node_id, ) # ------------------------------------------------------------------ diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py index 8594a8997..8e0ba9e22 100644 --- a/pymllm/orchestrator/scheduler_process.py +++ b/pymllm/orchestrator/scheduler_process.py @@ -2,8 +2,8 @@ SchedulerProcess -- the central scheduling and inference hub. Receives tokenized requests from the TokenizerProcess, organises them into -batches, runs model forward passes via the **in-process** model runner -(sglang-style), and streams finished token IDs to the DetokenizerProcess. +batches, runs model forward passes via the **in-process** model runner, +and streams finished token IDs to the DetokenizerProcess. Architecture: the scheduler owns the :class:`ModelRunnerProcess` directly (same process, direct function calls). GPU resources (KV cache, req pool @@ -351,7 +351,7 @@ def __init__( self._recv_from_tokenizer_addr = recv_from_tokenizer_addr self._send_to_detokenizer_addr = send_to_detokenizer_addr - # Model config (for in-process model runner, sglang-style) + # Model config (for in-process model runner) self._server_config = server_config self._model_config = model_config self._gpu_id = gpu_id @@ -393,18 +393,19 @@ def __init__( # Monotonic request-slot counter (simplified; no GPU pool access) self._next_req_pool_idx: int = 0 - # ------ Throughput metrics (sglang-style interval logging) ------ + # ------ Throughput metrics ------ # How often (in decode batches) to log throughput stats. self._decode_log_interval: int = ( server_config.decode_log_interval - if server_config is not None and hasattr(server_config, "decode_log_interval") + if server_config is not None + and hasattr(server_config, "decode_log_interval") else 40 ) # Accumulators reset at each log interval - self._num_prefill_tokens: int = 0 # new prefill tokens (excluding cache hits) - self._num_prefill_cache_tokens: int = 0 # prefill tokens served from cache - self._num_decode_tokens: int = 0 # generated decode tokens - self._num_prefill_reqs: int = 0 # prefill requests count + self._num_prefill_tokens: int = 0 # new prefill tokens (excluding cache hits) + self._num_prefill_cache_tokens: int = 0 # prefill tokens served from cache + self._num_decode_tokens: int = 0 # generated decode tokens + self._num_prefill_reqs: int = 0 # prefill requests count # Timestamps for throughput calculation self._last_prefill_stats_tic: float = time.time() self._last_decode_stats_tic: float = time.time() @@ -436,7 +437,7 @@ def init_sockets(self) -> None: self._poller.register(self._recv_from_tokenizer, zmq.POLLIN) def init_model(self) -> None: - """Create and initialise the in-process model runner (sglang-style). + """Create and initialise the in-process model runner. Must be called after ``init_sockets`` and inside the subprocess (after spawn) since it performs CUDA initialisation. @@ -693,7 +694,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]: # ------------------------------------------------------------------ def run_batch(self, batch: ScheduleBatch) -> Dict[str, Any]: - """Execute the batch via the in-process model runner (sglang-style). + """Execute the batch via the in-process model runner. Direct function call — no ZMQ serialisation overhead. """ @@ -772,7 +773,9 @@ def process_batch_result( self._used_tokens += len(new_token_ids) # Check finish conditions - req.check_finished(eos_token_id=self._eos_token_ids[0] if self._eos_token_ids else None) + req.check_finished( + eos_token_id=self._eos_token_ids[0] if self._eos_token_ids else None + ) # Process batch requests based on forward mode if batch.forward_mode.is_extend(): @@ -987,7 +990,7 @@ def run_scheduler_process( ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``. - The scheduler process now also owns the model runner (sglang-style), + The scheduler process now also owns the model runner, so model initialisation happens here. """ setup_subprocess_logging(log_level) diff --git a/pymllm/server/launch.py b/pymllm/server/launch.py index b9f603220..a328c0c6d 100644 --- a/pymllm/server/launch.py +++ b/pymllm/server/launch.py @@ -1,8 +1,7 @@ """pymllm HTTP server -- RESTful API entry point. This module implements a FastAPI-based HTTP server that wraps the pymllm -:class:`Engine` and exposes OpenAI-compatible and native REST endpoints, -following the architecture of sglang's ``http_server.py``. +:class:`Engine` and exposes OpenAI-compatible and native REST endpoints. Endpoints --------- @@ -896,9 +895,8 @@ def _prepare_args(): def launch_server(): """Launch the pymllm Engine then start the uvicorn HTTP server. - This function mirrors sglang's ``launch_server``: it first boots all engine - subprocesses (tokenizer, scheduler, model-runner, detokenizer) and then - hands off to uvicorn to serve HTTP traffic. + It first boots all engine subprocesses (tokenizer, scheduler, model-runner, detokenizer) + and then hands off to uvicorn to serve HTTP traffic. """ _prepare_args() cfg = get_global_config() From 9d33d0daa92d1c718e1fccf5a397c74a6c98ae70 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Tue, 17 Mar 2026 03:09:13 -0700 Subject: [PATCH 36/42] chore: update installation instructions and add new skills for pymllm - Modified .gitignore to include install directories for better build management. - Updated pyproject.toml to allow skipping CMake builds for faster installations. - Enhanced README.md and docs/index.rst with tips for CUDA-only users to streamline installation. - Introduced new skills for installing pymllm and linking C++ libraries, providing users with clear workflows for different installation modes. --- .claude/skills/install-pymllm/SKILL.md | 70 +++++++++++++++++++++ .claude/skills/link-pymllm-lib/SKILL.md | 83 +++++++++++++++++++++++++ .gitignore | 2 +- README.md | 9 +++ docs/index.rst | 11 ++++ pyproject.toml | 2 + 6 files changed, 176 insertions(+), 1 deletion(-) create mode 100644 .claude/skills/install-pymllm/SKILL.md create mode 100644 .claude/skills/link-pymllm-lib/SKILL.md diff --git a/.claude/skills/install-pymllm/SKILL.md b/.claude/skills/install-pymllm/SKILL.md new file mode 100644 index 000000000..f53c31945 --- /dev/null +++ b/.claude/skills/install-pymllm/SKILL.md @@ -0,0 +1,70 @@ +--- +name: install-pymllm +description: Install the pymllm Python package. Asks the user whether to do a full build (with CMake C++ compilation) or a fast install (Python-only, skip CMake). Use when the user asks to install, set up, or reinstall pymllm. +--- + +# Install pymllm + +## Goal + +Help the user install the `pymllm` package with the right configuration for their use case. + +## Workflow + +### Step 1: Ask the user which install mode they want + +Use `AskUserQuestion` to present two options: + +**Full Install (with C++ build)** +- Compiles the C++ mllm runtime and FFI extension via CMake +- Required if the user needs mobile inference, model conversion with FFI, or CPU/QNN backends +- Slower (several minutes depending on the machine) +- Command: `pip wheel -v -w dist . && pip install dist/*.whl --force-reinstall` + +**Fast Install (Python-only, skip CMake)** +- Skips the entire CMake build step +- Only installs the pure Python package +- Recommended for users who only use CUDA backends (FlashInfer, TileLang) and do not need the C++ mllm runtime +- Much faster (seconds) +- Command: `SKBUILD_WHEEL_CMAKE=false pip install -e .` + +### Step 2: Ask editable or non-editable + +Use `AskUserQuestion` to ask: + +- **Editable (`pip install -e .`)**: For active development. Python imports point to the source tree. Changes to `.py` files take effect immediately without reinstalling. +- **Non-editable (wheel)**: For stable usage. Installs a wheel into site-packages. + +### Step 3: Execute the install + +Based on user choices, run the appropriate command: + +| Mode | Editable | Command | +|------|----------|---------| +| Full | Yes | `pip install -e -v .` | +| Full | No | `pip wheel -v -w dist . && pip install dist/*.whl --force-reinstall` | +| Fast | Yes | `SKBUILD_WHEEL_CMAKE=false pip install -e .` | +| Fast | No | `SKBUILD_WHEEL_CMAKE=false pip wheel -v -w dist . && pip install dist/*.whl --force-reinstall` | + +### Step 4: Post-install for editable + full build + +If the user chose **editable + full build**, the compiled `.so` files live in a build directory (e.g. `build/bin/`), not in the source tree. The Python code at `pymllm/__init__.py` looks for libraries at `pymllm/lib/MllmFFIExtension.so`. A symlink is needed to bridge this gap. + +**Invoke the `/link-pymllm-lib` skill** to help the user set up the symlink. + +### Step 5: Install optional CUDA dependencies + +If the user chose fast install, suggest installing CUDA extras: + +```bash +pip install pymllm[cuda] +``` + +This pulls in `tilelang`, `flashinfer-python`, and `pyzmq`. + +## Important Notes + +- The project root must contain `pyproject.toml` with `scikit-build-core` as the build backend. +- The `wheel.cmake = true` flag in `pyproject.toml` controls whether CMake runs. The env var `SKBUILD_WHEEL_CMAKE=false` overrides it at install time without modifying the file. +- For non-editable full builds, the `.so` files are bundled inside the wheel automatically — no symlink needed. +- For fast installs, `pymllm.is_mobile_available()` will return `False` since no C++ libraries are present. This is expected. diff --git a/.claude/skills/link-pymllm-lib/SKILL.md b/.claude/skills/link-pymllm-lib/SKILL.md new file mode 100644 index 000000000..b8d9760f2 --- /dev/null +++ b/.claude/skills/link-pymllm-lib/SKILL.md @@ -0,0 +1,83 @@ +--- +name: link-pymllm-lib +description: Create or update the pymllm/lib symlink to point to a C++ build directory's bin/ folder. Required after editable installs with C++ builds so that Python can find the compiled .so libraries. Use when the user asks to link, fix, or set up pymllm native libraries. +--- + +# Link pymllm lib + +## Goal + +Create a symlink at `pymllm/lib` pointing to the correct build output directory so that an editable-installed pymllm can load the compiled C++ shared libraries (`MllmFFIExtension.so`, `libMllmRT.so`, etc.). + +## Background + +When pymllm is installed in editable mode (`pip install -e .`), Python imports from the source tree directly. The C++ libraries are compiled into `/bin/` by CMake, but pymllm looks for them at `pymllm/lib/`. A symlink bridges this gap: + +``` +pymllm/lib -> //bin +``` + +## Workflow + +### Step 1: Detect available build directories + +Scan the project root for directories matching the pattern `build*/bin/` that contain `MllmFFIExtension.so` (or `.dylib` on macOS). List all valid candidates. + +Common build directories and their corresponding platforms: + +| Build directory | Platform / Config | Typical build command | +|----------------|-------------------|----------------------| +| `build/bin` | X86 CPU only | `python task.py tasks/build_x86.yaml` | +| `build-x86-cuda/bin` | X86 + CUDA | `python task.py tasks/build_x86_cuda.yaml` | +| `build-qnn-aot/bin` | X86 + QNN AOT | `python task.py tasks/build_x86_qnn_aot.yaml` | +| `build-android-arm64-v8a-qnn/bin` | Android ARM + QNN | `python task.py tasks/build_android_qnn.yaml` | + +### Step 2: Ask the user which build to link + +Use `AskUserQuestion` to let the user pick from the detected build directories. Show each option with its path and the platform it corresponds to. + +If no build directories with `.so` files are found, inform the user they need to build first: + +```bash +pip install -r requirements.txt +python task.py tasks/build_x86.yaml # or another build task +``` + +### Step 3: Check existing symlink + +Before creating a new symlink, check if `pymllm/lib` already exists: + +- If it's a symlink, show where it currently points and confirm replacement. +- If it's a real directory, warn the user and ask before removing it. +- If it doesn't exist, proceed directly. + +### Step 4: Create the symlink + +```bash +ln -sfn //bin /pymllm/lib +``` + +Use `ln -sfn` to atomically replace any existing symlink. + +### Step 5: Verify + +After creating the symlink, verify by checking that the target `.so` file is accessible: + +```bash +ls -la pymllm/lib/MllmFFIExtension.so +``` + +Then run a quick Python check: + +```bash +python -c "import pymllm; print('mobile available:', pymllm.is_mobile_available())" +``` + +If `is_mobile_available()` returns `True`, the link is correct. + +## Important Notes + +- The symlink target must be an **absolute path** for reliability. +- On macOS, the library extension is `.dylib` instead of `.so`. +- Android build directories (e.g., `build-android-arm64-v8a-qnn/bin`) contain ARM binaries that cannot run on x86 hosts. Warn the user if they select one of these on a non-ARM machine. +- If the user has multiple build directories, they can re-run this skill anytime to switch which build pymllm uses. diff --git a/.gitignore b/.gitignore index 7f14b37ec..b441a62eb 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,7 @@ tasks/mllmteam* # Building files and binary build*/ -install*/ +/install*/ mllm-sdk-*/ mllm-install-*/ diff --git a/README.md b/README.md index 92dc29a6b..88666692c 100644 --- a/README.md +++ b/README.md @@ -308,6 +308,15 @@ mllm provides a set of model converters to convert models from other popular mod bash ./scripts/install_pymllm.sh ``` +> **Tip for CUDA-only users:** If you only use CUDA backends (e.g., FlashInfer, TileLang) and do not need the C++ mllm runtime, you can skip the CMake build to speed up installation significantly: +> +> ```shell +> SKBUILD_WHEEL_CMAKE=false pip install -e . +> pip install pymllm[cuda] +> ``` +> +> This installs only the pure Python package without compiling the C++ components. + **future:** Once PyPI approves the creation of the mllm organization, we will publish it there. Afterwards, you can use the command below to install it in the future. diff --git a/docs/index.rst b/docs/index.rst index 1f06ef487..3db7d58e2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -246,6 +246,17 @@ mllm provides a set of model converters to convert models from other popular mod bash ./scripts/install_pymllm.sh +.. tip:: + + **For CUDA-only users:** If you only use CUDA backends (e.g., FlashInfer, TileLang) and do not need the C++ mllm runtime, you can skip the CMake build to speed up installation significantly: + + .. code-block:: shell + + SKBUILD_WHEEL_CMAKE=false pip install -e . + pip install pymllm[cuda] + + This installs only the pure Python package without compiling the C++ components. + **future:** Once PyPI approves the creation of the mllm organization, we will publish it there. Afterwards, you can use the command below to install it in the future. diff --git a/pyproject.toml b/pyproject.toml index d752ddc1b..ce64b2ee1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,8 @@ first_party_detection = false target-version = ["py310", "py311", "py312"] [tool.scikit-build] +# Set to false or use env var SKBUILD_WHEEL_CMAKE=false to skip CMake build +wheel.cmake = true # ABI-agnostic wheel wheel.py-api = "py3" cmake.args = [ From fd1622638178157ad181d623dd0867a39aaff826 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Tue, 17 Mar 2026 06:26:13 -0700 Subject: [PATCH 37/42] refactor: enhance installation instructions and improve cache management - Updated SKILL.md to clarify installation steps and CUDA dependency handling. - Added new commands for different installation modes, including CUDA support. - Introduced a base class for prefix caches and implemented ChunkCache and MambaRadixCache for improved memory management. - Enhanced model runner and scheduler processes to limit CPU threads and optimize performance during inference. - Improved logging and structure across multiple modules for better traceability and maintainability. --- .claude/skills/install-pymllm/SKILL.md | 37 +- pymllm/configs/server_config.py | 3 +- pymllm/executor/model_runner.py | 127 +++- pymllm/mem_cache/__init__.py | 47 +- pymllm/mem_cache/base_prefix_cache.py | 206 ++++++ pymllm/mem_cache/chunk_cache.py | 74 +++ pymllm/mem_cache/mamba_radix_cache.py | 653 ++++++++++++++++++++ pymllm/mem_cache/radix_cache.py | 208 +++---- pymllm/orchestrator/detokenizer_process.py | 5 + pymllm/orchestrator/model_runner_process.py | 122 +++- pymllm/orchestrator/scheduler_process.py | 43 +- pymllm/orchestrator/tokenizer_process.py | 5 + pymllm/server/launch.py | 101 ++- 13 files changed, 1417 insertions(+), 214 deletions(-) create mode 100644 pymllm/mem_cache/base_prefix_cache.py create mode 100644 pymllm/mem_cache/chunk_cache.py create mode 100644 pymllm/mem_cache/mamba_radix_cache.py diff --git a/.claude/skills/install-pymllm/SKILL.md b/.claude/skills/install-pymllm/SKILL.md index f53c31945..d9d637989 100644 --- a/.claude/skills/install-pymllm/SKILL.md +++ b/.claude/skills/install-pymllm/SKILL.md @@ -35,32 +35,34 @@ Use `AskUserQuestion` to ask: - **Editable (`pip install -e .`)**: For active development. Python imports point to the source tree. Changes to `.py` files take effect immediately without reinstalling. - **Non-editable (wheel)**: For stable usage. Installs a wheel into site-packages. -### Step 3: Execute the install +### Step 3: Ask whether the user needs CUDA optional dependencies -Based on user choices, run the appropriate command: +Use `AskUserQuestion` to ask whether the user needs CUDA support (FlashInfer, TileLang, pyzmq, etc.). -| Mode | Editable | Command | -|------|----------|---------| -| Full | Yes | `pip install -e -v .` | -| Full | No | `pip wheel -v -w dist . && pip install dist/*.whl --force-reinstall` | -| Fast | Yes | `SKBUILD_WHEEL_CMAKE=false pip install -e .` | -| Fast | No | `SKBUILD_WHEEL_CMAKE=false pip wheel -v -w dist . && pip install dist/*.whl --force-reinstall` | +This determines whether to append `[cuda]` to the install specifier (e.g. `pip install -e ".[cuda]"` instead of `pip install -e .`). -### Step 4: Post-install for editable + full build +**This applies to ALL install modes.** For fast-install users this is especially important since the CUDA packages are the primary compute backend. -If the user chose **editable + full build**, the compiled `.so` files live in a build directory (e.g. `build/bin/`), not in the source tree. The Python code at `pymllm/__init__.py` looks for libraries at `pymllm/lib/MllmFFIExtension.so`. A symlink is needed to bridge this gap. +### Step 4: Execute the install -**Invoke the `/link-pymllm-lib` skill** to help the user set up the symlink. +Based on user choices, compose and run the appropriate command. The install specifier is either `.` or `".[cuda]"` depending on Step 3. -### Step 5: Install optional CUDA dependencies +| Mode | Editable | CUDA | Command | +|------|----------|------|---------| +| Full | Yes | No | `pip install -e -v .` | +| Full | Yes | Yes | `pip install -e -v ".[cuda]"` | +| Full | No | No | `pip wheel -v -w dist . && pip install dist/*.whl --force-reinstall` | +| Full | No | Yes | `pip wheel -v -w dist . && pip install dist/*.whl --force-reinstall && pip install "pymllm[cuda]"` | +| Fast | Yes | No | `SKBUILD_WHEEL_CMAKE=false pip install -e .` | +| Fast | Yes | Yes | `SKBUILD_WHEEL_CMAKE=false pip install -e ".[cuda]"` | +| Fast | No | No | `SKBUILD_WHEEL_CMAKE=false pip wheel -v -w dist . && pip install dist/*.whl --force-reinstall` | +| Fast | No | Yes | `SKBUILD_WHEEL_CMAKE=false pip wheel -v -w dist . && pip install dist/*.whl --force-reinstall && pip install "pymllm[cuda]"` | -If the user chose fast install, suggest installing CUDA extras: +### Step 5: Post-install for editable + full build -```bash -pip install pymllm[cuda] -``` +If the user chose **editable + full build**, the compiled `.so` files live in a build directory (e.g. `build/bin/`), not in the source tree. The Python code at `pymllm/__init__.py` looks for libraries at `pymllm/lib/MllmFFIExtension.so`. A symlink is needed to bridge this gap. -This pulls in `tilelang`, `flashinfer-python`, and `pyzmq`. +**Invoke the `/link-pymllm-lib` skill** to help the user set up the symlink. ## Important Notes @@ -68,3 +70,4 @@ This pulls in `tilelang`, `flashinfer-python`, and `pyzmq`. - The `wheel.cmake = true` flag in `pyproject.toml` controls whether CMake runs. The env var `SKBUILD_WHEEL_CMAKE=false` overrides it at install time without modifying the file. - For non-editable full builds, the `.so` files are bundled inside the wheel automatically — no symlink needed. - For fast installs, `pymllm.is_mobile_available()` will return `False` since no C++ libraries are present. This is expected. +- The `[cuda]` optional dependencies are defined in `pyproject.toml` under `[project.optional-dependencies]`. diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py index 8727f7c13..304d328ea 100644 --- a/pymllm/configs/server_config.py +++ b/pymllm/configs/server_config.py @@ -82,8 +82,9 @@ class ServerConfig: # Feature switches # --------------------------------------------------------------------- # enable_shared_queue: bool = False # Use shared memory queue for fast IPC - disable_radix_cache: bool = False # Disable radix-tree prefix caching + disable_radix_cache: bool = False # Disable radix-tree prefix caching (uses ChunkCache) radix_cache_page_size: int = 1 # Number of tokens per KV-pool page in RadixCache + enable_mamba_cache: bool = False # Use MambaRadixCache for SSM state caching # CUDA IPC transport for multimodal GPU tensors. # Requires enable_shared_queue=True to take effect. diff --git a/pymllm/executor/model_runner.py b/pymllm/executor/model_runner.py index f39bf4f02..e73272f1b 100644 --- a/pymllm/executor/model_runner.py +++ b/pymllm/executor/model_runner.py @@ -39,7 +39,7 @@ import logging import time from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch from torch import nn @@ -64,6 +64,14 @@ logger = logging.getLogger(__name__) +def _suppress_cpu_threads() -> None: + """Limit PyTorch intra-op threads to 1 for GPU inference. + + Reference: sglang ``ModelRunner``. + """ + torch.set_num_threads(1) + + # --------------------------------------------------------------------------- # Utility: GPU memory query # --------------------------------------------------------------------------- @@ -110,6 +118,97 @@ class LogitsProcessorOutput: hidden_states: Optional[torch.Tensor] = None +# --------------------------------------------------------------------------- +# Penalty helpers +# --------------------------------------------------------------------------- + + +def _apply_penalties( + logits: torch.Tensor, + token_histories: List[List[int]], + repetition_penalties: torch.Tensor, + frequency_penalties: torch.Tensor, + presence_penalties: torch.Tensor, +) -> torch.Tensor: + """Apply repetition, frequency, and presence penalties to logits in-place. + + - **repetition_penalty** (multiplicative, default 1.0): + For each token that appeared in the history, if logit > 0 divide + by the penalty, else multiply by it. Values > 1 discourage repetition. + + - **frequency_penalty** (additive, default 0.0): + Subtract ``penalty * count(token)`` from the logit for each token + that appeared in the history. The more a token appears, the + stronger the penalty. + + - **presence_penalty** (additive, default 0.0): + Subtract ``penalty`` from the logit for each token that appeared + at least once in the history (binary, not count-based). + + Parameters + ---------- + logits : [batch_size, vocab_size] + token_histories : list of list of int, length batch_size + repetition_penalties : [batch_size] + frequency_penalties : [batch_size] + presence_penalties : [batch_size] + """ + logits = logits.clone() + batch_size, vocab_size = logits.shape + device = logits.device + + for i in range(batch_size): + history = token_histories[i] + if not history: + continue + + rep_p = repetition_penalties[i].item() + freq_p = frequency_penalties[i].item() + pres_p = presence_penalties[i].item() + + # Skip if all penalties are neutral + if rep_p == 1.0 and freq_p == 0.0 and pres_p == 0.0: + continue + + # Count token occurrences + token_counts: Dict[int, int] = {} + for t in history: + if 0 <= t < vocab_size: + token_counts[t] = token_counts.get(t, 0) + 1 + + if not token_counts: + continue + + token_ids = list(token_counts.keys()) + token_ids_t = torch.tensor(token_ids, dtype=torch.long, device=device) + selected_logits = logits[i, token_ids_t] + + # Repetition penalty (multiplicative) + if rep_p != 1.0: + selected_logits = torch.where( + selected_logits > 0, + selected_logits / rep_p, + selected_logits * rep_p, + ) + + # Frequency penalty (additive, proportional to count) + if freq_p != 0.0: + counts = torch.tensor( + [token_counts[t] for t in token_ids], + dtype=torch.float32, + device=device, + ) + selected_logits = selected_logits - freq_p * counts + + # Presence penalty (additive, binary) + if pres_p != 0.0: + selected_logits = selected_logits - pres_p + + logits[i, token_ids_t] = selected_logits + + return logits + + # --------------------------------------------------------------------------- # ModelRunner # --------------------------------------------------------------------------- @@ -194,6 +293,13 @@ def initialize(self) -> None: if self.device == "cuda": torch.cuda.set_device(self.gpu_id) + # Limit PyTorch CPU threads to 1 for GPU inference. + # PyTorch's default (= CPU core count) causes OpenMP thread pool + # spin-wait that wastes CPU. GPU models don't benefit from CPU + # parallelism. Reference: sglang ModelRunner. + if self.device != "cpu": + _suppress_cpu_threads() + # Set default dtype torch.set_default_dtype(self.dtype) @@ -1067,10 +1173,12 @@ def sample( temperatures: Optional[torch.Tensor] = None, top_ps: Optional[torch.Tensor] = None, top_ks: Optional[torch.Tensor] = None, + penalty_params: Optional[Dict[str, Any]] = None, ) -> torch.Tensor: """Sample next-token IDs from logits. - Supports per-request temperature, top-p, and top-k. + Supports per-request temperature, top-p, top-k, and penalties + (repetition, frequency, presence). Parameters ---------- @@ -1084,6 +1192,11 @@ def sample( Per-request top-p, shape ``[batch_size]``. top_ks Per-request top-k, shape ``[batch_size]``. + penalty_params + Optional dict with keys ``repetition_penalties``, + ``frequency_penalties``, ``presence_penalties`` (tensors of + shape ``[batch_size]``), and ``token_histories`` (list of + list of int). Returns ------- @@ -1101,6 +1214,16 @@ def sample( if logits.numel() == 0: return torch.empty(0, dtype=torch.int32, device=self.device) + # Apply penalties to logits before temperature/sampling. + if penalty_params is not None: + logits = _apply_penalties( + logits, + penalty_params["token_histories"], + penalty_params["repetition_penalties"], + penalty_params["frequency_penalties"], + penalty_params["presence_penalties"], + ) + # Greedy path: temperature=0 (or all zeros) → argmax, no sampling. if temperatures is not None: all_greedy = bool((temperatures < 1e-6).all()) diff --git a/pymllm/mem_cache/__init__.py b/pymllm/mem_cache/__init__.py index c2ce06eba..cc449e426 100644 --- a/pymllm/mem_cache/__init__.py +++ b/pymllm/mem_cache/__init__.py @@ -1,37 +1,46 @@ -from pymllm.mem_cache.memory_pool import ( - KVPool, - ReqToTokenPool, - TokenToKVPoolAllocator, - make_full_attention_net_mem_pool, - make_req_to_token_pool, -) -from pymllm.mem_cache.radix_cache import ( +from pymllm.mem_cache.base_prefix_cache import ( + BasePrefixCache, EvictResult, InsertResult, MatchResult, - RadixCache, RadixKey, - TreeNode, hash_bytes, hash_to_int64, hash_token_ids, ) +from pymllm.mem_cache.chunk_cache import ChunkCache +from pymllm.mem_cache.mamba_radix_cache import MambaRadixCache, MambaTreeNode +from pymllm.mem_cache.memory_pool import ( + KVPool, + ReqToTokenPool, + TokenToKVPoolAllocator, + make_full_attention_net_mem_pool, + make_req_to_token_pool, +) +from pymllm.mem_cache.radix_cache import RadixCache, TreeNode __all__ = [ - # memory_pool - "KVPool", - "TokenToKVPoolAllocator", - "ReqToTokenPool", - "make_full_attention_net_mem_pool", - "make_req_to_token_pool", - # radix_cache - "RadixCache", + # base_prefix_cache + "BasePrefixCache", "RadixKey", - "TreeNode", "MatchResult", "InsertResult", "EvictResult", "hash_token_ids", "hash_to_int64", "hash_bytes", + # radix_cache + "RadixCache", + "TreeNode", + # chunk_cache + "ChunkCache", + # mamba_radix_cache + "MambaRadixCache", + "MambaTreeNode", + # memory_pool + "KVPool", + "TokenToKVPoolAllocator", + "ReqToTokenPool", + "make_full_attention_net_mem_pool", + "make_req_to_token_pool", ] diff --git a/pymllm/mem_cache/base_prefix_cache.py b/pymllm/mem_cache/base_prefix_cache.py new file mode 100644 index 000000000..a49355d6e --- /dev/null +++ b/pymllm/mem_cache/base_prefix_cache.py @@ -0,0 +1,206 @@ +"""Abstract base class and shared data types for prefix cache implementations. + +All concrete caches (:class:`RadixCache`, :class:`ChunkCache`, +:class:`MambaRadixCache`) inherit from :class:`BasePrefixCache` and share +the data classes defined here. +""" + +from __future__ import annotations + +import hashlib +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, Iterator, List, Optional, Tuple, Union + +import torch + + +# ====================================================================== +# Hashing utilities +# ====================================================================== + + +def hash_token_ids( + token_ids: List[Union[int, Tuple[int, ...]]], + prior_hash: Optional[str] = None, +) -> str: + """SHA-256 hash of a token-id page with optional chain-hash. + + Each token is encoded as a 4-byte little-endian unsigned integer; + tuples (bigram / EAGLE) hash each element in order. When *prior_hash* + is supplied the digest is seeded with the raw bytes of the previous + hash, making the result position-aware. + """ + hasher = hashlib.sha256() + if prior_hash: + hasher.update(bytes.fromhex(prior_hash)) + for t in token_ids: + if isinstance(t, tuple): + for elem in t: + hasher.update(elem.to_bytes(4, byteorder="little", signed=False)) + else: + hasher.update(t.to_bytes(4, byteorder="little", signed=False)) + return hasher.hexdigest() + + +def hash_to_int64(hex_str: str) -> int: + """Convert a hex digest to a signed 64-bit integer (first 16 hex chars).""" + val = int(hex_str[:16], 16) + return val - (1 << 64) if val >= (1 << 63) else val + + +def hash_bytes(data: bytes) -> int: + """SHA-256 -> unsigned 64-bit int. Useful for multimodal embedding keys.""" + return int.from_bytes(hashlib.sha256(data).digest()[:8], "big", signed=False) + + +# ====================================================================== +# Compound lookup key +# ====================================================================== + + +class RadixKey: + """Compound lookup key: token-id sequence + optional namespace tag. + + ``extra_key`` isolates independent namespaces so that sequences with + identical leading tokens but different adapters / LoRA ids / multimodal + context hashes never share prefix nodes. + """ + + __slots__ = ("token_ids", "extra_key") + + def __init__( + self, + token_ids: List[Union[int, Tuple[int, ...]]], + extra_key: Optional[str] = None, + ): + self.token_ids = token_ids + self.extra_key = extra_key + + def __len__(self) -> int: + return len(self.token_ids) + + def __iter__(self) -> Iterator: + return iter(self.token_ids) + + def __getitem__(self, idx: Union[int, slice]) -> RadixKey: + if isinstance(idx, slice): + return RadixKey(self.token_ids[idx], self.extra_key) + return RadixKey([self.token_ids[idx]], self.extra_key) + + def __repr__(self) -> str: + preview = self.token_ids[:10] + tail = "..." if len(self.token_ids) > 10 else "" + return f"RadixKey(extra={self.extra_key!r}, toks={preview}{tail})" + + +# ====================================================================== +# Result data classes +# ====================================================================== + + +@dataclass +class MatchResult: + """Returned by :meth:`BasePrefixCache.match_prefix`.""" + + indices: torch.Tensor + last_node: Any = None + prefix_len: int = 0 + # SSM / Mamba support + mamba_branching_seqlen: Optional[int] = None + + +@dataclass +class InsertResult: + """Returned by :meth:`BasePrefixCache.insert`.""" + + prefix_len: int = 0 + last_node: Any = None + # SSM / Mamba support: True when mamba state already existed in tree + mamba_exist: bool = False + + +@dataclass +class EvictResult: + """Returned by :meth:`BasePrefixCache.evict`.""" + + full_evicted: int = 0 + swa_evicted: int = 0 + mamba_evicted: int = 0 + + +# ====================================================================== +# Abstract base class +# ====================================================================== + + +class BasePrefixCache(ABC): + """Abstract interface for all prefix cache implementations. + + Concrete implementations: + + * :class:`~pymllm.mem_cache.radix_cache.RadixCache` -- radix-tree with + SWA tombstone support + * :class:`~pymllm.mem_cache.chunk_cache.ChunkCache` -- no-op fallback + (``disable_radix_cache=True``) + * :class:`~pymllm.mem_cache.mamba_radix_cache.MambaRadixCache` -- radix-tree + with independent Mamba/SSM state tracking + """ + + @abstractmethod + def reset(self) -> None: + """Clear all cached state and re-initialise.""" + ... + + @abstractmethod + def match_prefix(self, key: RadixKey) -> MatchResult: + """Find the longest cached prefix of *key*.""" + ... + + @abstractmethod + def insert( + self, + key: RadixKey, + value: Optional[torch.Tensor] = None, + **kwargs: Any, + ) -> InsertResult: + """Insert *key*/*value* into the cache.""" + ... + + @abstractmethod + def evict(self, num_tokens: int, swa_num_tokens: int = 0) -> EvictResult: + """Evict tokens to free memory.""" + ... + + @abstractmethod + def inc_lock_ref(self, node: Any) -> Optional[Any]: + """Lock *node* (and ancestors) to prevent eviction. + + Returns an opaque token (e.g. ``swa_boundary_id``) that must be + passed back to :meth:`dec_lock_ref`. + """ + ... + + @abstractmethod + def dec_lock_ref(self, node: Any, **kwargs: Any) -> None: + """Unlock *node* (and ancestors).""" + ... + + # ------------------------------------------------------------------ + # Size queries (default implementations return 0) + # ------------------------------------------------------------------ + + def evictable_size(self) -> int: + return 0 + + def swa_evictable_size(self) -> int: + return 0 + + def protected_size(self) -> int: + return 0 + + def swa_protected_size(self) -> int: + return 0 + + def total_size(self) -> int: + return 0 diff --git a/pymllm/mem_cache/chunk_cache.py b/pymllm/mem_cache/chunk_cache.py new file mode 100644 index 000000000..c53b2b69e --- /dev/null +++ b/pymllm/mem_cache/chunk_cache.py @@ -0,0 +1,74 @@ +"""No-op prefix cache used when ``disable_radix_cache=True``. + +Every request is fully computed from scratch -- no prefix sharing, no +tree structure, no eviction logic. This is the simplest possible +:class:`~pymllm.mem_cache.base_prefix_cache.BasePrefixCache` implementation. +""" + +from __future__ import annotations + +from typing import Any, Optional + +import torch + +from pymllm.mem_cache.base_prefix_cache import ( + BasePrefixCache, + EvictResult, + InsertResult, + MatchResult, + RadixKey, +) + + +class ChunkCache(BasePrefixCache): + """No-op prefix cache: no prefix sharing, no eviction. + + When the radix cache is disabled, this class replaces it so that + the rest of the system can call the same interface without branching. + + Parameters + ---------- + token_to_kv_pool_allocator: + Pool allocator used to free KV indices on request completion. + device: + Device for empty tensors returned by :meth:`match_prefix`. + """ + + def __init__( + self, + token_to_kv_pool_allocator: Any = None, + device: torch.device = torch.device("cpu"), + ): + self.pool = token_to_kv_pool_allocator + self.device = device + + def reset(self) -> None: + pass + + def match_prefix(self, key: RadixKey) -> MatchResult: + """Always returns an empty match (no prefix sharing).""" + return MatchResult( + indices=torch.empty(0, dtype=torch.int64, device=self.device), + last_node=None, + ) + + def insert( + self, + key: RadixKey, + value: Optional[torch.Tensor] = None, + **kwargs: Any, + ) -> InsertResult: + """No-op: nothing is cached.""" + return InsertResult() + + def evict(self, num_tokens: int, swa_num_tokens: int = 0) -> EvictResult: + """No-op: nothing to evict.""" + return EvictResult() + + def inc_lock_ref(self, node: Any) -> Optional[Any]: + """No-op: nothing to lock.""" + return None + + def dec_lock_ref(self, node: Any, **kwargs: Any) -> None: + """No-op: nothing to unlock.""" + pass diff --git a/pymllm/mem_cache/mamba_radix_cache.py b/pymllm/mem_cache/mamba_radix_cache.py new file mode 100644 index 000000000..bee8027e6 --- /dev/null +++ b/pymllm/mem_cache/mamba_radix_cache.py @@ -0,0 +1,653 @@ +"""Radix-tree KV cache with independent Mamba/SSM state tracking. + +Extends :class:`~pymllm.mem_cache.radix_cache.RadixCache` with dual-tracked +state for hybrid models that combine full attention layers and SSM (Mamba / +GDN) layers. Each tree node stores both: + +- ``value``: KV-pool indices for full-attention layers +- ``mamba_value``: state-pool indices for SSM layers + +The two pools have **independent reference counting and LRU eviction**: +Mamba state can be evicted more aggressively than full KV cache. + +Reference: sglang ``MambaRadixCache``. +""" + +from __future__ import annotations + +import heapq +import logging +import time +from collections import defaultdict +from typing import Any, Callable, Dict, List, Optional, Tuple + +import torch + +from pymllm.mem_cache.base_prefix_cache import ( + BasePrefixCache, + EvictResult, + InsertResult, + MatchResult, + RadixKey, +) +from pymllm.mem_cache.radix_cache import ( + TreeNode as _BaseTreeNode, + _child_key, + _key_match, + _next_node_id, +) + +logger = logging.getLogger(__name__) + + +# ====================================================================== +# Mamba-aware tree node +# ====================================================================== + + +class MambaTreeNode: + """Tree node with dual KV + Mamba state tracking. + + Invariant: ``full_lock_ref >= mamba_lock_ref``. If Mamba state is + locked, full KV must also be locked; full KV alone can be locked + without locking Mamba state. + """ + + __slots__ = ( + "children", + "parent", + "key", + "value", + "mamba_value", + "full_lock_ref", + "mamba_lock_ref", + "last_access_time", + "hit_count", + "id", + # LRU doubly-linked list pointers (full) + "prev", + "next", + # LRU doubly-linked list pointers (mamba) + "mamba_prev", + "mamba_next", + ) + + def __init__(self) -> None: + self.children: Dict[Any, MambaTreeNode] = defaultdict(MambaTreeNode) + self.parent: Optional[MambaTreeNode] = None + self.key: Optional[RadixKey] = None + self.value: Optional[torch.Tensor] = None + self.mamba_value: Optional[torch.Tensor] = None + + self.full_lock_ref: int = 0 + self.mamba_lock_ref: int = 0 + + self.last_access_time: float = time.monotonic() + self.hit_count: int = 0 + self.id: int = _next_node_id() + + # LRU list pointers + self.prev: Optional[MambaTreeNode] = None + self.next: Optional[MambaTreeNode] = None + self.mamba_prev: Optional[MambaTreeNode] = None + self.mamba_next: Optional[MambaTreeNode] = None + + @property + def evicted(self) -> bool: + return self.value is None + + @property + def mamba_tombstone(self) -> bool: + """Node has full KV but Mamba state was evicted.""" + return self.value is not None and self.mamba_value is None + + def __lt__(self, other: MambaTreeNode) -> bool: + return self.last_access_time < other.last_access_time + + +# ====================================================================== +# Doubly-linked LRU list +# ====================================================================== + + +class LRUList: + """Intrusive doubly-linked list for LRU ordering. + + Supports two modes via *mamba* flag: uses ``prev``/``next`` or + ``mamba_prev``/``mamba_next`` pointers on :class:`MambaTreeNode`. + """ + + def __init__(self, mamba: bool = False): + self.mamba = mamba + if mamba: + self._prv = "mamba_prev" + self._nxt = "mamba_next" + self._lock = "mamba_lock_ref" + else: + self._prv = "prev" + self._nxt = "next" + self._lock = "full_lock_ref" + + # Sentinel head (MRU side) and tail (LRU side) + self.head = MambaTreeNode() + self.tail = MambaTreeNode() + setattr(self.head, self._nxt, self.tail) + setattr(self.tail, self._prv, self.head) + self._cache: Dict[int, MambaTreeNode] = {} + + def __len__(self) -> int: + return len(self._cache) + + def __contains__(self, node: Optional[MambaTreeNode]) -> bool: + return node is not None and node.id in self._cache + + # -- Mutations -------------------------------------------------------- + + def insert_mru(self, node: MambaTreeNode) -> None: + """Insert *node* at the MRU (head) position.""" + self._cache[node.id] = node + self._add_after(self.head, node) + + def remove(self, node: MambaTreeNode) -> None: + """Remove *node* from the list.""" + self._cache.pop(node.id, None) + self._unlink(node) + + def touch_mru(self, node: MambaTreeNode) -> None: + """Move an existing *node* to the MRU position.""" + if node.id not in self._cache: + return + self._unlink(node) + self._add_after(self.head, node) + + def touch_node_and_parents_mru( + self, node: MambaTreeNode, root: MambaTreeNode + ) -> None: + """Move *node* and all ancestors up to *root* to MRU. + + Child is more recently used than parent. + """ + prev = self.head + cur = node + while cur != root: + if cur.id in self._cache: + if self.mamba and cur.mamba_value is None: + cur = cur.parent + continue + self._unlink(cur) + self._add_after(prev, cur) + prev = cur + cur = cur.parent + + # -- Queries ---------------------------------------------------------- + + def get_lru_leaf_unlocked(self) -> Optional[MambaTreeNode]: + """Return the LRU leaf node with lock_ref == 0, or ``None``.""" + x = getattr(self.tail, self._prv) + while x != self.head: + if getattr(x, self._lock) == 0 and len(x.children) == 0: + return x + x = getattr(x, self._prv) + return None + + def get_lru_unlocked(self) -> Optional[MambaTreeNode]: + """Return the LRU node with lock_ref == 0, or ``None``.""" + x = getattr(self.tail, self._prv) + while x != self.head: + if getattr(x, self._lock) == 0: + return x + x = getattr(x, self._prv) + return None + + # -- Internal --------------------------------------------------------- + + def _add_after(self, old: MambaTreeNode, new: MambaTreeNode) -> None: + nxt = getattr(old, self._nxt) + setattr(new, self._prv, old) + setattr(new, self._nxt, nxt) + setattr(nxt, self._prv, new) + setattr(old, self._nxt, new) + + def _unlink(self, node: MambaTreeNode) -> None: + prv = getattr(node, self._prv) + nxt = getattr(node, self._nxt) + if prv is not None: + setattr(prv, self._nxt, nxt) + if nxt is not None: + setattr(nxt, self._prv, prv) + setattr(node, self._prv, None) + setattr(node, self._nxt, None) + + +# ====================================================================== +# MambaRadixCache +# ====================================================================== + + +class MambaRadixCache(BasePrefixCache): + """Radix tree with independent Mamba/SSM state tracking. + + Parameters + ---------- + page_size: + Number of tokens per KV-pool page. + token_to_kv_pool_allocator: + Pool allocator for full-attention KV indices. + mamba_pool: + Pool object for Mamba/SSM state. Must support ``alloc_track_slot()``, + ``free_track_slot(slot)``, ``copy_states(src, dst)``. + on_node_evict: + Optional callback invoked with node id on eviction. + """ + + def __init__( + self, + page_size: int = 1, + token_to_kv_pool_allocator: Any = None, + mamba_pool: Any = None, + on_node_evict: Optional[Callable[[int], None]] = None, + ): + self.page_size = page_size + self.pool = token_to_kv_pool_allocator + self.mamba_pool = mamba_pool + self.on_node_evict = on_node_evict + + if self.pool is not None and hasattr(self.pool, "device"): + self.device = self.pool.device + else: + self.device = torch.device("cpu") + + # Dual LRU lists + self.full_lru = LRUList(mamba=False) + self.mamba_lru = LRUList(mamba=True) + + # Size counters + self._full_evictable: int = 0 + self._full_protected: int = 0 + self._mamba_evictable: int = 0 + self._mamba_protected: int = 0 + + self.reset() + + # ------------------------------------------------------------------ + # Size queries + # ------------------------------------------------------------------ + + def evictable_size(self) -> int: + return self._full_evictable + + def protected_size(self) -> int: + return self._full_protected + + def mamba_evictable_size(self) -> int: + return self._mamba_evictable + + def mamba_protected_size(self) -> int: + return self._mamba_protected + + def total_size(self) -> int: + total = 0 + stack = [self.root_node] + while stack: + n = stack.pop() + if n.value is not None: + total += len(n.value) + stack.extend(c for c in n.children.values() if not c.evicted) + return total + + # ------------------------------------------------------------------ + # BasePrefixCache interface + # ------------------------------------------------------------------ + + def reset(self) -> None: + self.root_node = MambaTreeNode() + self.root_node.key = RadixKey([]) + self.root_node.value = torch.tensor([], dtype=torch.int64) + self.root_node.mamba_value = torch.tensor([], dtype=torch.int64) + self.root_node.full_lock_ref = 1 + self.root_node.mamba_lock_ref = 1 + self._full_evictable = 0 + self._full_protected = 0 + self._mamba_evictable = 0 + self._mamba_protected = 0 + self.full_lru = LRUList(mamba=False) + self.mamba_lru = LRUList(mamba=True) + + def match_prefix(self, key: RadixKey) -> MatchResult: + """Find longest cached prefix. Also returns ``mamba_branching_seqlen``.""" + empty = MatchResult( + indices=torch.empty(0, dtype=torch.int64, device=self.device), + last_node=self.root_node, + ) + if len(key) == 0: + return empty + + key = self._page_align_key(key) + if len(key) == 0: + return empty + + node = self.root_node + values: List[torch.Tensor] = [] + mamba_branching_seqlen: Optional[int] = None + total_matched = 0 + + while len(key) > 0: + ck = _child_key(key, self.page_size) + if ck not in node.children: + break + child = node.children[ck] + child.hit_count += 1 + plen = _key_match(child.key, key, self.page_size) + + if plen < len(child.key): + new_node = self._split_node(child.key, child, plen) + values.append(new_node.value) + # Track mamba branching point + if mamba_branching_seqlen is None and new_node.mamba_tombstone: + mamba_branching_seqlen = total_matched + total_matched += len(new_node.value) + node = new_node + break + + values.append(child.value) + if mamba_branching_seqlen is None and child.mamba_tombstone: + mamba_branching_seqlen = total_matched + total_matched += len(child.value) + node = child + key = key[plen:] + + # Update LRU for matched path + self.full_lru.touch_node_and_parents_mru(node, self.root_node) + self.mamba_lru.touch_node_and_parents_mru(node, self.root_node) + + cat = ( + torch.cat(values) + if values + else torch.empty(0, dtype=torch.int64, device=self.device) + ) + return MatchResult( + indices=cat, + last_node=node, + prefix_len=len(cat), + mamba_branching_seqlen=mamba_branching_seqlen, + ) + + def insert( + self, + key: RadixKey, + value: Optional[torch.Tensor] = None, + *, + mamba_value: Optional[torch.Tensor] = None, + **kwargs: Any, + ) -> InsertResult: + """Insert with both full KV and Mamba state values.""" + if value is None: + value = torch.tensor(key.token_ids, dtype=torch.int64) + + if len(key) == 0: + return InsertResult() + + node = self.root_node + total_prefix = 0 + mamba_exist = False + + ck = _child_key(key, self.page_size) + while len(key) > 0 and ck in node.children: + node = node.children[ck] + plen = _key_match(node.key, key, self.page_size) + total_prefix += plen + key = key[plen:] + value = value[plen:] + + if plen < len(node.key): + node = self._split_node(node.key, node, plen) + + # Check if mamba state already exists + if node.mamba_value is not None: + mamba_exist = True + + if len(key) > 0: + ck = _child_key(key, self.page_size) + + if len(key) > 0: + new_leaf = self._add_leaf(node, key, value, mamba_value=mamba_value) + node = new_leaf + elif mamba_value is not None and node.mamba_value is None: + # Existing node gains mamba state (un-tombstone) + node.mamba_value = mamba_value.clone() + self.mamba_lru.insert_mru(node) + self._mamba_evictable += len(node.value) + + return InsertResult( + prefix_len=total_prefix, last_node=node, mamba_exist=mamba_exist + ) + + def evict(self, num_tokens: int, swa_num_tokens: int = 0) -> EvictResult: + """Evict full KV and/or Mamba state tokens. + + Phase 1: Evict full KV leaves (frees both KV and Mamba state). + Phase 2: Evict Mamba state from internal nodes (tombstone mamba). + """ + full_evicted = 0 + mamba_evicted = 0 + + # Phase 1: full leaf eviction + if num_tokens > 0: + while full_evicted < num_tokens: + node = self.full_lru.get_lru_leaf_unlocked() + if node is None: + break + n = len(node.value) + self._free_full_indices(node.value) + if node.mamba_value is not None: + self._free_mamba_value(node.mamba_value) + mamba_evicted += n + full_evicted += n + self._delete_leaf(node) + + # Cascade: parent may become evictable leaf + p = node.parent + if ( + p is not None + and p != self.root_node + and len(p.children) == 0 + and p.full_lock_ref == 0 + ): + # Will be picked up in next iteration via LRU + pass + + # Phase 2: mamba-only tombstone eviction + target_mamba = swa_num_tokens + if target_mamba > 0 and mamba_evicted < target_mamba: + while mamba_evicted < target_mamba: + node = self.mamba_lru.get_lru_unlocked() + if node is None: + break + if node.mamba_value is None: + continue + n = len(node.mamba_value) + self._free_mamba_value(node.mamba_value) + node.mamba_value = None + self.mamba_lru.remove(node) + self._mamba_evictable -= n + mamba_evicted += n + + return EvictResult( + full_evicted=full_evicted, mamba_evicted=mamba_evicted + ) + + def inc_lock_ref(self, node: MambaTreeNode) -> Optional[Any]: + """Lock full KV and Mamba state from *node* to root. + + Full lock propagates up to root. Mamba lock only applies to + the node itself (not ancestors). + """ + if node is None: + return None + + # Lock mamba on the node itself + if node.mamba_value is not None: + if node.mamba_lock_ref == 0 and node in self.mamba_lru: + self._mamba_evictable -= len(node.mamba_value) + self._mamba_protected += len(node.mamba_value) + node.mamba_lock_ref += 1 + + # Lock full KV up to root + cur = node + while cur != self.root_node: + if cur.full_lock_ref == 0: + self._full_evictable -= len(cur.key) + self._full_protected += len(cur.key) + cur.full_lock_ref += 1 + cur = cur.parent + return None + + def dec_lock_ref(self, node: MambaTreeNode, **kwargs: Any) -> None: + """Unlock full KV and Mamba state.""" + if node is None: + return + + # Unlock mamba on the node itself + if node.mamba_lock_ref > 0: + node.mamba_lock_ref -= 1 + if node.mamba_lock_ref == 0 and node.mamba_value is not None: + self._mamba_evictable += len(node.mamba_value) + self._mamba_protected -= len(node.mamba_value) + + # Unlock full KV up to root + cur = node + while cur != self.root_node: + if cur.full_lock_ref == 1: + self._full_evictable += len(cur.key) + self._full_protected -= len(cur.key) + cur.full_lock_ref -= 1 + cur = cur.parent + + # ------------------------------------------------------------------ + # Internal: tree manipulation + # ------------------------------------------------------------------ + + def _add_leaf( + self, + parent: MambaTreeNode, + key: RadixKey, + value: torch.Tensor, + mamba_value: Optional[torch.Tensor] = None, + ) -> MambaTreeNode: + # Parent may lose leaf status + if ( + len(parent.children) == 0 + and parent != self.root_node + and parent.full_lock_ref == 0 + and not parent.evicted + ): + self._full_evictable -= len(parent.key) + + new_node = MambaTreeNode() + new_node.parent = parent + new_node.key = key + new_node.value = value.clone() + parent.children[_child_key(key, self.page_size)] = new_node + + # Track in full LRU + self.full_lru.insert_mru(new_node) + self._full_evictable += len(key) + + # Track mamba state if provided + if mamba_value is not None: + new_node.mamba_value = mamba_value.clone() + self.mamba_lru.insert_mru(new_node) + self._mamba_evictable += len(key) + + return new_node + + def _split_node( + self, key: RadixKey, child: MambaTreeNode, split_len: int + ) -> MambaTreeNode: + """Split *child* at *split_len*, returning the new parent node.""" + new_node = MambaTreeNode() + new_node.children[_child_key(key[split_len:], self.page_size)] = child + new_node.parent = child.parent + new_node.full_lock_ref = child.full_lock_ref + new_node.mamba_lock_ref = child.mamba_lock_ref + new_node.key = child.key[:split_len] + new_node.value = child.value[:split_len].clone() + + # Split mamba value + if child.mamba_value is not None: + new_node.mamba_value = child.mamba_value[:split_len].clone() + child.mamba_value = child.mamba_value[split_len:].clone() + + child.parent = new_node + child.key = child.key[split_len:] + child.value = child.value[split_len:].clone() + new_node.parent.children[_child_key(key, self.page_size)] = new_node + + # Update LRU lists: insert new_node, keep child + self.full_lru.insert_mru(new_node) + if new_node.mamba_value is not None: + self.mamba_lru.insert_mru(new_node) + + return new_node + + def _delete_leaf(self, node: MambaTreeNode) -> None: + ck = _child_key(node.key, self.page_size) + node.parent.children.pop(ck, None) + + # Remove from LRU lists + if node in self.full_lru: + self.full_lru.remove(node) + self._full_evictable -= len(node.key) + + if node.mamba_value is not None and node in self.mamba_lru: + self.mamba_lru.remove(node) + self._mamba_evictable -= len(node.key) + + node.value = None + node.mamba_value = None + + if self.on_node_evict is not None: + self.on_node_evict(node.id) + + # ------------------------------------------------------------------ + # Internal: memory management + # ------------------------------------------------------------------ + + def _free_full_indices(self, indices: torch.Tensor) -> None: + if self.pool is not None and len(indices) > 0: + self.pool.free(indices) + + def _free_mamba_value(self, mamba_value: torch.Tensor) -> None: + if self.mamba_pool is not None and len(mamba_value) > 0: + for idx in mamba_value.tolist(): + self.mamba_pool.free_track_slot(int(idx)) + + def _page_align_key(self, key: RadixKey) -> RadixKey: + if self.page_size == 1: + return key + aligned = len(key) // self.page_size * self.page_size + return key[:aligned] + + def pretty_print(self) -> None: + """Print the tree structure to stdout.""" + self._print_helper(self.root_node, 0) + print( + f"total={self.total_size()} " + f"full_evictable={self._full_evictable} " + f"mamba_evictable={self._mamba_evictable}" + ) + + def _print_helper(self, node: MambaTreeNode, indent: int) -> None: + stack = [(node, indent)] + while stack: + n, ind = stack.pop() + toks = n.key.token_ids[:10] if n.key else [] + klen = len(n.key) if n.key else 0 + has_mamba = n.mamba_value is not None + print( + f"{' ' * ind}[{klen}] {toks} " + f"full_lock={n.full_lock_ref} mamba_lock={n.mamba_lock_ref} " + f"mamba={'Y' if has_mamba else 'N'}" + ) + for c in n.children.values(): + stack.append((c, ind + 1)) diff --git a/pymllm/mem_cache/radix_cache.py b/pymllm/mem_cache/radix_cache.py index a472d5085..80f3d6f1f 100644 --- a/pymllm/mem_cache/radix_cache.py +++ b/pymllm/mem_cache/radix_cache.py @@ -1,5 +1,4 @@ -"""Lightweight radix-tree KV cache with SWA and multimodal support. - +"""Radix-tree KV cache with SWA and multimodal support. Supports: - Multi-batch serving on a single GPU @@ -12,87 +11,29 @@ from __future__ import annotations -import hashlib import heapq import logging import time from collections import defaultdict -from dataclasses import dataclass -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch -logger = logging.getLogger(__name__) - - -def hash_token_ids( - token_ids: List[Union[int, Tuple[int, ...]]], - prior_hash: Optional[str] = None, -) -> str: - """SHA-256 hash of a token-id page with optional chain-hash. - - Each token is encoded as a 4-byte little-endian unsigned integer; - tuples (bigram / EAGLE) hash each element in order. When *prior_hash* - is supplied the digest is seeded with the raw bytes of the previous - hash, making the result position-aware. - """ - hasher = hashlib.sha256() - if prior_hash: - hasher.update(bytes.fromhex(prior_hash)) - for t in token_ids: - if isinstance(t, tuple): - for elem in t: - hasher.update(elem.to_bytes(4, byteorder="little", signed=False)) - else: - hasher.update(t.to_bytes(4, byteorder="little", signed=False)) - return hasher.hexdigest() - - -def hash_to_int64(hex_str: str) -> int: - """Convert a hex digest to a signed 64-bit integer (first 16 hex chars).""" - val = int(hex_str[:16], 16) - return val - (1 << 64) if val >= (1 << 63) else val - - -def hash_bytes(data: bytes) -> int: - """SHA-256 → unsigned 64-bit int. Useful for multimodal embedding keys.""" - return int.from_bytes(hashlib.sha256(data).digest()[:8], "big", signed=False) - - -class RadixKey: - """Compound lookup key: token-id sequence + optional namespace tag. - - ``extra_key`` isolates independent namespaces so that sequences with - identical leading tokens but different adapters / LoRA ids / multimodal - context hashes never share prefix nodes. - """ - - __slots__ = ("token_ids", "extra_key") +from pymllm.mem_cache.base_prefix_cache import ( + BasePrefixCache, + EvictResult, + InsertResult, + MatchResult, + RadixKey, + hash_token_ids, +) - def __init__( - self, - token_ids: List[Union[int, Tuple[int, ...]]], - extra_key: Optional[str] = None, - ): - self.token_ids = token_ids - self.extra_key = extra_key - - def __len__(self) -> int: - return len(self.token_ids) - - def __iter__(self) -> Iterator: - return iter(self.token_ids) - - def __getitem__(self, idx: Union[int, slice]) -> RadixKey: - if isinstance(idx, slice): - return RadixKey(self.token_ids[idx], self.extra_key) - return RadixKey([self.token_ids[idx]], self.extra_key) +logger = logging.getLogger(__name__) - def __repr__(self) -> str: - preview = self.token_ids[:10] - tail = "..." if len(self.token_ids) > 10 else "" - return f"RadixKey(extra={self.extra_key!r}, toks={preview}{tail})" +# ====================================================================== +# Tree node +# ====================================================================== _node_counter: int = 0 @@ -149,6 +90,11 @@ def __lt__(self, other: TreeNode) -> bool: return self.last_access_time < other.last_access_time +# ====================================================================== +# Helper functions +# ====================================================================== + + def _key_match(key0: RadixKey, key1: RadixKey, page_size: int) -> int: """Return the length of the common prefix (page-aligned when *page_size* > 1).""" if key0.extra_key != key1.extra_key: @@ -175,33 +121,13 @@ def _child_key(key: RadixKey, page_size: int) -> Any: return (key.extra_key, plain) if key.extra_key is not None else plain -@dataclass -class MatchResult: - """Returned by :meth:`RadixCache.match_prefix`.""" - - indices: torch.Tensor - last_node: TreeNode - prefix_len: int = 0 - - -@dataclass -class InsertResult: - """Returned by :meth:`RadixCache.insert`.""" - - prefix_len: int = 0 - last_node: Optional[TreeNode] = None - - -@dataclass -class EvictResult: - """Returned by :meth:`RadixCache.evict`.""" +# ====================================================================== +# RadixCache +# ====================================================================== - full_evicted: int = 0 - swa_evicted: int = 0 - -class RadixCache: - """Lightweight radix tree for KV-cache prefix sharing. +class RadixCache(BasePrefixCache): + """Radix tree for KV-cache prefix sharing. Parameters ---------- @@ -212,24 +138,22 @@ class RadixCache: If set, enables SWA mode. The cache tracks which nodes have had their SWA KV freed (tombstoned) and constrains prefix matching so that the sliding-window invariant is maintained. - disable: - When *True* every public method is a no-op (useful for ablation). token_to_kv_pool_allocator: Optional pool allocator with ``free(indices)`` (and ``free_swa`` for SWA mode). When *None*, index tensors are simply discarded. + on_node_evict: + Optional callback invoked with the node id when a node is evicted. """ def __init__( self, page_size: int = 1, sliding_window_size: Optional[int] = None, - disable: bool = False, token_to_kv_pool_allocator: Any = None, on_node_evict: Optional[Callable[[int], None]] = None, ): self.page_size = page_size self.sliding_window_size = sliding_window_size - self.disable = disable self.pool = token_to_kv_pool_allocator self.on_node_evict = on_node_evict @@ -245,6 +169,10 @@ def __init__( def supports_swa(self) -> bool: return self.sliding_window_size is not None + # ------------------------------------------------------------------ + # Size queries + # ------------------------------------------------------------------ + def evictable_size(self) -> int: return self._evictable_size @@ -257,6 +185,21 @@ def protected_size(self) -> int: def swa_protected_size(self) -> int: return self._swa_protected_size + def total_size(self) -> int: + """Total number of cached tokens (including tombstoned).""" + total = 0 + stack: List[TreeNode] = [self.root_node] + while stack: + n = stack.pop() + if n.value is not None: + total += len(n.value) + stack.extend(c for c in n.children.values() if not c.evicted) + return total + + # ------------------------------------------------------------------ + # BasePrefixCache interface + # ------------------------------------------------------------------ + def reset(self) -> None: """Clear all cached state and re-initialise the root node.""" self.root_node = TreeNode() @@ -283,7 +226,7 @@ def match_prefix(self, key: RadixKey) -> MatchResult: indices=torch.empty(0, dtype=torch.int64, device=self.device), last_node=self.root_node, ) - if self.disable or len(key) == 0: + if len(key) == 0: return empty key = self._page_align_key(key) @@ -310,6 +253,7 @@ def insert( *, prev_prefix_len: int = 0, swa_evicted_seqlen: int = 0, + **kwargs: Any, ) -> InsertResult: """Insert *key*/*value* into the tree. @@ -327,8 +271,6 @@ def insert( previously evicted. Used to decide whether a tombstoned node can be un-tombstoned with the incoming value. """ - if self.disable: - return InsertResult() if value is None: value = torch.tensor(key.token_ids, dtype=torch.int64) if self.supports_swa: @@ -346,9 +288,6 @@ def evict(self, num_tokens: int, swa_num_tokens: int = 0) -> EvictResult: Full eviction removes leaf nodes entirely; SWA eviction tombstones internal nodes (freeing SWA KV but retaining full-attn KV). """ - if self.disable: - return EvictResult() - full_evicted = 0 swa_evicted = 0 @@ -415,7 +354,7 @@ def inc_lock_ref(self, node: TreeNode) -> Optional[int]: Returns ``swa_boundary_id`` that must be passed back to :meth:`dec_lock_ref`. In non-SWA mode, returns ``None``. """ - if self.disable or node is None: + if node is None: return None swa_locked = 0 @@ -447,10 +386,10 @@ def inc_lock_ref(self, node: TreeNode) -> Optional[int]: return swa_boundary_id def dec_lock_ref( - self, node: TreeNode, swa_boundary_id: Optional[int] = None + self, node: TreeNode, swa_boundary_id: Optional[int] = None, **kwargs: Any ) -> None: """Unlock nodes from *node* up to root.""" - if self.disable or node is None: + if node is None: return dec_swa = True @@ -471,16 +410,9 @@ def dec_lock_ref( cur = cur.parent - def total_size(self) -> int: - """Total number of cached tokens (including tombstoned).""" - total = 0 - stack: List[TreeNode] = [self.root_node] - while stack: - n = stack.pop() - if n.value is not None: - total += len(n.value) - stack.extend(c for c in n.children.values() if not c.evicted) - return total + # ------------------------------------------------------------------ + # Hashing & pretty-print + # ------------------------------------------------------------------ def compute_node_hash(self, node: TreeNode) -> List[str]: """Compute position-aware SHA-256 hashes for *node* (one per page). @@ -523,6 +455,10 @@ def pretty_print(self) -> None: ) ) + # ------------------------------------------------------------------ + # Internal: match + # ------------------------------------------------------------------ + def _match_normal(self, key: RadixKey) -> Tuple[List[torch.Tensor], TreeNode]: node = self.root_node now = time.monotonic() @@ -593,6 +529,10 @@ def _match_swa(self, key: RadixKey) -> Tuple[List[torch.Tensor], TreeNode, int]: return values, best_node, best_count + # ------------------------------------------------------------------ + # Internal: insert + # ------------------------------------------------------------------ + def _insert_normal( self, node: TreeNode, key: RadixKey, value: torch.Tensor ) -> Tuple[int, TreeNode]: @@ -613,9 +553,6 @@ def _insert_normal( value = value[plen:] if plen < len(node.key): - # Partial match: split the node. ``node`` must advance to - # the NEW parent so that any remaining key is added as a - # sibling of the tail, not a child of it. node = self._split_node(node.key, node, plen) if len(key) > 0: ck = _child_key(key, self.page_size) @@ -691,6 +628,10 @@ def _insert_swa( return total_prefix + # ------------------------------------------------------------------ + # Internal: tree manipulation + # ------------------------------------------------------------------ + def _add_leaf( self, parent: TreeNode, @@ -698,8 +639,6 @@ def _add_leaf( value: torch.Tensor, swa_tombstone: bool = False, ) -> TreeNode: - # If parent was a childless (leaf) node, it will no longer be - # evictable after gaining a child. Adjust the size counter. if ( len(parent.children) == 0 and parent != self.root_node @@ -724,9 +663,15 @@ def _split_node(self, key: RadixKey, child: TreeNode, split_len: int) -> TreeNod logger.debug( "[SPLIT] node_id=%d key_len=%d split_len=%d " "parent_val[:4]=%s child_val[:4]=%s", - child.id, len(key), split_len, - child.value[:min(split_len, 4)].tolist() if child.value is not None else [], - child.value[split_len:split_len+4].tolist() if child.value is not None and len(child.value) > split_len else [], + child.id, + len(key), + split_len, + child.value[:min(split_len, 4)].tolist() + if child.value is not None + else [], + child.value[split_len : split_len + 4].tolist() + if child.value is not None and len(child.value) > split_len + else [], ) new_node = TreeNode() new_node.children[_child_key(key[split_len:], self.page_size)] = child @@ -759,7 +704,6 @@ def _delete_leaf(self, node: TreeNode) -> None: self._evictable_size -= len(node.key) if self.supports_swa and not node.swa_tombstone: self._swa_evictable_size -= len(node.key) - # Mark as evicted so node.evicted returns True. node.value = None if self.on_node_evict is not None: self.on_node_evict(node.id) @@ -768,6 +712,10 @@ def _tombstone_node(self, node: TreeNode) -> None: node.swa_tombstone = True self._swa_evictable_size -= len(node.key) + # ------------------------------------------------------------------ + # Internal: collection helpers + # ------------------------------------------------------------------ + def _collect_evictable_leaves(self) -> List[TreeNode]: leaves: List[TreeNode] = [] stack: List[TreeNode] = [self.root_node] diff --git a/pymllm/orchestrator/detokenizer_process.py b/pymllm/orchestrator/detokenizer_process.py index c2154e447..b2c1609a9 100644 --- a/pymllm/orchestrator/detokenizer_process.py +++ b/pymllm/orchestrator/detokenizer_process.py @@ -175,6 +175,11 @@ def run_detokenizer_process( ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" setup_subprocess_logging((tokenizer_cfg or {}).get("log_level", "info")) + + # Limit CPU threads — detokenizer doesn't need PyTorch parallelism. + import torch + torch.set_num_threads(1) + proc = DetokenizerProcess( recv_from_scheduler_addr, send_to_rr_addr, diff --git a/pymllm/orchestrator/model_runner_process.py b/pymllm/orchestrator/model_runner_process.py index f6fc709b2..a514ac2e9 100644 --- a/pymllm/orchestrator/model_runner_process.py +++ b/pymllm/orchestrator/model_runner_process.py @@ -24,7 +24,10 @@ import torch -from pymllm.mem_cache.radix_cache import RadixCache, RadixKey, TreeNode +from pymllm.mem_cache.base_prefix_cache import BasePrefixCache, RadixKey +from pymllm.mem_cache.chunk_cache import ChunkCache +from pymllm.mem_cache.mamba_radix_cache import MambaRadixCache +from pymllm.mem_cache.radix_cache import RadixCache logger = logging.getLogger(__name__) @@ -69,8 +72,9 @@ def __init__( # range [cache_protected_len, prefix_len) are duplicates that must # be freed from the allocator (the tree already holds cloned copies). self._rid_to_cache_protected_len: Dict[str, int] = {} - # Maps rid -> (last_node, swa_boundary_id) for radix cache lock tracking - self._rid_to_radix_lock: Dict[str, Tuple[TreeNode, Optional[int]]] = {} + # Maps rid -> (last_node, lock_token) for radix cache lock tracking. + # last_node type depends on the cache implementation (TreeNode, MambaTreeNode, etc.) + self._rid_to_radix_lock: Dict[str, Tuple[Any, Optional[Any]]] = {} # Maps rid -> mrope_position_delta (M-RoPE positional offset per request) # Populated during prefill; used to offset decode-step positions for # multimodal models (Qwen3-VL) that consume more position indices than @@ -105,34 +109,63 @@ def init_model(self) -> None: ) self._runner.initialize() - # Initialise RadixCache after memory pools are ready. + # Initialise prefix cache after memory pools are ready. + self._radix_cache = self._create_prefix_cache() + logger.info("ModelRunnerProcess: ModelRunner ready") + + def _create_prefix_cache(self) -> BasePrefixCache: + """Factory: create the appropriate prefix cache based on config.""" disable_cache = getattr(self._server_config, "disable_radix_cache", False) self._is_hybrid = self._runner.num_gdn_layers > 0 - if self._is_hybrid and not disable_cache: + enable_mamba_cache = getattr(self._server_config, "enable_mamba_cache", False) + sliding_window = self._runner.sliding_window_size + page_size = getattr(self._server_config, "radix_cache_page_size", 1) + allocator = self._runner.token_to_kv_pool_allocator + + if disable_cache: + device = allocator.device if allocator is not None else torch.device("cpu") + logger.info("ModelRunnerProcess: using ChunkCache (radix cache disabled)") + return ChunkCache( + token_to_kv_pool_allocator=allocator, + device=device, + ) + + if enable_mamba_cache: + mamba_pool = getattr(self._runner, "gdn_pool", None) + logger.info( + "ModelRunnerProcess: using MambaRadixCache " + "(mamba_pool=%s, page_size=%d)", + "available" if mamba_pool is not None else "none", + page_size, + ) + evict_cb = self._on_radix_node_evict if self._is_hybrid else None + return MambaRadixCache( + page_size=page_size, + token_to_kv_pool_allocator=allocator, + mamba_pool=mamba_pool, + on_node_evict=evict_cb, + ) + + # Standard RadixCache (with optional SWA) + if self._is_hybrid: logger.info( "ModelRunnerProcess: prefix caching ENABLED with GDN state " "tracking (%d GDN layers)", self._runner.num_gdn_layers, ) - sliding_window = self._runner.sliding_window_size - page_size = getattr(self._server_config, "radix_cache_page_size", 1) - # For hybrid models, register an eviction callback so that evicted - # radix nodes free their associated GDN track slots. evict_cb = self._on_radix_node_evict if self._is_hybrid else None - self._radix_cache = RadixCache( + logger.info( + "ModelRunnerProcess: using RadixCache " + "(sliding_window=%s, page_size=%d)", + sliding_window, + page_size, + ) + return RadixCache( page_size=page_size, sliding_window_size=sliding_window, - disable=disable_cache, - token_to_kv_pool_allocator=self._runner.token_to_kv_pool_allocator, + token_to_kv_pool_allocator=allocator, on_node_evict=evict_cb, ) - logger.info( - "ModelRunnerProcess: RadixCache initialized " - "(disable=%s, sliding_window=%s)", - disable_cache, - sliding_window, - ) - logger.info("ModelRunnerProcess: ModelRunner ready") # ------------------------------------------------------------------ # Forward pass @@ -223,16 +256,50 @@ def _forward_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: temperatures = [] top_ps = [] top_ks = [] + repetition_penalties = [] + frequency_penalties = [] + presence_penalties = [] for m in requests_meta: sp = m.get("sampling_params") or {} temperatures.append(sp.get("temperature", 1.0)) top_ps.append(sp.get("top_p", 1.0)) top_ks.append(sp.get("top_k", -1)) + repetition_penalties.append(sp.get("repetition_penalty", 1.0)) + frequency_penalties.append(sp.get("frequency_penalty", 0.0)) + presence_penalties.append(sp.get("presence_penalty", 0.0)) temps_tensor = torch.tensor(temperatures, dtype=torch.float32, device=device) top_ps_tensor = torch.tensor(top_ps, dtype=torch.float32, device=device) top_ks_tensor = torch.tensor(top_ks, dtype=torch.int32, device=device) + # Collect token histories for penalty computation. + # Each entry is (input_ids + output_ids_so_far) for the request. + has_penalties = ( + any(p != 1.0 for p in repetition_penalties) + or any(p != 0.0 for p in frequency_penalties) + or any(p != 0.0 for p in presence_penalties) + ) + penalty_params = None + if has_penalties: + token_histories = [] + for m in requests_meta: + rid = m["rid"] + input_ids = self._rid_to_input_ids.get(rid, []) + output_ids = self._rid_to_output_ids.get(rid, []) + token_histories.append(list(input_ids) + list(output_ids)) + penalty_params = { + "repetition_penalties": torch.tensor( + repetition_penalties, dtype=torch.float32, device=device + ), + "frequency_penalties": torch.tensor( + frequency_penalties, dtype=torch.float32, device=device + ), + "presence_penalties": torch.tensor( + presence_penalties, dtype=torch.float32, device=device + ), + "token_histories": token_histories, + } + if forward_mode == "extend": if extend_seq_lens_t is None: extend_seq_lens_list: List[int] = batch["extend_seq_lens"] @@ -325,6 +392,7 @@ def _forward_batch(self, batch: Dict[str, Any]) -> Dict[str, Any]: temperatures=temps_tensor, top_ps=top_ps_tensor, top_ks=top_ks_tensor, + penalty_params=penalty_params, ) # ============================================================== @@ -427,7 +495,7 @@ def _insert_into_radix_cache(self, requests_meta: List[Dict[str, Any]]) -> None: """ _dbg = logger.isEnabledFor(logging.DEBUG) cache = self._radix_cache - if cache is None or cache.disable: + if cache is None: return runner = self._runner @@ -570,7 +638,7 @@ def _allocate_extend( # --- Step 1: Radix cache prefix matching --- actual_prefix_lens: List[int] = [] actual_extend_lens: List[int] = [] - matched_nodes: List[Optional[TreeNode]] = [] + matched_nodes: List[Optional[Any]] = [] # Cache the match results so we don't call match_prefix twice cached_indices_list: List[Optional[torch.Tensor]] = [] gdn_pool = getattr(runner, "gdn_pool", None) @@ -582,7 +650,7 @@ def _allocate_extend( # Store input_ids for later radix cache insert self._rid_to_input_ids[m["rid"]] = full_input_ids - if cache is not None and not cache.disable and len(full_input_ids) > 0: + if cache is not None and len(full_input_ids) > 0: key = RadixKey(full_input_ids) match_result = cache.match_prefix(key) prefix_len = match_result.prefix_len @@ -660,7 +728,7 @@ def _allocate_extend( # Without locking first, _alloc_kv_with_eviction could evict the # matched nodes, freeing their KV pool slots and causing # use-after-free when we later read from cached_indices. - if cache is not None and not cache.disable: + if cache is not None: for i, m in enumerate(requests_meta): node = matched_nodes[i] if node is not None and actual_prefix_lens[i] > 0: @@ -770,7 +838,7 @@ def _allocate_extend( # Allocate a track slot only when the radix cache is enabled; # track slots are freed via the eviction callback so they must # be associated with a node, which only happens when cache is on. - if cache is not None and not cache.disable: + if cache is not None: ts = gdn_pool.alloc_track_slot() if ts is not None: self._rid_to_gdn_track_slot[rid] = ts @@ -785,7 +853,7 @@ def _unlock_matched_nodes(self, requests_meta: List[Dict[str, Any]]) -> None: Called when allocation fails after locking matched nodes. """ cache = self._radix_cache - if cache is None or cache.disable: + if cache is None: return for m in requests_meta: lock = self._rid_to_radix_lock.pop(m["rid"], None) @@ -807,7 +875,7 @@ def _alloc_kv_with_eviction(self, num_tokens: int) -> Optional[torch.Tensor]: return result # Eviction loop: try evicting from radix cache to free space - if cache is None or cache.disable: + if cache is None: return None for attempt in range(_MAX_EVICT_RETRIES): @@ -931,7 +999,7 @@ def _free_rid_resources(self, rid: str) -> None: # and the eviction callback; here we just remove the rid mapping. self._rid_to_gdn_track_slot.pop(rid, None) - cache_enabled = cache is not None and not cache.disable + cache_enabled = cache is not None # ---------------------------------------------------------- # Phase 1: Read all KV indices BEFORE freeing anything. diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py index 8e0ba9e22..fa9046061 100644 --- a/pymllm/orchestrator/scheduler_process.py +++ b/pymllm/orchestrator/scheduler_process.py @@ -23,6 +23,8 @@ if batch: result = run_batch(batch) # direct call to model runner process_batch_result(batch, result) + else: + idle_sleeper.sleep() # block until ZMQ data or timeout stream_output() """ @@ -48,11 +50,38 @@ # Default scheduling limits _DEFAULT_MAX_RUNNING_REQUESTS = 256 +_DEFAULT_IDLE_POLL_TIMEOUT_MS = 1000 _DEFAULT_MAX_PREFILL_TOKENS = 8192 _DEFAULT_MAX_TOTAL_TOKENS = 131072 _DEFAULT_MAX_NEW_TOKENS = 32768 +# ====================================================================== +# IdleSleeper -- avoid busy-looping when no work is available +# ====================================================================== + + +class IdleSleeper: + """Block the scheduler thread when idle using ZMQ Poller. + + Avoids 100% CPU spinning when no requests are pending. The poller + wakes immediately when data arrives on any registered socket, so + request latency is not affected. + """ + + def __init__( + self, sockets: list, poll_timeout_ms: int = _DEFAULT_IDLE_POLL_TIMEOUT_MS + ): + self.poller = zmq.Poller() + for s in sockets: + self.poller.register(s, zmq.POLLIN) + self.poll_timeout_ms = poll_timeout_ms + + def sleep(self) -> None: + """Block until data arrives on any registered socket, or timeout.""" + self.poller.poll(self.poll_timeout_ms) + + # ====================================================================== # Req -- per-request state tracker # ====================================================================== @@ -436,6 +465,10 @@ def init_sockets(self) -> None: self._poller = zmq.Poller() self._poller.register(self._recv_from_tokenizer, zmq.POLLIN) + # Idle sleeper: blocks the event loop when no batch is ready, + # wakes immediately on incoming ZMQ messages. + self._idle_sleeper = IdleSleeper([self._recv_from_tokenizer]) + def init_model(self) -> None: """Create and initialise the in-process model runner. @@ -466,6 +499,10 @@ def event_loop(self) -> None: if batch is not None: result = self.run_batch(batch) self.process_batch_result(batch, result) + else: + # No work available -- sleep until a new request arrives + # on the ZMQ socket (or timeout). Avoids busy-looping. + self._idle_sleeper.sleep() self.stream_output() # ------------------------------------------------------------------ @@ -525,7 +562,7 @@ def _recv_from_shared_queue(self) -> None: """ while True: try: - rid, shm_name, mm_inputs = self._shared_queue.get(timeout=0.0001) + rid, shm_name, mm_inputs = self._shared_queue.get(timeout=0.002) # Read metadata from shared memory (and unlink immediately) metadata: TokenizedGenerateReqInput = SharedMemoryManager.read_metadata( @@ -830,8 +867,10 @@ def stream_output(self) -> None: Produces :class:`~pymllm.engine.io_struct.BatchTokenIDOutput`-compatible dicts. For streaming requests, intermediate tokens are also sent. """ - # Collect streaming outputs from running requests + # Collect streaming outputs from running requests (skip aborted) for req in self._running_batch: + if req.finished_reason == "abort": + continue if req.stream and len(req.output_ids) > req.read_offset: decode_ids = req.output_ids[req.read_offset :] output = { diff --git a/pymllm/orchestrator/tokenizer_process.py b/pymllm/orchestrator/tokenizer_process.py index 703618a40..44a4c897c 100644 --- a/pymllm/orchestrator/tokenizer_process.py +++ b/pymllm/orchestrator/tokenizer_process.py @@ -487,6 +487,11 @@ def run_tokenizer_process( ) -> None: """Entry point for ``torch.multiprocessing.Process(target=...)``.""" setup_subprocess_logging(tokenizer_cfg.get("log_level", "info")) + + # Limit CPU threads — tokenizer doesn't need PyTorch parallelism. + import torch + torch.set_num_threads(1) + proc = TokenizerProcess( recv_from_rr_addr, send_to_scheduler_addr, tokenizer_cfg, shared_queue ) diff --git a/pymllm/server/launch.py b/pymllm/server/launch.py index a328c0c6d..1dab4325d 100644 --- a/pymllm/server/launch.py +++ b/pymllm/server/launch.py @@ -17,6 +17,7 @@ """ import asyncio +import contextlib import logging import os import time @@ -38,6 +39,62 @@ logger = logging.getLogger(__name__) asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) + +# --------------------------------------------------------------------------- +# Disconnect-aware async generator wrapper +# --------------------------------------------------------------------------- + +_DISCONNECT_CHECK_INTERVAL = 1.0 # seconds + + +async def _iter_with_disconnect_check( + agen: AsyncIterator, + request: Request, + interval: float = _DISCONNECT_CHECK_INTERVAL, +) -> AsyncIterator: + """Wrap an async generator, periodically checking for client disconnect. + + The standard ``async for chunk in agen`` pattern only checks between + items. If the generator blocks waiting for the next item (e.g. waiting + for a decode step), a client disconnect goes unnoticed. + + This wrapper uses ``asyncio.wait`` with a timeout so that + ``request.is_disconnected()`` is polled every *interval* seconds even + while waiting for the next item. + + When a disconnect is detected, the underlying generator is closed via + ``aclose()`` which triggers its ``finally`` cleanup (abort logic). + """ + aiter = agen.__aiter__() + while True: + # Start fetching the next item without blocking indefinitely. + next_task = asyncio.ensure_future(aiter.__anext__()) + try: + while True: + done, _ = await asyncio.wait({next_task}, timeout=interval) + if done: + break + # Timeout: check if client is still connected. + if await request.is_disconnected(): + next_task.cancel() + with contextlib.suppress( + asyncio.CancelledError, StopAsyncIteration + ): + await next_task + # Close the generator to trigger its finally block. + await agen.aclose() + return + except Exception: + next_task.cancel() + with contextlib.suppress(asyncio.CancelledError, StopAsyncIteration): + await next_task + raise + + try: + yield next_task.result() + except StopAsyncIteration: + return + # --------------------------------------------------------------------------- # Global handles (populated at startup) # --------------------------------------------------------------------------- @@ -471,10 +528,9 @@ async def generate(obj: GenerateRequest, request: Request): if obj.stream: async def _stream() -> AsyncIterator[bytes]: + gen = engine.generate_async(**kwargs) try: - async for chunk in engine.generate_async(**kwargs): - if await request.is_disconnected(): - break + async for chunk in _iter_with_disconnect_check(gen, request): # Skip empty intermediate chunks (e.g. special tokens # stripped by the detokenizer) if not chunk.get("delta") and not chunk.get("finished"): @@ -483,19 +539,24 @@ async def _stream() -> AsyncIterator[bytes]: except Exception as e: err = {"error": {"message": str(e)}} yield b"data: " + orjson.dumps(err) + b"\n\n" + finally: + await gen.aclose() yield b"data: [DONE]\n\n" return StreamingResponse(_stream(), media_type="text/event-stream") + gen = engine.generate_async(**kwargs) try: results = [] - async for item in engine.generate_async(**kwargs): + async for item in _iter_with_disconnect_check(gen, request): results.append(item) result = results[0] if len(results) == 1 else results return ORJSONResponse(result) except Exception as e: logger.error("[generate] Error: %s", e) raise HTTPException(status_code=400, detail=str(e)) + finally: + await gen.aclose() # --------------------------------------------------------------------------- @@ -530,12 +591,11 @@ async def _stream() -> AsyncIterator[bytes]: comp_id = _make_completion_id() prompt_tokens = 0 completion_tokens = 0 + gen = engine.generate_async( + prompt=obj.prompt, sampling_params=sp, stream=True + ) try: - async for chunk in engine.generate_async( - prompt=obj.prompt, sampling_params=sp, stream=True - ): - if await request.is_disconnected(): - break + async for chunk in _iter_with_disconnect_check(gen, request): prompt_tokens = chunk.get("prompt_tokens", prompt_tokens) completion_tokens = chunk.get("completion_tokens", completion_tokens) delta_text = chunk.get("delta", "") @@ -563,6 +623,8 @@ async def _stream() -> AsyncIterator[bytes]: except Exception as e: err = {"error": {"message": str(e)}} yield b"data: " + orjson.dumps(err) + b"\n\n" + finally: + await gen.aclose() # Final usage-only chunk (OpenAI stream_options.include_usage) if include_usage: usage_chunk: Dict[str, Any] = { @@ -582,11 +644,12 @@ async def _stream() -> AsyncIterator[bytes]: return StreamingResponse(_stream(), media_type="text/event-stream") + gen = engine.generate_async( + prompt=obj.prompt, sampling_params=sp + ) try: results = [] - async for item in engine.generate_async( - prompt=obj.prompt, sampling_params=sp - ): + async for item in _iter_with_disconnect_check(gen, request): results.append(item) choices = [] prompt_tokens = 0 @@ -622,6 +685,8 @@ async def _stream() -> AsyncIterator[bytes]: except Exception as e: logger.error("[v1/completions] Error: %s", e) raise HTTPException(status_code=400, detail=str(e)) + finally: + await gen.aclose() # --------------------------------------------------------------------------- @@ -714,10 +779,9 @@ def _make_sse(delta: Dict[str, Any], finish: Optional[str] = None) -> bytes: } return b"data: " + orjson.dumps(sse) + b"\n\n" + gen = engine.generate_async(**gen_kwargs, stream=True) try: - async for chunk in engine.generate_async(**gen_kwargs, stream=True): - if await request.is_disconnected(): - break + async for chunk in _iter_with_disconnect_check(gen, request): prompt_tokens = chunk.get("prompt_tokens", prompt_tokens) completion_tokens = chunk.get("completion_tokens", completion_tokens) @@ -776,6 +840,8 @@ def _make_sse(delta: Dict[str, Any], finish: Optional[str] = None) -> bytes: except Exception as e: err = {"error": {"message": str(e)}} yield b"data: " + orjson.dumps(err) + b"\n\n" + finally: + await gen.aclose() # Final usage-only chunk if include_usage: usage_chunk: Dict[str, Any] = { @@ -796,11 +862,12 @@ def _make_sse(delta: Dict[str, Any], finish: Optional[str] = None) -> bytes: return StreamingResponse(_stream(), media_type="text/event-stream") # -- Non-streaming -- + gen = engine.generate_async(**gen_kwargs) try: from pymllm.parsers import ReasoningParser, ToolCallParser r = {} - async for item in engine.generate_async(**gen_kwargs): + async for item in _iter_with_disconnect_check(gen, request): r = item prompt_tokens = r.get("prompt_tokens", 0) completion_tokens = r.get("completion_tokens", 0) @@ -853,6 +920,8 @@ def _make_sse(delta: Dict[str, Any], finish: Optional[str] = None) -> bytes: except Exception as e: logger.error("[v1/chat/completions] Error: %s", e) raise HTTPException(status_code=400, detail=str(e)) + finally: + await gen.aclose() # --------------------------------------------------------------------------- From a6a993a6a2b6238bdcff48bc3ce7fc027f812989 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 18 Mar 2026 06:34:54 +0000 Subject: [PATCH 38/42] refactor: enhance configuration management and improve process health monitoring - Updated installation instructions in to clarify build steps for x86 Qualcomm AOT. - Modified to prevent re-initialization and ensure singleton behavior. - Added CORS origin configuration in for better API security. - Implemented health monitoring in the class to track subprocess liveness. - Enhanced request handling in to manage queued requests and clean up stale states. - Improved error handling and logging across various modules for better traceability. Signed-off-by: chenghuaWang <2923277184@qq.com> --- docs/qnn_backend/aot_execute.rst | 5 + pymllm/configs/global_config.py | 32 +++-- pymllm/configs/server_config.py | 1 + pymllm/engine/launch.py | 68 +++++++--- pymllm/executor/model_runner.py | 7 +- pymllm/orchestrator/detokenizer_process.py | 10 ++ pymllm/orchestrator/ipc_utils.py | 13 ++ .../orchestrator/request_response_process.py | 57 +++++++- pymllm/orchestrator/scheduler_process.py | 36 +++-- pymllm/server/launch.py | 127 ++++++++++++++---- 10 files changed, 286 insertions(+), 70 deletions(-) diff --git a/docs/qnn_backend/aot_execute.rst b/docs/qnn_backend/aot_execute.rst index 6b03834c0..7fd2a9a6b 100644 --- a/docs/qnn_backend/aot_execute.rst +++ b/docs/qnn_backend/aot_execute.rst @@ -60,6 +60,10 @@ Taking ``qwen3_qnn_aot`` as an example, the detailed steps are as follows. pip install -e . # link lib to pymllm's dir, so that tvm ffi can find the lib + # + # NOTE:! build x86 qualcomm aot first ! + source /bin/envsetup.sh + python task.py tasks/build_x86_qnn_aot.yaml ln -s /bin/ mllm/pymllm/lib @@ -82,6 +86,7 @@ Taking ``qwen3_qnn_aot`` as an example, the detailed steps are as follows. .. code-block:: shell # In the mllm-v2 project root directory + source /bin/envsetup.sh python task.py tasks/build_x86_qnn_aot.yaml # Run the compiler program diff --git a/pymllm/configs/global_config.py b/pymllm/configs/global_config.py index 711de3cd1..6ec68dda2 100644 --- a/pymllm/configs/global_config.py +++ b/pymllm/configs/global_config.py @@ -23,7 +23,7 @@ from pymllm.configs.quantization_config import QuantizationConfig -@dataclass +@dataclass(init=False) class GlobalConfig: """Singleton that holds every sub-config pymllm needs. @@ -36,22 +36,36 @@ class GlobalConfig: cfg.model.hidden_size cfg.quantization.method cfg.server.host - """ - server: "ServerConfig" = field(default=None, repr=False) # type: ignore[assignment] - model: ModelConfig = field(default_factory=ModelConfig) - quantization: QuantizationConfig = field(default_factory=QuantizationConfig) + .. note:: + + Always use :meth:`get_instance` (or the module-level + :func:`get_global_config` shortcut) to obtain the singleton. + ``GlobalConfig()`` is safe to call multiple times — the second and + subsequent calls return the existing instance without re-initialising + fields. + """ - _initialized: bool = field(default=False, repr=False) + server: "ServerConfig" + model: ModelConfig + quantization: QuantizationConfig + _initialized: bool def __new__(cls): if not hasattr(cls, "_instance") or cls._instance is None: cls._instance = super().__new__(cls) return cls._instance - def __post_init__(self): - if self.server is None: - self.server = ServerConfig(model_path=None) + def __init__(self): + # Guard: skip re-initialisation on repeated GlobalConfig() calls. + # The dataclass auto-generated __init__ is disabled (init=False) so + # this custom __init__ has full control. + if getattr(self, "_initialized", False): + return + self.server = ServerConfig(model_path=None) + self.model = ModelConfig() + self.quantization = QuantizationConfig() + self._initialized = True @classmethod def get_instance(cls) -> "GlobalConfig": diff --git a/pymllm/configs/server_config.py b/pymllm/configs/server_config.py index 304d328ea..92d02e05e 100644 --- a/pymllm/configs/server_config.py +++ b/pymllm/configs/server_config.py @@ -31,6 +31,7 @@ class ServerConfig: admin_api_key: Optional[str] = None served_model_name: Optional[str] = None file_storage_path: Path = Path("mllm_storage") + cors_allow_origins: list[str] = field(default_factory=lambda: ["*"]) # --------------------------------------------------------------------- # # Scheduling and memory diff --git a/pymllm/engine/launch.py b/pymllm/engine/launch.py index 5e3a2ef00..8fd39caab 100644 --- a/pymllm/engine/launch.py +++ b/pymllm/engine/launch.py @@ -2,6 +2,8 @@ import atexit import logging import os +import threading +import time import uuid from pathlib import Path from typing import Any, AsyncIterator, Dict, List, Optional, Union @@ -21,7 +23,7 @@ from pymllm.configs import get_global_config from pymllm.engine.io_struct import GenerateReqInput -from pymllm.orchestrator.ipc_utils import make_ipc_address +from pymllm.orchestrator.ipc_utils import cleanup_ipc_files, make_ipc_address from pymllm.orchestrator.request_response_process import ( ReqState, RequestResponseProcess, @@ -129,18 +131,46 @@ class Engine: def __init__(self): self._subprocesses: List[mp.Process] = [] self._rr_process: Optional[RequestResponseProcess] = None + self._ipc_uid: Optional[str] = None + self._subprocess_healthy: bool = True self._config_logging() self._set_default_torch_dtype() self._check_model_and_tokenizer() + @property + def is_healthy(self) -> bool: + """True if engine and all subprocesses are alive.""" + return self._subprocess_healthy + def launch(self) -> None: self._launch_processes() + self._start_health_monitor() atexit.register(self.shutdown) + def _start_health_monitor(self) -> None: + """Start a daemon thread that checks subprocess liveness.""" + + def _monitor(): + while self._subprocess_healthy: + for proc in self._subprocesses: + if not proc.is_alive(): + logger.error( + "Subprocess pid=%s died unexpectedly (exitcode=%s)", + proc.pid, + proc.exitcode, + ) + self._subprocess_healthy = False + return + time.sleep(5) + + t = threading.Thread(target=_monitor, daemon=True, name="engine-health-monitor") + t.start() + def _launch_processes(self) -> None: """Spawn all subprocess workers and wire up ZMQ IPC channels.""" mp.set_start_method("spawn", force=True) uid = str(os.getpid()) + self._ipc_uid = uid # IPC addresses for ZMQ communication between processes addr_request_response_to_tokenizer: str = make_ipc_address( @@ -310,7 +340,7 @@ def _launch_processes(self) -> None: print(colored(line, "magenta", attrs=["bold"])) print() except Exception as e: - logger.debug(f"Failed to print banner: {e}") + logger.debug("Failed to print banner: %s", e) print("🚀 pymllm FIRED UP! 🚀\n") else: print("🚀 pymllm FIRED UP! 🚀\n") @@ -358,7 +388,8 @@ def generate( request.normalize_batch_and_arguments() async def _run() -> Union[Dict[str, Any], List[Dict[str, Any]]]: - result = await self._rr_process.add_request(request) + max_queued = get_global_config().server.max_queued_requests + result = await self._rr_process.add_request(request, max_queued=max_queued) if request.is_single: single_rid = rid if isinstance(rid, str) else rid[0] return await self._wait_for_final_result(single_rid, result) # type: ignore[arg-type] @@ -370,12 +401,11 @@ async def _run() -> Union[Dict[str, Any], List[Dict[str, Any]]]: ) return list(outputs) + loop = asyncio.new_event_loop() try: - loop = asyncio.get_event_loop() - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - return loop.run_until_complete(_run()) + return loop.run_until_complete(_run()) + finally: + loop.close() async def generate_async( self, @@ -421,7 +451,8 @@ async def generate_async( extra_options=kwargs, ) request.normalize_batch_and_arguments() - result = await self._rr_process.add_request(request) + max_queued = get_global_config().server.max_queued_requests + result = await self._rr_process.add_request(request, max_queued=max_queued) if request.is_single: single_rid = rid if isinstance(rid, str) else rid[0] # type: ignore[index] @@ -533,13 +564,17 @@ def shutdown(self) -> None: """Terminate all subprocesses.""" if self._rr_process is not None: try: - loop = asyncio.get_event_loop() - if loop.is_running(): - loop.create_task(self._rr_process.shutdown()) - else: + loop = asyncio.get_running_loop() + # Loop is running (e.g. called from uvicorn shutdown) — + # schedule cleanup as a fire-and-forget task. + loop.create_task(self._rr_process.shutdown()) + except RuntimeError: + # No running loop — create a temporary one for cleanup. + loop = asyncio.new_event_loop() + try: loop.run_until_complete(self._rr_process.shutdown()) - except Exception: - pass + finally: + loop.close() for proc in self._subprocesses: if proc.is_alive(): proc.terminate() @@ -547,6 +582,9 @@ def shutdown(self) -> None: if proc.is_alive(): proc.kill() self._subprocesses.clear() + # Clean up IPC socket files + if self._ipc_uid is not None: + cleanup_ipc_files(self._ipc_uid) logger.info("All subprocesses shut down") def _set_default_torch_dtype(self): diff --git a/pymllm/executor/model_runner.py b/pymllm/executor/model_runner.py index e73272f1b..d60b9d899 100644 --- a/pymllm/executor/model_runner.py +++ b/pymllm/executor/model_runner.py @@ -65,10 +65,7 @@ def _suppress_cpu_threads() -> None: - """Limit PyTorch intra-op threads to 1 for GPU inference. - - Reference: sglang ``ModelRunner``. - """ + """Limit PyTorch intra-op threads to 1 for GPU inference.""" torch.set_num_threads(1) @@ -296,7 +293,7 @@ def initialize(self) -> None: # Limit PyTorch CPU threads to 1 for GPU inference. # PyTorch's default (= CPU core count) causes OpenMP thread pool # spin-wait that wastes CPU. GPU models don't benefit from CPU - # parallelism. Reference: sglang ModelRunner. + # parallelism. if self.device != "cpu": _suppress_cpu_threads() diff --git a/pymllm/orchestrator/detokenizer_process.py b/pymllm/orchestrator/detokenizer_process.py index b2c1609a9..1bbda98d0 100644 --- a/pymllm/orchestrator/detokenizer_process.py +++ b/pymllm/orchestrator/detokenizer_process.py @@ -100,6 +100,16 @@ def _detokenize(self, token_id_out: Dict[str, Any]) -> List[Dict[str, Any]]: rids: List[str] = token_id_out.get("rids", []) output_ids: List[int] = token_id_out.get("output_ids", []) finished_reasons: List[Optional[str]] = token_id_out.get("finished_reasons", []) + + # NOTE: The scheduler currently sends one rid per message. The shared + # output_ids list is the complete output for that single rid. If + # batched sending is ever added, each rid will need its own output_ids. + if len(rids) > 1: + logger.warning( + "Detokenizer received %d rids in one message; " + "output_ids are shared -- results may be incorrect", + len(rids), + ) decode_ids: List[int] = token_id_out.get("decode_ids", []) skip_special_tokens_list: List[bool] = token_id_out.get( "skip_special_tokens", [] diff --git a/pymllm/orchestrator/ipc_utils.py b/pymllm/orchestrator/ipc_utils.py index b464a3979..abb59849a 100644 --- a/pymllm/orchestrator/ipc_utils.py +++ b/pymllm/orchestrator/ipc_utils.py @@ -71,6 +71,19 @@ def close_zmq_socket(sock: zmq.Socket) -> None: pass +def cleanup_ipc_files(unique_id: Optional[str] = None) -> None: + """Remove IPC socket files for the given engine (or all if no id given).""" + import glob as _glob + + suffix = f"_{unique_id}" if unique_id else "" + pattern = os.path.join(_IPC_DIR, f"pymllm_*{suffix}") + for f in _glob.glob(pattern): + try: + os.unlink(f) + except OSError: + pass + + def setup_subprocess_logging(log_level: str = "info") -> None: """Configure logging for a spawned subprocess. diff --git a/pymllm/orchestrator/request_response_process.py b/pymllm/orchestrator/request_response_process.py index 5c72a14c4..f59ffa51e 100644 --- a/pymllm/orchestrator/request_response_process.py +++ b/pymllm/orchestrator/request_response_process.py @@ -14,6 +14,7 @@ import asyncio import dataclasses import logging +import time from typing import Any, Dict, List, Optional, Union import zmq @@ -39,6 +40,7 @@ class ReqState: out_list: List[Dict[str, Any]] = dataclasses.field(default_factory=list) finished: bool = False event: asyncio.Event = dataclasses.field(default_factory=asyncio.Event) + created_at: float = dataclasses.field(default_factory=time.time) class RequestResponseProcess: @@ -100,7 +102,9 @@ def listen(self) -> None: logger.debug("RequestResponseProcess: background tasks started") async def add_request( - self, request: GenerateReqInput + self, + request: GenerateReqInput, + max_queued: Optional[int] = None, ) -> Union[ReqState, List[ReqState]]: """Enqueue request(s) and return the corresponding :class:`ReqState`(s). @@ -112,10 +116,15 @@ async def add_request( independent messages. Returns a ``List[ReqState]`` in the same order as the input rids. - Callers should ``await state.event.wait()`` in a loop, consuming - ``state.out_list`` entries until ``state.finished`` is ``True``. + Parameters + ---------- + max_queued + If set, raise ``RuntimeError`` when the queue already has this many + items (back-pressure / overload protection). """ self.listen() + if max_queued is not None and self._request_queue.qsize() >= max_queued: + raise RuntimeError("Server overloaded: too many queued requests") if request.is_single: rid = request.rid if isinstance(request.rid, str) else request.rid[0] @@ -146,7 +155,8 @@ async def abort_request(self, rid: str) -> None: state.finished = True state.out_list.append({"rid": rid, "error": "aborted", "finished": True}) state.event.set() - await self._send_to_tokenizer.send_pyobj({"rid": rid, "abort": True}) + if self._send_to_tokenizer is not None: + await self._send_to_tokenizer.send_pyobj({"rid": rid, "abort": True}) async def shutdown(self) -> None: if self._loop_task is not None: @@ -174,10 +184,26 @@ async def _send_loop(self) -> None: request = await self._request_queue.get() await self._send_to_tokenizer.send_pyobj(request) + # Stale state cleanup constants + _STALE_TIMEOUT = 1800 # 30 minutes + _CLEANUP_INTERVAL = 60 # seconds + async def _recv_loop(self) -> None: """Receive decoded results from DetokenizerProcess and dispatch to ReqStates.""" + last_cleanup = time.time() while True: - result = await self._recv_from_detokenizer.recv_pyobj() + # Use a timeout so that stale-state cleanup runs even when no + # results are flowing back from the detokenizer. + try: + result = await asyncio.wait_for( + self._recv_from_detokenizer.recv_pyobj(), + timeout=self._CLEANUP_INTERVAL, + ) + except asyncio.TimeoutError: + self._cleanup_stale_states() + last_cleanup = time.time() + continue + rid = result.get("rid") state = self._rid_to_state.get(rid) if state is None: @@ -187,3 +213,24 @@ async def _recv_loop(self) -> None: if result.get("finished", False): state.finished = True state.event.set() + + # Also run cleanup on the normal path when enough time has passed + now = time.time() + if now - last_cleanup > self._CLEANUP_INTERVAL: + last_cleanup = now + self._cleanup_stale_states() + + def _cleanup_stale_states(self) -> None: + """Remove request states that have been pending longer than ``_STALE_TIMEOUT``.""" + now = time.time() + stale = [ + r + for r, s in self._rid_to_state.items() + if not s.finished and (now - s.created_at) > self._STALE_TIMEOUT + ] + for r in stale: + logger.warning("Cleaning stale request state: rid=%s", r) + s = self._rid_to_state.pop(r) + s.finished = True + s.out_list.append({"rid": r, "error": "timeout", "finished": True}) + s.event.set() diff --git a/pymllm/orchestrator/scheduler_process.py b/pymllm/orchestrator/scheduler_process.py index fa9046061..3bc3466a1 100644 --- a/pymllm/orchestrator/scheduler_process.py +++ b/pymllm/orchestrator/scheduler_process.py @@ -175,12 +175,13 @@ def __init__( # Prompt length self.prompt_len: int = len(input_ids) - def check_finished(self, eos_token_id: Optional[int] = None) -> bool: + def check_finished(self) -> bool: """Check if this request has reached a finish condition. Sets ``finished_reason`` and returns True if finished. Checks: - 1. EOS token in the latest generated token + 1. Stop token (EOS tokens are merged into stop_token_ids during + :meth:`SchedulerProcess.process_input_requests`) 2. ``max_new_tokens`` reached """ if self.finished_reason is not None: @@ -188,11 +189,6 @@ def check_finished(self, eos_token_id: Optional[int] = None) -> bool: if self.output_ids: last_token = self.output_ids[-1] - # Check model EOS token - if eos_token_id is not None and last_token == eos_token_id: - self.finished_reason = "eos" - return True - # Check stop token IDs from sampling params if last_token in self.stop_token_ids: self.finished_reason = "eos" return True @@ -403,7 +399,7 @@ def __init__( self._waiting_queue: Deque[TokenizedGenerateReqInput] = deque() self._pending_queue: List[Req] = [] self._running_batch: List[Req] = [] - self._finished: List[Dict[str, Any]] = [] + self._finished: Deque[Dict[str, Any]] = deque() # Scheduling limits self._max_running_requests = max_running_requests @@ -809,10 +805,8 @@ def process_batch_result( # Update token budget for newly generated tokens self._used_tokens += len(new_token_ids) - # Check finish conditions - req.check_finished( - eos_token_id=self._eos_token_ids[0] if self._eos_token_ids else None - ) + # Check finish conditions (EOS tokens already in stop_token_ids) + req.check_finished() # Process batch requests based on forward mode if batch.forward_mode.is_extend(): @@ -888,7 +882,7 @@ def stream_output(self) -> None: # Send finished outputs while self._finished: - item = self._finished.pop(0) + item = self._finished.popleft() self._send_to_detokenizer.send_pyobj(item) # ------------------------------------------------------------------ @@ -1033,6 +1027,19 @@ def run_scheduler_process( so model initialisation happens here. """ setup_subprocess_logging(log_level) + + # Extract scheduling limits from server_config (fall back to defaults) + max_running = _DEFAULT_MAX_RUNNING_REQUESTS + max_prefill = _DEFAULT_MAX_PREFILL_TOKENS + max_total = _DEFAULT_MAX_TOTAL_TOKENS + if server_config is not None: + if getattr(server_config, "max_running_requests", None) is not None: + max_running = server_config.max_running_requests + if getattr(server_config, "max_prefill_tokens", None) is not None: + max_prefill = server_config.max_prefill_tokens + if getattr(server_config, "max_total_tokens", None) is not None: + max_total = server_config.max_total_tokens + proc = SchedulerProcess( recv_from_tokenizer_addr, send_to_detokenizer_addr, @@ -1042,6 +1049,9 @@ def run_scheduler_process( shared_queue=shared_queue, enable_shared_queue=enable_shared_queue, tensor_transport_mode=tensor_transport_mode, + max_running_requests=max_running, + max_prefill_tokens=max_prefill, + max_total_tokens=max_total, default_max_new_tokens=default_max_new_tokens, eos_token_ids=eos_token_ids, ) diff --git a/pymllm/server/launch.py b/pymllm/server/launch.py index 1dab4325d..7f756d46d 100644 --- a/pymllm/server/launch.py +++ b/pymllm/server/launch.py @@ -274,13 +274,38 @@ async def lifespan(app: FastAPI): app = FastAPI(lifespan=lifespan) -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) +# NOTE: CORS middleware is added in launch_server() after config is loaded, +# so that cors_allow_origins from ServerConfig can be used. + + +# --------------------------------------------------------------------------- +# Authentication middleware +# --------------------------------------------------------------------------- + +# Paths that are always accessible without an API key (liveness probes). +_AUTH_EXEMPT_PATHS = frozenset({"/health", "/health_generate"}) + + +@app.middleware("http") +async def _auth_middleware(request: Request, call_next): + """Enforce ``Authorization: Bearer `` when ``api_key`` is configured.""" + cfg = get_global_config() + api_key = cfg.server.api_key + if api_key is None: + # No key configured — open access. + return await call_next(request) + if request.url.path in _AUTH_EXEMPT_PATHS: + return await call_next(request) + auth = request.headers.get("Authorization", "") + if auth == f"Bearer {api_key}": + return await call_next(request) + admin_key = cfg.server.admin_api_key + if admin_key and auth == f"Bearer {admin_key}": + return await call_next(request) + return ORJSONResponse( + status_code=401, + content={"error": {"message": "Invalid or missing API key", "code": 401}}, + ) # --------------------------------------------------------------------------- @@ -304,7 +329,10 @@ async def http_exception_handler(request: Request, exc: HTTPException): @app.get("/health") @app.get("/health_generate") async def health(): - """Liveness probe.""" + """Liveness / readiness probe. Returns 503 if subprocesses died.""" + engine = _engine + if engine is None or not engine.is_healthy: + return Response(status_code=503) return Response(status_code=200) @@ -322,13 +350,19 @@ async def model_info(): } +_SERVER_INFO_REDACT = frozenset({"api_key", "admin_api_key"}) + + @app.get("/server_info") async def server_info(): - """Dump runtime server configuration.""" + """Dump runtime server configuration (sensitive fields redacted).""" import dataclasses as _dc cfg = get_global_config() - return _dc.asdict(cfg.server) + d = _dc.asdict(cfg.server) + for k in _SERVER_INFO_REDACT: + d.pop(k, None) + return d @app.get("/v1/models") @@ -537,7 +571,8 @@ async def _stream() -> AsyncIterator[bytes]: continue yield b"data: " + orjson.dumps(chunk) + b"\n\n" except Exception as e: - err = {"error": {"message": str(e)}} + logger.error("[generate] stream error: %s", e, exc_info=True) + err = {"error": {"message": "Internal server error"}} yield b"data: " + orjson.dumps(err) + b"\n\n" finally: await gen.aclose() @@ -550,11 +585,22 @@ async def _stream() -> AsyncIterator[bytes]: results = [] async for item in _iter_with_disconnect_check(gen, request): results.append(item) + if not results: + raise HTTPException(status_code=500, detail="No output from engine") result = results[0] if len(results) == 1 else results return ORJSONResponse(result) - except Exception as e: - logger.error("[generate] Error: %s", e) + except HTTPException: + raise + except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) + except RuntimeError as e: + if "too many queued" in str(e): + raise HTTPException(status_code=429, detail=str(e)) + logger.error("[generate] Error: %s", e, exc_info=True) + raise HTTPException(status_code=500, detail="Internal server error") + except Exception as e: + logger.error("[generate] Error: %s", e, exc_info=True) + raise HTTPException(status_code=500, detail="Internal server error") finally: await gen.aclose() @@ -567,6 +613,12 @@ async def _stream() -> AsyncIterator[bytes]: @app.post("/v1/completions") async def openai_completions(obj: CompletionRequest, request: Request): """OpenAI-compatible text completion endpoint.""" + if obj.n > 1: + raise HTTPException(status_code=400, detail="n > 1 is not supported") + if obj.echo: + raise HTTPException(status_code=400, detail="echo is not yet supported") + if obj.logprobs is not None and obj.logprobs > 0: + raise HTTPException(status_code=400, detail="logprobs is not yet supported") engine = _get_engine() sp = _build_sampling_params( temperature=obj.temperature, @@ -621,7 +673,8 @@ async def _stream() -> AsyncIterator[bytes]: } yield b"data: " + orjson.dumps(sse) + b"\n\n" except Exception as e: - err = {"error": {"message": str(e)}} + logger.error("[v1/completions] stream error: %s", e, exc_info=True) + err = {"error": {"message": "Internal server error"}} yield b"data: " + orjson.dumps(err) + b"\n\n" finally: await gen.aclose() @@ -682,9 +735,16 @@ async def _stream() -> AsyncIterator[bytes]: }, } ) - except Exception as e: - logger.error("[v1/completions] Error: %s", e) + except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) + except RuntimeError as e: + if "too many queued" in str(e): + raise HTTPException(status_code=429, detail=str(e)) + logger.error("[v1/completions] Error: %s", e, exc_info=True) + raise HTTPException(status_code=500, detail="Internal server error") + except Exception as e: + logger.error("[v1/completions] Error: %s", e, exc_info=True) + raise HTTPException(status_code=500, detail="Internal server error") finally: await gen.aclose() @@ -697,6 +757,10 @@ async def _stream() -> AsyncIterator[bytes]: @app.post("/v1/chat/completions") async def openai_chat_completions(obj: ChatCompletionRequest, request: Request): """OpenAI-compatible chat completion endpoint with reasoning & tool-call parsing.""" + if obj.n > 1: + raise HTTPException(status_code=400, detail="n > 1 is not supported") + if obj.logprobs: + raise HTTPException(status_code=400, detail="logprobs is not yet supported") engine = _get_engine() cfg = get_global_config() # Auto-enable thinking when reasoning_parser is configured and the @@ -721,7 +785,6 @@ async def openai_chat_completions(obj: ChatCompletionRequest, request: Request): repetition_penalty=obj.repetition_penalty, seed=obj.seed, ) - cfg = get_global_config() model_name = obj.model or cfg.server.served_model_name or str(cfg.server.model_path) include_usage = ( obj.stream_options is not None and obj.stream_options.include_usage @@ -838,7 +901,8 @@ def _make_sse(delta: Dict[str, Any], finish: Optional[str] = None) -> bytes: yield _make_sse({}, finish=finish_reason) except Exception as e: - err = {"error": {"message": str(e)}} + logger.error("[v1/chat/completions] stream error: %s", e, exc_info=True) + err = {"error": {"message": "Internal server error"}} yield b"data: " + orjson.dumps(err) + b"\n\n" finally: await gen.aclose() @@ -917,9 +981,16 @@ def _make_sse(delta: Dict[str, Any], finish: Optional[str] = None) -> bytes: }, } ) - except Exception as e: - logger.error("[v1/chat/completions] Error: %s", e) + except ValueError as e: raise HTTPException(status_code=400, detail=str(e)) + except RuntimeError as e: + if "too many queued" in str(e): + raise HTTPException(status_code=429, detail=str(e)) + logger.error("[v1/chat/completions] Error: %s", e, exc_info=True) + raise HTTPException(status_code=500, detail="Internal server error") + except Exception as e: + logger.error("[v1/chat/completions] Error: %s", e, exc_info=True) + raise HTTPException(status_code=500, detail="Internal server error") finally: await gen.aclose() @@ -931,8 +1002,8 @@ def _make_sse(delta: Dict[str, Any], finish: Optional[str] = None) -> bytes: @app.api_route("/flush_cache", methods=["GET", "POST"]) async def flush_cache(): - """Placeholder cache flush.""" - return Response(content="Cache flushed.\n", status_code=200) + """Cache flush (not yet implemented).""" + raise HTTPException(status_code=501, detail="Cache flush not implemented") @app.post("/abort_request") @@ -964,12 +1035,22 @@ def _prepare_args(): def launch_server(): """Launch the pymllm Engine then start the uvicorn HTTP server. - It first boots all engine subprocesses (tokenizer, scheduler, model-runner, detokenizer) + It first boots all engine subprocesses (tokenizer, scheduler, model-runner, detokenizer) and then hands off to uvicorn to serve HTTP traffic. """ _prepare_args() cfg = get_global_config() + # Add CORS middleware (after config is loaded so origins are configurable). + origins = cfg.server.cors_allow_origins + app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=(origins != ["*"]), + allow_methods=["*"], + allow_headers=["*"], + ) + engine = Engine() engine.launch() From a78e3a0acfaa7b979527873fbab759e9e09b4bab Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 18 Mar 2026 07:55:10 +0000 Subject: [PATCH 39/42] feat(mllm-kernel): introduce new Marlin kernel implementations for efficient tensor operations - Added ScalarType class to represent various floating point and integer types, enhancing type flexibility. - Implemented awq_marlin_repack and gptq_marlin_repack kernels for optimized weight repacking in CUDA. - Developed dequantization functions for fast conversion of quantized values to floating point formats. - Introduced permute_cols_kernel for efficient column permutation in matrix operations. - Enhanced kernel templates for better configurability and performance across different data types. - Improved shared memory management and synchronization mechanisms for better execution efficiency. Signed-off-by: chenghuaWang <2923277184@qq.com> --- .../include/mllm_kernel/scalar_type.hpp | 260 +++ .../csrc/gemm/marlin/awq_marlin_repack.cuh | 251 +++ .../cuda/csrc/gemm/marlin/dequant.h | 504 ++++++ .../cuda/csrc/gemm/marlin/gptq_marlin.cuh | 1001 +++++++++++ .../csrc/gemm/marlin/gptq_marlin_repack.cuh | 362 ++++ .../cuda/csrc/gemm/marlin/kernel.h | 32 + .../cuda/csrc/gemm/marlin/marlin.cuh | 89 + .../cuda/csrc/gemm/marlin/marlin_dtypes.cuh | 77 + .../cuda/csrc/gemm/marlin/marlin_template.h | 1514 +++++++++++++++++ mllm-kernel/mllm_kernel/cuda/jit/__init__.py | 11 +- .../mllm_kernel/cuda/jit/awq_marlin_repack.py | 78 + .../mllm_kernel/cuda/jit/gptq_marlin.py | 213 +++ pymllm/executor/model_runner.py | 9 + pymllm/layers/base.py | 8 +- pymllm/layers/linear.py | 161 +- pymllm/layers/quantize_base.py | 275 +++ pymllm/quantization/QUANTIZATION.md | 257 +++ pymllm/quantization/__init__.py | 18 + pymllm/quantization/methods/__init__.py | 15 + pymllm/quantization/methods/awq_marlin.py | 524 ++++++ pymllm/quantization/methods/awq_w4a16.py | 0 pymllm/quantization/quant_config.py | 203 +++ pymllm/quantization/quant_recipe.py | 3 - 23 files changed, 5804 insertions(+), 61 deletions(-) create mode 100644 mllm-kernel/include/mllm_kernel/scalar_type.hpp create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/awq_marlin_repack.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/dequant.h create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/gptq_marlin.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/gptq_marlin_repack.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/kernel.h create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin_dtypes.cuh create mode 100644 mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin_template.h create mode 100644 mllm-kernel/mllm_kernel/cuda/jit/awq_marlin_repack.py create mode 100644 mllm-kernel/mllm_kernel/cuda/jit/gptq_marlin.py create mode 100644 pymllm/layers/quantize_base.py create mode 100644 pymllm/quantization/QUANTIZATION.md create mode 100644 pymllm/quantization/methods/awq_marlin.py delete mode 100644 pymllm/quantization/methods/awq_w4a16.py create mode 100644 pymllm/quantization/quant_config.py delete mode 100644 pymllm/quantization/quant_recipe.py diff --git a/mllm-kernel/include/mllm_kernel/scalar_type.hpp b/mllm-kernel/include/mllm_kernel/scalar_type.hpp new file mode 100644 index 000000000..def41a12b --- /dev/null +++ b/mllm-kernel/include/mllm_kernel/scalar_type.hpp @@ -0,0 +1,260 @@ +#pragma once + +#include +#include +#ifndef __CUDACC__ +#include +#endif + +namespace host { + +// +// ScalarType can represent a wide range of floating point and integer types, +// in particular it can be used to represent sub-byte data types (something +// that torch.dtype currently does not support). +// +class ScalarType { + public: + enum NanRepr : uint8_t { + NAN_NONE = 0, // nans are not supported + NAN_IEEE_754 = 1, // nans are: exp all 1s, mantissa not all 0s + NAN_EXTD_RANGE_MAX_MIN = 2, // nans are: exp all 1s, mantissa all 1s + + NAN_REPR_ID_MAX + }; + + constexpr ScalarType(uint8_t exponent, uint8_t mantissa, bool signed_, int32_t bias, bool finite_values_only = false, + NanRepr nan_repr = NAN_IEEE_754) + : exponent(exponent), + mantissa(mantissa), + signed_(signed_), + bias(bias), + finite_values_only(finite_values_only), + nan_repr(nan_repr) {}; + + static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) { return ScalarType(0, size_bits - 1, true, bias); } + + static constexpr ScalarType uint(uint8_t size_bits, int32_t bias = 0) { return ScalarType(0, size_bits, false, bias); } + + // IEEE 754 compliant floating point type + static constexpr ScalarType float_IEEE754(uint8_t exponent, uint8_t mantissa) { + assert(mantissa > 0 && exponent > 0); + return ScalarType(exponent, mantissa, true, 0, false, NAN_IEEE_754); + } + + // IEEE 754 non-compliant floating point type + static constexpr ScalarType float_(uint8_t exponent, uint8_t mantissa, bool finite_values_only, NanRepr nan_repr) { + assert(nan_repr < NAN_REPR_ID_MAX); + assert(mantissa > 0 && exponent > 0); + assert(nan_repr != NAN_IEEE_754); + return ScalarType(exponent, mantissa, true, 0, finite_values_only, nan_repr); + } + + uint8_t const exponent; // size of the exponent field (0 for integer types) + uint8_t const mantissa; // size of the mantissa field (size of the integer + // excluding the sign bit for integer types) + bool const signed_; // flag if the type supports negative numbers (i.e. has a + // sign bit) + int32_t const bias; // stored values equal value + bias, + // used for quantized type + + // Extra Floating point info + bool const finite_values_only; // i.e. no +/-inf if true + NanRepr const nan_repr; // how NaNs are represented + // (not applicable for integer types) + + using Id = int64_t; + + private: + // Field size in id + template + static constexpr size_t member_id_field_width() { + using T = std::decay_t; + return std::is_same_v ? 1 : sizeof(T) * 8; + } + + template + static constexpr auto reduce_members_helper(Fn f, Init val, Member member, Rest... rest) { + auto new_val = f(val, member); + if constexpr (sizeof...(rest) > 0) { + return reduce_members_helper(f, new_val, rest...); + } else { + return new_val; + }; + } + + template + constexpr auto reduce_members(Fn f, Init init) const { + // Should be in constructor order for `from_id` + return reduce_members_helper(f, init, exponent, mantissa, signed_, bias, finite_values_only, nan_repr); + }; + + template + static constexpr auto reduce_member_types(Fn f, Init init) { + constexpr auto dummy_type = ScalarType(0, 0, false, 0, false, NAN_NONE); + return dummy_type.reduce_members(f, init); + }; + + static constexpr auto id_size_bits() { + return reduce_member_types([](int acc, auto member) -> int { return acc + member_id_field_width(); }, 0); + } + + public: + constexpr Id id() const { + static_assert(id_size_bits() <= sizeof(Id) * 8, "ScalarType id is too large to be stored"); + + auto or_and_advance = [](std::pair result, auto member) -> std::pair { + auto [id, bit_offset] = result; + auto constexpr bits = member_id_field_width(); + return {id | (int64_t(member) & ((uint64_t(1) << bits) - 1)) << bit_offset, bit_offset + bits}; + }; + return reduce_members(or_and_advance, std::pair{}).first; + } + + static constexpr ScalarType from_id(Id id) { + auto extract_and_advance = [id](auto result, auto member) { + using T = decltype(member); + auto [tuple, bit_offset] = result; + auto constexpr bits = member_id_field_width(); + auto extracted_val = static_cast((int64_t(id) >> bit_offset) & ((uint64_t(1) << bits) - 1)); + auto new_tuple = std::tuple_cat(tuple, std::make_tuple(extracted_val)); + return std::pair{new_tuple, bit_offset + bits}; + }; + + auto [tuple_args, _] = reduce_member_types(extract_and_advance, std::pair, int>{}); + return std::apply([](auto... args) { return ScalarType(args...); }, tuple_args); + } + + constexpr int64_t size_bits() const { return mantissa + exponent + is_signed(); } + constexpr bool is_signed() const { return signed_; } + constexpr bool is_integer() const { return exponent == 0; } + constexpr bool is_floating_point() const { return exponent > 0; } + constexpr bool is_ieee_754() const { return is_floating_point() && finite_values_only == false && nan_repr == NAN_IEEE_754; } + constexpr bool has_nans() const { return is_floating_point() && nan_repr != NAN_NONE; } + constexpr bool has_infs() const { return is_floating_point() && finite_values_only == false; } + constexpr bool has_bias() const { return bias != 0; } + +#ifndef __CUDACC__ + private: + double _floating_point_max() const { + assert(mantissa <= 52 && exponent <= 11); + + uint64_t max_mantissa = (uint64_t(1) << mantissa) - 1; + if (nan_repr == NAN_EXTD_RANGE_MAX_MIN) { max_mantissa -= 1; } + + uint64_t max_exponent = (uint64_t(1) << exponent) - 2; + if (nan_repr == NAN_EXTD_RANGE_MAX_MIN || nan_repr == NAN_NONE) { + assert(exponent < 11); + max_exponent += 1; + } + + uint64_t exponent_bias = (uint64_t(1) << (exponent - 1)) - 1; + uint64_t exponent_bias_double = (uint64_t(1) << 10) - 1; // double e = 11 + + uint64_t max_exponent_double = max_exponent - exponent_bias + exponent_bias_double; + + uint64_t double_raw = (max_mantissa << (52 - mantissa)) | (max_exponent_double << 52); + + return *reinterpret_cast(&double_raw); + } + + constexpr std::variant _raw_max() const { + if (is_floating_point()) { + return {_floating_point_max()}; + } else { + assert(size_bits() < 64 || (size_bits() == 64 && is_signed())); + return {(int64_t(1) << mantissa) - 1}; + } + } + + constexpr std::variant _raw_min() const { + if (is_floating_point()) { + assert(is_signed()); + constexpr uint64_t sign_bit_double = (uint64_t(1) << 63); + + double max = _floating_point_max(); + uint64_t max_raw = *reinterpret_cast(&max); + uint64_t min_raw = max_raw | sign_bit_double; + return {*reinterpret_cast(&min_raw)}; + } else { + assert(!is_signed() || size_bits() <= 64); + if (is_signed()) { + return {INT64_MIN >> (64 - size_bits())}; + } else { + return {int64_t(0)}; + } + } + } + + public: + constexpr std::variant max() const { + return std::visit([this](auto x) -> std::variant { return {x - bias}; }, _raw_max()); + } + + constexpr std::variant min() const { + return std::visit([this](auto x) -> std::variant { return {x - bias}; }, _raw_min()); + } +#endif // __CUDACC__ + + public: + std::string str() const { + if (is_floating_point()) { + auto ret = "float" + std::to_string(size_bits()) + "_e" + std::to_string(exponent) + "m" + std::to_string(mantissa); + if (!is_ieee_754()) { + if (finite_values_only) { ret += "f"; } + if (nan_repr != NAN_NONE) { ret += "n"; } + } + return ret; + } else { + auto ret = ((is_signed()) ? "int" : "uint") + std::to_string(size_bits()); + if (has_bias()) { ret += "b" + std::to_string(bias); } + return ret; + } + } + + constexpr bool operator==(ScalarType const& other) const { + return mantissa == other.mantissa && exponent == other.exponent && bias == other.bias && signed_ == other.signed_ + && finite_values_only == other.finite_values_only && nan_repr == other.nan_repr; + } +}; + +using ScalarTypeId = ScalarType::Id; + +// "rust style" names +static inline constexpr auto kS4 = ScalarType::int_(4); +static inline constexpr auto kU4 = ScalarType::uint(4); +static inline constexpr auto kU4B8 = ScalarType::uint(4, 8); +static inline constexpr auto kS8 = ScalarType::int_(8); +static inline constexpr auto kU8 = ScalarType::uint(8); +static inline constexpr auto kU8B128 = ScalarType::uint(8, 128); + +static inline constexpr auto kFE2M1f = ScalarType::float_(2, 1, true, ScalarType::NAN_NONE); +static inline constexpr auto kFE3M2f = ScalarType::float_(3, 2, true, ScalarType::NAN_NONE); +static inline constexpr auto kFE4M3fn = ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN); +static inline constexpr auto kFE8M0fnu = ScalarType(8, 0, false, 0, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN); +static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2); +static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7); +static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10); + +// Fixed width style names +static inline constexpr auto kInt4 = kS4; +static inline constexpr auto kUint4 = kU4; +static inline constexpr auto kUint4b8 = kU4B8; +static inline constexpr auto kInt8 = kS8; +static inline constexpr auto kUint8 = kU8; +static inline constexpr auto kUint8b128 = kU8B128; + +static inline constexpr auto kFloat4_e2m1f = kFE2M1f; +static inline constexpr auto kFloat6_e3m2f = kFE3M2f; +static inline constexpr auto kFloat8_e4m3fn = kFE4M3fn; +static inline constexpr auto kFloat8_e5m2 = kFE5M2; +static inline constexpr auto kFloat16_e8m7 = kFE8M7; +static inline constexpr auto kFloat16_e5m10 = kFE5M10; + +// colloquial names +static inline constexpr auto kHalf = kFE5M10; +static inline constexpr auto kFloat16 = kHalf; +static inline constexpr auto kBFloat16 = kFE8M7; + +static inline constexpr auto kFloat16Id = kFloat16.id(); +} // namespace host diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/awq_marlin_repack.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/awq_marlin_repack.cuh new file mode 100644 index 000000000..71ace4470 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/awq_marlin_repack.cuh @@ -0,0 +1,251 @@ +#pragma once + +#include + +#include + +#include "marlin.cuh" + +namespace device::marlin { + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 +template +__global__ void awq_marlin_repack_kernel( + uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr, int size_k, int size_n) { + return; +} +#else + +template +__global__ void awq_marlin_repack_kernel( + uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr, int size_k, int size_n) { + constexpr int pack_factor = 32 / num_bits; + + int k_tiles = size_k / tile_k_size; + int n_tiles = size_n / tile_n_size; + int block_k_tiles = div_ceil(k_tiles, (int)gridDim.x); + + auto start_k_tile = blockIdx.x * block_k_tiles; + if (start_k_tile >= k_tiles) { + return; + } + + int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles); + + // Wait until the next thread tile has been loaded to shared memory. + auto wait_for_stage = [&]() { + // We only have `stages - 2` active fetches since we are double buffering + // and can only issue the next fetch when it is guaranteed that the previous + // shared memory load is fully complete (as it may otherwise be + // overwritten). + cp_async_wait(); + __syncthreads(); + }; + + extern __shared__ int4 sh[]; + + constexpr int tile_n_ints = tile_n_size / pack_factor; + + constexpr int stage_n_threads = tile_n_ints / 4; + constexpr int stage_k_threads = tile_k_size; + constexpr int stage_size = stage_k_threads * stage_n_threads; + + auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) { + if (n_tile_id >= n_tiles) { + cp_async_fence(); + return; + } + + int first_n = n_tile_id * tile_n_size; + int first_n_packed = first_n / pack_factor; + + int4* sh_ptr = sh + stage_size * pipe; + + if (threadIdx.x < stage_size) { + auto k_id = threadIdx.x / stage_n_threads; + auto n_id = threadIdx.x % stage_n_threads; + + int first_k = k_tile_id * tile_k_size; + + cp_async4( + &sh_ptr[k_id * stage_n_threads + n_id], + reinterpret_cast( + &(b_q_weight_ptr[(first_k + k_id) * (size_n / pack_factor) + first_n_packed + (n_id * 4)]))); + } + + cp_async_fence(); + }; + + auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) { + if (n_tile_id >= n_tiles) { + return; + } + + auto warp_id = threadIdx.x / 32; + auto th_id = threadIdx.x % 32; + + if (warp_id >= 4) { + return; + } + + int tc_col = th_id / 4; + int tc_row = (th_id % 4) * 2; + + constexpr int tc_offsets[4] = {0, 1, 8, 9}; + + int cur_n = warp_id * 16 + tc_col; + int cur_n_packed = cur_n / pack_factor; + int cur_n_pos = cur_n % pack_factor; + + constexpr int sh_stride = tile_n_ints; + constexpr uint32_t mask = (1 << num_bits) - 1; + + int4* sh_stage_ptr = sh + stage_size * pipe; + uint32_t* sh_stage_int_ptr = reinterpret_cast(sh_stage_ptr); + + // Undo interleaving + int cur_n_pos_unpacked; + if constexpr (num_bits == 4) { + constexpr int undo_pack[8] = {0, 4, 1, 5, 2, 6, 3, 7}; + cur_n_pos_unpacked = undo_pack[cur_n_pos]; + } else { + constexpr int undo_pack[4] = {0, 2, 1, 3}; + cur_n_pos_unpacked = undo_pack[cur_n_pos]; + } + + uint32_t vals[8]; +#pragma unroll + for (int i = 0; i < 4; i++) { + int cur_elem = tc_row + tc_offsets[i]; + + int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem]; + int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) + sh_stride * cur_elem]; + + vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask; + vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask; + } + + constexpr int tile_size_val = tile_k_size * tile_n_size / pack_factor; + int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size_val; + + // Result of: + // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h + if constexpr (num_bits == 4) { + constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + + uint32_t res = 0; +#pragma unroll + for (int i = 0; i < 8; i++) { + res |= vals[pack_idx[i]] << (i * 4); + } + + out_ptr[out_offset + th_id * 4 + warp_id] = res; + + } else { + constexpr int pack_idx[4] = {0, 2, 1, 3}; + + uint32_t res1 = 0; + uint32_t res2 = 0; +#pragma unroll + for (int i = 0; i < 4; i++) { + res1 |= vals[pack_idx[i]] << (i * 8); + res2 |= vals[4 + pack_idx[i]] << (i * 8); + } + + out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1; + out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2; + } + }; + + auto start_pipes = [&](int k_tile_id, int n_tile_id) { +#pragma unroll + for (int pipe = 0; pipe < repack_stages - 1; pipe++) { + fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe); + } + + wait_for_stage(); + }; +#pragma unroll + for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) { + int n_tile_id = 0; + + start_pipes(k_tile_id, n_tile_id); + + while (n_tile_id < n_tiles) { +#pragma unroll + for (int pipe = 0; pipe < repack_stages; pipe++) { + fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id, n_tile_id + pipe + repack_stages - 1); + repack_tile(pipe, k_tile_id, n_tile_id + pipe); + wait_for_stage(); + } + n_tile_id += repack_stages; + } + } +} +#endif + +} // namespace device::marlin + +// Host wrapper +void awq_marlin_repack( + tvm::ffi::TensorView out, tvm::ffi::TensorView b_q_weight, int64_t size_k, int64_t size_n, int64_t num_bits) { + using namespace host; + using namespace device::marlin; + + // Validate alignment + RuntimeCheck(size_k % tile_k_size == 0, "size_k = ", size_k, " is not divisible by tile_k_size = ", tile_k_size); + RuntimeCheck(size_n % tile_n_size == 0, "size_n = ", size_n, " is not divisible by tile_n_size = ", tile_n_size); + RuntimeCheck(num_bits == 4 || num_bits == 8, "num_bits must be 4 or 8. Got = ", num_bits); + + int const pack_factor = 32 / num_bits; + + // Validate tensors + SymbolicDevice cuda_device; + cuda_device.set_options(); + + TensorMatcher({size_k, size_n / pack_factor}).with_dtype().with_device(cuda_device).verify(b_q_weight); + + TensorMatcher({size_k / tile_size, size_n * tile_size / pack_factor}) + .with_dtype() + .with_device(cuda_device) + .verify(out); + + // Get device and stream + auto device = cuda_device.unwrap(); + auto stream = LaunchKernel::resolve_device(device); + + // Get pointers + auto* b_q_weight_ptr = reinterpret_cast(b_q_weight.data_ptr()); + auto* out_ptr = reinterpret_cast(out.data_ptr()); + + // Get device attributes + int blocks = 0; + cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, device.device_id); + + int max_shared_mem = 0; + cudaDeviceGetAttribute(&max_shared_mem, cudaDevAttrMaxSharedMemoryPerBlockOptin, device.device_id); + RuntimeCheck(max_shared_mem > 0, "max_shared_mem must be > 0"); + + // Dispatch based on num_bits + if (num_bits == 4) { + cudaFuncSetAttribute( + awq_marlin_repack_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); + LaunchKernel(blocks, repack_threads, stream, max_shared_mem)( + awq_marlin_repack_kernel, + b_q_weight_ptr, + out_ptr, + static_cast(size_k), + static_cast(size_n)); + } else if (num_bits == 8) { + cudaFuncSetAttribute( + awq_marlin_repack_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); + LaunchKernel(blocks, repack_threads, stream, max_shared_mem)( + awq_marlin_repack_kernel, + b_q_weight_ptr, + out_ptr, + static_cast(size_k), + static_cast(size_n)); + } else { + RuntimeCheck(false, "Unsupported repack config: num_bits = ", num_bits); + } +} diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/dequant.h b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/dequant.h new file mode 100644 index 000000000..d194cf3ec --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/dequant.h @@ -0,0 +1,504 @@ +/* +Fast Dequantization (Converting INT4/INT8/FP4/FP8 to FP16/BF16) + +The process of fast dequantization can be summarized as a combination +of bitwise operations and floating-point computations: + +weight =>(bit_op / bitwise operations)=> +f16_value =>(flop / floating-point computation)=> +dequantized_weight + +Since the dequantized weights typically require subtracting the zero point and +applying a scale factor, the floating-point computation step can be fused with +the zero-point subtraction and scaling operations. + +The following are the parts that need to be modified for the fused operation +of zero-point subtraction and scaling. + +## INT4 => FP16/BF16 or INT8 => FP16 + +The floating-point computation is `__hsub2` + +If has zero points: + + flop(bit_op(weight)) - flop(bit_op(zp)) + = sub(bit_op(weight), bias) - sub(bit_op(zp), bias) + = bit_op(weight) - bit_op(zp) + +so we don't need additional modification. + +If has float zero points: + + flop(bit_op(weight)) - fzp + = sub(bit_op(weight), bias) - fzp + = bit_op(weight) - (fzp + bias) + +where the `fzp + bias` can be computed at weight loading. But this +may have accuracy issue, so we should not use this in most cases. + +If has not zero points: + + scale(flop(bit_op(weight))) + = scale(sub(bit_op(weight), bias)) + = scale(bit_op(weight)) - scale(bias) + = fma(bit_op(weight), scale_factor, scale(bias)) + +where the `scale(bias)` can be cached. But this may have accuracy issue, +so we should not use this in most cases. + + +## INT8 => BF16 + +INT8 => BF16 is a special case, it use byte_perm instead of flop. +We cannot fused byte_perm with scaling. + + +## FP4/FP8 => FP16/BF16 + + scale(flop(bit_op(weight))) + = scale(mul(bit_op(weight), multiplier)) + = mul(bit_op(weight), scale_factor * multiplier) + +where `scale_factor * multiplier` can be computed at weight loading. + +*/ + +#include "marlin_dtypes.cuh" + +namespace device::marlin { + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 +// Lookup-table based 3-input logical operation; explicitly used for +// dequantization as the compiler does not seem to automatically recognize it in +// all cases. +template +__device__ inline int lop3(int a, int b, int c) { + int res; + asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" : "=r"(res) : "r"(a), "r"(b), "r"(c), "n"(lut)); + return res; +} + +// Constructs destination register by taking bytes from 2 sources (based on +// mask) +template +__device__ inline uint32_t prmt(uint32_t a) { + uint32_t res; + asm volatile("prmt.b32 %0, %1, %2, %3;\n" : "=r"(res) : "r"(a), "n"(start_byte), "n"(mask)); + return res; +} + +template +__device__ inline void dequant(int q, scalar_t2* frag_b); + +// +// Efficiently dequantize 4bit values packed in an int32 value into a full +// B-fragment of 4 fp16 values. We mostly follow the strategy in the link below, +// with some small changes: +// - FP16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L215-L287 +// - BF16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L327-L385 +// +template<> +__device__ inline void dequant(int q, half2* frag_b) { + const int MASK = 0x000f000f; + const int EX = 0x64006400; + // Guarantee that the `(a & b) | c` operations are LOP3s. + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + q >>= 4; + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + + frag_b[0] = *reinterpret_cast(&lo); + frag_b[1] = *reinterpret_cast(&hi); +} + +template<> +__device__ inline void dequant(int q, half2* frag_b) { + const int LO = 0x000f000f; + const int HI = 0x00f000f0; + const int EX = 0x64006400; + // Guarantee that the `(a & b) | c` operations are LOP3s. + // clang-format off + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + // clang-format on + // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point + // directly into `SUB` and `ADD`. + const int SUB = 0x64086408; + const int MUL = 0x2c002c00; + const int ADD = 0xd480d480; + frag_b[0] = __hsub2(*reinterpret_cast(&lo), *reinterpret_cast(&SUB)); + frag_b[1] = + __hfma2(*reinterpret_cast(&hi), *reinterpret_cast(&MUL), *reinterpret_cast(&ADD)); +} + +template<> +__device__ inline void dequant(int q, half2* frag_b) { + dequant(q, frag_b); +} + +template<> +__device__ inline void dequant(int q, half2* frag_b) { + const int LO = 0x000f000f; + const int HI = 0x00f000f0; + const int EX = 0x64006400; + // Guarantee that the `(a & b) | c` operations are LOP3s. + // clang-format off + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + // clang-format on + // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point + // directly into `SUB` and `ADD`. + const int SUB = 0x64006400; + const int MUL = 0x2c002c00; + const int ADD = 0xd400d400; + frag_b[0] = __hsub2(*reinterpret_cast(&lo), *reinterpret_cast(&SUB)); + frag_b[1] = + __hfma2(*reinterpret_cast(&hi), *reinterpret_cast(&MUL), *reinterpret_cast(&ADD)); +} + +template<> +__device__ inline void dequant(int q, nv_bfloat162* frag_b) { + static constexpr uint32_t MASK = 0x000f000f; + static constexpr uint32_t EX = 0x43004300; + + // Guarantee that the `(a & b) | c` operations are LOP3s. + // clang-format off + int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + q >>= 4; + int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + // clang-format on + + frag_b[0] = *reinterpret_cast(&lo); + frag_b[1] = *reinterpret_cast(&hi); +} + +template<> +__device__ inline void dequant(int q, nv_bfloat162* frag_b) { + dequant(q, frag_b); + + static constexpr uint32_t SUB = 0x43084308; + + frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast(&SUB)); + frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast(&SUB)); +} + +template<> +__device__ inline void dequant(int q, nv_bfloat162* frag_b) { + dequant(q, frag_b); +} + +template<> +__device__ inline void dequant(int q, nv_bfloat162* frag_b) { + dequant(q, frag_b); + + static constexpr uint32_t SUB = 0x43004300; + + frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast(&SUB)); + frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast(&SUB)); +} + +// +// Fast Int8ToFp16/Int8ToBf16: Efficiently dequantize 8bit int values to fp16 or +// bf16 Reference: +// - FP16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L53-L85 +// - BF16: +// https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h#L125-L175 +// +template<> +__device__ inline void dequant(int q, half2* frag_b) { + static constexpr uint32_t mask_for_elt_01 = 0x5250; + static constexpr uint32_t mask_for_elt_23 = 0x5351; + static constexpr uint32_t start_byte_for_fp16 = 0x64646464; + + uint32_t lo = prmt(q); + uint32_t hi = prmt(q); + + frag_b[0] = *reinterpret_cast(&lo); + frag_b[1] = *reinterpret_cast(&hi); +} + +template<> +__device__ inline void dequant(int q, half2* frag_b) { + dequant(q, frag_b); + + static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480; + frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); +} + +template<> +__device__ inline void dequant(int q, half2* frag_b) { + dequant(q, frag_b); +} + +template<> +__device__ inline void dequant(int q, half2* frag_b) { + dequant(q, frag_b); + + static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64006400; + frag_b[0] = __hsub2(frag_b[0], *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); + frag_b[1] = __hsub2(frag_b[1], *reinterpret_cast(&I8s_TO_F16s_MAGIC_NUM)); +} + +template<> +__device__ inline void dequant(int q, nv_bfloat162* frag_b) { + float fp32_intermediates[4]; + uint32_t* fp32_intermediates_casted = reinterpret_cast(fp32_intermediates); + + static constexpr uint32_t fp32_base = 0x4B000000; + fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650); + fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652); + fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651); + fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653); + + fp32_intermediates[0] -= 8388736.f; + fp32_intermediates[1] -= 8388736.f; + fp32_intermediates[2] -= 8388736.f; + fp32_intermediates[3] -= 8388736.f; + + uint32_t* bf16_result_ptr = reinterpret_cast(frag_b); + bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0], fp32_intermediates_casted[1], 0x7632); + bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2], fp32_intermediates_casted[3], 0x7632); +} + +template<> +__device__ inline void dequant(int q, nv_bfloat162* frag_b) { + float fp32_intermediates[4]; + uint32_t* fp32_intermediates_casted = reinterpret_cast(fp32_intermediates); + + static constexpr uint32_t fp32_base = 0x4B000000; + fp32_intermediates_casted[0] = __byte_perm(q, fp32_base, 0x7650); + fp32_intermediates_casted[1] = __byte_perm(q, fp32_base, 0x7652); + fp32_intermediates_casted[2] = __byte_perm(q, fp32_base, 0x7651); + fp32_intermediates_casted[3] = __byte_perm(q, fp32_base, 0x7653); + + fp32_intermediates[0] -= 8388608.f; + fp32_intermediates[1] -= 8388608.f; + fp32_intermediates[2] -= 8388608.f; + fp32_intermediates[3] -= 8388608.f; + + uint32_t* bf16_result_ptr = reinterpret_cast(frag_b); + bf16_result_ptr[0] = __byte_perm(fp32_intermediates_casted[0], fp32_intermediates_casted[1], 0x7632); + bf16_result_ptr[1] = __byte_perm(fp32_intermediates_casted[2], fp32_intermediates_casted[3], 0x7632); +} + +template<> +__device__ inline void dequant(int q, half2* frag_b) { + // Constants for FP8 (E4M3) and FP16 formats + constexpr int FP8_EXPONENT = 4, FP16_EXPONENT = 5; + constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP8_EXPONENT; + constexpr int MASK = 0x7F007F00; + + // Extract and shift FP8 values to FP16 format + int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT); + q <<= 8; + int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT); + + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = *reinterpret_cast(&Out1); + frag_b[0] = *reinterpret_cast(&Out2); +} + +template<> +__device__ inline void dequant(int q, half2* frag_b) { + dequant(q, frag_b); + + // Constants for FP8 (E4M3) and FP16 formats + constexpr int FP8_EXPONENT = 4, FP16_EXPONENT = 5; + + // Construct and apply exponent bias + constexpr int BIAS_OFFSET = (1 << (FP16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1)); + const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET)); + + // Convert to half2 and apply bias + frag_b[1] = __hmul2(frag_b[1], bias_reg); + frag_b[0] = __hmul2(frag_b[0], bias_reg); +} + +template<> +__device__ inline void dequant(int q, nv_bfloat162* frag_b) { + // Constants for FP8 (E4M3) and BF16 formats + constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8; + constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT; + + constexpr int MASK = 0x7F007F00; + + // Extract and shift FP8 values to BF16 format + int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT); + q <<= 8; + int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT); + + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = *reinterpret_cast(&Out1); + frag_b[0] = *reinterpret_cast(&Out2); +} + +template<> +__device__ inline void dequant(int q, nv_bfloat162* frag_b) { + dequant(q, frag_b); + + // Constants for FP8 (E4M3) and BF16 formats + constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8; + + // Construct and apply exponent bias + constexpr int BIAS_OFFSET = (1 << (BF16_EXPONENT - 1)) - (1 << (FP8_EXPONENT - 1)); + // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent + // position + constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23; + const nv_bfloat162 bias_reg = __float2bfloat162_rn(*reinterpret_cast(&BIAS)); + + // Convert to bfloat162 and apply bias + frag_b[1] = __hmul2(frag_b[1], bias_reg); + frag_b[0] = __hmul2(frag_b[0], bias_reg); +} + +template<> +__device__ inline void dequant(int q, half2* frag_b) { + // Constants for FP4 (E2M1) and FP16 formats + constexpr int FP4_EXPONENT = 2, FP16_EXPONENT = 5; + constexpr int RIGHT_SHIFT = FP16_EXPONENT - FP4_EXPONENT; + constexpr int MASK = 0x70007000; + + // Extract and shift FP4 values to FP16 format + int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT); + q <<= 4; + int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT); + + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = *reinterpret_cast(&Out1); + frag_b[0] = *reinterpret_cast(&Out2); +} + +template<> +__device__ inline void dequant(int q, half2* frag_b) { + dequant(q, frag_b); + + // Constants for FP4 (E2M1) and FP16 formats + constexpr int FP4_EXPONENT = 2, FP16_EXPONENT = 5; + + // Construct and apply exponent bias + constexpr int BIAS_OFFSET = (1 << (FP16_EXPONENT - 1)) - (1 << (FP4_EXPONENT - 1)); + const half2 bias_reg = __float2half2_rn(float(1 << BIAS_OFFSET)); + + // Convert to half2 and apply bias + frag_b[1] = __hmul2(frag_b[1], bias_reg); + frag_b[0] = __hmul2(frag_b[0], bias_reg); +} + +template<> +__device__ inline void dequant(int q, nv_bfloat162* frag_b) { + // Constants for FP4 (E2M1) and FP16 formats + constexpr int FP4_EXPONENT = 2, BF16_EXPONENT = 8; + constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP4_EXPONENT; + constexpr int MASK = 0x70007000; + + // Extract and shift FP4 values to FP16 format + int Out1 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT); + q <<= 4; + int Out2 = (q & 0x80008000) | ((q & MASK) >> RIGHT_SHIFT); + + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = *reinterpret_cast(&Out1); + frag_b[0] = *reinterpret_cast(&Out2); +} + +template<> +__device__ inline void dequant(int q, nv_bfloat162* frag_b) { + dequant(q, frag_b); + + // Constants for FP4 (E2M1) and BF16 formats + constexpr int FP4_EXPONENT = 2, BF16_EXPONENT = 8; + + // Construct and apply exponent bias + constexpr int BIAS_OFFSET = (1 << (BF16_EXPONENT - 1)) - (1 << (FP4_EXPONENT - 1)); + // Add 127 (float exponent bias) to BIAS_OFFSET and shift to float exponent + // position + constexpr uint32_t BIAS = (BIAS_OFFSET + 127) << 23; + const nv_bfloat162 bias_reg = __float2bfloat162_rn(*reinterpret_cast(&BIAS)); + + // Convert to half2 and apply bias + frag_b[1] = __hmul2(frag_b[1], bias_reg); + frag_b[0] = __hmul2(frag_b[0], bias_reg); +} + +template +__device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b); + +template<> +__device__ inline void dequant_fp8_scales(int q, half2* frag_b) { + int Out1 = (q & 0xFF00FF00) >> 1; + ; + q <<= 8; + int Out2 = (q & 0xFF00FF00) >> 1; + + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = *reinterpret_cast(&Out1); + frag_b[0] = *reinterpret_cast(&Out2); +}; + +template<> +__device__ inline void dequant_fp8_scales(int q, nv_bfloat162* frag_b) { + constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8; + constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT; + constexpr int MASK = 0x7F007F00; + + // Extract and shift FP8 values to BF16 format + int Out1 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT); + q <<= 8; + int Out2 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT); + + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = *reinterpret_cast(&Out1); + frag_b[0] = *reinterpret_cast(&Out2); +}; + +// New version with s_type_id parameter for marlin_moe_wna16_v2 +template +__device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b); + +template<> +__device__ inline void dequant_fp8_scales(int q, half2* frag_b) { + int Out1 = (q & 0xFF00FF00) >> 1; + ; + q <<= 8; + int Out2 = (q & 0xFF00FF00) >> 1; + + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = *reinterpret_cast(&Out1); + frag_b[0] = *reinterpret_cast(&Out2); +}; + +template<> +__device__ inline void dequant_fp8_scales(int q, nv_bfloat162* frag_b) { + constexpr int FP8_EXPONENT = 4, BF16_EXPONENT = 8; + constexpr int RIGHT_SHIFT = BF16_EXPONENT - FP8_EXPONENT; + constexpr int MASK = 0x7F007F00; + + // Extract and shift FP8 values to BF16 format + int Out1 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT); + q <<= 8; + int Out2 = ((q & 0x80008000) >> 1) | ((q & MASK) >> RIGHT_SHIFT); + + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = *reinterpret_cast(&Out1); + frag_b[0] = *reinterpret_cast(&Out2); +} + +template<> +__device__ inline void dequant_fp8_scales(int q, nv_bfloat162* frag_b) { + // In this conversion, 2 ** -127 in FP8E8M0 would become 0 in BF16, + // but we assume that such a extreme value would not occur in real models. + int Out1 = (q & 0xFF00FF00) >> 1; + q <<= 7; + int Out2 = q & 0x7F807F80; + + // Note: reverse indexing is intentional because weights are permuted + frag_b[1] = *reinterpret_cast(&Out1); + frag_b[0] = *reinterpret_cast(&Out2); +} + +#endif + +} // namespace device::marlin diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/gptq_marlin.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/gptq_marlin.cuh new file mode 100644 index 000000000..02b3f5222 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/gptq_marlin.cuh @@ -0,0 +1,1001 @@ +/* + * Modified by Neural Magic + * Copyright (C) Marlin.2024 Elias Frantar + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Adapted from https://github.com/IST-DASLab/marlin + */ + +#pragma once + +#include + +#include + +#include "kernel.h" +#include "marlin_template.h" + +namespace device::marlin { + +__global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){}; + +using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS); + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + +__global__ void permute_cols_kernel( + int4 const* __restrict__ a_int4_ptr, + int const* __restrict__ perm_int_ptr, + int4* __restrict__ out_int4_ptr, + int size_m, + int size_k, + int lda, + int block_rows) {} + +#else + +// For a given "a" of size [M,K] performs a permutation of the K columns based +// on the given "perm" indices. +__global__ void permute_cols_kernel( + int4 const* __restrict__ a_int4_ptr, + int const* __restrict__ perm_int_ptr, + int4* __restrict__ out_int4_ptr, + int size_m, + int size_k, + int lda, + int block_rows) { + auto start_row = block_rows * blockIdx.x; + int finish_row = start_row + block_rows; + if (finish_row > size_m) { + finish_row = size_m; + } + int cur_block_rows = finish_row - start_row; + + int input_row_stride = lda * sizeof(half) / 16; + int output_row_stride = size_k * sizeof(half) / 16; + + auto permute_row = [&](int row) { + int iters = size_k / default_threads; + int rest = size_k % default_threads; + + int input_offset = row * input_row_stride; + int output_offset = row * output_row_stride; + + half const* a_row_half = reinterpret_cast(a_int4_ptr + input_offset); + half* out_half = reinterpret_cast(out_int4_ptr + output_offset); + + int base_k = 0; + + for (int i = 0; i < iters; i++) { + auto cur_k = base_k + threadIdx.x; + int src_pos = perm_int_ptr[cur_k]; + + out_half[cur_k] = a_row_half[src_pos]; + + base_k += default_threads; + } + + if (rest) { + if (threadIdx.x < rest) { + auto cur_k = base_k + threadIdx.x; + int src_pos = perm_int_ptr[cur_k]; + + out_half[cur_k] = a_row_half[src_pos]; + } + } + }; + + for (int i = 0; i < cur_block_rows; i++) { + int cur_row = start_row + i; + if (cur_row < size_m) { + permute_row(cur_row); + } + } +} + +typedef struct { + int thread_k; + int thread_n; + int num_threads; +} thread_config_t; + +thread_config_t small_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {128, 128, 256}, + {64, 128, 128}, + {128, 64, 128}}; + +thread_config_t large_batch_thread_configs[] = { + // Ordered by priority + + // thread_k, thread_n, num_threads + {64, 256, 256}, + {64, 128, 128}, + {128, 64, 128}}; + +typedef struct { + int blocks_per_sm; + thread_config_t tb_cfg; +} exec_config_t; + +int get_scales_cache_size( + thread_config_t const& th_config, + int prob_m, + int prob_n, + int prob_k, + int num_bits, + int group_size, + bool has_act_order, + bool is_k_full) { + bool cache_scales_chunk = has_act_order && !is_k_full; + + int tb_n = th_config.thread_n; + int tb_k = th_config.thread_k; + + // Get max scale groups per thread-block + int tb_groups; + if (group_size == -1) { + tb_groups = 1; + } else if (group_size == 0) { + tb_groups = div_ceil(tb_k, 32); // Worst case is 32 group size + } else { + tb_groups = div_ceil(tb_k, group_size); + } + + if (cache_scales_chunk) { + int load_groups = tb_groups * pipe_stages * 2; // Chunk size is 2x pipeline over dim K + load_groups = max(load_groups, 32); // We load at least 32 scale groups + return load_groups * tb_n * 2; + } else { + int tb_scales = tb_groups * tb_n * 2; + + return tb_scales * pipe_stages; + } +} + +int get_kernel_cache_size( + thread_config_t const& th_config, + int thread_m_blocks, + int prob_m, + int prob_n, + int prob_k, + int num_bits, + int group_size, + bool has_act_order, + bool is_k_full, + int has_zp, + int is_zp_float) { + int pack_factor = 32 / num_bits; + + // Get B size + int tb_k = th_config.thread_k; + int tb_n = th_config.thread_n; + int tb_m = thread_m_blocks * 16; + int sh_a_size = pipe_stages * (tb_m * tb_k) * 2; + int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4; + int sh_red_size = tb_m * (tb_n + 8); + int sh_s_size = + get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits, group_size, has_act_order, is_k_full); + int sh_g_idx_size = has_act_order && !is_k_full ? pipe_stages * tb_k / 4 : 0; + int sh_zp_size = 0; + if (has_zp) { + if (is_zp_float) + sh_zp_size = sh_s_size; + else if (num_bits == 4) + sh_zp_size = sh_s_size / 4; + else if (num_bits == 8) + sh_zp_size = sh_s_size / 2; + } + + int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size + sh_zp_size + sh_g_idx_size; + + return total_size; +} + +bool is_valid_config( + thread_config_t const& th_config, + int thread_m_blocks, + int prob_m, + int prob_n, + int prob_k, + int num_bits, + int group_size, + bool has_act_order, + bool is_k_full, + int has_zp, + int is_zp_float, + int max_shared_mem) { + // Sanity + if (th_config.thread_k == -1 || th_config.thread_n == -1 || th_config.num_threads == -1) { + return false; + } + + // Verify K/N are divisible by thread K/N + if (prob_k % th_config.thread_k != 0 || prob_n % th_config.thread_n != 0) { + return false; + } + + // Verify min for thread K/N + if (th_config.thread_n < min_thread_n || th_config.thread_k < min_thread_k) { + return false; + } + + // num_threads must be at least 128 (= 4 warps) + if (th_config.num_threads < 128) { + return false; + } + + // Check that pipeline fits into cache + int cache_size = get_kernel_cache_size( + th_config, + thread_m_blocks, + prob_m, + prob_n, + prob_k, + num_bits, + group_size, + has_act_order, + is_k_full, + has_zp, + is_zp_float); + return cache_size <= max_shared_mem; +} + +#define _GET_IF( \ + W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \ + else if ( \ + q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && thread_n_blocks == THREAD_N_BLOCKS && \ + thread_k_blocks == THREAD_K_BLOCKS && m_block_size_8 == M_BLOCK_SIZE_8 && group_blocks == GROUP_BLOCKS && \ + num_threads == NUM_THREADS && is_zp_float == IS_ZP_FLOAT) { \ + kernel = Marlin< \ + scalar_t, \ + W_TYPE.id(), \ + NUM_THREADS, \ + THREAD_M_BLOCKS, \ + THREAD_N_BLOCKS, \ + THREAD_K_BLOCKS, \ + M_BLOCK_SIZE_8, \ + pipe_stages, \ + GROUP_BLOCKS, \ + IS_ZP_FLOAT>; \ + } + +// COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false) +// this is the most common cases +// BIGGROUP: cases for big group size (group_blocks in [-1, 8]) +// FZP: cases for float-zero-point (is_zp_float = true) +// ACT: cases for act order case (group_blocks == 0) +// FP4: cases for nvfp4(e2m1) (group_blocks == 1) +#define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) + +#define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \ + \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \ + \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) + +#define COMMON_GET_IF(W_TYPE) \ + COMMON_GET_IF_M1(W_TYPE, 8, 8, 256) \ + COMMON_GET_IF_M1(W_TYPE, 8, 4, 128) \ + COMMON_GET_IF_M1(W_TYPE, 4, 8, 128) \ + COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \ + COMMON_GET_IF_M234(W_TYPE, 8, 4, 128) \ + COMMON_GET_IF_M234(W_TYPE, 4, 8, 128) + +#define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) + +#define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) + +#define BIGGROUP_GET_IF(W_TYPE) \ + BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256) \ + BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128) \ + BIGGROUP_GET_IF_M1(W_TYPE, 4, 8, 128) \ + BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \ + BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128) \ + BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128) + +#define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) + +#define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) + +#define FP4_GET_IF(W_TYPE) \ + FP4_GET_IF_M1(W_TYPE, 8, 8, 256) \ + FP4_GET_IF_M1(W_TYPE, 8, 4, 128) \ + FP4_GET_IF_M1(W_TYPE, 4, 8, 128) \ + FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \ + FP4_GET_IF_M234(W_TYPE, 8, 4, 128) \ + FP4_GET_IF_M234(W_TYPE, 4, 8, 128) + +// We currently have 4-bit models only with group_blocks == 4 +#define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) + +#define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) + +#define FZP_GET_IF(W_TYPE) \ + FZP_GET_IF_M1(W_TYPE, 8, 8, 256) \ + FZP_GET_IF_M1(W_TYPE, 8, 4, 128) \ + FZP_GET_IF_M1(W_TYPE, 4, 8, 128) \ + FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \ + FZP_GET_IF_M234(W_TYPE, 8, 4, 128) \ + FZP_GET_IF_M234(W_TYPE, 4, 8, 128) + +// We currently have 4-bit models only with group_blocks == 4 +#define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) + +#define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \ + _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) + +#define ACT_GET_IF(W_TYPE) \ + ACT_GET_IF_M1(W_TYPE, 8, 8, 256) \ + ACT_GET_IF_M1(W_TYPE, 8, 4, 128) \ + ACT_GET_IF_M1(W_TYPE, 4, 8, 128) \ + ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \ + ACT_GET_IF_M234(W_TYPE, 8, 4, 128) \ + ACT_GET_IF_M234(W_TYPE, 4, 8, 128) + +template +MarlinFuncPtr get_marlin_kernel( + const host::ScalarType q_type, + int thread_m_blocks, + int thread_n_blocks, + int thread_k_blocks, + bool m_block_size_8, + bool has_act_order, + bool has_zp, + int group_blocks, + int num_threads, + bool is_zp_float) { + int num_bits = q_type.size_bits(); + auto kernel = MarlinDefault; + if (false) { + } + + COMMON_GET_IF(host::kU4) + COMMON_GET_IF(host::kU4B8) + COMMON_GET_IF(host::kU8B128) + + FP4_GET_IF(host::kFE2M1f) + + BIGGROUP_GET_IF(host::kFE4M3fn) + + ACT_GET_IF(host::kU4B8) + ACT_GET_IF(host::kU8B128) + + if (std::is_same::value) { + if (false) { + } + FZP_GET_IF(host::kU4) + } + + return kernel; +} + +template +exec_config_t determine_exec_config( + const host::ScalarType& q_type, + int prob_m, + int prob_n, + int prob_k, + int thread_m_blocks, + bool m_block_size_8, + int num_bits, + int group_size, + bool has_act_order, + bool is_k_full, + bool has_zp, + bool is_zp_float, + int max_shared_mem, + int sms) { + exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}}; + thread_config_t* thread_configs = thread_m_blocks > 1 ? large_batch_thread_configs : small_batch_thread_configs; + int thread_configs_size = thread_m_blocks > 1 ? sizeof(large_batch_thread_configs) / sizeof(thread_config_t) + : sizeof(small_batch_thread_configs) / sizeof(thread_config_t); + + for (int i = 0; i < thread_configs_size; i++) { + thread_config_t th_config = thread_configs[i]; + + if (!is_valid_config( + th_config, + thread_m_blocks, + prob_m, + prob_n, + prob_k, + num_bits, + group_size, + has_act_order, + is_k_full, + has_zp, + is_zp_float, + max_shared_mem)) { + continue; + } + + int cache_size = get_kernel_cache_size( + th_config, + thread_m_blocks, + prob_m, + prob_n, + prob_k, + num_bits, + group_size, + has_act_order, + is_k_full, + has_zp, + is_zp_float); + + int group_blocks = 0; + if (!has_act_order) { + group_blocks = group_size == -1 ? -1 : group_size / 16; + } + + auto kernel = get_marlin_kernel( + q_type, + thread_m_blocks, + th_config.thread_n / 16, + th_config.thread_k / 16, + m_block_size_8, + has_act_order, + has_zp, + group_blocks, + th_config.num_threads, + is_zp_float); + + if (kernel == MarlinDefault) continue; + + // int m_tiles = div_ceil(prob_m, thread_m_blocks * 16); + // int n_tiles = prob_n / th_config.thread_n; + // int k_tiles = prob_k / th_config.thread_k; + + return {1, th_config}; + } + + return exec_cfg; +} + +template +void marlin_mm( + const void* A, + const void* B, + void* C, + void* C_tmp, + void* s, + void* s2, + void* zp, + void* g_idx, + void* perm, + void* a_tmp, + int prob_m, + int prob_n, + int prob_k, + int lda, + void* workspace, + host::ScalarType const& q_type, + bool has_act_order, + bool is_k_full, + bool has_zp, + int num_groups, + int group_size, + int dev, + cudaStream_t stream, + int thread_k_init, + int thread_n_init, + int sms, + bool use_atomic_add, + bool use_fp32_reduce, + bool is_zp_float) { + if (has_zp) { + host::RuntimeCheck( + q_type == host::kU4 || q_type == host::kU8, "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str()); + } else { + host::RuntimeCheck( + q_type == host::kU4B8 || q_type == host::kU8B128 || q_type == host::kFE4M3fn || q_type == host::kFE2M1f, + "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when " + "has_zp = False. Got = ", + q_type.str()); + } + + host::RuntimeCheck( + prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m, ", ", prob_n, ", ", prob_k, "]"); + + int group_blocks = 0; + if (has_act_order) { + if (is_k_full) { + host::RuntimeCheck(group_size != -1); + group_blocks = group_size / 16; + host::RuntimeCheck( + prob_k % group_blocks == 0, "prob_k = ", prob_k, " is not divisible by group_blocks = ", group_blocks); + } else { + host::RuntimeCheck(group_size == 0); + group_blocks = 0; + } + } else { + if (group_size == -1) { + group_blocks = -1; + } else { + group_blocks = group_size / 16; + host::RuntimeCheck( + prob_k % group_blocks == 0, "prob_k = ", prob_k, " is not divisible by group_blocks = ", group_blocks); + } + } + + int num_bits = q_type.size_bits(); + const int4* A_ptr = (const int4*)A; + const int4* B_ptr = (const int4*)B; + int4* C_ptr = (int4*)C; + int4* C_tmp_ptr = (int4*)C_tmp; + const int4* s_ptr = (const int4*)s; + const uint16_t* s2_ptr = (const uint16_t*)s2; + const int4* zp_ptr = (const int4*)zp; + const int* g_idx_ptr = (const int*)g_idx; + const int* perm_ptr = (const int*)perm; + int4* a_tmp_ptr = (int4*)a_tmp; + + int* locks = (int*)workspace; + + if (has_act_order) { + // Permute A columns + int block_rows = div_ceil(prob_m, sms); + host::LaunchKernel(sms, default_threads, stream)( + permute_cols_kernel, A_ptr, perm_ptr, a_tmp_ptr, prob_m, prob_k, lda, block_rows); + A_ptr = a_tmp_ptr; + lda = prob_k; + + // If we have a full K, then we can run the non-act-order version of Marlin + // (since the weight rows are reordered by increasing group ids, and by + // having a full K, we have full original groups) + if (is_k_full) has_act_order = false; + } + + int max_shared_mem = 0; + host::RuntimeDeviceCheck(cudaDeviceGetAttribute(&max_shared_mem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev)); + host::RuntimeCheck(max_shared_mem > 0); + + int max_par = 16; + if (prob_n <= 4096) max_par = 16 * 8; + int max_shared_mem_new = max_shared_mem; + int rest_m = prob_m; + int max_thread_m_blocks = 4; + while (rest_m) { + int par_count = rest_m / (max_thread_m_blocks * 16); + if (par_count > max_par) par_count = max_par; + int prob_m_split = par_count > 0 ? (par_count * (max_thread_m_blocks * 16)) : rest_m; + + int thread_k = thread_k_init; + int thread_n = thread_n_init; + + int thread_m_blocks = min(div_ceil(prob_m_split, 16), max_thread_m_blocks); + int m_block_size_8 = prob_m_split <= 8; + + // Set thread config + exec_config_t exec_cfg; + thread_config_t thread_tfg; + if (thread_k != -1 && thread_n != -1) { + thread_tfg = thread_config_t{thread_k, thread_n, default_threads}; + exec_cfg = exec_config_t{1, thread_tfg}; + host::RuntimeCheck(prob_n % thread_n == 0, "prob_n = ", prob_n, " is not divisible by thread_n = ", thread_n); + host::RuntimeCheck(prob_k % thread_k == 0, "prob_k = ", prob_k, " is not divisible by thread_k = ", thread_k); + } else { + // Auto config + exec_cfg = determine_exec_config( + q_type, + prob_m_split, + prob_n, + prob_k, + thread_m_blocks, + m_block_size_8, + num_bits, + group_size, + has_act_order, + is_k_full, + has_zp, + is_zp_float, + max_shared_mem, + sms); + thread_tfg = exec_cfg.tb_cfg; + if (thread_tfg.thread_k == -1 && max_thread_m_blocks > 1) { + max_thread_m_blocks--; + continue; + } + } + + int num_threads = thread_tfg.num_threads; + thread_k = thread_tfg.thread_k; + thread_n = thread_tfg.thread_n; + int blocks = sms * exec_cfg.blocks_per_sm; + if (exec_cfg.blocks_per_sm > 1) max_shared_mem_new = max_shared_mem / exec_cfg.blocks_per_sm - 1024; + + int thread_k_blocks = thread_k / 16; + int thread_n_blocks = thread_n / 16; + + host::RuntimeCheck( + is_valid_config( + thread_tfg, + thread_m_blocks, + prob_m_split, + prob_n, + prob_k, + num_bits, + group_size, + has_act_order, + is_k_full, + has_zp, + is_zp_float, + max_shared_mem_new), + "Invalid thread config: thread_m_blocks = ", + thread_m_blocks, + ", thread_k = ", + thread_tfg.thread_k, + ", thread_n = ", + thread_tfg.thread_n, + ", num_threads = ", + thread_tfg.num_threads, + " for MKN = [", + prob_m, + ", ", + prob_k, + ", ", + prob_n, + "] and num_bits = ", + num_bits, + ", prob_m_split = ", + prob_m_split, + ", group_size = ", + group_size, + ", has_act_order = ", + has_act_order, + ", is_k_full = ", + is_k_full, + ", has_zp = ", + has_zp, + ", is_zp_float = ", + is_zp_float, + ", max_shared_mem_new = ", + max_shared_mem_new); + + auto kernel = get_marlin_kernel( + q_type, + thread_m_blocks, + thread_n_blocks, + thread_k_blocks, + m_block_size_8, + has_act_order, + has_zp, + group_blocks, + num_threads, + is_zp_float); + + if (kernel == MarlinDefault) { + host::Panic( + "Unsupported shapes: MNK = [", + prob_m, + ", ", + prob_n, + ", ", + prob_k, + "]", + ", has_act_order = ", + has_act_order, + ", num_groups = ", + num_groups, + ", group_size = ", + group_size, + ", prob_m_split = ", + prob_m_split, + ", thread_m_blocks = ", + thread_m_blocks, + ", thread_n_blocks = ", + thread_n_blocks, + ", thread_k_blocks = ", + thread_k_blocks, + ", num_threads = ", + num_threads, + ", num_bits = ", + num_bits); + } + + host::RuntimeDeviceCheck( + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem_new)); + + bool part_use_atomic_add = use_atomic_add && div_ceil(prob_m_split, 64) * prob_n <= 2048; + + host::LaunchKernel(blocks, num_threads, stream, max_shared_mem_new)( + kernel, + A_ptr, + B_ptr, + C_ptr, + C_tmp_ptr, + s_ptr, + s2_ptr, + zp_ptr, + g_idx_ptr, + num_groups, + prob_m_split, + prob_n, + prob_k, + lda, + locks, + part_use_atomic_add, + use_fp32_reduce, + max_shared_mem_new); + + A_ptr += prob_m_split * (lda / 8); + C_ptr += prob_m_split * (prob_n / 8); + rest_m -= prob_m_split; + } +} + +#endif + +} // namespace device::marlin + +template +void gptq_marlin_gemm( + tvm::ffi::TensorView a, + tvm::ffi::TensorView b_q_weight, + tvm::ffi::TensorView b_scales, + tvm::ffi::TensorView global_scale, + tvm::ffi::TensorView b_zeros, + tvm::ffi::TensorView g_idx, + tvm::ffi::TensorView perm, + tvm::ffi::TensorView c, + tvm::ffi::TensorView c_tmp, + tvm::ffi::TensorView a_tmp, + tvm::ffi::TensorView workspace, + int64_t b_q_type_id, + bool is_k_full, + bool use_atomic_add, + bool use_fp32_reduce, + bool is_zp_float) { + using namespace host; + + ScalarType const b_q_type = ScalarType::from_id(b_q_type_id); + int pack_factor = 32 / b_q_type.size_bits(); + + // Bind symbolic sizes + auto M = SymbolicSize{"M"}; + auto K = SymbolicSize{"K"}; + auto N = SymbolicSize{"N"}; + auto device = SymbolicDevice{}; + device.set_options(); + + // Verify a: [M, K] + auto lda = SymbolicSize{"lda"}; + TensorMatcher({M, K}).with_strides({lda, 1}).with_dtype().with_device(device).verify(a); + + int64_t size_m = M.unwrap(); + int64_t size_k = K.unwrap(); + + // Verify b_q_weight: [K/tile_size, packed_N] + RuntimeCheck( + size_k % device::marlin::tile_size == 0, + "size_k = ", + size_k, + " is not divisible by tile_size = ", + device::marlin::tile_size); + int64_t expected_bqw_dim0 = size_k / device::marlin::tile_size; + auto bqw_dim0 = SymbolicSize{"bqw_dim0"}; + auto bqw_dim1 = SymbolicSize{"bqw_dim1"}; + bqw_dim0.set_value(expected_bqw_dim0); + TensorMatcher({bqw_dim0, bqw_dim1}).with_dtype().with_device(device).verify(b_q_weight); + + RuntimeCheck( + b_q_weight.size(1) % device::marlin::tile_size == 0, + "b_q_weight.size(1) = ", + b_q_weight.size(1), + " is not divisible by tile_size = ", + device::marlin::tile_size); + int64_t actual_size_n = (b_q_weight.size(1) / device::marlin::tile_size) * pack_factor; + N.set_value(actual_size_n); + int64_t size_n = N.unwrap(); + + // Verify stride alignment + int64_t a_stride0 = a.stride(0); + RuntimeCheck(a_stride0 % 8 == 0, "a.stride(0) must be divisible by 8"); + + // Verify b_scales: [num_groups, N] + auto num_groups_sym = SymbolicSize{"num_groups"}; + TensorMatcher({num_groups_sym, N}).with_device(device).verify(b_scales); + int num_groups = static_cast(num_groups_sym.unwrap()); + + // Verify c: [M, N] + TensorMatcher({M, N}).with_dtype().with_device(device).verify(c); + + // Early return for zero-size M + if (size_m == 0) return; + + // Determine has_act_order from g_idx/perm sizes + int64_t g_idx_size = g_idx.size(0); + int64_t perm_size = perm.size(0); + bool has_act_order = g_idx_size > 0 && perm_size > 0; + + if (has_act_order) { + RuntimeCheck( + (g_idx_size == size_k && perm_size == size_k), + "Unexpected g_idx.size(0) = ", + g_idx_size, + " and perm.size(0) = ", + perm_size, + ", where size_k = ", + size_k); + } + + // Determine has_zp from b_zeros size + int64_t b_zeros_size = b_zeros.size(0); + bool has_zp = b_zeros_size > 0; + + if (has_zp) { + RuntimeCheck( + b_q_type == kU4 || b_q_type == kU8, "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str()); + } else { + RuntimeCheck( + b_q_type == kU4B8 || b_q_type == kU8B128 || b_q_type == kFE4M3fn || b_q_type == kFE2M1f, + "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when " + "has_zp = False. Got = ", + b_q_type.str()); + } + + if (has_zp && is_zp_float) { + RuntimeCheck( + std::is_same::value, "Computation type must be float16 (half) when using float zero points."); + } + + // Verify b_zeros shape + if (has_zp) { + RuntimeCheck(b_zeros.dim() == 2, "b_zeros rank = ", b_zeros.dim(), " is not 2"); + if (is_zp_float) { + RuntimeCheck(b_zeros.size(1) == size_n, "b_zeros dim 1 = ", b_zeros.size(1), " is not size_n = ", size_n); + RuntimeCheck( + num_groups == b_zeros.size(0), "b_zeros dim 0 = ", b_zeros.size(0), " is not num_groups = ", num_groups); + RuntimeCheck(num_groups != -1, "num_groups must be != -1"); + } else { + RuntimeCheck( + b_zeros.size(0) == num_groups, "b_zeros dim 0 = ", b_zeros.size(0), " is not num_groups = ", num_groups); + RuntimeCheck( + b_zeros.size(1) == size_n / pack_factor, + "b_zeros dim 1 = ", + b_zeros.size(1), + " is not size_n / pack_factor = ", + size_n / pack_factor); + } + } + + // Verify global_scale + int64_t global_scale_size = global_scale.size(0); + if (global_scale_size > 0) { + RuntimeCheck(b_q_type == kFE2M1f, "global_scale can only be used for float4_e2m1f."); + } else { + RuntimeCheck(!(b_q_type == kFE2M1f), "the global_scale parameter must be passed for float4_e2m1f."); + } + + // Derive group_size + int group_size = -1; + if (has_act_order) { + if (is_k_full) { + RuntimeCheck(num_groups > 1, "For act_order, num_groups must be > 1"); + RuntimeCheck(size_k % num_groups == 0, "size_k = ", size_k, ", is not divisible by num_groups = ", num_groups); + group_size = static_cast(size_k / num_groups); + } else { + group_size = 0; + } + } else { + if (num_groups > 1) { + RuntimeCheck(size_k % num_groups == 0, "size_k = ", size_k, ", is not divisible by num_groups = ", num_groups); + group_size = static_cast(size_k / num_groups); + } else { + group_size = -1; + } + } + + // Verify workspace and get device info + RuntimeCheck( + size_n % device::marlin::min_thread_n == 0, + "size_n = ", + size_n, + ", is not divisible by min_thread_n = ", + device::marlin::min_thread_n); + + DLDevice dl_device = device.unwrap(); + int dev = dl_device.device_id; + cudaStream_t stream = LaunchKernel::resolve_device(dl_device); + + int sms = -1; + RuntimeDeviceCheck(cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, dev)); + + RuntimeCheck( + workspace.size(0) >= sms, "workspace.size(0) = ", workspace.size(0), " is below min_workspace_size = ", sms); + + // Hardcoded defaults (auto config) + int thread_k_init = -1; + int thread_n_init = -1; + + // Compute c_tmp and a_tmp pointers + // c_tmp and a_tmp are pre-allocated by caller + + device::marlin::marlin_mm( + a.data_ptr(), + b_q_weight.data_ptr(), + c.data_ptr(), + c_tmp.data_ptr(), + b_scales.data_ptr(), + global_scale.data_ptr(), + b_zeros.data_ptr(), + g_idx.data_ptr(), + perm.data_ptr(), + a_tmp.data_ptr(), + static_cast(size_m), + static_cast(size_n), + static_cast(size_k), + static_cast(a_stride0), + workspace.data_ptr(), + b_q_type, + has_act_order, + is_k_full, + has_zp, + num_groups, + group_size, + dev, + stream, + thread_k_init, + thread_n_init, + sms, + use_atomic_add, + use_fp32_reduce, + is_zp_float); +} diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/gptq_marlin_repack.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/gptq_marlin_repack.cuh new file mode 100644 index 000000000..b869260c1 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/gptq_marlin_repack.cuh @@ -0,0 +1,362 @@ +/* + * Modified by Neural Magic + * Copyright (C) Marlin.2024 Elias Frantar + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Adapted from https://github.com/IST-DASLab/marlin + */ + +#pragma once + +#include + +#include + +#include "marlin.cuh" + +namespace device::marlin { + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 +template +__global__ void gptq_marlin_repack_kernel( + uint32_t const* __restrict__ b_q_weight_ptr, + uint32_t const* __restrict__ perm_ptr, + uint32_t* __restrict__ out_ptr, + int size_k, + int size_n) { + return; +} +#else +template +__global__ void gptq_marlin_repack_kernel( + uint32_t const* __restrict__ b_q_weight_ptr, + uint32_t const* __restrict__ perm_ptr, + uint32_t* __restrict__ out_ptr, + int size_k, + int size_n) { + constexpr int pack_factor = 32 / num_bits; + + int k_tiles = size_k / tile_k_size; + int n_tiles = size_n / tile_n_size; + int block_k_tiles = div_ceil(k_tiles, gridDim.x); + + auto start_k_tile = blockIdx.x * block_k_tiles; + if (start_k_tile >= k_tiles) { + return; + } + + int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles); + + // Wait until the next thread tile has been loaded to shared memory. + auto wait_for_stage = [&]() { + // We only have `stages - 2` active fetches since we are double buffering + // and can only issue the next fetch when it is guaranteed that the previous + // shared memory load is fully complete (as it may otherwise be + // overwritten). + cp_async_wait(); + __syncthreads(); + }; + + extern __shared__ int4 sh[]; + + constexpr int perm_size = tile_k_size / 4; + + int4* sh_perm_ptr = sh; + int4* sh_pipe_ptr = sh_perm_ptr; + if constexpr (has_perm) { + sh_pipe_ptr += perm_size; + } + + constexpr int tile_ints = tile_k_size / pack_factor; + + constexpr int stage_n_threads = tile_n_size / 4; + constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints; + constexpr int stage_size = stage_k_threads * stage_n_threads; + + auto load_perm_to_shared = [&](int k_tile_id) { + int first_k_int4 = (k_tile_id * tile_k_size) / 4; + + int4 const* perm_int4_ptr = reinterpret_cast(perm_ptr); + + if (threadIdx.x < perm_size) { + sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x]; + } + __syncthreads(); + }; + + auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) { + if (n_tile_id >= n_tiles) { + cp_async_fence(); + return; + } + + int first_n = n_tile_id * tile_n_size; + + int4* sh_ptr = sh_pipe_ptr + stage_size * pipe; + + if constexpr (has_perm) { + if (threadIdx.x < stage_size) { + auto k_id = threadIdx.x / stage_n_threads; + auto n_id = threadIdx.x % stage_n_threads; + + uint32_t const* sh_perm_int_ptr = reinterpret_cast(sh_perm_ptr); + + int src_k = sh_perm_int_ptr[k_id]; + int src_k_packed = src_k / pack_factor; + + cp_async4( + &sh_ptr[k_id * stage_n_threads + n_id], + reinterpret_cast(&(b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)]))); + } + + } else { + if (threadIdx.x < stage_size) { + auto k_id = threadIdx.x / stage_n_threads; + auto n_id = threadIdx.x % stage_n_threads; + + int first_k = k_tile_id * tile_k_size; + int first_k_packed = first_k / pack_factor; + + cp_async4( + &sh_ptr[k_id * stage_n_threads + n_id], + reinterpret_cast(&(b_q_weight_ptr[(first_k_packed + k_id) * size_n + first_n + (n_id * 4)]))); + } + } + + cp_async_fence(); + }; + + auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) { + if (n_tile_id >= n_tiles) { + return; + } + + auto warp_id = threadIdx.x / 32; + auto th_id = threadIdx.x % 32; + + if (warp_id >= 4) { + return; + } + + int tc_col = th_id / 4; + int tc_row = (th_id % 4) * 2; + + constexpr int tc_offsets[4] = {0, 1, 8, 9}; + + int cur_n = warp_id * 16 + tc_col; + + constexpr int sh_stride = 64; + constexpr uint32_t mask = (1 << num_bits) - 1; + + int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe; + uint32_t* sh_stage_int_ptr = reinterpret_cast(sh_stage_ptr); + + uint32_t* sh_perm_int_ptr = reinterpret_cast(sh_perm_ptr); + + uint32_t vals[8]; + + if constexpr (has_perm) { + for (int i = 0; i < 4; i++) { + int k_idx = tc_row + tc_offsets[i]; + + uint32_t src_k = sh_perm_int_ptr[k_idx]; + uint32_t src_k_pos = src_k % pack_factor; + + uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n]; + uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask; + + uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8]; + uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask; + + vals[i] = b1_cur_val; + vals[4 + i] = b2_cur_val; + } + + } else { + uint32_t b1_vals[tile_ints]; + uint32_t b2_vals[tile_ints]; + +#pragma unroll + for (int i = 0; i < tile_ints; i++) { + b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i]; + b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i]; + } + +#pragma unroll + for (int i = 0; i < 4; i++) { + int cur_elem = tc_row + tc_offsets[i]; + int cur_int = cur_elem / pack_factor; + int cur_pos = cur_elem % pack_factor; + + vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask; + vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask; + } + } + + constexpr int tile_size = tile_k_size * tile_n_size / pack_factor; + int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size; + + // Result of: + // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h + if constexpr (num_bits == 4) { + constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + + uint32_t res = 0; +#pragma unroll + for (int i = 0; i < 8; i++) { + res |= vals[pack_idx[i]] << (i * 4); + } + + out_ptr[out_offset + th_id * 4 + warp_id] = res; + + } else { + constexpr int pack_idx[4] = {0, 2, 1, 3}; + + uint32_t res1 = 0; + uint32_t res2 = 0; +#pragma unroll + for (int i = 0; i < 4; i++) { + res1 |= vals[pack_idx[i]] << (i * 8); + res2 |= vals[4 + pack_idx[i]] << (i * 8); + } + + out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1; + out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2; + } + }; + + auto start_pipes = [&](int k_tile_id, int n_tile_id) { +#pragma unroll + for (int pipe = 0; pipe < repack_stages - 1; pipe++) { + fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe); + } + + wait_for_stage(); + }; +#pragma unroll + for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) { + int n_tile_id = 0; + + if constexpr (has_perm) { + load_perm_to_shared(k_tile_id); + } + + start_pipes(k_tile_id, n_tile_id); + + while (n_tile_id < n_tiles) { +#pragma unroll + for (int pipe = 0; pipe < repack_stages; pipe++) { + fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id, n_tile_id + pipe + repack_stages - 1); + repack_tile(pipe, k_tile_id, n_tile_id + pipe); + wait_for_stage(); + } + n_tile_id += repack_stages; + } + } +} +#endif + +} // namespace device::marlin + +#define CALL_IF_REPACK(NUM_BITS, HAS_PERM) \ + else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \ + host::RuntimeDeviceCheck(cudaFuncSetAttribute( \ + device::marlin::gptq_marlin_repack_kernel, \ + cudaFuncAttributeMaxDynamicSharedMemorySize, \ + max_shared_mem)); \ + host::LaunchKernel(blocks, device::marlin::repack_threads, stream, static_cast(max_shared_mem))( \ + device::marlin::gptq_marlin_repack_kernel, \ + b_q_weight_ptr, \ + perm_ptr, \ + out_ptr, \ + size_k, \ + size_n); \ + } + +void gptq_marlin_repack( + tvm::ffi::TensorView b_q_weight, + tvm::ffi::TensorView perm, + tvm::ffi::TensorView out, + int64_t size_k, + int64_t size_n, + int64_t num_bits) { + using namespace host; + + // Validate num_bits + RuntimeCheck(num_bits == 4 || num_bits == 8, "num_bits must be 4 or 8. Got = ", num_bits); + int const pack_factor = 32 / static_cast(num_bits); + + // Validate size alignment + RuntimeCheck( + size_k % device::marlin::tile_k_size == 0, + "size_k = ", + size_k, + " is not divisible by tile_k_size = ", + device::marlin::tile_k_size); + RuntimeCheck( + size_n % device::marlin::tile_n_size == 0, + "size_n = ", + size_n, + " is not divisible by tile_n_size = ", + device::marlin::tile_n_size); + + // Validate b_q_weight + auto bqw_dim0 = SymbolicSize{"bqw_dim0"}; + auto bqw_dim1 = SymbolicSize{"bqw_dim1"}; + bqw_dim0.set_value(size_k / pack_factor); + bqw_dim1.set_value(size_n); + auto device_ = SymbolicDevice{}; + device_.set_options(); + TensorMatcher({bqw_dim0, bqw_dim1}).with_dtype().with_device(device_).verify(b_q_weight); + + // Validate out + auto out_dim0 = SymbolicSize{"out_dim0"}; + auto out_dim1 = SymbolicSize{"out_dim1"}; + out_dim0.set_value(size_k / device::marlin::tile_size); + out_dim1.set_value(size_n * device::marlin::tile_size / pack_factor); + TensorMatcher({out_dim0, out_dim1}).with_dtype().with_device(device_).verify(out); + + // Detect if there is act_order + bool has_perm = perm.size(0) != 0; + + // Get ptrs + uint32_t const* b_q_weight_ptr = reinterpret_cast(b_q_weight.data_ptr()); + uint32_t const* perm_ptr = reinterpret_cast(perm.data_ptr()); + uint32_t* out_ptr = reinterpret_cast(out.data_ptr()); + + // Get dev info + DLDevice dl_device = device_.unwrap(); + int dev = dl_device.device_id; + cudaStream_t stream = LaunchKernel::resolve_device(dl_device); + int blocks; + RuntimeDeviceCheck(cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev)); + + int max_shared_mem = 0; + RuntimeDeviceCheck(cudaDeviceGetAttribute(&max_shared_mem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev)); + RuntimeCheck(max_shared_mem > 0, "max_shared_mem must be > 0"); + + if (false) { + } + CALL_IF_REPACK(4, false) + CALL_IF_REPACK(4, true) + CALL_IF_REPACK(8, false) + CALL_IF_REPACK(8, true) + else { + Panic("Unsupported repack config: num_bits = ", num_bits, ", has_perm = ", has_perm); + } +} + +#undef CALL_IF_REPACK diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/kernel.h b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/kernel.h new file mode 100644 index 000000000..e54dd426f --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/kernel.h @@ -0,0 +1,32 @@ + +#include + +#include "marlin.cuh" +#include "marlin_dtypes.cuh" + +#define MARLIN_KERNEL_PARAMS \ + const int4 *__restrict__ A, const int4 *__restrict__ B, int4 *__restrict__ C, int4 *__restrict__ C_tmp, \ + const int4 *__restrict__ scales_ptr, const uint16_t *__restrict__ scale2_ptr, const int4 *__restrict__ zp_ptr, \ + const int *__restrict__ g_idx, int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \ + bool use_atomic_add, bool use_fp32_reduce, int max_shared_mem + +namespace device::marlin { +templateshared + // fetch pipeline + const int group_blocks, // number of consecutive 16x16 blocks + // with a separate quantization scale + const bool is_zp_float // is zero point of float16 type? + > +__global__ void Marlin(MARLIN_KERNEL_PARAMS); + +} // namespace device::marlin diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin.cuh new file mode 100644 index 000000000..483ff5fc5 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin.cuh @@ -0,0 +1,89 @@ +#pragma once + +#include + +#include + +// Bridge the mllm_kernel::host namespace to the `host` namespace expected by +// Marlin code (originally from sglang). +namespace host = ::mllm_kernel::host; + +namespace device::marlin { +// Marlin params + +// 8 warps are a good choice since every SM has 4 schedulers and having more +// than 1 warp per schedule allows some more latency hiding. At the same time, +// we want relatively few warps to have many registers per warp and small tiles. +static constexpr int default_threads = 256; + +static constexpr int pipe_stages = 4; // 4 pipeline stages fit into shared memory + +static constexpr int min_thread_n = 64; +static constexpr int min_thread_k = 64; +static constexpr int max_thread_n = 256; + +static constexpr int tile_size = 16; +static constexpr int max_par = 16; + +// Repack params +static constexpr int repack_stages = 8; + +static constexpr int repack_threads = 256; + +static constexpr int tile_k_size = tile_size; +static constexpr int tile_n_size = tile_k_size * 4; + +// Helpers +template +struct Vec { + T elems[n]; + __device__ T& operator[](int i) { + return elems[i]; + } +}; + +using I4 = Vec; + +using host::div_ceil; + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 +// No support for async +#else + +__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) { + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred), + "r"(smem), + "l"(glob_ptr), + "n"(BYTES)); +} + +__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) { + const int BYTES = 16; + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + asm volatile( + "{\n" + " cp.async.cg.shared.global [%0], [%1], %2;\n" + "}\n" ::"r"(smem), + "l"(glob_ptr), + "n"(BYTES)); +} + +__device__ inline void cp_async_fence() { + asm volatile("cp.async.commit_group;\n" ::); +} + +template +__device__ inline void cp_async_wait() { + asm volatile("cp.async.wait_group %0;\n" ::"n"(n)); +} + +#endif + +} // namespace device::marlin diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin_dtypes.cuh b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin_dtypes.cuh new file mode 100644 index 000000000..40b538688 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin_dtypes.cuh @@ -0,0 +1,77 @@ +#ifndef _data_types_cuh +#define _data_types_cuh +#include + +#include "marlin.cuh" + +namespace device::marlin { + +template +class ScalarType {}; + +template <> +class ScalarType { + public: + using scalar_t = fp16_t; + using scalar_t2 = fp16x2_t; + + // Matrix fragments for tensor core instructions; their precise layout is + // documented here: + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type + using FragA = Vec; + using FragB = Vec; + using FragC = Vec; + using FragS = Vec; + using FragZP = Vec; + + static __device__ float inline num2float(const fp16_t x) { + return __half2float(x); + } + + static __device__ fp16x2_t inline num2num2(const fp16_t x) { + return __half2half2(x); + } + + static __device__ fp16x2_t inline nums2num2(const fp16_t x1, const fp16_t x2) { + return __halves2half2(x1, x2); + } + + static __host__ __device__ fp16_t inline float2num(const float x) { + return __float2half(x); + } +}; + +template <> +class ScalarType { + public: + using scalar_t = bf16_t; + using scalar_t2 = bf16x2_t; + + using FragA = Vec; + using FragB = Vec; + using FragC = Vec; + using FragS = Vec; + using FragZP = Vec; + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 + static __device__ float inline num2float(const bf16_t x) { + return __bfloat162float(x); + } + + static __device__ bf16x2_t inline num2num2(const bf16_t x) { + return __bfloat162bfloat162(x); + } + + static __device__ bf16x2_t inline nums2num2(const bf16_t x1, const bf16_t x2) { + return __halves2bfloat162(x1, x2); + } + + static __host__ __device__ bf16_t inline float2num(const float x) { + return __float2bfloat16(x); + } +#endif +}; + +} // namespace device::marlin + +#endif diff --git a/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin_template.h b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin_template.h new file mode 100644 index 000000000..04052838c --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/csrc/gemm/marlin/marlin_template.h @@ -0,0 +1,1514 @@ +/* + * Modified by Neural Magic + * Copyright (C) Marlin.2024 Elias Frantar + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Adapted from https://github.com/IST-DASLab/marlin + */ +#include + +#include "dequant.h" +#include "marlin.cuh" +#include "marlin_dtypes.cuh" + +#define STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t) \ + static_assert(std::is_same::value || std::is_same::value, \ + "only float16 and bfloat16 is supported"); + +namespace device::marlin { + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 + +templateshared + // fetch pipeline + const bool has_act_order, // whether act_order is enabled + const int group_blocks, // number of consecutive 16x16 blocks + // with a separate quantization scale + const bool is_zp_float // is zero point of float16 type? + > +__global__ void Marlin(const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + int4* __restrict__ C_tmp, // fp32 tmp output buffer (for reduce) + const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape + // (k/groupsize)xn + const int* __restrict__ g_idx, // int32 group indices of shape k + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int* locks, // extra global storage for barrier synchronization + bool use_fp32_reduce // whether to use fp32 global reduce +) {} + +} // namespace device::marlin + +#else + +// m16n8k16 tensor core mma instruction with fp16 inputs and fp32 +// output/accumulation. +template +__device__ inline void mma(const typename ScalarType::FragA& a_frag, + const typename ScalarType::FragB& frag_b, typename ScalarType::FragC& frag_c) { + const uint32_t* a = reinterpret_cast(&a_frag); + const uint32_t* b = reinterpret_cast(&frag_b); + float* c = reinterpret_cast(&frag_c); + if constexpr (std::is_same::value) { + asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), + "f"(c[3])); + } else if constexpr (std::is_same::value) { + asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), + "f"(c[3])); + } else { + STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); + } +} + +template +__device__ inline void mma_trans(const typename ScalarType::FragA& a_frag, + const typename ScalarType::FragB& frag_b, + const typename ScalarType::FragB& frag_b2, + typename ScalarType::FragC& frag_c) { + const uint32_t* a = reinterpret_cast(&a_frag); + const uint32_t* b = reinterpret_cast(&frag_b); + const uint32_t* b2 = reinterpret_cast(&frag_b2); + float* c = reinterpret_cast(&frag_c); + if constexpr (std::is_same::value) { + asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), + "f"(c[3])); + } else if constexpr (std::is_same::value) { + asm volatile("mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3]) + : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]), "f"(c[0]), "f"(c[1]), "f"(c[2]), + "f"(c[3])); + } else { + STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t); + } +} + +// Instruction for loading a full 16x16 matrix fragment of operand A from shared +// memory, directly in tensor core layout. +template +__device__ inline void ldsm(typename ScalarType::FragA& frag_a, const void* smem_ptr) { + uint32_t* a = reinterpret_cast(&frag_a); + uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr)); + if constexpr (count == 4) { + asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0,%1,%2,%3}, [%4];\n" + : "=r"(a[0]), "=r"(a[1]), "=r"(a[2]), "=r"(a[3]) + : "r"(smem)); + } else if constexpr (count == 2) { + asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0,%1}, [%2];\n" : "=r"(a[0]), "=r"(a[1]) : "r"(smem)); + } else if constexpr (count == 1) { + asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n" : "=r"(a[0]) : "r"(smem)); + } else { + static_assert(count == 1 || count == 2 || count == 4, "invalid count"); + } +} + +// Multiply dequantized values by the corresponding quantization scale; used +// only for grouped quantization. +template +__device__ inline void scale(typename ScalarType::FragB& frag_b, typename ScalarType::FragS& frag_s, + int i) { + using scalar_t2 = typename ScalarType::scalar_t2; + scalar_t2 s = ScalarType::num2num2(reinterpret_cast(&frag_s)[i]); + frag_b[0] = __hmul2(frag_b[0], s); + frag_b[1] = __hmul2(frag_b[1], s); +} + +template +__device__ inline void scale_and_sub(typename ScalarType::FragB& frag_b, scalar_t s, scalar_t zp) { + using scalar_t2 = typename ScalarType::scalar_t2; + scalar_t2 s2 = ScalarType::num2num2(s); + scalar_t2 zp2 = ScalarType::num2num2(zp); + frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2)); + frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2)); +} + +template +__device__ inline void sub_zp(typename ScalarType::FragB& frag_b, typename ScalarType::scalar_t2& frag_zp, + int i) { + using scalar_t2 = typename ScalarType::scalar_t2; + scalar_t2 zp = ScalarType::num2num2(reinterpret_cast(&frag_zp)[i]); + frag_b[0] = __hsub2(frag_b[0], zp); + frag_b[1] = __hsub2(frag_b[1], zp); +} + +// Same as above, but for act_order (each K is multiplied individually) +template +__device__ inline void scale4(typename ScalarType::FragB& frag_b, typename ScalarType::FragS& frag_s_1, + typename ScalarType::FragS& frag_s_2, typename ScalarType::FragS& frag_s_3, + typename ScalarType::FragS& frag_s_4, int i) { + using scalar_t2 = typename ScalarType::scalar_t2; + scalar_t2 s_val_1_2; + s_val_1_2.x = reinterpret_cast(&frag_s_1)[i]; + s_val_1_2.y = reinterpret_cast(&frag_s_2)[i]; + + scalar_t2 s_val_3_4; + s_val_3_4.x = reinterpret_cast(&frag_s_3)[i]; + s_val_3_4.y = reinterpret_cast(&frag_s_4)[i]; + + frag_b[0] = __hmul2(frag_b[0], s_val_1_2); + frag_b[1] = __hmul2(frag_b[1], s_val_3_4); +} + +// Given 2 floats multiply by 2 scales (halves) +template +__device__ inline void scale_float(float* c, typename ScalarType::FragS& s) { + scalar_t* s_ptr = reinterpret_cast(&s); + c[0] = __fmul_rn(c[0], ScalarType::num2float(s_ptr[0])); + c[1] = __fmul_rn(c[1], ScalarType::num2float(s_ptr[1])); +} + +// Wait until barrier reaches `count`, then lock for current threadblock. +__device__ inline void barrier_acquire(int* lock, int count) { + if (threadIdx.x == 0) { + int state = -1; + do + // Guarantee that subsequent writes by this threadblock will be visible + // globally. + asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock)); + while (state != count); + } + __syncthreads(); +} + +// Release barrier and increment visitation count. +__device__ inline void barrier_release(int* lock, bool reset = false) { + __syncthreads(); + if (threadIdx.x == 0) { + if (reset) { + lock[0] = 0; + return; + } + int val = 1; + // Make sure that all writes since acquiring this barrier are visible + // globally, while releasing the barrier. + asm volatile("fence.acq_rel.gpu;\n"); + asm volatile("red.relaxed.gpu.global.add.s32 [%0], %1;\n" : : "l"(lock), "r"(val)); + } +} + +// Wait until value of lock to be negative, and then add 1 +__device__ inline void wait_negative_and_add(int* lock) { + if (threadIdx.x == 0) { + int state = 0; + do + // Guarantee that subsequent writes by this threadblock will be visible + // globally. + asm volatile("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock)); + while (state >= 0); + atomicAdd(lock, 1); + } + __syncthreads(); +} + +templateshared + // fetch pipeline + const int group_blocks, // number of consecutive 16x16 blocks + // with a separate quantization scale + const bool is_zp_float // is zero point of float16 type? + > +__global__ void Marlin(const int4* __restrict__ A, // fp16 input matrix of shape mxk + const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn + int4* __restrict__ C, // fp16 output buffer of shape mxn + int4* __restrict__ C_tmp, // fp32 tmp output buffer (for reduce) + const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape + // (k/groupsize)xn + const uint16_t* __restrict__ scale2_ptr, // fp16 global scale (for nvfp4 + // only) + const int4* __restrict__ zp_ptr, // 4bit packed zero-points of shape + // (k/groupsize)x(n/pack_factor) + const int* __restrict__ g_idx, // int32 group indices of shape k + int num_groups, // number of scale groups per output channel + int prob_m, // batch dimension m + int prob_n, // output dimension n + int prob_k, // reduction dimension k + int lda, // A.stride(0), equal to prob_k is A is contiguous + int* locks, // extra global storage for barrier synchronization + bool use_atomic_add, // whether to use atomic add to reduce + bool use_fp32_reduce, // whether to use fp32 global reduce + int max_shared_mem) { + // Each threadblock processes one "stripe" of the B matrix with (roughly) the + // same size, which might involve multiple column "slices" (of width 16 * + // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM + // example: + // 0 1 3 + // 0 2 3 + // 1 2 4 + // While this kind of partitioning makes things somewhat more complicated, it + // ensures good utilization of all SMs for many kinds of shape and GPU + // configurations, while requiring as few slow global cross-threadblock + // reductions as possible. + using Dtype = ScalarType; + using scalar_t2 = typename ScalarType::scalar_t2; + using FragA = typename ScalarType::FragA; + using FragB = typename ScalarType::FragB; + using FragC = typename ScalarType::FragC; + using FragS = typename ScalarType::FragS; + using FragZP = typename ScalarType::FragZP; + + static constexpr auto w_type = host::ScalarType::from_id(w_type_id); + constexpr bool has_zp = w_type == host::kU4 || w_type == host::kU8; + constexpr bool is_int_type = w_type == host::kU4 || w_type == host::kU8 || w_type == host::kU4B8 || w_type == host::kU8B128; + // see comments of dequant.h for more details + constexpr bool dequant_skip_flop = !is_int_type || has_zp && !is_zp_float && !std::is_same::value + || has_zp && !is_zp_float && !(w_type == host::kU8); + + scalar_t2 global_scale; + + if constexpr (w_type == host::kFE2M1f) { + uint16_t val = scale2_ptr[0]; + global_scale = Dtype::num2num2(*reinterpret_cast(&val)); + } + + constexpr bool has_act_order = group_blocks == 0; + constexpr int m_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks); + + constexpr int pack_factor = 32 / w_type.size_bits(); + static_assert(thread_m_blocks == 1 || !m_block_size_8); + + // For larger GEMMs we run multiple batchsize 64 versions in parallel for a + // better partitioning with less reductions + int parallel = 1; + if (prob_m > m_block_size) { + parallel = prob_m / m_block_size; + prob_m = m_block_size; + } + + int k_tiles = prob_k / 16 / thread_k_blocks; + int n_tiles = prob_n / 16 / thread_n_blocks; + int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x); + + if constexpr (!has_act_order && group_blocks != -1) { + if (group_blocks >= thread_k_blocks) { + // Ensure that the number of tiles in each stripe is a multiple of the + // groupsize; this avoids an annoying special case where a stripe starts + // in the middle of group. + iters = (group_blocks / thread_k_blocks) * div_ceil(iters, (group_blocks / thread_k_blocks)); + } + } + + int slice_row = (iters * blockIdx.x) % k_tiles; + int slice_col_par = (iters * blockIdx.x) / k_tiles; + int slice_col = slice_col_par; + int slice_iters; // number of threadblock tiles in the current slice + int slice_count = 0; // total number of active threadblocks in the current slice + int slice_idx; // index of threadblock in current slice; numbered bottom to + // top + + int par_id = 0; + int locks_off = 0; + + // We can easily implement parallel problem execution by just remapping + // indices and advancing global pointers + if (slice_col_par >= n_tiles) { + A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8; + C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8; + slice_col = slice_col_par % n_tiles; + par_id = slice_col_par / n_tiles; + } + if (parallel * n_tiles >= gridDim.x) { + // when parallel * n_tiles >= sms + // then there are at most $sms$ conflict tile blocks + locks_off = blockIdx.x; + } else { + locks_off = (iters * blockIdx.x) / k_tiles - 1; + } + + // Compute all information about the current slice which is required for + // synchronization. + auto init_slice = [&](bool first_init = false) { + slice_iters = iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row); + if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0; + if (slice_iters == 0) return; + if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row; + slice_count = 1; + slice_idx = 0; + int col_first = iters * div_ceil(k_tiles * slice_col_par, iters); + if (col_first <= k_tiles * (slice_col_par + 1)) { + int col_off = col_first - k_tiles * slice_col_par; + slice_count = div_ceil(k_tiles - col_off, iters); + if (col_off > 0) slice_count++; + int delta_first = iters * blockIdx.x - col_first; + if (delta_first < 0 || (col_off == 0 && delta_first == 0)) + slice_idx = slice_count - 1; + else { + slice_idx = slice_count - 1 - delta_first / iters; + if (col_off > 0) slice_idx--; + } + } + if (parallel * n_tiles >= gridDim.x) { + if (slice_count > 1 && slice_idx == slice_count - 1) { locks_off++; } + } else { + locks_off++; + } + + if (first_init && use_atomic_add && slice_count > 1 && slice_idx == 0) { + constexpr int threads_per_m = 16 * thread_n_blocks / 8; + int m_per_thread = div_ceil(thread_m_blocks * 16, threads / threads_per_m); + if (m_block_size_8) m_per_thread = div_ceil(8, threads / threads_per_m); + for (int i = 0; i < m_per_thread; i++) { + int row = threads / threads_per_m * i + threadIdx.x / threads_per_m; + if (row < prob_m) { + int col = slice_col * 16 * thread_n_blocks / 8 + threadIdx.x % threads_per_m; + C[row * prob_n / 8 + col] = {0, 0, 0, 0}; + } + } + // After write zero to output, write a negative value to lock. + // Every SM that processes the same slice would wait for + // the negative value, and then atomicAdd 1 to it. + // After all SMs are processed, the lock value would back to 0 again. + __syncthreads(); + if (threadIdx.x == 0) locks[locks_off] = 1 - slice_count; + } + + if (slice_col == n_tiles) { + A += 16 * thread_m_blocks * lda / 8; + C += 16 * thread_m_blocks * prob_n / 8; + slice_col = 0; + par_id++; + } + }; + init_slice(true); + + // A sizes/strides + + // stride of the A matrix in global memory + int a_gl_stride = lda / 8; + // stride of an A matrix tile in shared memory + constexpr int a_sh_stride = 16 * thread_k_blocks / 8; + // delta between subsequent A tiles in global memory + constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8; + // between subsequent accesses within a tile + int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o); + // between shared memory writes + constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o); + // between shared memory tile reads + constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4)); + // within a shared memory tile + constexpr int a_sh_rd_delta_i = a_sh_stride * 16; + // overall size of a tile + constexpr int a_sh_stage = a_sh_stride * m_block_size; + // number of shared write iterations for a tile + constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta); + + // B sizes/strides + int b_gl_stride = 16 * prob_n / (pack_factor * 4); + constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4; + constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2; + constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs; + + int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks; + int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads); + constexpr int b_sh_wr_delta = threads * b_thread_vecs; + constexpr int b_sh_rd_delta = threads * b_thread_vecs; + constexpr int b_sh_stage = b_sh_stride * thread_k_blocks; + constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta; + + // Scale sizes/strides without act_order + int s_gl_stride = prob_n / 8; + constexpr int s_sh_stride = 16 * thread_n_blocks / 8; + constexpr int s_tb_groups = !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks + ? thread_k_blocks / group_blocks / (w_type == host::kFE2M1f ? 2 : 1) + : 1; + constexpr int s_sh_stage = s_tb_groups * s_sh_stride; + int s_gl_rd_delta = s_gl_stride; + + // Scale size/strides with act_order + constexpr int tb_k = 16 * thread_k_blocks; + constexpr int g_idx_stage = has_act_order ? (tb_k * sizeof(int)) / 16 : 0; + // constexpr int act_s_row_stride = 1; + // int act_s_col_stride = act_s_row_stride * num_groups; + constexpr int act_s_max_num_groups = 32; + int act_s_col_stride = 1; + int act_s_col_warp_stride = act_s_col_stride * 8; + + int tb_n_warps = thread_n_blocks / 4; + int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps; + + // Zero-points sizes/strides + int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4; + constexpr int zp_sh_stride = is_zp_float ? 16 * thread_n_blocks / 8 : ((16 * thread_n_blocks) / pack_factor) / 4; + constexpr int zp_tb_groups = s_tb_groups; + constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0; + int zp_gl_rd_delta = zp_gl_stride; + + // Global A read index of current thread. + int a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o); + a_gl_rd += a_gl_rd_delta_o * slice_row; + // Shared write index of current thread. + int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o); + // Shared read index. + int a_sh_rd = a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) + + (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1)); + a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4)); + + int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) + (threadIdx.x % b_sh_stride_threads) * b_thread_vecs; + b_gl_rd += b_sh_stride * slice_col; + b_gl_rd += b_gl_rd_delta_o * slice_row; + auto b_sh_wr = threadIdx.x * b_thread_vecs; + auto b_sh_rd = threadIdx.x * b_thread_vecs; + + // For act_order + constexpr int k_iter_size = tb_k / b_sh_wr_iters; + int slice_k_start = tb_k * slice_row; + int slice_k_finish = slice_k_start + tb_k * slice_iters; + int slice_k_start_shared_fetch = slice_k_start; + int slice_n_offset = act_s_col_tb_stride * slice_col; + + // No act_order + int s_gl_rd; + if constexpr (!has_act_order) { + if constexpr (group_blocks == -1) { + s_gl_rd = s_sh_stride * slice_col + threadIdx.x; + } else { + s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) / (w_type == host::kFE2M1f ? 2 : 1) + + s_sh_stride * slice_col + threadIdx.x; + } + } + auto s_sh_wr = threadIdx.x; + bool s_sh_wr_pred = threadIdx.x < s_sh_stride; + + // Zero-points + int zp_gl_rd; + if constexpr (has_zp) { + if constexpr (group_blocks == -1) { + zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x; + } else { + zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) + zp_sh_stride * slice_col + threadIdx.x; + } + } + auto zp_sh_wr = threadIdx.x; + bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride; + + // We use a different scale layout for grouped and column-wise quantization as + // we scale a `half2` tile in column-major layout in the former and in + // row-major in the latter case. + int s_sh_rd; + if constexpr (group_blocks != -1 && w_type == host::kFE2M1f) { + auto warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; + int warp_row = warp_id / n_warps; + + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4; + s_sh_rd = s_sh_rd * 2 + warp_row % 2; + + } else if constexpr (group_blocks != -1) + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4; + else if constexpr (group_blocks == -1 && (m_block_size_8 || (has_zp && !dequant_skip_flop))) + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 8; + else + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) % 4; + + // Zero-points have the same read layout as the scales + // (without column-wise case) + constexpr int num_col_threads = 8; + constexpr int num_row_threads = 4; + constexpr int num_ints_per_thread = 8 / pack_factor; + int zp_sh_rd; + if constexpr (has_zp) { + if constexpr (is_zp_float) { + if constexpr (group_blocks != -1) { + zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4; + } + } else { + zp_sh_rd = num_ints_per_thread * num_col_threads * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads); + } + } + + // Precompute which thread should not read memory in which iterations; this is + // needed if there are more threads than required for a certain tilesize or + // when the batchsize is not a multiple of 16. + bool a_sh_wr_pred[a_sh_wr_iters]; +#pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) a_sh_wr_pred[i] = a_sh_wr_delta * i + a_sh_wr < a_sh_stride * prob_m; + + // To ensure that writing and reading A tiles to/from shared memory, the + // latter in fragment format, is fully bank conflict free, we need to use a + // rather fancy XOR-based layout. The key here is that neither reads nor + // writes of the 16-byte `int4` blocks of 8 consecutive threads involve the + // same shared memory banks. Further, it seems (based on NSight-Compute) that + // each warp must also write a consecutive memory segment? + auto transform_a = [&](int i) { + int row = i / a_gl_rd_delta_o; + return a_gl_rd_delta_o * row + (i % a_gl_rd_delta_o) ^ (row % 8); + }; + // Since the computation of this remapping is non-trivial and, due to our main + // loop unrolls, all shared memory accesses are static, we simply precompute + // both transformed reads and writes. + int a_sh_wr_trans[a_sh_wr_iters]; +#pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) a_sh_wr_trans[i] = transform_a(a_sh_wr_delta * i + a_sh_wr); + int a_sh_rd_trans[b_sh_wr_iters][thread_m_blocks]; +#pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { +#pragma unroll + for (int j = 0; j < thread_m_blocks; j++) + a_sh_rd_trans[i][j] = transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd); + } + + // Since B-accesses have non-constant stride they have to be computed at + // runtime; we break dependencies between subsequent accesses with a tile by + // maintining multiple pointers (we have enough registers), a tiny + // optimization. + const int4* B_ptr[b_sh_wr_iters]; +#pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd; + + extern __shared__ int4 sh[]; + // Shared memory storage for global fetch pipelines. + constexpr int sh_red_size = (2 * thread_n_blocks + 1) * 16 * thread_m_blocks; + constexpr int sh_b_size = stages * b_sh_stage; + int4* sh_b = sh; + int4* sh_red = sh; + int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size); + int4* sh_zp = sh_g_idx + (stages * g_idx_stage); + constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride) : (stages * s_sh_stage); + int4* sh_s = sh_zp + (stages * zp_sh_stage); + // shared memory reused by reduction should be smaller than + // shared memory used by weight. + static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <= stages * b_sh_stage); + int4* sh_a = sh_s + sh_s_size; + // constexpr int shm_size_used = + // stages * (g_idx_stage + zp_sh_stage) + sh_s_size + + // (sh_red_size > sh_b_size ? sh_red_size : sh_b_size); + + // Register storage for double buffer of shared memory reads. + FragA frag_a[2][thread_m_blocks]; + I4 frag_b_quant[2][b_thread_vecs]; + FragC frag_c[thread_m_blocks][4][2]; + FragS frag_s[2][4]; // No act-order + FragS act_frag_s[2][4][4]; // For act-order + int frag_qzp[2][num_ints_per_thread]; // Zero-points + FragZP frag_zp; // Zero-points in fp16 + FragZP frag_zpf[2]; // Zero-points in fp16 in HQQ + + // Zero accumulators. + auto zero_accums = [&]() { +#pragma unroll + for (int i = 0; i < thread_m_blocks * 4 * 2 * 4; i++) reinterpret_cast(frag_c)[i] = 0; + }; + + int sh_first_group_id = -1; + int sh_num_groups = -1; + + auto fetch_act_order_scales_to_shared = [&](bool is_async, int first_group_id, int last_group_id) { + sh_first_group_id = first_group_id; + sh_num_groups = last_group_id - first_group_id + 1; + + if (sh_num_groups > act_s_max_num_groups) { sh_num_groups = act_s_max_num_groups; } + + if (sh_first_group_id + sh_num_groups > num_groups) { sh_num_groups = num_groups - sh_first_group_id; } + + int row_offset = first_group_id * s_gl_stride; + + if (is_async) { + for (int i = 0; i < sh_num_groups; i++) { + if (threadIdx.x < s_sh_stride) { + cp_async4_pred(&sh_s[(i * s_sh_stride) + threadIdx.x], + &scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset + threadIdx.x]); + } + } + } else { + for (int i = 0; i < sh_num_groups; i++) { + if (threadIdx.x < s_sh_stride) { + sh_s[(i * s_sh_stride) + threadIdx.x] = scales_ptr[row_offset + (i * s_gl_stride) + slice_n_offset + threadIdx.x]; + } + } + } + }; + // Asynchronously fetch the next A, B and s tile from global to the next + // shared memory pipeline location. + auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) { + if (pred) { + int4* sh_a_stage = sh_a + a_sh_stage * pipe; +#pragma unroll + for (int i = 0; i < a_sh_wr_iters; i++) { + cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[a_gl_rd_delta_i * i + a_gl_rd + a_gl_rd_delta_o * a_off], + a_sh_wr_pred[i]); + } + int4* sh_b_stage = sh_b + b_sh_stage * pipe; +#pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) { +#pragma unroll + for (int j = 0; j < b_thread_vecs; j++) { cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j); } + + B_ptr[i] += b_gl_rd_delta_o; + } + + if constexpr (has_act_order) { + // Fetch g_idx thread-block portion + int full_pipe = a_off; + int cur_k = slice_k_start_shared_fetch + tb_k * full_pipe; + if (cur_k < prob_k && cur_k < slice_k_finish) { + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + + int4 const* cur_g_idx_stage_ptr = reinterpret_cast(&g_idx[cur_k]); + + if (threadIdx.x < g_idx_stage) { cp_async4_pred(&sh_g_idx_stage[threadIdx.x], &cur_g_idx_stage_ptr[threadIdx.x]); } + } + } else { + if constexpr (group_blocks != -1) { + int4* sh_s_stage = sh_s + s_sh_stage * pipe; + + if constexpr (group_blocks >= thread_k_blocks) { + // Only fetch scales if this tile starts a new group + if (pipe % (group_blocks / thread_k_blocks) == 0) { + if (s_sh_wr_pred) { cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]); } + s_gl_rd += s_gl_rd_delta; + } + } else { + for (int i = 0; i < s_tb_groups; i++) { + if (s_sh_wr_pred) { cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr], &scales_ptr[s_gl_rd]); } + s_gl_rd += s_gl_rd_delta; + } + } + } + + if constexpr (has_zp && group_blocks != -1) { + int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe; + + if constexpr (group_blocks >= thread_k_blocks) { + // Only fetch zero-points if this tile starts a new group + if (pipe % (group_blocks / thread_k_blocks) == 0) { + if (zp_sh_wr_pred) { cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]); } + zp_gl_rd += zp_gl_rd_delta; + } + } else { + for (int i = 0; i < zp_tb_groups; i++) { + if (zp_sh_wr_pred) { cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr], &zp_ptr[zp_gl_rd]); } + zp_gl_rd += zp_gl_rd_delta; + } + } + } + } + } + // Insert a fence even when we are winding down the pipeline to ensure that + // waiting is also correct at this point. + cp_async_fence(); + }; + + auto fetch_col_zp_to_shared = [&]() { + if (zp_sh_wr_pred) { cp_async4(&sh_zp[zp_sh_wr], &zp_ptr[zp_gl_rd]); } + }; + + auto fetch_col_scale_to_shared = [&]() { + if (s_sh_wr_pred) { cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]); } + }; + + // Wait until the next thread tile has been loaded to shared memory. + auto wait_for_stage = [&]() { + // We only have `stages - 2` active fetches since we are double buffering + // and can only issue the next fetch when it is guaranteed that the previous + // shared memory load is fully complete (as it may otherwise be + // overwritten). + cp_async_wait(); + __syncthreads(); + }; + + // Load the next sub-tile from the current location in the shared memory pipe + // into the current register buffer. + auto fetch_to_registers = [&](int k, int pipe) { + int4* sh_a_stage = sh_a + a_sh_stage * pipe; +#pragma unroll + for (int i = 0; i < thread_m_blocks; i++) + ldsm(frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]); + int4* sh_b_stage = sh_b + b_sh_stage * pipe; + +#pragma unroll + for (int i = 0; i < b_thread_vecs; i++) { + frag_b_quant[k % 2][i] = *reinterpret_cast(&sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]); + } + }; + + bool is_same_group[stages]; + int same_group_id[stages]; + + auto init_same_group = [&](int pipe) { + if constexpr (!has_act_order) { return; } + + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + int* sh_g_idx_int_ptr = reinterpret_cast(sh_g_idx_stage); + + int group_id_1 = sh_g_idx_int_ptr[0]; + int group_id_2 = sh_g_idx_int_ptr[tb_k - 1]; + + is_same_group[pipe] = group_id_1 == group_id_2; + same_group_id[pipe] = group_id_1; + }; + + auto fetch_scales_to_registers = [&](int k, int full_pipe) { + int pipe = full_pipe % stages; + + if constexpr (!has_act_order) { + // No act-order case + if constexpr (group_blocks == -1) { + // load only when starting a new slice + if (k == 0 && full_pipe == 0) { + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + } + } else if constexpr (group_blocks != -1) { + if constexpr (group_blocks >= thread_k_blocks) { + if (k % b_sh_wr_iters == 0) { + int4* sh_s_stage = + sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks))); + reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; + } else { + reinterpret_cast(&frag_s[1])[0] = reinterpret_cast(&frag_s[0])[0]; + } + } else { + auto warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; + + int warp_row = warp_id / n_warps; + + int cur_k = warp_row * 16; + cur_k += k_iter_size * (k % b_sh_wr_iters); + + int k_blocks = cur_k / 16; + int cur_group_id = k_blocks / (group_blocks * (w_type == host::kFE2M1f ? 2 : 1)); + + int4* sh_s_stage = sh_s + s_sh_stage * pipe; + + if constexpr (w_type_id != host::kFE2M1f.id()) { + reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride]; + } else { + reinterpret_cast(&frag_s[k % 2])[0] = + reinterpret_cast(sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)]; + } + } + } + + return; + } + + // Act-order case + + // Determine K of the "current" thread-block + int cur_k = slice_k_start + tb_k * full_pipe; + if (cur_k >= prob_k || cur_k >= slice_k_finish) { return; } + + // Reset (to current thread-block) since we read g_idx portion from the + // shared memory + cur_k = 0; + + // Progress to current iteration + cur_k += k_iter_size * (k % b_sh_wr_iters); + + // Determine "position" inside the thread-block (based on warp and + // thread-id) + auto warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; // Each warp processes 4 16-size tiles over N + + int warp_row = warp_id / n_warps; + int warp_col = warp_id % n_warps; + + cur_k += warp_row * 16; + + auto th_id = threadIdx.x % 32; + cur_k += (th_id % 4) * 2; // Due to tensor-core layout for fp16 B matrix + + int s_col_shift = + /*slice_n_offset +*/ (act_s_col_warp_stride * warp_col) + (th_id / 4) * act_s_col_stride; + + if (is_same_group[pipe]) { + if (k % 2 == 0) { + *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))) = + sh_s[(same_group_id[pipe] - sh_first_group_id) * s_sh_stride + s_col_shift]; + } else { + *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))) = *(reinterpret_cast(&(act_frag_s[(k - 1) % 2][0][0]))); + } + + for (int i = 1; i < 4; i++) { + *(reinterpret_cast(&(act_frag_s[k % 2][i][0]))) = *(reinterpret_cast(&(act_frag_s[k % 2][0][0]))); + } + return; + } + + int4* sh_g_idx_stage = sh_g_idx + g_idx_stage * pipe; + int* sh_g_idx_int_ptr = reinterpret_cast(sh_g_idx_stage); + + constexpr int k_frag_offsets[4] = {0, 1, 8, 9}; // Tensor core offsets per thread + +#pragma unroll + for (int i = 0; i < 4; i++) { + int actual_k = cur_k + k_frag_offsets[i]; + + int group_id = sh_g_idx_int_ptr[actual_k]; + int rel_group_id = group_id - sh_first_group_id; + + *(reinterpret_cast(&(act_frag_s[k % 2][i][0]))) = sh_s[rel_group_id * s_sh_stride + s_col_shift]; + } + }; + + auto fetch_zp_to_registers = [&](int k, int full_pipe) { + // This code does not handle group_blocks == 0, + // which signifies act_order. + // has_zp implies AWQ, which doesn't have act_order, + static_assert(!has_zp || group_blocks != 0); + + if constexpr (has_zp && !is_zp_float) { + int pipe = full_pipe % stages; + + if constexpr (group_blocks == -1) { + // load only when starting a new slice + if (k == 0 && full_pipe == 0) { +#pragma unroll + for (int i = 0; i < num_ints_per_thread; i++) { frag_qzp[k % 2][i] = (reinterpret_cast(sh_zp))[zp_sh_rd + i]; } + } + + } else if constexpr (group_blocks >= thread_k_blocks) { + if (k % b_sh_wr_iters == 0) { + int4* sh_zp_stage = + sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks))); +#pragma unroll + for (int i = 0; i < num_ints_per_thread; i++) { + frag_qzp[k % 2][i] = (reinterpret_cast(sh_zp_stage))[zp_sh_rd + i]; + } + } + } else { + auto warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; + + int warp_row = warp_id / n_warps; + + int cur_k = warp_row * 16; + cur_k += k_iter_size * (k % b_sh_wr_iters); + + int k_blocks = cur_k / 16; + int cur_group_id = 0; + + // Suppress bogus and persistent divide-by-zero warning +#pragma nv_diagnostic push +#pragma nv_diag_suppress divide_by_zero + cur_group_id = k_blocks / group_blocks; +#pragma nv_diagnostic pop + + int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe; + + sh_zp_stage += cur_group_id * zp_sh_stride; + +#pragma unroll + for (int i = 0; i < num_ints_per_thread; i++) { + frag_qzp[k % 2][i] = (reinterpret_cast(sh_zp_stage))[zp_sh_rd + i]; + } + } + } + + else if constexpr (has_zp && is_zp_float) { + int pipe = full_pipe % stages; + + if constexpr (group_blocks != -1) { + if constexpr (group_blocks >= thread_k_blocks) { + if (k % b_sh_wr_iters == 0) { + int4* sh_zp_stage = + sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) * (pipe / (group_blocks / thread_k_blocks))); + reinterpret_cast(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd]; + } + } else { + auto warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; + + int warp_row = warp_id / n_warps; + + int cur_k = warp_row * 16; + cur_k += k_iter_size * (k % b_sh_wr_iters); + + int k_blocks = cur_k / 16; + // Suppress bogus and persistent divide-by-zero warning +#pragma nv_diagnostic push +#pragma nv_diag_suppress divide_by_zero + int cur_group_id = k_blocks / group_blocks; +#pragma nv_diagnostic pop + + int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe; + + reinterpret_cast(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride]; + } + } + } + }; + + auto dequant_data = [&](int q, scalar_t2* frag_b_ptr) { dequant(q, frag_b_ptr); }; + + // Execute the actual tensor core matmul of a sub-tile. + bool is_first_matmul_in_slice = true; + auto matmul = [&](int k) { + int k2 = k % 2; + const bool is_new_zp = ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) + || (group_blocks == -1 && is_first_matmul_in_slice); + if constexpr (has_zp && !is_zp_float) { + if (is_new_zp) { + if constexpr (group_blocks == -1) is_first_matmul_in_slice = false; + FragB frag_zp_0; + FragB frag_zp_1; + int zp_quant_0, zp_quant_1; + + if constexpr (w_type.size_bits() == 4) { + zp_quant_0 = frag_qzp[k2][0]; + zp_quant_1 = zp_quant_0 >> 8; + } else { + static_assert(w_type.size_bits() == 8); + zp_quant_0 = frag_qzp[k2][0]; + zp_quant_1 = frag_qzp[k2][1]; + } + + dequant_data(zp_quant_0, reinterpret_cast(&frag_zp)); + dequant_data(zp_quant_1, reinterpret_cast(&frag_zp) + 2); + } + } + if constexpr (!dequant_skip_flop && has_zp && is_zp_float) { + if (is_new_zp) { reinterpret_cast(&frag_zp)[0] = reinterpret_cast(&frag_zpf[k2])[0]; } + } + + if constexpr (w_type == host::kFE2M1f) { + int s_quant_0 = reinterpret_cast(frag_s[k2])[0]; + int s_quant_1 = reinterpret_cast(frag_s[k2])[1]; + + dequant_fp8_scales(s_quant_0, reinterpret_cast(&frag_s[k2])); + dequant_fp8_scales(s_quant_1, reinterpret_cast(&frag_s[k2]) + 2); + } + +// We have the m dimension as the inner loop in order to encourage overlapping +// dequantization and matmul operations. +#pragma unroll + for (int j = 0; j < 4; j++) { + FragB frag_b0; + FragB frag_b1; + int b_quant_0, b_quant_1; + + if constexpr (w_type_id == host::kFE2M1f.id()) { + b_quant_1 = frag_b_quant[k2][0][j]; + b_quant_0 = b_quant_1 << 8; + } else if constexpr (w_type.size_bits() == 4) { + b_quant_0 = frag_b_quant[k2][0][j]; + b_quant_1 = b_quant_0 >> 8; + } else { + static_assert(w_type.size_bits() == 8); + int* frag_b_quant_ptr = reinterpret_cast(frag_b_quant[k2]); + b_quant_0 = frag_b_quant_ptr[j * 2 + 0]; + b_quant_1 = frag_b_quant_ptr[j * 2 + 1]; + } + + dequant_data(b_quant_0, reinterpret_cast(&frag_b0)); + dequant_data(b_quant_1, reinterpret_cast(&frag_b1)); + + if constexpr (dequant_skip_flop && has_zp && !is_zp_float) { + sub_zp(frag_b0, frag_zp[j], 0); + sub_zp(frag_b1, frag_zp[j], 1); + } + + // Apply scale to frag_b0 + if constexpr (has_act_order) { + static_assert(group_blocks != -1); + scale4(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j], act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0); + scale4(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j], act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1); + } else if constexpr (!dequant_skip_flop && has_zp && !is_zp_float && group_blocks == -1) { + int idx = (threadIdx.x / 4) % 2; + scalar_t2 s2 = Dtype::nums2num2(reinterpret_cast(&frag_s[j / 2][j % 2 * 2 + 0])[idx], + reinterpret_cast(&frag_s[j / 2][j % 2 * 2 + 1])[idx]); + if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2); + scale_and_sub(frag_b0, s2.x, frag_zp[j].x); + scale_and_sub(frag_b1, s2.y, frag_zp[j].y); + } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1) { + if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], *reinterpret_cast(&frag_s[k2][j])); + scale_and_sub(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x); + scale_and_sub(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y); + } else if constexpr (group_blocks != -1) { + scale(frag_b0, frag_s[k2][j], 0); + scale(frag_b1, frag_s[k2][j], 1); + } + +#pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { + if constexpr (m_block_size_8) { + mma_trans(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]); + } else { + mma(frag_a[k2][i], frag_b0, frag_c[i][j][0]); + mma(frag_a[k2][i], frag_b1, frag_c[i][j][1]); + } + } + } + }; + + // Since we slice across the k dimension of a tile in order to increase the + // number of warps while keeping the n dimension of a tile reasonable, we have + // multiple warps that accumulate their partial sums of the same output + // location; which we have to reduce over in the end. We do in shared memory. + auto thread_block_reduce = [&]() { + constexpr int red_off = threads / b_sh_stride_threads / 2; + if (red_off >= 1) { + auto red_idx = threadIdx.x / b_sh_stride_threads; + constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2; + constexpr int red_sh_delta = b_sh_stride_threads; + int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) + (threadIdx.x % b_sh_stride_threads); + + // Parallel logarithmic shared memory reduction. We make sure to avoid any + // unnecessary read or write iterations, e.g., for two warps we write only + // once by warp 1 and read only once by warp 0. + +#pragma unroll + for (int m_block = 0; m_block < thread_m_blocks; m_block++) { +#pragma unroll + for (int i = red_off; i > 0; i /= 2) { + if (i <= red_idx && red_idx < 2 * i) { +#pragma unroll + for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) { + int red_sh_wr = red_sh_delta * j + (red_sh_rd - red_sh_stride * i); + if (i < red_off) { + float* c_rd = reinterpret_cast(&sh_red[red_sh_delta * j + red_sh_rd]); + float* c_wr = reinterpret_cast(&sh_red[red_sh_wr]); +#pragma unroll + for (int k = 0; k < 4; k++) reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += c_rd[k] + c_wr[k]; + } + sh_red[red_sh_wr] = reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; + } + } + __syncthreads(); + } + if (red_idx == 0) { +#pragma unroll + for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) { + float* c_rd = reinterpret_cast(&sh_red[red_sh_delta * i + red_sh_rd]); +#pragma unroll + for (int j = 0; j < 4; j++) reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += c_rd[j]; + } + } + __syncthreads(); + } + } + }; + + // Since multiple threadblocks may process parts of the same column slice, we + // finally have to globally reduce over the results. As the striped + // partitioning minimizes the number of such reductions and our outputs are + // usually rather small, we perform this reduction serially in L2 cache. + auto global_reduce_fp16 = [&](bool first = false, bool last = false) { + // We are very careful here to reduce directly in the output buffer to + // maximize L2 cache utilization in this step. To do this, we write out + // results in FP16 (but still reduce with FP32 compute). + constexpr int active_threads = 32 * thread_n_blocks / 4; + if (threadIdx.x < active_threads) { + int c_gl_stride = prob_n / 8; + int c_gl_wr_delta_o = 8 * c_gl_stride; + int c_gl_wr_delta_i = 4 * (active_threads / 32); + int c_gl_wr; + if constexpr (m_block_size_8) { + c_gl_wr = c_gl_stride * ((threadIdx.x % 4) * 2) + 4 * (threadIdx.x / 32) + (threadIdx.x % 32) / 8; + c_gl_wr += (2 * thread_n_blocks) * slice_col; + } else { + c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) + 4 * (threadIdx.x / 32) + threadIdx.x % 4; + c_gl_wr += (2 * thread_n_blocks) * slice_col; + } + constexpr int c_sh_wr_delta = active_threads; + auto c_sh_wr = threadIdx.x; + + int row = (threadIdx.x % 32) / 4; + + if (!first) { +// Interestingly, doing direct global accesses here really seems to mess up +// the compiler and lead to slowdowns, hence we also use async-copies even +// though these fetches are not actually asynchronous. +#pragma unroll + for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) { + if constexpr (m_block_size_8) { + cp_async4_pred(&sh_red[c_sh_wr + c_sh_wr_delta * i], + &C[c_gl_wr + i * c_gl_stride + (threadIdx.x % 8) / 4 * c_gl_wr_delta_i], + (threadIdx.x % 4) * 2 + i < prob_m); + } else { + cp_async4_pred(&sh_red[c_sh_wr + c_sh_wr_delta * i], + &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)], + i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); + } + } + cp_async_fence(); + cp_async_wait<0>(); + } + +#pragma unroll + for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) { + bool mask = (!m_block_size_8) && (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) + || (m_block_size_8) && ((threadIdx.x % 4) * 2 + i < prob_m); + if (mask) { + if (!first) { + int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta]; +#pragma unroll + for (int j = 0; j < 2 * 4; j++) { + int delta = 0; + if constexpr (m_block_size_8) { delta = j % 2 == 1 ? -2 : 0; } + reinterpret_cast(&frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] += + Dtype::num2float(reinterpret_cast(&c_red)[j]); + } + } + if (!last) { + int4 c; +#pragma unroll + for (int j = 0; j < 2 * 4; j++) { + int delta = 0; + if constexpr (m_block_size_8) { delta = j % 2 == 1 ? -2 : 0; } + reinterpret_cast(&c)[j] = + Dtype::float2num(reinterpret_cast(&frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]); + } + if constexpr (m_block_size_8) + C[c_gl_wr + i * c_gl_stride + (threadIdx.x % 8) / 4 * c_gl_wr_delta_i] = c; + else + C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)] = c; + } + } + } + } + }; + + // Globally reduce over threadblocks that compute the same column block. + // We use a tmp C buffer to reduce in full fp32 precision. + auto global_reduce_fp32 = [&](bool first = false, bool last = false) { + constexpr int tb_m = thread_m_blocks * 16; + constexpr int tb_n = thread_n_blocks * 16; + + constexpr int c_size = tb_m * tb_n * sizeof(float) / 16; + + constexpr int active_threads = 32 * thread_n_blocks / 4; + bool is_th_active = threadIdx.x < active_threads; + + constexpr int num_floats = thread_m_blocks * 4 * 2 * 4; + constexpr int th_size = num_floats * sizeof(float) / 16; + + int c_cur_offset = locks_off * c_size; + + if (!is_th_active) { return; } + + if (!first) { + float* frag_c_ptr = reinterpret_cast(&frag_c); +#pragma unroll + for (int k = 0; k < th_size; k += (m_block_size_8 ? 2 : 1)) { + sh_red[threadIdx.x] = C_tmp[c_cur_offset + active_threads * k + threadIdx.x]; + + float* sh_c_ptr = reinterpret_cast(&sh_red[threadIdx.x]); +#pragma unroll + for (int f = 0; f < 4; f++) { frag_c_ptr[k * 4 + f] += sh_c_ptr[f]; } + } + } + + if (!last) { + int4* frag_c_ptr = reinterpret_cast(&frag_c); +#pragma unroll + for (int k = 0; k < th_size; k += (m_block_size_8 ? 2 : 1)) { + C_tmp[c_cur_offset + active_threads * k + threadIdx.x] = frag_c_ptr[k]; + } + } + }; + + // Write out the reduce final result in the correct layout. We only actually + // reshuffle matrix fragments in this step, the reduction above is performed + // in fragment layout. + auto write_result = [&]() { + int c_gl_stride = prob_n / 8; + constexpr int c_sh_stride = 2 * thread_n_blocks + 1; + int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks)); + constexpr int c_sh_rd_delta = c_sh_stride * (threads / (2 * thread_n_blocks)); + + int c_gl_wr = c_gl_stride * (threadIdx.x / (2 * thread_n_blocks)) + (threadIdx.x % (2 * thread_n_blocks)); + c_gl_wr += (2 * thread_n_blocks) * slice_col; + int c_sh_wr; + if constexpr (m_block_size_8) { + c_sh_wr = (8 * c_sh_stride) * ((threadIdx.x % 32) % 4 * 2) + (threadIdx.x % 32) / 4; + c_sh_wr += 64 * (threadIdx.x / 32); + } else { + c_sh_wr = (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4; + c_sh_wr += 32 * (threadIdx.x / 32); + } + + int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) + (threadIdx.x % (2 * thread_n_blocks)); + + int c_gl_wr_end = c_gl_stride * prob_m; + // We first reorder in shared memory to guarantee the most efficient final + // global write patterns + auto write = [&](int idx, float c0, float c1, FragS& s) { + scalar_t2 res = Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1)); + + // For per-column quantization we finally apply the scale here (only for + // 4-bit) + if constexpr (!has_act_order && group_blocks == -1 && w_type.size_bits() == 4 + && (has_zp && dequant_skip_flop || !has_zp)) { + res = __hmul2(res, s[0]); + } + + if constexpr (w_type == host::kFE2M1f) { res = __hmul2(res, global_scale); } + + if constexpr (m_block_size_8) { + ((scalar_t*)sh_red)[idx] = res.x; + ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y; + } else { + ((scalar_t2*)sh_red)[idx] = res; + } + }; + + if (threadIdx.x / 32 < thread_n_blocks / 4) { +#pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { +#pragma unroll + for (int j = 0; j < 4; j++) { + if constexpr (m_block_size_8) { + int wr = c_sh_wr + 16 * j; + write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 1]); + } else { + int wr = c_sh_wr + 8 * j; + write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0], frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2], frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]); + write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0], frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]); + write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2], frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]); + } + } + c_sh_wr += 16 * (4 * c_sh_stride); + } + } + __syncthreads(); + +#pragma unroll + for (int i = 0; i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); i++) { + if (c_gl_wr < c_gl_wr_end) { + if (use_atomic_add && slice_count > 1) { + scalar_t2* C_half2 = reinterpret_cast(&C[c_gl_wr]); + scalar_t2* sh_red_half2 = reinterpret_cast(&sh_red[c_sh_rd]); +#pragma unroll + for (int a = 0; a < 4; a++) { atomicAdd(&C_half2[a], sh_red_half2[a]); } + } else { + C[c_gl_wr] = sh_red[c_sh_rd]; + } + c_gl_wr += c_gl_wr_delta; + c_sh_rd += c_sh_rd_delta; + } + } + __syncthreads(); + }; + + // Start global fetch and register load pipelines. + auto start_pipes = [&]() { + +#pragma unroll + for (int i = 0; i < stages - 1; i++) { + if (has_act_order && i == 0) { + int last_g_idx = slice_k_start + stages * tb_k * 2; + if (last_g_idx >= prob_k) { last_g_idx = prob_k - 1; } + fetch_act_order_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]); + } + + if constexpr (has_zp && !is_zp_float && group_blocks == -1) { + if (i == 0) { + fetch_col_zp_to_shared(); + if constexpr (!dequant_skip_flop) { fetch_col_scale_to_shared(); } + } + } + fetch_to_shared(i, i, i < slice_iters); + } + + zero_accums(); + wait_for_stage(); + init_same_group(0); + fetch_to_registers(0, 0); + fetch_scales_to_registers(0, 0); + fetch_zp_to_registers(0, 0); + a_gl_rd += a_gl_rd_delta_o * (stages - 1); + if constexpr (has_act_order) { slice_k_start_shared_fetch += tb_k * (stages - 1); } + }; + if (slice_iters) { start_pipes(); } + + // Main loop. + while (slice_iters) { + // We unroll over both the global fetch and the register load pipeline to + // ensure all shared memory accesses are static. Note that both pipelines + // have even length meaning that the next iteration will always start at + // index 0. + +#pragma unroll + for (int pipe = 0; pipe < stages;) { +#pragma unroll + for (int k = 0; k < b_sh_wr_iters; k++) { + fetch_to_registers(k + 1, pipe % stages); + fetch_scales_to_registers(k + 1, pipe); + fetch_zp_to_registers(k + 1, pipe); + if (k == b_sh_wr_iters - 2) { + fetch_to_shared((pipe + stages - 1) % stages, pipe, slice_iters >= stages); + pipe++; + wait_for_stage(); + init_same_group(pipe % stages); + } + matmul(k); + } + slice_iters--; + if (slice_iters == 0) { break; } + } + + a_gl_rd += a_gl_rd_delta_o * stages; + + if constexpr (has_act_order) { + slice_k_start += tb_k * stages; + + if (slice_k_start < prob_k) { + slice_k_start_shared_fetch += tb_k * stages; + int first_group_id = g_idx[slice_k_start]; + int last_g_idx = slice_k_start + stages * tb_k * 2; + if (last_g_idx >= prob_k) { last_g_idx = prob_k - 1; } + int last_group_id = g_idx[last_g_idx]; + if (last_group_id >= sh_first_group_id + sh_num_groups) { + fetch_act_order_scales_to_shared(false, first_group_id, last_group_id); + __syncthreads(); + } + } + } + + // Process results and, if necessary, proceed to the next column slice. + // While this pattern may not be the most readable, other ways of writing + // the loop seemed to noticeably worse performance after compilation. + if (slice_iters == 0) { + cp_async_wait<0>(); + bool last = slice_idx == slice_count - 1; + // For per-column scales, we only fetch them here in the final step before + // write-out + if constexpr (!has_act_order && group_blocks == -1 && (has_zp && dequant_skip_flop || !has_zp)) { + if (w_type.size_bits() == 8 || (last || use_atomic_add)) { + if (s_sh_wr_pred) { cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]); } + cp_async_fence(); + } + } + + thread_block_reduce(); + if constexpr (!has_act_order && group_blocks == -1 && (has_zp && dequant_skip_flop || !has_zp)) { + if (w_type.size_bits() == 8 || (last || use_atomic_add)) { + cp_async_wait<0>(); + __syncthreads(); + if (threadIdx.x / 32 < thread_n_blocks / 4) { + reinterpret_cast(&frag_s)[0] = sh_s[s_sh_rd + 0]; + reinterpret_cast(&frag_s)[1] = sh_s[s_sh_rd + 4]; + if constexpr (m_block_size_8) { + int idx = (threadIdx.x / 4) % 2; + scalar_t2* frag_s_half2 = reinterpret_cast(frag_s); +#pragma unroll + for (int i = 0; i < 8; i++) { + frag_s_half2[i] = Dtype::num2num2(reinterpret_cast(&frag_s_half2[i])[idx]); + } + } + } + } + } + + // For 8-bit channelwise, we apply the scale before the global reduction + // that converts the fp32 results to fp16 (so that we avoid possible + // overflow in fp16) + if constexpr (!has_act_order && group_blocks == -1 && w_type.size_bits() == 8 + && (has_zp && dequant_skip_flop || !has_zp)) { + if (threadIdx.x / 32 < thread_n_blocks / 4) { +#pragma unroll + for (int i = 0; i < thread_m_blocks; i++) { +#pragma unroll + for (int j = 0; j < 4; j++) { + scale_float(reinterpret_cast(&frag_c[i][j][0][0]), frag_s[j / 2][2 * (j % 2) + 0]); + scale_float(reinterpret_cast(&frag_c[i][j][0][2]), + frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]); + + if constexpr (!m_block_size_8) { + scale_float(reinterpret_cast(&frag_c[i][j][1][0]), frag_s[j / 2][2 * (j % 2) + 1]); + scale_float(reinterpret_cast(&frag_c[i][j][1][2]), frag_s[j / 2][2 * (j % 2) + 1]); + } + } + } + } + } + + if (slice_count > 1 && !use_atomic_add) { + // only globally reduce if there is more than one block in a slice + barrier_acquire(&locks[locks_off], slice_idx); + if (use_fp32_reduce) { + global_reduce_fp32(slice_idx == 0, last); + } else { + global_reduce_fp16(slice_idx == 0, last); + } + barrier_release(&locks[locks_off], last); + } + if (use_atomic_add && slice_count > 1 && slice_idx != 0) wait_negative_and_add(&locks[locks_off]); + if (last || use_atomic_add) + // only the last block in a slice actually writes the result + write_result(); + slice_row = 0; + slice_col_par++; + slice_col++; + is_first_matmul_in_slice = true; + init_slice(); + + if (slice_iters) { + a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) + (threadIdx.x % a_gl_rd_delta_o); +#pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles; + if (slice_col == 0) { +#pragma unroll + for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride; + } + + // Update slice k/n for scales loading + if constexpr (has_act_order) { + slice_k_start = tb_k * slice_row; + slice_k_finish = slice_k_start + tb_k * slice_iters; + slice_k_start_shared_fetch = slice_k_start; + slice_n_offset = act_s_col_tb_stride * slice_col; + + } else { + s_gl_rd = s_sh_stride * slice_col + threadIdx.x; + zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x; + } + + start_pipes(); + } + } + } +} + +} // namespace device::marlin + +#endif diff --git a/mllm-kernel/mllm_kernel/cuda/jit/__init__.py b/mllm-kernel/mllm_kernel/cuda/jit/__init__.py index cc4ab667a..1fe41f560 100644 --- a/mllm-kernel/mllm_kernel/cuda/jit/__init__.py +++ b/mllm-kernel/mllm_kernel/cuda/jit/__init__.py @@ -1,5 +1,14 @@ from .add_constant import add_constant +from .awq_marlin_repack import awq_marlin_repack from .gdn_decode import gdn_decode +from .gptq_marlin import gptq_marlin_gemm from .store_cache import can_use_store_cache, store_cache -__all__ = ["add_constant", "can_use_store_cache", "gdn_decode", "store_cache"] +__all__ = [ + "add_constant", + "awq_marlin_repack", + "can_use_store_cache", + "gdn_decode", + "gptq_marlin_gemm", + "store_cache", +] diff --git a/mllm-kernel/mllm_kernel/cuda/jit/awq_marlin_repack.py b/mllm-kernel/mllm_kernel/cuda/jit/awq_marlin_repack.py new file mode 100644 index 000000000..f13f50475 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/jit/awq_marlin_repack.py @@ -0,0 +1,78 @@ +"""AWQ Marlin weight repack CUDA JIT kernel. + +Repacks AWQ-format quantized weights into Marlin kernel layout. + +Usage:: + + from mllm_kernel.cuda.jit.awq_marlin_repack import awq_marlin_repack + + out = awq_marlin_repack(b_q_weight, size_k, size_n, num_bits) +""" + +from __future__ import annotations + +import torch + +from mllm_kernel.jit_utils import cache_once, jit + + +@cache_once +def _make_awq_marlin_repack_kernel(): + """JIT-compile the AWQ Marlin repack CUDA kernel.""" + + @jit( + args=[], + device="cuda", + cuda_files=["gemm/marlin/awq_marlin_repack.cuh"], + cuda_wrappers=[("awq_marlin_repack", "awq_marlin_repack")], + func_name="awq_marlin_repack", + ) + def _kernel( + compiled_module, + out: torch.Tensor, + b_q_weight: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, + ) -> None: + compiled_module.awq_marlin_repack(out, b_q_weight, size_k, size_n, num_bits) + + return _kernel + + +def awq_marlin_repack( + b_q_weight: torch.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> torch.Tensor: + """Repack AWQ-format quantized weights into Marlin kernel layout. + + Parameters + ---------- + b_q_weight : torch.Tensor + AWQ packed weight tensor, shape ``(size_k, size_n // pack_factor)``, + dtype ``int32``. + size_k : int + Number of input features (must be divisible by 16). + size_n : int + Number of output features (must be divisible by 64). + num_bits : int + Weight quantization bit-width (4 or 8). + + Returns + ------- + torch.Tensor + Repacked weight tensor in Marlin layout, shape + ``(size_k // 16, size_n * 16 // pack_factor)``, dtype ``int32``. + """ + tile_size = 16 + pack_factor = 32 // num_bits + out = torch.empty( + (size_k // tile_size, size_n * tile_size // pack_factor), + dtype=b_q_weight.dtype, + device=b_q_weight.device, + ) + kernel = _make_awq_marlin_repack_kernel() + kernel(out, b_q_weight, size_k, size_n, num_bits) + return out diff --git a/mllm-kernel/mllm_kernel/cuda/jit/gptq_marlin.py b/mllm-kernel/mllm_kernel/cuda/jit/gptq_marlin.py new file mode 100644 index 000000000..9eeefa765 --- /dev/null +++ b/mllm-kernel/mllm_kernel/cuda/jit/gptq_marlin.py @@ -0,0 +1,213 @@ +"""GPTQ Marlin GEMM CUDA JIT kernel. + +Performs quantized matrix multiplication using the Marlin kernel for +GPTQ/AWQ-style W4A16 or W8A16 quantized weights. + +Usage:: + + from mllm_kernel.cuda.jit.gptq_marlin import gptq_marlin_gemm + + output = gptq_marlin_gemm( + a, c, b_q_weight, b_scales, global_scale, b_zeros, + g_idx, perm, workspace, b_q_type_id, + size_m, size_n, size_k, + ) +""" + +from __future__ import annotations + +from typing import Optional + +import torch + +from mllm_kernel.jit_utils import cache_once, jit, make_cpp_args + +# Constants matching device::marlin:: in marlin.cuh +_MAX_THREAD_N = 256 + + +@cache_once +def _make_gptq_marlin_gemm_kernel(dtype: torch.dtype): + """JIT-compile the GPTQ Marlin GEMM kernel for a specific dtype.""" + args = make_cpp_args(dtype) + + @jit( + args=args, + device="cuda", + cuda_files=["gemm/marlin/gptq_marlin.cuh"], + cuda_wrappers=[("gptq_marlin_gemm", f"gptq_marlin_gemm<{args}>")], + func_name="gptq_marlin_gemm", + ) + def _kernel( + compiled_module, + a: torch.Tensor, + b_q_weight: torch.Tensor, + b_scales: torch.Tensor, + global_scale: torch.Tensor, + b_zeros: torch.Tensor, + g_idx: torch.Tensor, + perm: torch.Tensor, + c: torch.Tensor, + c_tmp: torch.Tensor, + a_tmp: torch.Tensor, + workspace: torch.Tensor, + b_q_type_id: int, + is_k_full: bool, + use_atomic_add: bool, + use_fp32_reduce: bool, + is_zp_float: bool, + ) -> None: + compiled_module.gptq_marlin_gemm( + a, + b_q_weight, + b_scales, + global_scale, + b_zeros, + g_idx, + perm, + c, + c_tmp, + a_tmp, + workspace, + b_q_type_id, + is_k_full, + use_atomic_add, + use_fp32_reduce, + is_zp_float, + ) + + return _kernel + + +def _or_empty( + t: Optional[torch.Tensor], device: torch.device, dtype: torch.dtype +) -> torch.Tensor: + return t if t is not None else torch.empty(0, device=device, dtype=dtype) + + +def gptq_marlin_gemm( + a: torch.Tensor, + c: Optional[torch.Tensor], + b_q_weight: torch.Tensor, + b_scales: torch.Tensor, + global_scale: Optional[torch.Tensor], + b_zeros: Optional[torch.Tensor], + g_idx: Optional[torch.Tensor], + perm: Optional[torch.Tensor], + workspace: torch.Tensor, + b_q_type_id: int, + size_m: int, + size_n: int, + size_k: int, + is_k_full: bool = True, + use_atomic_add: bool = False, + use_fp32_reduce: bool = False, + is_zp_float: bool = False, +) -> torch.Tensor: + """Perform quantized GEMM using the Marlin kernel. + + Parameters + ---------- + a : torch.Tensor + Input activation tensor, shape ``(size_m, size_k)``, fp16 or bf16. + c : torch.Tensor or None + Output buffer, shape ``(size_m, size_n)``. Allocated if ``None``. + b_q_weight : torch.Tensor + Quantized weight in Marlin layout, int32. + b_scales : torch.Tensor + Per-group quantization scales. + global_scale : torch.Tensor or None + Global scale for FP8 quantization. + b_zeros : torch.Tensor or None + Per-group zero points (for AWQ-style asymmetric quantization). + g_idx : torch.Tensor or None + Group indices for activation reordering. + perm : torch.Tensor or None + Permutation indices for activation reordering. + workspace : torch.Tensor + Workspace buffer for synchronization. + b_q_type_id : int + ScalarType id for the quantized weight type. + size_m : int + Batch dimension. + size_n : int + Output dimension. + size_k : int + Reduction dimension. + is_k_full : bool + Whether the full K dimension is present (no TP split on K). + use_atomic_add : bool + Use atomic add for output reduction. + use_fp32_reduce : bool + Use fp32 for global reduction. + is_zp_float : bool + Whether zero points are float16 type. + + Returns + ------- + torch.Tensor + Output tensor, shape ``(size_m, size_n)``. + """ + device = a.device + + # Allocate output if not provided + if c is None: + c = torch.empty((size_m, size_n), dtype=a.dtype, device=device) + + # Early return for zero-size M + if size_m == 0: + return c + + # Determine activation ordering + has_act_order = ( + g_idx is not None + and perm is not None + and g_idx.numel() > 0 + and perm.numel() > 0 + ) + + # Allocate c_tmp for fp32 reduce + if use_fp32_reduce: + sms = torch.cuda.get_device_properties(device).multi_processor_count + max_m_block = min(((size_m + 15) // 16) * 16, 64) + c_tmp = torch.empty( + sms * max_m_block * _MAX_THREAD_N, + dtype=torch.float32, + device=device, + ) + else: + c_tmp = torch.empty(0, dtype=torch.float32, device=device) + + # Allocate a_tmp for act_order column permutation + if has_act_order: + a_tmp = torch.empty((size_m, size_k), dtype=a.dtype, device=device) + else: + a_tmp = torch.empty(0, dtype=a.dtype, device=device) + + # Convert Optional tensors to empty tensors + global_scale_t = _or_empty(global_scale, device, a.dtype) + b_zeros_t = _or_empty(b_zeros, device, torch.int32) + g_idx_t = _or_empty(g_idx, device, torch.int32) + perm_t = _or_empty(perm, device, torch.int32) + + kernel = _make_gptq_marlin_gemm_kernel(a.dtype) + kernel( + a, + b_q_weight, + b_scales, + global_scale_t, + b_zeros_t, + g_idx_t, + perm_t, + c, + c_tmp, + a_tmp, + workspace, + b_q_type_id, + is_k_full, + use_atomic_add, + use_fp32_reduce, + is_zp_float, + ) + + return c diff --git a/pymllm/executor/model_runner.py b/pymllm/executor/model_runner.py index d60b9d899..45fe4e564 100644 --- a/pymllm/executor/model_runner.py +++ b/pymllm/executor/model_runner.py @@ -512,6 +512,15 @@ def load_model(self) -> None: finally: torch.set_default_dtype(old_dtype) self.model.load_weights(self._iter_weights(model_path)) + + # Post-load processing: let each quantization method repack/transform + # weights from checkpoint format to runtime format (e.g. AWQ → Marlin, + # GPTQ g_idx shuffling, FP8 calibration). + for _name, module in self.model.named_modules(): + quant_method = getattr(module, "quant_method", None) + if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"): + quant_method.process_weights_after_loading(module) + self.model.eval() after_mem = get_available_gpu_memory(self.device, self.gpu_id) diff --git a/pymllm/layers/base.py b/pymllm/layers/base.py index 3044e2064..3e762ae5a 100644 --- a/pymllm/layers/base.py +++ b/pymllm/layers/base.py @@ -2,14 +2,16 @@ from torch import nn from torch.nn import Parameter from pymllm.layers.utils import set_weight_attrs -from pymllm.quantization.quant_recipe import QuantRecipe -from typing import Optional +from typing import Optional, TYPE_CHECKING + +if TYPE_CHECKING: + from pymllm.layers.quantize_base import QuantizeMethodBase class MllmBaseLayer(nn.Module): def __init__(self): super().__init__() - self.quant_recipe: Optional[QuantRecipe] = None + self.quant_method: Optional["QuantizeMethodBase"] = None def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): """Load weights into a parameter. diff --git a/pymllm/layers/linear.py b/pymllm/layers/linear.py index dc583e931..b4058c2da 100644 --- a/pymllm/layers/linear.py +++ b/pymllm/layers/linear.py @@ -1,10 +1,34 @@ +"""Linear layers with quantization method dispatch. + +Every linear layer holds a ``quant_method`` attribute (an instance of +:class:`~pymllm.layers.quantize_base.LinearMethodBase`). When no +quantization is configured, :class:`UnquantizedLinearMethod` is used as the +default — it creates a standard FP weight and forwards via ``F.linear``. + +Quantized checkpoints plug in a different ``LinearMethodBase`` (e.g. +``AWQLinearMethod``) which creates packed int4 weights, scales, and +zero-points, and overrides :meth:`apply` with a fused dequant+matmul kernel. + +Usage in model definitions:: + + # Non-quantized (default) + layer = ColumnParallelLinear(4096, 4096) + + # Quantized — pass a quant_method from QuantizationConfig + qm = awq_config.get_quant_method(layer, prefix="model.layers.0.q_proj") + layer = ColumnParallelLinear(4096, 4096, quant_method=qm) +""" + from __future__ import annotations +from typing import Optional + import torch import torch.nn.functional as F from torch.nn import Parameter from pymllm.layers.base import MllmBaseLayer +from pymllm.layers.quantize_base import LinearMethodBase, UnquantizedLinearMethod from pymllm.layers.utils import set_weight_attrs from pymllm.orchestrator import ( divide, @@ -21,14 +45,20 @@ class ColumnParallelLinear(MllmBaseLayer): The weight matrix is split along the output dimension across TP ranks. Each rank holds ``out_features / tp_size`` rows of the weight. - Args: - in_features: Size of each input sample. - out_features: Size of each output sample (before sharding). - bias: If ``True``, adds a learnable bias. - gather_output: If ``True``, all-gather the output across TP ranks - so every rank gets the full ``out_features``. Set to ``False`` - when the next layer is a :class:`RowParallelLinear` that expects - a split input. + Parameters + ---------- + in_features + Size of each input sample. + out_features + Size of each output sample (before sharding). + bias + If ``True``, adds a learnable bias. + gather_output + If ``True``, all-gather the output across TP ranks so every rank + gets the full ``out_features``. Set to ``False`` when the next + layer is a :class:`RowParallelLinear` that expects a split input. + quant_method + Quantization method instance. ``None`` → :class:`UnquantizedLinearMethod`. """ def __init__( @@ -37,6 +67,7 @@ def __init__( out_features: int, bias: bool = True, gather_output: bool = True, + quant_method: Optional[LinearMethodBase] = None, ): super().__init__() @@ -57,16 +88,20 @@ def __init__( self.output_start_index = self.tp_rank * self.out_features_per_partition self.output_end_index = self.output_start_index + self.out_features_per_partition - self.weight = Parameter( - torch.empty(self.out_features_per_partition, in_features) - ) - set_weight_attrs( - self.weight, - { - "output_dim": 0, - "input_dim": 1, - "weight_loader": self.weight_loader, - }, + # --- Quantization method --- + # The quant_method creates the weight parameters on this layer via + # create_weights(). For UnquantizedLinearMethod this creates a + # standard FP Parameter named "weight". For quantized methods it + # may instead create qweight, scales, qzeros, etc. + self.quant_method = quant_method or UnquantizedLinearMethod() + self.quant_method.create_weights( + layer=self, + input_size_per_partition=in_features, + output_partition_sizes=[self.out_features_per_partition], + input_size=in_features, + output_size=out_features, + params_dtype=torch.get_default_dtype(), + weight_loader=self.weight_loader, ) if bias: @@ -111,7 +146,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param.data.copy_(shard_weight) def forward(self, x: torch.Tensor) -> torch.Tensor: - output = F.linear(x, self.weight, self.bias) + # Delegate computation to the quant_method. For unquantized layers + # this is F.linear; for quantized layers it's a fused dequant+matmul. + output = self.quant_method.apply(self, x, self.bias) if self.gather_output and self.tp_size > 1: output = tensor_model_parallel_all_gather(output, dim=-1) @@ -129,11 +166,18 @@ class RowParallelLinear(MllmBaseLayer): Typically placed after a :class:`ColumnParallelLinear` whose ``gather_output=False``, so the input is already split. - Args: - in_features: Size of each input sample (before sharding). - out_features: Size of each output sample. - bias: If ``True``, adds a learnable bias (applied after all-reduce). - reduce_output: If ``True``, all-reduce the output across TP ranks. + Parameters + ---------- + in_features + Size of each input sample (before sharding). + out_features + Size of each output sample. + bias + If ``True``, adds a learnable bias (applied after all-reduce). + reduce_output + If ``True``, all-reduce the output across TP ranks. + quant_method + Quantization method instance. ``None`` → :class:`UnquantizedLinearMethod`. """ def __init__( @@ -142,6 +186,7 @@ def __init__( out_features: int, bias: bool = True, reduce_output: bool = True, + quant_method: Optional[LinearMethodBase] = None, ): super().__init__() @@ -162,16 +207,16 @@ def __init__( self.input_start_index = self.tp_rank * self.in_features_per_partition self.input_end_index = self.input_start_index + self.in_features_per_partition - self.weight = Parameter( - torch.empty(out_features, self.in_features_per_partition) - ) - set_weight_attrs( - self.weight, - { - "output_dim": 0, - "input_dim": 1, - "weight_loader": self.weight_loader, - }, + # --- Quantization method --- + self.quant_method = quant_method or UnquantizedLinearMethod() + self.quant_method.create_weights( + layer=self, + input_size_per_partition=self.in_features_per_partition, + output_partition_sizes=[out_features], + input_size=in_features, + output_size=out_features, + params_dtype=torch.get_default_dtype(), + weight_loader=self.weight_loader, ) if bias: @@ -210,7 +255,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param.data.copy_(shard_weight) def forward(self, x: torch.Tensor) -> torch.Tensor: - output = F.linear(x, self.weight) + # Delegate computation to the quant_method (no bias here; bias is + # added after the all-reduce below). + output = self.quant_method.apply(self, x) if self.reduce_output and self.tp_size > 1: output = tensor_model_parallel_all_reduce(output) @@ -222,26 +269,41 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class Linear(MllmBaseLayer): - """Linear layer with simple quant dispatch.""" + """Non-parallel linear layer with quantization dispatch. + + Parameters + ---------- + in_features + Size of each input sample. + out_features + Size of each output sample. + bias + If ``True``, adds a learnable bias. + quant_method + Quantization method instance. ``None`` → :class:`UnquantizedLinearMethod`. + """ def __init__( self, in_features: int, out_features: int, bias: bool = True, + quant_method: Optional[LinearMethodBase] = None, ): super().__init__() self.in_features = in_features self.out_features = out_features - self.weight = Parameter(torch.empty(out_features, in_features)) - set_weight_attrs( - self.weight, - { - "output_dim": 0, - "input_dim": 1, - "weight_loader": self.weight_loader, - }, + # --- Quantization method --- + self.quant_method = quant_method or UnquantizedLinearMethod() + self.quant_method.create_weights( + layer=self, + input_size_per_partition=in_features, + output_partition_sizes=[out_features], + input_size=in_features, + output_size=out_features, + params_dtype=torch.get_default_dtype(), + weight_loader=self.weight_loader, ) if bias: @@ -250,14 +312,5 @@ def __init__( else: self.register_parameter("bias", None) - def _forward_torch_linear(self, x: torch.Tensor) -> torch.Tensor: - return F.linear(x, self.weight, self.bias) - - def _forward_quant_linear(self, x: torch.Tensor) -> torch.Tensor: - # TODO(wch): Implement quantized linear path. - raise NotImplementedError("quant_linear is not implemented yet.") - def forward(self, x: torch.Tensor) -> torch.Tensor: - if self.quant_recipe is None: - return self._forward_torch_linear(x) - return self._forward_quant_linear(x) + return self.quant_method.apply(self, x, self.bias) diff --git a/pymllm/layers/quantize_base.py b/pymllm/layers/quantize_base.py new file mode 100644 index 000000000..951fc6115 --- /dev/null +++ b/pymllm/layers/quantize_base.py @@ -0,0 +1,275 @@ +"""Quantization method base classes for pymllm layers. + +This module defines the plugin interface that all quantization methods must +implement. The pattern follows sglang / vLLM's ``LinearMethodBase`` design: + +1. Each quantization algorithm (AWQ, GPTQ, FP8, ...) provides a concrete + subclass of :class:`LinearMethodBase`. +2. Linear layers hold a ``quant_method`` attribute (an instance of + :class:`LinearMethodBase`). +3. During ``__init__``, the linear layer calls + ``quant_method.create_weights(layer, ...)`` to register the appropriate + parameters (packed int weights, scales, zero-points, etc.) on itself. +4. During ``forward``, the linear layer calls + ``quant_method.apply(layer, x, bias)`` instead of ``F.linear``. +5. After checkpoint loading, :class:`~pymllm.executor.model_runner.ModelRunner` + iterates all modules and calls + ``quant_method.process_weights_after_loading(layer)`` for format conversion, + repacking (e.g. AWQ → Marlin), or calibration. + +Typical lifecycle:: + + # ---- model construction ---- + quant_method = SomeLinearMethod(bits=4, group_size=128) + layer = ColumnParallelLinear(4096, 4096, quant_method=quant_method) + # → calls quant_method.create_weights(layer, ...) + # → layer now has .qweight, .scales, .qzeros, etc. + + # ---- weight loading ---- + model.load_weights(iter_weights(...)) + # → checkpoint tensors are loaded into the parameters created above, + # using each parameter's ``weight_loader`` attribute. + + # ---- post-load processing ---- + for module in model.modules(): + qm = getattr(module, "quant_method", None) + if qm is not None: + qm.process_weights_after_loading(module) + # → AWQ repacks int4 → Marlin layout, GPTQ shuffles by g_idx, etc. + + # ---- inference ---- + output = layer(x) + # → calls quant_method.apply(layer, x, bias) + # → dequant + matmul (or fused kernel) +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional + +import torch +import torch.nn.functional as F +from torch.nn import Parameter + +from pymllm.layers.utils import set_weight_attrs + + +# --------------------------------------------------------------------------- +# Base classes +# --------------------------------------------------------------------------- + + +class QuantizeMethodBase(ABC): + """Base class for all quantization methods (linear, embedding, MoE, ...). + + Every concrete quantization algorithm must implement at least + :meth:`create_weights` and :meth:`apply`. + + How to implement a new quantization method + ------------------------------------------- + 1. Subclass :class:`LinearMethodBase` (for linear layers). + 2. Override :meth:`create_weights` to register quantized parameters + (``qweight``, ``scales``, ``qzeros``, etc.) on the layer via + ``layer.register_parameter()``. + 3. Override :meth:`apply` to perform the quantized forward computation. + 4. Optionally override :meth:`process_weights_after_loading` if the + checkpoint format differs from the runtime format (e.g. repacking, + transposing, or calibrating scales). + """ + + def create_weights( + self, + layer: torch.nn.Module, + *args: Any, + **kwargs: Any, + ) -> None: + """Create and register quantized weight parameters on *layer*. + + Called once during layer construction (``__init__``). Implementations + should call ``layer.register_parameter(name, param)`` and attach + metadata via :func:`~pymllm.layers.utils.set_weight_attrs` so that + the weight-loading infrastructure knows how to shard and load them. + + Parameters + ---------- + layer + The ``nn.Module`` (e.g. ``ColumnParallelLinear``) that will own + the parameters. + """ + raise NotImplementedError + + @abstractmethod + def apply( + self, + layer: torch.nn.Module, + *args: Any, + **kwargs: Any, + ) -> torch.Tensor: + """Execute the quantized forward pass. + + Called by ``layer.forward()`` every inference step. The method should + read the parameters previously created by :meth:`create_weights` from + *layer* (e.g. ``layer.qweight``, ``layer.scales``), dequantize or + invoke a fused kernel, and return the output tensor. + + Parameters + ---------- + layer + The module that owns the quantized parameters. + """ + raise NotImplementedError + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + """Post-process parameters after checkpoint loading. + + Called once by ``ModelRunner`` after all checkpoint tensors have been + loaded into the layer's parameters. Use this for: + + * **Repacking**: converting checkpoint layout to kernel-native layout + (e.g. AutoAWQ int4 → Marlin packed format). + * **Transposing**: rearranging dimensions for optimised GEMM kernels. + * **Calibration**: computing per-tensor or per-channel scales from + the loaded FP weights (e.g. dynamic FP8 quantisation). + * **Cleanup**: replacing custom parameter wrappers with plain + ``torch.nn.Parameter`` to avoid overhead during inference. + + The default implementation is a no-op. + """ + return + + +class LinearMethodBase(QuantizeMethodBase): + """Base class for quantization methods applied to linear layers. + + Narrows the :class:`QuantizeMethodBase` interface with concrete + signatures tailored to linear (matmul) operations. + + Subclasses must implement :meth:`create_weights` and :meth:`apply`. + """ + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs: Any, + ) -> None: + """Create quantized weight tensors on *layer*. + + Parameters + ---------- + layer + The linear module that will own the parameters. + input_size_per_partition + Number of input features on this TP rank. + output_partition_sizes + Output sizes of each logical weight on this TP rank. For a + standard linear layer this is ``[out_features_per_partition]``. + For a merged QKV layer it might be ``[q_size, k_size, v_size]``. + input_size + Full (un-sharded) input dimension. + output_size + Full (un-sharded) output dimension. + params_dtype + Data type for full-precision parameters (e.g. ``torch.float16``). + **extra_weight_attrs + Additional metadata to attach to created parameters (e.g. + ``weight_loader``, ``packed_dim``, ``packed_factor``). + + Example (AWQ W4A16):: + + # Register packed 4-bit weights, scales, and zero-points + qweight = Parameter(torch.empty(..., dtype=torch.int32)) + layer.register_parameter("qweight", qweight) + + scales = Parameter(torch.empty(..., dtype=params_dtype)) + layer.register_parameter("scales", scales) + + qzeros = Parameter(torch.empty(..., dtype=torch.int32)) + layer.register_parameter("qzeros", qzeros) + """ + raise NotImplementedError + + @abstractmethod + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Compute the quantized linear forward. + + Parameters + ---------- + layer + The module that owns quantized parameters (set by + :meth:`create_weights`). + x + Input activation tensor, shape ``(*, input_size_per_partition)``. + bias + Optional bias vector. + + Returns + ------- + torch.Tensor + Output tensor, shape ``(*, sum(output_partition_sizes))``. + + Example (AWQ W4A16):: + + qweight = layer.qweight # packed int32 + scales = layer.scales # fp16 per-group scales + qzeros = layer.qzeros # packed int32 zero-points + # → invoke dequant + matmul kernel + """ + raise NotImplementedError + + +# --------------------------------------------------------------------------- +# Default unquantized implementation +# --------------------------------------------------------------------------- + + +class UnquantizedLinearMethod(LinearMethodBase): + """Default pass-through for non-quantized linear layers. + + Creates a standard FP weight ``(out_features, in_features)`` and + forwards via ``F.linear``. This is used when no quantization config + is specified so that every linear layer always has a ``quant_method`` + attribute with a uniform interface. + """ + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs: Any, + ) -> None: + """Create a standard full-precision weight parameter.""" + weight = Parameter( + torch.empty( + sum(output_partition_sizes), + input_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + layer.register_parameter("weight", weight) + set_weight_attrs(weight, extra_weight_attrs) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Standard ``F.linear`` forward.""" + return F.linear(x, layer.weight, bias) diff --git a/pymllm/quantization/QUANTIZATION.md b/pymllm/quantization/QUANTIZATION.md new file mode 100644 index 000000000..7e15e8e0c --- /dev/null +++ b/pymllm/quantization/QUANTIZATION.md @@ -0,0 +1,257 @@ +# pymllm Quantization Guide + +## Architecture + +pymllm uses a **plugin-based** quantization system. Each quantization +algorithm (AWQ, GPTQ, FP8, W8A8, ...) is a self-contained plugin that +implements three methods: **create weights**, **apply** (forward), and +**process weights after loading**. + +``` + QuantizationConfig + (parses checkpoint) + │ + │ get_quant_method(layer, prefix) + ▼ +┌─────────────────────────────────────────────────────┐ +│ LinearMethodBase │ +│ │ +│ create_weights() ← called during layer __init__ │ +│ apply() ← called during layer forward │ +│ process_weights_after_loading() ← called once │ +│ after checkpoint is loaded │ +└─────────────────────────────────────────────────────┘ + │ + │ registered on layer as + │ layer.quant_method + ▼ + Linear / ColumnParallelLinear / ... +``` + +### Key modules + +| Module | Purpose | +|--------|---------| +| `pymllm.layers.quantize_base` | `QuantizeMethodBase`, `LinearMethodBase`, `UnquantizedLinearMethod` | +| `pymllm.quantization.quant_config` | `QuantizationConfig` base class, registry, factory | +| `pymllm.quantization.methods/` | Concrete implementations (AWQ, GPTQ, FP8, ...) | + +## Lifecycle + +### 1. Model construction + +Each linear layer accepts an optional `quant_method` argument. If `None`, +`UnquantizedLinearMethod` is used (standard FP weight + `F.linear`). + +```python +from pymllm.layers.linear import ColumnParallelLinear + +# No quantization (default) +layer = ColumnParallelLinear(4096, 4096) + +# With quantization +layer = ColumnParallelLinear(4096, 4096, quant_method=my_quant_method) +``` + +During `__init__`, the layer calls: + +```python +self.quant_method.create_weights( + layer=self, + input_size_per_partition=in_features, + output_partition_sizes=[out_features_per_partition], + input_size=in_features, + output_size=out_features, + params_dtype=torch.get_default_dtype(), + weight_loader=self.weight_loader, +) +``` + +This registers the appropriate parameters on the layer. For unquantized +layers, this is a single `weight` parameter. For AWQ, it might be +`qweight` (packed int32), `scales` (fp16), and `qzeros` (packed int32). + +### 2. Weight loading + +The standard `model.load_weights(iter)` loop loads checkpoint tensors into +the parameters created above, using each parameter's `weight_loader` +attribute for tensor-parallel sharding. + +### 3. Post-load processing + +After all weights are loaded, `ModelRunner` calls: + +```python +for name, module in model.named_modules(): + quant_method = getattr(module, "quant_method", None) + if quant_method is not None: + quant_method.process_weights_after_loading(module) +``` + +This is where format conversions happen: +- **AWQ**: repack AutoAWQ int4 layout → Marlin kernel layout +- **GPTQ**: shuffle weights according to `g_idx` for exllama kernels +- **FP8**: quantize FP16 weights to FP8 and compute per-tensor scales + +### 4. Inference + +Every forward call goes through `quant_method.apply()`: + +```python +# Inside ColumnParallelLinear.forward(): +output = self.quant_method.apply(self, x, self.bias) +``` + +For unquantized layers this is just `F.linear`. For quantized layers it +invokes a fused dequant+matmul kernel. + +## How to add a new quantization method + +### Step 1: Implement `LinearMethodBase` + +Create a file in `pymllm/quantization/methods/`, e.g. `awq.py`: + +```python +from pymllm.layers.quantize_base import LinearMethodBase +from pymllm.layers.utils import set_weight_attrs + +class AWQLinearMethod(LinearMethodBase): + \"\"\"AWQ W4A16 quantized linear method.\"\"\" + + def __init__(self, weight_bits: int, group_size: int, zero_point: bool): + self.weight_bits = weight_bits + self.group_size = group_size + self.zero_point = zero_point + self.pack_factor = 32 // weight_bits # e.g. 8 for 4-bit + + def create_weights( + self, layer, input_size_per_partition, output_partition_sizes, + input_size, output_size, params_dtype, **extra_weight_attrs, + ): + output_size_per_partition = sum(output_partition_sizes) + + # Packed 4-bit weights: each int32 holds 8 x 4-bit values + qweight = Parameter( + torch.empty( + input_size_per_partition, + output_size_per_partition // self.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs(qweight, {"input_dim": 0, "output_dim": 1}) + layer.register_parameter("qweight", qweight) + set_weight_attrs(qweight, extra_weight_attrs) + + # Per-group scales + scales = Parameter( + torch.empty( + input_size_per_partition // self.group_size, + output_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("scales", scales) + set_weight_attrs(scales, extra_weight_attrs) + + # Per-group zero-points (packed) + qzeros = Parameter( + torch.empty( + input_size_per_partition // self.group_size, + output_size_per_partition // self.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + layer.register_parameter("qzeros", qzeros) + set_weight_attrs(qzeros, extra_weight_attrs) + + def apply(self, layer, x, bias=None): + # Dequantize and compute matmul + # In practice, call a fused CUDA kernel here + out = awq_dequantize_and_gemm(x, layer.qweight, layer.scales, layer.qzeros) + if bias is not None: + out = out + bias + return out + + def process_weights_after_loading(self, layer): + # Optional: repack weights for a faster kernel layout + # e.g. convert AutoAWQ format → Marlin format + layer.qweight = Parameter(layer.qweight.data, requires_grad=False) + layer.scales = Parameter(layer.scales.data, requires_grad=False) + layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False) +``` + +### Step 2: Implement `QuantizationConfig` + +```python +from pymllm.quantization.quant_config import QuantizationConfig, register_quantization + +@register_quantization("awq") +class AWQConfig(QuantizationConfig): + def __init__(self, weight_bits, group_size, zero_point): + self.weight_bits = weight_bits + self.group_size = group_size + self.zero_point = zero_point + + def get_name(self) -> str: + return "awq" + + @classmethod + def from_config(cls, config: dict) -> "AWQConfig": + return cls( + weight_bits=config["bits"], + group_size=config["group_size"], + zero_point=config["zero_point"], + ) + + def get_quant_method(self, layer, prefix=""): + # Skip quantization for certain layers if needed + # if "lm_head" in prefix: + # return None + return AWQLinearMethod(self.weight_bits, self.group_size, self.zero_point) +``` + +### Step 3: Use it + +```python +from pymllm.quantization import get_quantization_config + +# Parse from checkpoint config +ConfigClass = get_quantization_config("awq") +config = ConfigClass.from_config({"bits": 4, "group_size": 128, "zero_point": True}) + +# Create layer with quantization +quant_method = config.get_quant_method(layer=None, prefix="model.layers.0.q_proj") +layer = ColumnParallelLinear(4096, 4096, quant_method=quant_method) +``` + +## API Reference + +### `QuantizeMethodBase` + +| Method | When called | Purpose | +|--------|-------------|---------| +| `create_weights(layer, ...)` | `layer.__init__` | Register parameters (weight, scales, etc.) on the layer | +| `apply(layer, x, bias)` | `layer.forward` | Quantized matmul computation | +| `process_weights_after_loading(layer)` | After `load_weights` | Repack / transform loaded checkpoint tensors | + +### `QuantizationConfig` + +| Method | Purpose | +|--------|---------| +| `get_name()` | Return method name (e.g. `"awq"`) | +| `from_config(config_dict)` | Class method: parse checkpoint JSON into config instance | +| `get_quant_method(layer, prefix)` | Return `LinearMethodBase` for a specific layer | +| `get_supported_act_dtypes()` | Activation dtypes this method supports | +| `get_min_capability()` | Minimum CUDA compute capability | +| `get_config_filenames()` | Checkpoint files to probe (default: `["quantize_config.json"]`) | + +### Registry functions + +| Function | Purpose | +|----------|---------| +| `@register_quantization("name")` | Decorator to register a config class | +| `get_quantization_config("name")` | Look up registered config class by name | +| `list_quantization_methods()` | List all registered method names | diff --git a/pymllm/quantization/__init__.py b/pymllm/quantization/__init__.py index e69de29bb..e4bf77025 100644 --- a/pymllm/quantization/__init__.py +++ b/pymllm/quantization/__init__.py @@ -0,0 +1,18 @@ +"""Quantization infrastructure for pymllm.""" + +from pymllm.quantization.quant_config import ( + QuantizationConfig, + get_quantization_config, + list_quantization_methods, + register_quantization, +) + +# Import methods module to trigger @register_quantization decorators +import pymllm.quantization.methods # noqa: F401 + +__all__ = [ + "QuantizationConfig", + "get_quantization_config", + "list_quantization_methods", + "register_quantization", +] diff --git a/pymllm/quantization/methods/__init__.py b/pymllm/quantization/methods/__init__.py index e69de29bb..90367f741 100644 --- a/pymllm/quantization/methods/__init__.py +++ b/pymllm/quantization/methods/__init__.py @@ -0,0 +1,15 @@ +"""Quantization method implementations. + +Importing this module triggers registration of all built-in quantization +methods via the ``@register_quantization`` decorator. +""" + +from pymllm.quantization.methods.awq_marlin import ( + AWQMarlinConfig, + AWQMarlinLinearMethod, +) + +__all__ = [ + "AWQMarlinConfig", + "AWQMarlinLinearMethod", +] diff --git a/pymllm/quantization/methods/awq_marlin.py b/pymllm/quantization/methods/awq_marlin.py new file mode 100644 index 000000000..e8f929aa3 --- /dev/null +++ b/pymllm/quantization/methods/awq_marlin.py @@ -0,0 +1,524 @@ +"""AWQ quantization with Marlin kernel acceleration. + +This module implements the AWQ Marlin quantization plugin for pymllm, +providing high-performance W4A16 inference via the Marlin GEMM kernel. + +Classes +------- +AWQMarlinConfig + Quantization configuration parsed from ``quantize_config.json``. +AWQMarlinLinearMethod + Linear method that uses AWQ weight format with Marlin kernel dispatch. +""" + +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Optional + +import numpy +import torch +from torch.nn import Parameter + +from pymllm.layers.quantize_base import LinearMethodBase +from pymllm.layers.utils import set_weight_attrs +from pymllm.quantization.quant_config import QuantizationConfig, register_quantization + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Marlin constants +# --------------------------------------------------------------------------- + +MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128] +GPTQ_MARLIN_MIN_THREAD_N = 64 +GPTQ_MARLIN_MIN_THREAD_K = 128 +GPTQ_MARLIN_TILE = 16 + + +# --------------------------------------------------------------------------- +# ScalarType helpers (matching host::ScalarType in scalar_type.hpp) +# --------------------------------------------------------------------------- + +class _ScalarTypeInfo: + """Lightweight Python mirror of host::ScalarType for type id computation.""" + + def __init__(self, name: str, size_bits: int, type_id: int): + self.name = name + self.size_bits = size_bits + self.id = type_id + + def __repr__(self) -> str: + return f"ScalarType({self.name})" + + def __eq__(self, other: object) -> bool: + if isinstance(other, _ScalarTypeInfo): + return self.id == other.id + return NotImplemented + + def __hash__(self) -> int: + return hash(self.id) + + +def _compute_scalar_type_id( + exponent: int, mantissa: int, signed: bool, bias: int, + finite_values_only: bool = False, nan_repr: int = 1, +) -> int: + """Compute the ScalarType::Id matching the C++ implementation.""" + bit_offset = 0 + result = 0 + + for value, width in [ + (exponent, 8), + (mantissa, 8), + (signed, 1), + (bias, 32), + (finite_values_only, 1), + (nan_repr, 8), + ]: + int_val = int(value) + mask = (1 << width) - 1 + result |= (int_val & mask) << bit_offset + bit_offset += width + + return result + + +# Pre-compute the scalar type ids we need +_uint4_id = _compute_scalar_type_id(0, 4, False, 0) +_uint8_id = _compute_scalar_type_id(0, 8, False, 0) +_uint4b8_id = _compute_scalar_type_id(0, 4, False, 8) +_uint8b128_id = _compute_scalar_type_id(0, 8, False, 128) + +SCALAR_TYPE_UINT4 = _ScalarTypeInfo("uint4", 4, _uint4_id) +SCALAR_TYPE_UINT8 = _ScalarTypeInfo("uint8", 8, _uint8_id) + + +# num_bits -> ScalarType mapping +_TYPE_MAP: Dict[int, _ScalarTypeInfo] = { + 4: SCALAR_TYPE_UINT4, + 8: SCALAR_TYPE_UINT8, +} + + +# --------------------------------------------------------------------------- +# Marlin utility functions +# --------------------------------------------------------------------------- + +def verify_marlin_supported( + quant_type: _ScalarTypeInfo, group_size: int, has_zp: bool +) -> None: + """Verify that the Marlin kernel supports this configuration.""" + major, minor = torch.cuda.get_device_capability() + capability = major * 10 + minor + if capability < 80: + raise ValueError( + f"Marlin requires SM80+ (Ampere or newer). Got SM{capability}." + ) + if group_size not in MARLIN_SUPPORTED_GROUP_SIZES: + raise ValueError( + f"Marlin does not support group_size={group_size}. " + f"Supported: {MARLIN_SUPPORTED_GROUP_SIZES}" + ) + + +def verify_marlin_supports_shape( + output_size_per_partition: int, + input_size_per_partition: int, + input_size: int, + group_size: int, +) -> None: + """Verify that tensor dimensions are compatible with Marlin.""" + if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0: + raise ValueError( + f"output_size_per_partition={output_size_per_partition} is not " + f"divisible by min_thread_n={GPTQ_MARLIN_MIN_THREAD_N}." + ) + if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0: + raise ValueError( + f"input_size_per_partition={input_size_per_partition} is not " + f"divisible by min_thread_k={GPTQ_MARLIN_MIN_THREAD_K}." + ) + if group_size < input_size and input_size_per_partition % group_size != 0: + raise ValueError( + f"input_size_per_partition={input_size_per_partition} is not " + f"divisible by group_size={group_size}." + ) + + +def marlin_make_workspace(device: torch.device) -> torch.Tensor: + """Create Marlin workspace buffer for threadblock synchronization.""" + sms = torch.cuda.get_device_properties(device).multi_processor_count + return torch.zeros(sms, dtype=torch.int, device=device, requires_grad=False) + + +def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor: + """Create empty g_idx tensor (AWQ doesn't use activation reordering).""" + return torch.nn.Parameter( + torch.empty(0, dtype=torch.int, device=device), requires_grad=False + ) + + +def get_scale_perms(): + """Get the scale permutation indices for Marlin format.""" + scale_perm: list[int] = [] + for i in range(8): + scale_perm.extend([i + 8 * j for j in range(8)]) + scale_perm_single: list[int] = [] + for i in range(4): + scale_perm_single.extend( + [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]] + ) + return scale_perm, scale_perm_single + + +def marlin_permute_scales( + s: torch.Tensor, size_k: int, size_n: int, group_size: int +) -> torch.Tensor: + """Permute quantization scales from standard to Marlin layout.""" + scale_perm, scale_perm_single = get_scale_perms() + if group_size < size_k and group_size != -1: + s = s.reshape((-1, len(scale_perm)))[:, scale_perm] + else: + s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single] + s = s.reshape((-1, size_n)).contiguous() + return s + + +def pack_cols( + q_w: torch.Tensor, num_bits: int, size_k: int, size_n: int +) -> torch.Tensor: + """Pack quantized columns into int32 values.""" + pack_factor = 32 // num_bits + assert size_n % pack_factor == 0 + out = torch.zeros( + (size_k, size_n // pack_factor), dtype=torch.int32, device=q_w.device + ) + for i in range(pack_factor): + out.bitwise_or_(q_w[:, i::pack_factor].int() << (num_bits * i)) + return out + + +def unpack_cols( + packed: torch.Tensor, num_bits: int, size_k: int, size_n: int +) -> torch.Tensor: + """Unpack int32 packed columns into individual quantized values.""" + pack_factor = 32 // num_bits + mask = (1 << num_bits) - 1 + out = torch.zeros( + (size_k, size_n), dtype=torch.int32, device=packed.device + ) + for i in range(pack_factor): + out[:, i::pack_factor] = (packed >> (num_bits * i)) & mask + return out + + +def marlin_zero_points( + zp: torch.Tensor, size_k: int, size_n: int, num_bits: int +) -> torch.Tensor: + """Permute and pack zero points into Marlin format.""" + scale_perm, _ = get_scale_perms() + zp = zp.reshape((-1, len(scale_perm)))[:, scale_perm] + + # Interleave column dim (for the dequantize code) and pack to int32 + if num_bits == 4: + interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7]) + elif num_bits == 8: + interleave = numpy.array([0, 2, 1, 3]) + else: + raise ValueError(f"num_bits must be 4 or 8, got {num_bits}") + + zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel() + zp = zp.reshape((-1, size_n)).contiguous() + zp = pack_cols(zp, num_bits, size_k, size_n) + return zp + + +def awq_to_marlin_zero_points( + q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int +) -> torch.Tensor: + """Convert AWQ-format zero points to Marlin format. + + AWQ zero-points are quantized and packed on the column dim with a specific + interleaving. This function undoes the AWQ interleaving, then applies + Marlin permutation and repacks. + """ + q_zp = unpack_cols(q_zp_packed, num_bits, size_k, size_n) + + # Undo AWQ interleaving + if num_bits == 4: + undo_interleave = numpy.argsort(numpy.array([0, 2, 4, 6, 1, 3, 5, 7])) + elif num_bits == 8: + undo_interleave = numpy.argsort(numpy.array([0, 2, 1, 3])) + else: + raise ValueError(f"num_bits must be 4 or 8, got {num_bits}") + + q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel() + q_zp = q_zp.reshape((-1, size_n)).contiguous() + + return marlin_zero_points(q_zp, size_k, size_n, num_bits) + + +def replace_parameter( + layer: torch.nn.Module, name: str, new_data: torch.Tensor +) -> None: + """Replace a parameter on a layer with new data.""" + param = torch.nn.Parameter(new_data, requires_grad=False) + layer.register_parameter(name, param) + + +# --------------------------------------------------------------------------- +# AWQMarlinLinearMethod +# --------------------------------------------------------------------------- + +class AWQMarlinLinearMethod(LinearMethodBase): + """Linear method for AWQ with Marlin kernel acceleration. + + Uses the Marlin W4A16 GEMM kernel for high-performance inference. + Weights are repacked from AWQ format to Marlin format after loading. + """ + + def __init__(self, quant_config: AWQMarlinConfig) -> None: + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs: Any, + ) -> None: + del output_size + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + + # Normalize group_size + if self.quant_config.group_size != -1: + group_size = self.quant_config.group_size + else: + group_size = input_size + + verify_marlin_supports_shape( + output_size_per_partition=output_size_per_partition, + input_size_per_partition=input_size_per_partition, + input_size=input_size, + group_size=group_size, + ) + + # Packed quantized weights: (input_size, output_size // pack_factor) + qweight = Parameter( + torch.empty( + input_size_per_partition, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs(qweight, { + "input_dim": 0, + "output_dim": 1, + }) + layer.register_parameter("qweight", qweight) + set_weight_attrs(qweight, extra_weight_attrs) + + num_groups = input_size_per_partition // group_size + + # Zero points: (num_groups, output_size // pack_factor) + qzeros = Parameter( + torch.empty( + num_groups, + output_size_per_partition // self.quant_config.pack_factor, + dtype=torch.int32, + ), + requires_grad=False, + ) + set_weight_attrs(qzeros, { + "input_dim": 0, + "output_dim": 1, + }) + layer.register_parameter("qzeros", qzeros) + set_weight_attrs(qzeros, extra_weight_attrs) + + # Scales: (num_groups, output_size) + scales = Parameter( + torch.empty( + num_groups, + output_size_per_partition, + dtype=params_dtype, + ), + requires_grad=False, + ) + set_weight_attrs(scales, { + "input_dim": 0, + "output_dim": 1, + }) + layer.register_parameter("scales", scales) + set_weight_attrs(scales, extra_weight_attrs) + + # Store dimensions for post-loading processing + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.num_groups = num_groups + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + """Repack AWQ weights to Marlin format after checkpoint loading.""" + from mllm_kernel.cuda.jit.awq_marlin_repack import awq_marlin_repack + + device = layer.qweight.device + + # Unwrap parameter data for processing + layer.qweight = Parameter(layer.qweight.data, requires_grad=False) + layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False) + layer.scales = Parameter(layer.scales.data, requires_grad=False) + + # Allocate marlin workspace + layer.workspace = marlin_make_workspace(device) + + # Repack weights from AWQ format to Marlin format + marlin_qweight = awq_marlin_repack( + layer.qweight, + size_k=layer.input_size_per_partition, + size_n=layer.output_size_per_partition, + num_bits=self.quant_config.quant_type.size_bits, + ) + replace_parameter(layer, "qweight", marlin_qweight) + + # Permute scales from AWQ format to Marlin format + marlin_scales = marlin_permute_scales( + layer.scales, + size_k=layer.input_size_per_partition, + size_n=layer.output_size_per_partition, + group_size=self.quant_config.group_size, + ) + replace_parameter(layer, "scales", marlin_scales) + + # Convert zero points from AWQ format to Marlin format + marlin_zp = awq_to_marlin_zero_points( + layer.qzeros, + size_k=layer.num_groups, + size_n=layer.output_size_per_partition, + num_bits=self.quant_config.quant_type.size_bits, + ) + replace_parameter(layer, "qzeros", marlin_zp) + + # AWQ doesn't use activation reordering + layer.g_idx = marlin_make_empty_g_idx(device) + layer.g_idx_sort_indices = marlin_make_empty_g_idx(device) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Perform quantized matmul using the Marlin GEMM kernel.""" + from mllm_kernel.cuda.jit.gptq_marlin import gptq_marlin_gemm + + reshaped_x = x.reshape(-1, x.shape[-1]) + out_shape = x.shape[:-1] + (layer.output_size_per_partition,) + + size_m = reshaped_x.shape[0] + size_n = layer.output_size_per_partition + size_k = layer.input_size_per_partition + + output = gptq_marlin_gemm( + a=reshaped_x, + c=None, + b_q_weight=layer.qweight, + b_scales=layer.scales, + global_scale=None, + b_zeros=layer.qzeros, + g_idx=layer.g_idx, + perm=layer.g_idx_sort_indices, + workspace=layer.workspace, + b_q_type_id=self.quant_config.quant_type.id, + size_m=size_m, + size_n=size_n, + size_k=size_k, + is_k_full=True, + use_fp32_reduce=True, + is_zp_float=False, + ) + + if bias is not None: + output.add_(bias) + + return output.reshape(out_shape) + + +# --------------------------------------------------------------------------- +# AWQMarlinConfig +# --------------------------------------------------------------------------- + +@register_quantization("awq_marlin") +class AWQMarlinConfig(QuantizationConfig): + """Configuration for AWQ quantization with Marlin kernel acceleration. + + This config is used when loading models quantized with AutoAWQ and + running inference with the high-performance Marlin W4A16 GEMM kernel. + + Registered as ``"awq_marlin"`` in the quantization registry. + """ + + def __init__( + self, + weight_bits: int, + group_size: int, + zero_point: bool, + ) -> None: + super().__init__() + self.weight_bits = weight_bits + self.group_size = group_size + self.zero_point = zero_point + self.pack_factor = 32 // weight_bits + + if weight_bits not in _TYPE_MAP: + raise ValueError( + f"Unsupported weight_bits={weight_bits}. " + f"Supported: {list(_TYPE_MAP.keys())}" + ) + self.quant_type = _TYPE_MAP[weight_bits] + + verify_marlin_supported( + self.quant_type, + group_size=self.group_size, + has_zp=self.zero_point, + ) + + def __repr__(self) -> str: + return ( + f"AWQMarlinConfig(weight_bits={self.weight_bits}, " + f"group_size={self.group_size}, " + f"zero_point={self.zero_point})" + ) + + def get_name(self) -> str: + return "awq_marlin" + + def get_supported_act_dtypes(self) -> List[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @staticmethod + def get_config_filenames() -> List[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> AWQMarlinConfig: + weight_bits = config.get("bits", config.get("w_bit", 4)) + group_size = config.get("group_size", 128) + zero_point = config.get("zero_point", True) + return cls(weight_bits, group_size, zero_point) + + def get_quant_method( + self, + layer: torch.nn.Module, + prefix: str = "", + ) -> Optional[AWQMarlinLinearMethod]: + return AWQMarlinLinearMethod(self) diff --git a/pymllm/quantization/methods/awq_w4a16.py b/pymllm/quantization/methods/awq_w4a16.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/pymllm/quantization/quant_config.py b/pymllm/quantization/quant_config.py new file mode 100644 index 000000000..8225f6d11 --- /dev/null +++ b/pymllm/quantization/quant_config.py @@ -0,0 +1,203 @@ +"""Quantization configuration base class and registry. + +This module provides the bridge between a model checkpoint's quantization +metadata (e.g. ``quantize_config.json``) and the runtime +:class:`~pymllm.layers.quantize_base.LinearMethodBase` instances used by +each linear layer. + +Architecture overview:: + + quantize_config.json ──parse──► QuantizationConfig subclass + │ + │ get_quant_method(layer, prefix) + ▼ + LinearMethodBase instance + (AWQLinearMethod, FP8LinearMethod, ...) + +How to add a new quantization method +------------------------------------- +1. Create a ``QuantizationConfig`` subclass (e.g. ``AWQConfig``). +2. Implement ``get_name()``, ``from_config()``, ``get_quant_method()``. +3. Register it:: + + from pymllm.quantization.quant_config import register_quantization + + @register_quantization("awq") + class AWQConfig(QuantizationConfig): + ... + +4. When the server starts with ``--quantization.method awq``, the loader + will call ``get_quantization_config("awq")`` to obtain the config class, + then ``from_config(hf_quant_config)`` to instantiate it, and finally + ``config.get_quant_method(layer, prefix)`` for each linear layer. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Type + +import torch + +from pymllm.layers.quantize_base import QuantizeMethodBase + + +# --------------------------------------------------------------------------- +# Registry +# --------------------------------------------------------------------------- + +# Maps method name (e.g. "awq", "gptq", "fp8") to config class. +_QUANTIZATION_REGISTRY: Dict[str, Type[QuantizationConfig]] = {} + + +def register_quantization( + name: str, +) -> "type[type[QuantizationConfig]]": + """Class decorator that registers a :class:`QuantizationConfig` subclass. + + Usage:: + + @register_quantization("awq") + class AWQConfig(QuantizationConfig): + ... + """ + + def decorator(cls: Type[QuantizationConfig]) -> Type[QuantizationConfig]: + if name in _QUANTIZATION_REGISTRY: + raise ValueError( + f"Quantization method {name!r} is already registered " + f"by {_QUANTIZATION_REGISTRY[name].__name__}" + ) + _QUANTIZATION_REGISTRY[name] = cls + return cls + + return decorator # type: ignore[return-value] + + +def get_quantization_config(method: str) -> Type[QuantizationConfig]: + """Look up a registered :class:`QuantizationConfig` by name. + + Raises ``KeyError`` if the method is not registered. + """ + if method not in _QUANTIZATION_REGISTRY: + supported = ", ".join(sorted(_QUANTIZATION_REGISTRY)) or "(none)" + raise KeyError( + f"Unknown quantization method {method!r}. " + f"Registered methods: {supported}" + ) + return _QUANTIZATION_REGISTRY[method] + + +def list_quantization_methods() -> List[str]: + """Return sorted list of registered quantization method names.""" + return sorted(_QUANTIZATION_REGISTRY) + + +# --------------------------------------------------------------------------- +# Base config class +# --------------------------------------------------------------------------- + + +class QuantizationConfig(ABC): + """Base class for quantization configurations. + + A ``QuantizationConfig`` is instantiated once per model load. It reads + quantization metadata from the checkpoint (bit-width, group size, etc.) + and provides :class:`~pymllm.layers.quantize_base.QuantizeMethodBase` + instances to each layer. + + Subclass contract + ----------------- + * :meth:`get_name` — return the method name (e.g. ``"awq"``). + * :meth:`from_config` — class method that parses a dict from the + checkpoint's ``quantize_config.json``. + * :meth:`get_quant_method` — return the appropriate + ``LinearMethodBase`` (or ``None`` to skip quantization for a layer). + + Optional overrides + ------------------ + * :meth:`get_supported_act_dtypes` — restrict activation dtypes. + * :meth:`get_min_capability` — minimum GPU compute capability. + * :meth:`get_config_filenames` — files to probe in the checkpoint dir. + """ + + @abstractmethod + def get_name(self) -> str: + """Return the canonical name of this quantization method. + + Examples: ``"awq"``, ``"gptq"``, ``"fp8"``, ``"w8a8"``. + """ + raise NotImplementedError + + @classmethod + @abstractmethod + def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig": + """Create an instance from a checkpoint's quantization config dict. + + Parameters + ---------- + config + Parsed JSON from the checkpoint's ``quantize_config.json`` or + the ``quantization_config`` section of ``config.json``. + + Example config dict (AWQ):: + + { + "quant_method": "awq", + "bits": 4, + "group_size": 128, + "zero_point": true + } + """ + raise NotImplementedError + + @abstractmethod + def get_quant_method( + self, + layer: torch.nn.Module, + prefix: str = "", + ) -> Optional[QuantizeMethodBase]: + """Return the quantization method for *layer*, or ``None`` to skip. + + Parameters + ---------- + layer + The ``nn.Module`` being constructed (e.g. ``ColumnParallelLinear``). + prefix + The layer's full dotted name in the model (e.g. + ``"model.layers.0.self_attn.q_proj"``). Can be used to + selectively skip quantization for certain layers. + + Returns + ------- + QuantizeMethodBase or None + The method instance. ``None`` means this layer should fall back + to the default :class:`~pymllm.layers.quantize_base.UnquantizedLinearMethod`. + """ + raise NotImplementedError + + # -- Optional hooks (with sensible defaults) -- + + def get_supported_act_dtypes(self) -> List[torch.dtype]: + """Activation dtypes supported by this method. + + Override to restrict (e.g. FP8 only supports ``float16``). + Default: no restriction. + """ + return [torch.float16, torch.bfloat16, torch.float32] + + @classmethod + def get_min_capability(cls) -> int: + """Minimum CUDA compute capability (e.g. 75 for Turing). + + Default: 0 (no restriction). + """ + return 0 + + @staticmethod + def get_config_filenames() -> List[str]: + """File names to look for in the checkpoint directory. + + Default: ``["quantize_config.json"]``. + """ + return ["quantize_config.json"] diff --git a/pymllm/quantization/quant_recipe.py b/pymllm/quantization/quant_recipe.py deleted file mode 100644 index a5b493bec..000000000 --- a/pymllm/quantization/quant_recipe.py +++ /dev/null @@ -1,3 +0,0 @@ -class QuantRecipe: - def __init__(self): - pass From 945313410ceb7a44d20a76b136a5496cdeb9d4d9 Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 18 Mar 2026 08:35:59 +0000 Subject: [PATCH 40/42] feat(quantization): implement quantization configuration loading and integration - Added methods to load quantization configuration from model checkpoints, enhancing model flexibility. - Integrated quantization support into various layers, allowing for dynamic quantization method selection based on configuration. - Updated model classes to accept quantization parameters, ensuring compatibility with quantized models. - Enhanced documentation to include usage instructions for quantization in server settings and auto-detection mechanisms. Signed-off-by: chenghuaWang <2923277184@qq.com> --- pymllm/executor/model_runner.py | 104 +++++++++++++++++++++++++++- pymllm/layers/gated_delta_net.py | 36 ++++++++-- pymllm/layers/mlp.py | 33 ++++++++- pymllm/models/qwen3_5.py | 67 +++++++++++++----- pymllm/models/qwen3_vl.py | 94 ++++++++++++++++++++----- pymllm/quantization/QUANTIZATION.md | 68 ++++++++++++++++++ 6 files changed, 359 insertions(+), 43 deletions(-) diff --git a/pymllm/executor/model_runner.py b/pymllm/executor/model_runner.py index 45fe4e564..2178afa99 100644 --- a/pymllm/executor/model_runner.py +++ b/pymllm/executor/model_runner.py @@ -450,6 +450,102 @@ def _extract_model_metadata(self) -> None: *([self.num_gdn_layers] if self.num_gdn_layers > 0 else []), ) + # ------------------------------------------------------------------ + # Quantization config resolution + # ------------------------------------------------------------------ + + @staticmethod + def _load_quant_config_dict(model_path: str) -> dict: + """Probe checkpoint directory for quantization metadata. + + Checks files listed by each registered ``QuantizationConfig`` + (e.g. ``quantize_config.json``), then falls back to the + ``quantization_config`` section of ``config.json``. + + Returns an empty dict when no quantization metadata is found. + """ + import json + from pathlib import Path + + from pymllm.quantization import QuantizationConfig + + model_path = Path(model_path) + + # Collect candidate filenames from all registered config classes + filenames: list[str] = [] + for subcls in QuantizationConfig.__subclasses__(): + filenames.extend(subcls.get_config_filenames()) + # Deduplicate while preserving order + seen: set[str] = set() + unique: list[str] = [] + for f in filenames: + if f not in seen: + seen.add(f) + unique.append(f) + + for fname in unique: + fpath = model_path / fname + if fpath.exists(): + with open(fpath) as fp: + return json.load(fp) + + # Fallback: config.json → quantization_config section + config_path = model_path / "config.json" + if config_path.exists(): + with open(config_path) as fp: + cfg = json.load(fp) + if "quantization_config" in cfg: + return cfg["quantization_config"] + + return {} + + def _resolve_quant_config(self): + """Resolve the quantization configuration for this model. + + Priority: + 1. CLI value from ``GlobalConfig.quantization.method`` + 2. Auto-detect from checkpoint's ``quantize_config.json`` + or ``config.json`` → ``quantization_config.quant_method`` + 3. Auto-upgrade ``"awq"`` → ``"awq_marlin"`` on SM80+ GPUs + + Returns ``None`` when quantization is not requested / detected. + """ + from pymllm.quantization import get_quantization_config + + global_cfg = get_global_config() + method = global_cfg.quantization.method + model_path = self.server_config.model_path + + config_dict = self._load_quant_config_dict(model_path) + + # Auto-detect from checkpoint if CLI didn't specify a method + if method is None and config_dict: + method = config_dict.get("quant_method") + + if method is None: + return None + + # Auto-upgrade awq → awq_marlin on Ampere+ GPUs + if method == "awq": + cap = torch.cuda.get_device_capability(self.gpu_id) + sm = cap[0] * 10 + cap[1] + if sm >= 80: + logger.info( + "Auto-upgrading quantization: awq → awq_marlin (SM%d)", + sm, + ) + method = "awq_marlin" + + config_cls = get_quantization_config(method) + quant_config = config_cls.from_config(config_dict) + logger.info( + "Quantization: %s (bits=%s, group_size=%s)", + quant_config.get_name(), + getattr(quant_config, "weight_bits", "?"), + getattr(quant_config, "group_size", "?"), + ) + return quant_config + # ------------------------------------------------------------------ # Model loading # ------------------------------------------------------------------ @@ -500,6 +596,9 @@ def load_model(self) -> None: ) logger.info("Using pymllm model class: %s", model_cls.__name__) + + quant_config = self._resolve_quant_config() + device_str = f"cuda:{self.gpu_id}" if self.device == "cuda" else self.device # Use set_default_dtype so parameters created without explicit dtype # get the target dtype, while parameters with explicit dtype=torch.float32 @@ -508,7 +607,10 @@ def load_model(self) -> None: torch.set_default_dtype(self.dtype) try: with torch.device(device_str): - self.model = model_cls(hf_config) + if quant_config is not None: + self.model = model_cls(hf_config, quant_config=quant_config) + else: + self.model = model_cls(hf_config) finally: torch.set_default_dtype(old_dtype) self.model.load_weights(self._iter_weights(model_path)) diff --git a/pymllm/layers/gated_delta_net.py b/pymllm/layers/gated_delta_net.py index 3753734d9..5472371da 100644 --- a/pymllm/layers/gated_delta_net.py +++ b/pymllm/layers/gated_delta_net.py @@ -92,6 +92,8 @@ def __init__( layer_id: int = 0, gdn_layer_idx: int = 0, rms_norm_eps: float = 1e-6, + quant_config=None, + prefix: str = "", ): super().__init__() self.hidden_size = hidden_size @@ -105,11 +107,32 @@ def __init__( self.layer_id = layer_id self.gdn_layer_idx = gdn_layer_idx + def _get_qm(suffix, out_features): + # Skip quantization for small projections — Marlin kernels + # require minimum thread tile sizes that exceed these dims. + if quant_config is None or out_features < 64: + return None + return quant_config.get_quant_method( + layer=None, prefix=f"{prefix}.{suffix}" if prefix else suffix, + ) + # Input projections - self.in_proj_qkv = Linear(hidden_size, self.key_dim * 2 + self.value_dim, bias=False) - self.in_proj_z = Linear(hidden_size, self.value_dim, bias=False) - self.in_proj_a = Linear(hidden_size, num_v_heads, bias=False) - self.in_proj_b = Linear(hidden_size, num_v_heads, bias=False) + self.in_proj_qkv = Linear( + hidden_size, self.key_dim * 2 + self.value_dim, bias=False, + quant_method=_get_qm("in_proj_qkv", self.key_dim * 2 + self.value_dim), + ) + self.in_proj_z = Linear( + hidden_size, self.value_dim, bias=False, + quant_method=_get_qm("in_proj_z", self.value_dim), + ) + self.in_proj_a = Linear( + hidden_size, num_v_heads, bias=False, + quant_method=_get_qm("in_proj_a", num_v_heads), + ) + self.in_proj_b = Linear( + hidden_size, num_v_heads, bias=False, + quant_method=_get_qm("in_proj_b", num_v_heads), + ) # Causal convolution (weight only — computation is in the backend) self.conv1d = GDNConv1d(self.key_dim * 2 + self.value_dim, conv_kernel_size) @@ -125,7 +148,10 @@ def __init__( self.norm = RMSNormGated(head_v_dim, eps=rms_norm_eps, norm_before_gate=True) # Output projection - self.out_proj = Linear(self.value_dim, hidden_size, bias=False) + self.out_proj = Linear( + self.value_dim, hidden_size, bias=False, + quant_method=_get_qm("out_proj", hidden_size), + ) # RadixLinearAttention — delegates to the attention backend from pymllm.layers.attention.radix_linear_attention import RadixLinearAttention diff --git a/pymllm/layers/mlp.py b/pymllm/layers/mlp.py index 1a40db92e..1894e23ca 100644 --- a/pymllm/layers/mlp.py +++ b/pymllm/layers/mlp.py @@ -67,17 +67,32 @@ def __init__( use_bias_gate_up: bool = False, use_bias_down: bool = False, enable_pdl: Optional[bool] = None, + quant_config=None, + prefix: str = "", ): super().__init__() _validate_mlp_args(hidden_size, intermediate_size, activation) + # Quantized checkpoints store gate_proj / up_proj separately; + # fusing them into a single packed-int32 parameter is impractical, + # so force the unfused path when quantisation is active. + if quant_config is not None: + use_fused_gate_up_proj = False + self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.activation = activation self.use_fused_gate_up_proj = use_fused_gate_up_proj self.enable_pdl = enable_pdl - if not use_fused_gate_up_proj: + def _get_qm(suffix): + if quant_config is None: + return None + return quant_config.get_quant_method( + layer=None, prefix=f"{prefix}.{suffix}" if prefix else suffix, + ) + + if not use_fused_gate_up_proj and quant_config is None: logger.warning( "MLP with use_fused_gate_up_proj=False uses a lower-efficiency path. " "Use use_fused_gate_up_proj=True for better performance.", @@ -86,6 +101,7 @@ def __init__( if use_fused_gate_up_proj: self.gate_up_proj = Linear( hidden_size, 2 * intermediate_size, bias=use_bias_gate_up, + quant_method=_get_qm("gate_up_proj"), ) self.gate_proj = None self.up_proj = None @@ -93,13 +109,16 @@ def __init__( self.gate_up_proj = None self.gate_proj = Linear( hidden_size, intermediate_size, bias=use_bias_gate_up, + quant_method=_get_qm("gate_proj"), ) self.up_proj = Linear( hidden_size, intermediate_size, bias=use_bias_gate_up, + quant_method=_get_qm("up_proj"), ) self.down_proj = Linear( intermediate_size, hidden_size, bias=use_bias_down, + quant_method=_get_qm("down_proj"), ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -160,6 +179,8 @@ def __init__( use_bias_gate_up: bool = False, use_bias_down: bool = False, enable_pdl: Optional[bool] = None, + quant_config=None, + prefix: str = "", ): super().__init__() _validate_mlp_args(hidden_size, intermediate_size, activation) @@ -169,18 +190,28 @@ def __init__( self.activation = activation self.enable_pdl = enable_pdl + def _get_qm(suffix): + if quant_config is None: + return None + return quant_config.get_quant_method( + layer=None, prefix=f"{prefix}.{suffix}" if prefix else suffix, + ) + self.gate_proj = ColumnParallelLinear( hidden_size, intermediate_size, bias=use_bias_gate_up, gather_output=False, + quant_method=_get_qm("gate_proj"), ) self.up_proj = ColumnParallelLinear( hidden_size, intermediate_size, bias=use_bias_gate_up, gather_output=False, + quant_method=_get_qm("up_proj"), ) self.down_proj = RowParallelLinear( intermediate_size, hidden_size, bias=use_bias_down, reduce_output=True, + quant_method=_get_qm("down_proj"), ) def forward(self, x: torch.Tensor) -> torch.Tensor: diff --git a/pymllm/models/qwen3_5.py b/pymllm/models/qwen3_5.py index ca4dbe2ea..5b6bd558a 100644 --- a/pymllm/models/qwen3_5.py +++ b/pymllm/models/qwen3_5.py @@ -70,7 +70,7 @@ def _get_layer_types(config) -> List[str]: class Qwen3_5FullAttention(nn.Module): """Standard multi-head attention with RoPE, QK-norm, and optional output gate.""" - def __init__(self, config, layer_id: int): + def __init__(self, config, layer_id: int, quant_config=None, prefix: str = ""): super().__init__() tc = _get_text_config(config) self.hidden_size = tc.hidden_size @@ -91,10 +91,17 @@ def __init__(self, config, layer_id: int): else: q_proj_size = self.q_size - self.q_proj = Linear(self.hidden_size, q_proj_size, bias=False) - self.k_proj = Linear(self.hidden_size, self.kv_size, bias=False) - self.v_proj = Linear(self.hidden_size, self.kv_size, bias=False) - self.o_proj = Linear(self.q_size, self.hidden_size, bias=False) + def _get_qm(suffix): + if quant_config is None: + return None + return quant_config.get_quant_method( + layer=None, prefix=f"{prefix}.{suffix}" if prefix else suffix, + ) + + self.q_proj = Linear(self.hidden_size, q_proj_size, bias=False, quant_method=_get_qm("q_proj")) + self.k_proj = Linear(self.hidden_size, self.kv_size, bias=False, quant_method=_get_qm("k_proj")) + self.v_proj = Linear(self.hidden_size, self.kv_size, bias=False, quant_method=_get_qm("v_proj")) + self.o_proj = Linear(self.q_size, self.hidden_size, bias=False, quant_method=_get_qm("o_proj")) # QK normalization self.q_norm = GemmaRMSNorm(self.head_dim, eps=tc.rms_norm_eps) @@ -166,14 +173,20 @@ def forward( class Qwen3_5AttentionDecoderLayer(nn.Module): """Decoder layer with full attention + MLP.""" - def __init__(self, config, layer_id: int): + def __init__(self, config, layer_id: int, quant_config=None, prefix: str = ""): super().__init__() tc = _get_text_config(config) - self.self_attn = Qwen3_5FullAttention(config, layer_id) + self.self_attn = Qwen3_5FullAttention( + config, layer_id, + quant_config=quant_config, + prefix=f"{prefix}.self_attn" if prefix else "self_attn", + ) self.mlp = MLP( hidden_size=tc.hidden_size, intermediate_size=tc.intermediate_size, activation=tc.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp" if prefix else "mlp", ) self.input_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) self.post_attention_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) @@ -209,7 +222,8 @@ def forward( class Qwen3_5LinearDecoderLayer(nn.Module): """Decoder layer with GDN linear attention + MLP.""" - def __init__(self, config, layer_id: int, gdn_layer_idx: int = 0): + def __init__(self, config, layer_id: int, gdn_layer_idx: int = 0, + quant_config=None, prefix: str = ""): super().__init__() tc = _get_text_config(config) self.linear_attn = GatedDeltaNet( @@ -222,11 +236,15 @@ def __init__(self, config, layer_id: int, gdn_layer_idx: int = 0): layer_id=layer_id, gdn_layer_idx=gdn_layer_idx, rms_norm_eps=tc.rms_norm_eps, + quant_config=quant_config, + prefix=f"{prefix}.linear_attn" if prefix else "linear_attn", ) self.mlp = MLP( hidden_size=tc.hidden_size, intermediate_size=tc.intermediate_size, activation=tc.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp" if prefix else "mlp", ) self.input_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) self.post_attention_layernorm = GemmaRMSNorm(tc.hidden_size, eps=tc.rms_norm_eps) @@ -274,10 +292,11 @@ class Qwen3_5ForCausalLM(nn.Module): Dense (non-MoE) variant. """ - def __init__(self, config): + def __init__(self, config, quant_config=None): super().__init__() tc = _get_text_config(config) self.config = tc + self.quant_config = quant_config self.hidden_size = tc.hidden_size self.vocab_size = tc.vocab_size @@ -292,14 +311,21 @@ def __init__(self, config): self.full_attn_layer_ids = set() for idx in range(tc.num_hidden_layers): layer_type = layer_types[idx] + layer_prefix = f"layers.{idx}" if layer_type == "linear_attention": self.layers.append( - Qwen3_5LinearDecoderLayer(config, idx, gdn_layer_idx=gdn_count) + Qwen3_5LinearDecoderLayer( + config, idx, gdn_layer_idx=gdn_count, + quant_config=quant_config, prefix=layer_prefix, + ) ) gdn_count += 1 else: self.layers.append( - Qwen3_5AttentionDecoderLayer(config, idx) + Qwen3_5AttentionDecoderLayer( + config, idx, + quant_config=quant_config, prefix=layer_prefix, + ) ) self.full_attn_layer_ids.add(idx) self.num_gdn_layers = gdn_count @@ -346,10 +372,14 @@ def forward( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): """Load HuggingFace checkpoint weights with name remapping.""" - stacked_params_mapping = [ - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] + # When quantized, gate/up are separate projections — skip stacking. + if self.quant_config is not None: + stacked_params_mapping = [] + else: + stacked_params_mapping = [ + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] params_dict = dict(self.named_parameters()) loaded: Set[str] = set() @@ -417,16 +447,17 @@ class Qwen3_5ForConditionalGeneration(nn.Module): language model. """ - def __init__(self, config): + def __init__(self, config, quant_config=None): super().__init__() from pymllm.models.qwen3_vl import ( Qwen3VLVisionModel, ) self.config = config + self.quant_config = quant_config tc = _get_text_config(config) - # Vision encoder (reuse Qwen3VL's vision model) + # Vision encoder — NOT quantized vision_config = getattr(config, "vision_config", None) if vision_config is not None: self.visual = Qwen3VLVisionModel( @@ -452,7 +483,7 @@ def __init__(self, config): self.visual = None # Language model - self.model = Qwen3_5ForCausalLM(config) + self.model = Qwen3_5ForCausalLM(config, quant_config=quant_config) # Expose hybrid model metadata for ModelRunner self.num_gdn_layers = self.model.num_gdn_layers diff --git a/pymllm/models/qwen3_vl.py b/pymllm/models/qwen3_vl.py index ffa20f115..b253ad091 100644 --- a/pymllm/models/qwen3_vl.py +++ b/pymllm/models/qwen3_vl.py @@ -36,6 +36,7 @@ from pymllm.layers import RMSNorm, apply_mrope from pymllm.layers.attention.radix_attention import RadixAttention +from pymllm.layers.linear import Linear from pymllm.layers.mlp import MLP if TYPE_CHECKING: @@ -698,6 +699,8 @@ def __init__( mrope_section: Tuple[int, int, int] = (24, 20, 20), mrope_interleaved: bool = True, max_position_embeddings: int = 32768, + quant_config=None, + prefix: str = "", ): super().__init__() self.num_heads = num_heads @@ -709,13 +712,44 @@ def __init__( self.mrope_section = list(mrope_section) self.mrope_interleaved = mrope_interleaved - # Fused QKV projection - self.qkv_proj = nn.Linear( - hidden_size, self.q_size + 2 * self.kv_size, bias=False - ) + def _get_qm(suffix): + if quant_config is None: + return None + return quant_config.get_quant_method( + layer=None, prefix=f"{prefix}.{suffix}" if prefix else suffix, + ) + + # When quantized, AWQ checkpoints store q/k/v separately so we + # cannot fuse them into a single packed-int32 parameter. + self.use_fused_qkv = quant_config is None + + if self.use_fused_qkv: + self.qkv_proj = Linear( + hidden_size, self.q_size + 2 * self.kv_size, bias=False, + ) + self.q_proj = None + self.k_proj = None + self.v_proj = None + else: + self.qkv_proj = None + self.q_proj = Linear( + hidden_size, self.q_size, bias=False, + quant_method=_get_qm("q_proj"), + ) + self.k_proj = Linear( + hidden_size, self.kv_size, bias=False, + quant_method=_get_qm("k_proj"), + ) + self.v_proj = Linear( + hidden_size, self.kv_size, bias=False, + quant_method=_get_qm("v_proj"), + ) # Output projection - self.o_proj = nn.Linear(num_heads * head_dim, hidden_size, bias=False) + self.o_proj = Linear( + num_heads * head_dim, hidden_size, bias=False, + quant_method=_get_qm("o_proj"), + ) # QK normalization self.q_norm = RMSNorm(head_dim, eps=rms_norm_eps) @@ -742,8 +776,13 @@ def forward( hidden_states: torch.Tensor, forward_batch: "ForwardBatch", ) -> torch.Tensor: - qkv = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + if self.use_fused_qkv: + qkv = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + else: + q = self.q_proj(hidden_states) + k = self.k_proj(hidden_states) + v = self.v_proj(hidden_states) # Per-head QK normalization q = self.q_norm(q.view(-1, self.num_heads, self.head_dim)) @@ -786,6 +825,8 @@ def __init__( mrope_section: Tuple[int, int, int] = (24, 20, 20), mrope_interleaved: bool = True, max_position_embeddings: int = 32768, + quant_config=None, + prefix: str = "", ): super().__init__() self.self_attn = Qwen3VLAttention( @@ -799,6 +840,8 @@ def __init__( mrope_section=mrope_section, mrope_interleaved=mrope_interleaved, max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + prefix=f"{prefix}.self_attn" if prefix else "self_attn", ) self.mlp = MLP( hidden_size=hidden_size, @@ -807,6 +850,8 @@ def __init__( use_fused_gate_up_proj=True, use_bias_gate_up=False, use_bias_down=False, + quant_config=quant_config, + prefix=f"{prefix}.mlp" if prefix else "mlp", ) self.input_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps) self.post_attention_layernorm = RMSNorm(hidden_size, eps=rms_norm_eps) @@ -854,6 +899,7 @@ def __init__( mrope_section: Tuple[int, int, int] = (24, 20, 20), mrope_interleaved: bool = True, max_position_embeddings: int = 32768, + quant_config=None, ): super().__init__() self.hidden_size = hidden_size @@ -875,6 +921,8 @@ def __init__( mrope_section=mrope_section, mrope_interleaved=mrope_interleaved, max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + prefix=f"model.layers.{layer_id}", ) for layer_id in range(num_hidden_layers) ] @@ -941,14 +989,15 @@ class Qwen3VLForConditionalGeneration(nn.Module): logits = model.forward(input_ids, positions, forward_batch) """ - def __init__(self, config) -> None: + def __init__(self, config, quant_config=None) -> None: super().__init__() self.config = config + self.quant_config = quant_config text_config = getattr(config, "text_config", config) vision_config = getattr(config, "vision_config", None) - # Vision encoder + # Vision encoder — NOT quantized if vision_config is not None: self.visual = Qwen3VLVisionModel( depth=getattr(vision_config, "depth", 27), @@ -1000,6 +1049,7 @@ def __init__(self, config) -> None: mrope_section=tuple(mrope_section), mrope_interleaved=bool(mrope_interleaved), max_position_embeddings=max_position_embeddings, + quant_config=quant_config, ) # LM head — following sglang's pattern: always use lm_head.weight @@ -1190,14 +1240,19 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> None: Handles weight name remapping between HuggingFace Qwen3-VL checkpoints and this model's parameter names. """ - stacked_params_mapping = [ - # (param_name, weight_name, shard_id) - (".qkv_proj", ".q_proj", "q"), - (".qkv_proj", ".k_proj", "k"), - (".qkv_proj", ".v_proj", "v"), - (".gate_up_proj", ".up_proj", 1), - (".gate_up_proj", ".gate_proj", 0), - ] + # When quantized, the model has separate q/k/v and gate/up projections + # (no fused qkv_proj / gate_up_proj), so skip the stacking logic. + if self.quant_config is not None: + stacked_params_mapping = [] + else: + stacked_params_mapping = [ + # (param_name, weight_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".up_proj", 1), + (".gate_up_proj", ".gate_proj", 0), + ] params_dict = dict(self.named_parameters()) @@ -1249,7 +1304,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> None: # Direct parameter loading if name in params_dict: param = params_dict[name] - if param.data.shape == loaded_weight.shape: + loader = getattr(param, "weight_loader", None) + if loader is not None: + loader(param, loaded_weight) + elif param.data.shape == loaded_weight.shape: param.data.copy_(loaded_weight) else: logger.warning( diff --git a/pymllm/quantization/QUANTIZATION.md b/pymllm/quantization/QUANTIZATION.md index 7e15e8e0c..edb49dcc7 100644 --- a/pymllm/quantization/QUANTIZATION.md +++ b/pymllm/quantization/QUANTIZATION.md @@ -105,6 +105,74 @@ output = self.quant_method.apply(self, x, self.bias) For unquantized layers this is just `F.linear`. For quantized layers it invokes a fused dequant+matmul kernel. +## Server Usage + +### CLI flag + +```bash +python -m pymllm.server --model_path /path/to/model --quantization.method awq_marlin +``` + +### Auto-detection + +If `--quantization.method` is not specified, pymllm probes the checkpoint +directory for `quantize_config.json` (or the `quantization_config` section +of `config.json`). When found, the `quant_method` field is used +automatically. + +### Auto-upgrade: awq → awq_marlin + +On Ampere+ GPUs (SM80+), `"awq"` is automatically upgraded to +`"awq_marlin"` for significantly faster inference via the Marlin GEMM +kernel. No user action required. + +### Supported models + +| Model | Status | +|-------|--------| +| Qwen3VL (`Qwen3VLForConditionalGeneration`) | Supported | +| Qwen3.5 (`Qwen3_5ForConditionalGeneration`, `Qwen3_5ForCausalLM`) | Supported | + +### End-to-end pipeline + +``` +CLI: --quantization.method awq_marlin (or auto-detected) + │ + ▼ +ModelRunner._resolve_quant_config() + reads quantize_config.json / config.json + auto-upgrades "awq" → "awq_marlin" on SM80+ + │ + ▼ +model_cls(hf_config, quant_config=AWQMarlinConfig(...)) + │ + ▼ propagates quant_config through sub-modules +Qwen3VLForConditionalGeneration → Qwen3VLTextModel → decoder layers + │ + ▼ each Linear() call gets quant_method +quant_config.get_quant_method(layer, prefix) → AWQMarlinLinearMethod + │ + ▼ Linear.__init__ calls quant_method.create_weights() +registers qweight, scales, qzeros (instead of weight) + │ + ▼ model.load_weights() loads checkpoint tensors + │ + ▼ process_weights_after_loading() +repacks AWQ int4 → Marlin kernel layout + │ + ▼ inference via quant_method.apply() +calls gptq_marlin_gemm kernel +``` + +### Notes + +- Vision encoder is **never quantized** — only text decoder layers +- Fused QKV and gate_up projections are automatically split into separate + projections when quantized (AWQ checkpoints store them separately) +- Embedding, layer norms, and lm_head remain in full precision + +--- + ## How to add a new quantization method ### Step 1: Implement `LinearMethodBase` From 556009695cc1a566a1e08db13b993c7f1b85a54e Mon Sep 17 00:00:00 2001 From: chenghuaWang <2923277184@qq.com> Date: Wed, 18 Mar 2026 13:47:11 +0000 Subject: [PATCH 41/42] feat(docs): update README files with latest news and model integration details - Added a new entry for pymllm: is_qnn_aot_on_x86_enabled is true usage: pymllm [-h] [{show-config}] pymllm helper commands. positional arguments: {show-config} Run helper command. Use 'show-config' to print config details. options: -h, --help show this help message and exit supporting CUDA on Jetson Orin and Jetson Thor devices (experimental). - Included information about the newly supported Qwen3-4B model in the model integration table. Signed-off-by: chenghuaWang <2923277184@qq.com> --- README-ZH.md | 2 ++ README.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/README-ZH.md b/README-ZH.md index e33b718d2..b5592d1e9 100644 --- a/README-ZH.md +++ b/README-ZH.md @@ -17,6 +17,7 @@ mllm ## 最新动态 +- [2026 年 3 月 18 日] 🔥🔥🔥 `pymllm` 已支持在 Jetson Orin 和 Jetson Thor 设备上使用 CUDA(实验特性,仍在持续开发中)。 - [2026 年 2 月 3 日] 🔥🔥🔥 MLLM Qnn AOT 已支持在 NPU 上全图执行![快速开始](https://ubiquitouslearning.github.io/mllm/qnn_backend/aot_execute.html), [技术报告](https://chenghuawang.github.io/News/2026-01-29-mllm-qnn-aot-support/) - [2025 年 11 月 27 日] Android Demo 更新:通过一种全新的 In-App Go 服务架构,在 Android 上实现了 Qwen3 和 DeepSeek-OCR 的稳定流式推理。 - [2025 年 11 月 23 日] MLLM v2 发布! @@ -78,6 +79,7 @@ mllm 框架可以与主流社区框架的模型检查点无缝集成。通过 ml |-----------------------------------------------------------------------------|------|-----------------------| | [Qwen3-0.6B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-w4a32kai) | | | [Qwen3-1.7B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-1.7B-w4a8-i8mm-kai) | [W4A16-SM8650](https://modelscope.cn/models/mllmTeam/Qwen3-1.7B-Qnn-AOT-SM8650/summary) | +| [Qwen3-4B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-4B-w4a8-i8mm-kai) | | | [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/DeepSeek-OCR-w4a8-i8mm-kai) | | | [SmolLM3](https://huggingface.co/blog/smollm3)| [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/SmolLM3-3B-w4a8-i8mm-kai) | | | [Qwen2-VL-2B-Instruct](https://qwenlm.github.io/zh/blog/qwen2-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2-VL-2B-Instruct-w4a32kai) || diff --git a/README.md b/README.md index 88666692c..decfbf68a 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ mllm ## Latest News +- [2026 Mar 18] 🔥🔥🔥 `pymllm` now supports CUDA on Jetson Orin and Jetson Thor devices (experimental; still under active development). - [2026 Feb 03] 🔥🔥🔥 MLLM Qnn AOT Support for Full Graph Execution on NPU! [Quick Start](https://ubiquitouslearning.github.io/mllm/qnn_backend/aot_execute.html), [Technical Report](https://chenghuawang.github.io/News/2026-01-29-mllm-qnn-aot-support-en/) - [2025 Nov 27] Android Demo Update: Enabled stable Qwen3 and DeepSeek-OCR streaming on Android via a novel In-App Go Server Architecture. - [2025 Nov 23] MLLM v2 released! @@ -76,6 +77,7 @@ The mllm framework integrates seamlessly with popular community frameworks' chec |-----------------------------------------------------------------------------|------|-----------------------| | [Qwen3-0.6B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-0.6B-w4a32kai) | | | [Qwen3-1.7B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-1.7B-w4a8-i8mm-kai) | [W4A16-SM8650](https://modelscope.cn/models/mllmTeam/Qwen3-1.7B-Qnn-AOT-SM8650/) | +| [Qwen3-4B](https://github.com/QwenLM/Qwen3) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen3-4B-w4a8-i8mm-kai) | | | [DeepSeek-OCR](https://github.com/deepseek-ai/DeepSeek-OCR) | [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/DeepSeek-OCR-w4a8-i8mm-kai) | | | [SmolLM3](https://huggingface.co/blog/smollm3)| [✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/SmolLM3-3B-w4a8-i8mm-kai) | | | [Qwen2-VL-2B-Instruct](https://qwenlm.github.io/zh/blog/qwen2-vl/)|[✔️ w4a8](https://www.modelscope.cn/models/mllmTeam/Qwen2-VL-2B-Instruct-w4a32kai) || From e78ea117e6754b73326b9f230b77c0229272f582 Mon Sep 17 00:00:00 2001 From: KKkai <1640576073@qq.com> Date: Wed, 25 Mar 2026 13:42:53 +0800 Subject: [PATCH 42/42] fix --- mllm/ffi/vendors/tvm-ffi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllm/ffi/vendors/tvm-ffi b/mllm/ffi/vendors/tvm-ffi index dcd07cfe2..46f735807 160000 --- a/mllm/ffi/vendors/tvm-ffi +++ b/mllm/ffi/vendors/tvm-ffi @@ -1 +1 @@ -Subproject commit dcd07cfe27465287ee5b203b742e85dcfb99606a +Subproject commit 46f73580780f2973e6ea3afb6d3a9d6f6ffd02cc