From bef6c2d065d72be53885e3e3c913b5e60e13843f Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 19 Jul 2025 11:36:26 +0800 Subject: [PATCH 01/12] Add support for Llada-8b: diffusion model llama: fix llama-model fixup working --- common/arg.cpp | 63 ++- common/common.h | 18 +- convert_hf_to_gguf.py | 153 ++++++ examples/diffusion/CMakeLists.txt | 10 +- ...fusion-cli.cpp => diffusion-dream-cli.cpp} | 41 +- examples/diffusion/diffusion-llada-cli.cpp | 505 ++++++++++++++++++ gguf-py/gguf/constants.py | 17 + gguf-py/gguf/tensor_mapping.py | 21 + include/llama.h | 3 + src/llama-arch.cpp | 22 + src/llama-arch.h | 1 + src/llama-model.cpp | 173 ++++++ 12 files changed, 984 insertions(+), 43 deletions(-) rename examples/diffusion/{diffusion-cli.cpp => diffusion-dream-cli.cpp} (92%) create mode 100644 examples/diffusion/diffusion-llada-cli.cpp diff --git a/common/arg.cpp b/common/arg.cpp index 060053595dbfd..8f32517e56b6e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3438,34 +3438,59 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER})); - // diffusion parameters + // shared diffusion parameters add_opt(common_arg( { "--diffusion-steps" }, "N", - string_format("number of diffusion steps (default: %d)", params.diffusion.steps), - [](common_params & params, int value) { params.diffusion.steps = value; } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + string_format("number of diffusion steps (default: %d)", params.diffusion_dream.steps), + [](common_params & params, int value) { + params.diffusion_dream.steps = value; + params.diffusion_llada.steps = value; + } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_DREAM, LLAMA_EXAMPLE_DIFFUSION_LLADA })); + add_opt(common_arg( + { "--diffusion-visual" }, + string_format("enable visual diffusion mode (show progressive generation) (default: %s)", + params.diffusion_dream.visual_mode ? "true" : "false"), + [](common_params & params) { + params.diffusion_dream.visual_mode = true; + params.diffusion_llada.visual_mode = true; + } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_DREAM, LLAMA_EXAMPLE_DIFFUSION_LLADA })); + + // DREAM-specific diffusion parameters add_opt(common_arg( { "--diffusion-eps" }, "F", - string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps), - [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion_dream.eps), + [](common_params & params, const std::string & value) { params.diffusion_dream.eps = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_DREAM })); add_opt(common_arg( { "--diffusion-algorithm" }, "N", string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)", - params.diffusion.algorithm), - [](common_params & params, int value) { params.diffusion.algorithm = value; } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + params.diffusion_dream.algorithm), + [](common_params & params, int value) { params.diffusion_dream.algorithm = value; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_DREAM })); add_opt(common_arg( { "--diffusion-alg-temp" }, "F", - string_format("algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp), - [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); - add_opt(common_arg( - { "--diffusion-visual" }, - string_format("enable visual diffusion mode (show progressive generation) (default: %s)", - params.diffusion.visual_mode ? "true" : "false"), - [](common_params & params) { params.diffusion.visual_mode = true; } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + string_format("algorithm temperature (default: %.3f)", (double) params.diffusion_dream.alg_temp), + [](common_params & params, const std::string & value) { params.diffusion_dream.alg_temp = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_DREAM })); + + // LLADA-specific diffusion parameters + add_opt(common_arg( + { "--diffusion-block-length" }, "N", + string_format("block length for generation (default: %d)", params.diffusion_llada.block_length), + [](common_params & params, int value) { params.diffusion_llada.block_length = value; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA })); + add_opt(common_arg( + { "--diffusion-cfg-scale" }, "F", + string_format("classifier-free guidance scale (default: %.3f)", (double) params.diffusion_llada.cfg_scale), + [](common_params & params, const std::string & value) { params.diffusion_llada.cfg_scale = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA })); + add_opt(common_arg( + { "--diffusion-remasking-alg" }, "N", + string_format("remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM (default: %d)", params.diffusion_llada.remasking), + [](common_params & params, int value) { params.diffusion_llada.remasking = value; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA })); return ctx_arg; } diff --git a/common/common.h b/common/common.h index 00f42694eafa8..3a53387074e7d 100644 --- a/common/common.h +++ b/common/common.h @@ -81,7 +81,8 @@ enum llama_example { LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_TTS, - LLAMA_EXAMPLE_DIFFUSION, + LLAMA_EXAMPLE_DIFFUSION_DREAM, + LLAMA_EXAMPLE_DIFFUSION_LLADA, LLAMA_EXAMPLE_COUNT, }; @@ -219,7 +220,7 @@ struct common_params_vocoder { bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT }; -struct common_params_diffusion { +struct common_params_diffusion_dream { int32_t steps = 64; // number of diffusion steps float eps = 1e-3f; // epsilon for timesteps int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY) @@ -227,6 +228,14 @@ struct common_params_diffusion { bool visual_mode = false; // show progressive diffusion on screen }; +struct common_params_diffusion_llada { + int32_t steps = 64; // number of diffusion steps + int32_t block_length = 32; // block length for generation + float cfg_scale = 0.2f; // classifier-free guidance scale + int32_t remasking = 0; // remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM + bool visual_mode = false; // show progressive diffusion on screen +}; + enum common_reasoning_format { COMMON_REASONING_FORMAT_NONE, COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in tags in stream mode @@ -277,8 +286,9 @@ struct common_params { struct common_params_sampling sampling; struct common_params_speculative speculative; - struct common_params_vocoder vocoder; - struct common_params_diffusion diffusion; + struct common_params_vocoder vocoder; + struct common_params_diffusion_dream diffusion_dream; + struct common_params_diffusion_llada diffusion_llada; struct common_params_model model; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3f5cefe007cca..2733544113fc6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2904,6 +2904,159 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("LLaDAModelLM") +class LLaDAModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLADA + undo_permute = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # fix for SmolVLM2, missing `num_attention_heads` in config.json + if self.hf_arch == "VLlama3ForCausalLM": + self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) + + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + + vocab_dict = tokenizer.get_vocab() + vocab_size = self.hparams.get("vocab_size", len(vocab_dict)) + assert max(vocab_dict.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab_dict.items()} + added_vocab = tokenizer.get_added_vocab() + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + elif reverse_vocab[i] in added_vocab: + tokens.append(reverse_vocab[i]) + # Check if it's a special token - treat special tokens as CONTROL tokens + if hasattr(tokenizer, 'added_tokens_decoder') and i in tokenizer.added_tokens_decoder: + if tokenizer.added_tokens_decoder[i].special: + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + # Fallback: treat all added vocab as control tokens for special tokens like <|im_start|> + toktypes.append(gguf.TokenType.CONTROL) + else: + tokens.append(reverse_vocab[i]) + toktypes.append(gguf.TokenType.NORMAL) + + return tokens, toktypes, tokpre + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + try: + self._set_vocab_llama_hf() + except (FileNotFoundError, TypeError): + # Llama 3 + self._set_vocab_gpt2() + + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) + if self.hparams.get("vocab_size", 32000) == 32016: + special_vocab = gguf.SpecialVocab( + self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot'] + ) + special_vocab._set_special_token("prefix", 32007) + special_vocab._set_special_token("suffix", 32008) + special_vocab._set_special_token("middle", 32009) + special_vocab._set_special_token("eot", 32010) + special_vocab.add_to_gguf(self.gguf_writer) + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + # Apply to granite small models only + if self.hparams.get("vocab_size", 32000) == 49152: + self.gguf_writer.add_add_bos_token(False) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self._try_set_pooling_type() + + # Add parameters similar to LlamaModel + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if (rope_dim := hparams.get("head_dim")) is None: + n_heads = hparams.get("num_attention_heads", hparams.get("n_heads")) + rope_dim = hparams.get("hidden_size", hparams.get("d_model")) // n_heads + self.gguf_writer.add_rope_dimension_count(rope_dim) + + # Set context length for LLaDA + context_length = self.hparams.get("max_sequence_length") + self.gguf_writer.add_context_length(context_length) + + # Set embedding length (dimension size) + embedding_length = self.hparams.get("d_model") + self.gguf_writer.add_embedding_length(embedding_length) + + # Set feed forward length (MLP hidden size) + feed_forward_length = self.hparams.get("mlp_hidden_size") + self.gguf_writer.add_feed_forward_length(feed_forward_length) + + # Set RoPE parameters + if "rope_theta" in self.hparams: + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + + # Set RMS norm epsilon + if "rms_norm_eps" in self.hparams: + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + + # LLaDA models use non-causal attention for diffusion, similar to Dream + self.gguf_writer.add_causal_attention(False) + # Handle RoPE scaling similar to LlamaModel and Dream + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + elif rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + + # Add LLaDA-specific parameters + mask_token_id = self.hparams.get("mask_token_id") + if mask_token_id is not None: + self.gguf_writer.add_mask_token_id(mask_token_id) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams.get("num_attention_heads", self.hparams.get("n_heads")) + n_kv_head = self.hparams.get("num_key_value_heads", self.hparams.get("n_kv_heads")) + + if self.undo_permute: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LLaDAModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LLaDAModel.permute(data_torch, n_head, n_kv_head) + + # LLaDA model tensors should be mapped directly since it's the base model + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Ernie4_5_ForCausalLM") class Ernie4_5Model(TextModel): model_arch = gguf.MODEL_ARCH.ERNIE4_5 diff --git a/examples/diffusion/CMakeLists.txt b/examples/diffusion/CMakeLists.txt index 396549c8029d9..459beec5f8011 100644 --- a/examples/diffusion/CMakeLists.txt +++ b/examples/diffusion/CMakeLists.txt @@ -1,5 +1,11 @@ -set(TARGET llama-diffusion-cli) -add_executable(${TARGET} diffusion-cli.cpp) +set(TARGET llama-diffusion-dream-cli) +add_executable(${TARGET} diffusion-dream-cli.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +set(TARGET llama-diffusion-llada-cli) +add_executable(${TARGET} diffusion-llada-cli.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-dream-cli.cpp similarity index 92% rename from examples/diffusion/diffusion-cli.cpp rename to examples/diffusion/diffusion-dream-cli.cpp index 3e11ce1160b05..308024263b451 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-dream-cli.cpp @@ -332,9 +332,9 @@ static std::string format_input_text(const std::string & prompt, bool use_chat_t } struct callback_data { - const common_params_diffusion * diff_params; - const llama_vocab * vocab; - int32_t n_input; + const common_params_diffusion_dream * diff_params; + const llama_vocab * vocab; + int32_t n_input; }; static bool diffusion_step_callback(int32_t step, @@ -396,13 +396,13 @@ int main(int argc, char ** argv) { common_params params; - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) { + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION_DREAM)) { return 1; } const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" }; - const char * alg_name = (params.diffusion.algorithm >= 0 && params.diffusion.algorithm <= 3) ? - alg_names[params.diffusion.algorithm] : + const char * alg_name = (params.diffusion_dream.algorithm >= 0 && params.diffusion_dream.algorithm <= 3) ? + alg_names[params.diffusion_dream.algorithm] : "UNKNOWN"; common_init(); @@ -421,6 +421,11 @@ int main(int argc, char ** argv) { return 1; } + // Check if the model architecture is Dream + char arch_str[128]; + GGML_ASSERT(llama_model_meta_val_str(model, "general.architecture", arch_str, 128) >= 0 && + std::string(arch_str) == "dream"); + llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = params.n_ctx; ctx_params.n_batch = params.n_batch; @@ -445,7 +450,7 @@ int main(int argc, char ** argv) { std::vector input_tokens = common_tokenize(vocab, formatted_prompt, /*add special tokens*/ true, /*parse special*/ true); - int n_input = input_tokens.size(); + int n_input = input_tokens.size(); if (n_input >= params.n_ctx) { LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx); @@ -455,28 +460,28 @@ int main(int argc, char ** argv) { } struct diffusion_params ldiff_params = diffusion_default_params(); - ldiff_params.steps = params.diffusion.steps; - ldiff_params.eps = params.diffusion.eps; + ldiff_params.steps = params.diffusion_dream.steps; + ldiff_params.eps = params.diffusion_dream.eps; ldiff_params.temperature = params.sampling.temp; ldiff_params.top_p = params.sampling.top_p; ldiff_params.top_k = params.sampling.top_k; - ldiff_params.algorithm = static_cast(params.diffusion.algorithm); - ldiff_params.alg_temp = params.diffusion.alg_temp; + ldiff_params.algorithm = static_cast(params.diffusion_dream.algorithm); + ldiff_params.alg_temp = params.diffusion_dream.alg_temp; ldiff_params.seed = params.sampling.seed; llama_token mask_token_id = llama_vocab_mask(vocab); GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL); - LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); - LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", params.diffusion.steps); - LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", params.diffusion.eps); - LOG_INF("diffusion_params: - %-25s u32 = %d (%s)\n", "algorithm", params.diffusion.algorithm, + LOG_INF("dream_diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); + LOG_INF("dream_diffusion_params: - %-25s u32 = %d\n", "steps", params.diffusion_dream.steps); + LOG_INF("dream_diffusion_params: - %-25s f32 = %.6f\n", "eps", params.diffusion_dream.eps); + LOG_INF("dream_diffusion_params: - %-25s u32 = %d (%s)\n", "algorithm", params.diffusion_dream.algorithm, alg_name); - LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", params.diffusion.alg_temp); + LOG_INF("dream_diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", params.diffusion_dream.alg_temp); ldiff_params.mask_token_id = mask_token_id; - callback_data cb_data = { ¶ms.diffusion, vocab, n_input }; + callback_data cb_data = { ¶ms.diffusion_dream, vocab, n_input }; ldiff_params.step_callback = diffusion_step_callback; ldiff_params.step_callback_user_data = &cb_data; @@ -488,7 +493,7 @@ int main(int argc, char ** argv) { ldiff_params, n_generated); if (n_generated > 0) { - if (params.diffusion.visual_mode) { + if (params.diffusion_dream.visual_mode) { //clear screen and move cursor to top-left LOG_INF("\033[2J\033[H"); } diff --git a/examples/diffusion/diffusion-llada-cli.cpp b/examples/diffusion/diffusion-llada-cli.cpp new file mode 100644 index 0000000000000..770f2cae7af42 --- /dev/null +++ b/examples/diffusion/diffusion-llada-cli.cpp @@ -0,0 +1,505 @@ +#include "arg.h" +#include "chat.h" +#include "common.h" +#include "llama.h" +#include "log.h" + + +#include +#include +#include +#include +#include +#include +#include + +enum remasking_type { + REMASKING_LOW_CONFIDENCE = 0, + REMASKING_RANDOM = 1, +}; + +struct diffusion_params_llada { + int32_t steps; + int32_t max_length; + int32_t block_length; + float temperature; + float cfg_scale; + llama_token mask_token_id; + enum remasking_type remasking; + bool (*step_callback)(int32_t step, int32_t total_steps, const llama_token * tokens, int32_t n_tokens, void * user_data); + void * step_callback_user_data; + int32_t seed; +}; + +static diffusion_params_llada diffusion_default_params_llada() { + diffusion_params_llada params = {}; + params.steps = 128; + params.max_length = 256; + params.block_length = 32; + params.temperature = 0.0f; + params.cfg_scale = 0.2f; + params.mask_token_id = LLAMA_TOKEN_NULL; + params.remasking = REMASKING_LOW_CONFIDENCE; + params.step_callback = nullptr; + params.step_callback_user_data = nullptr; + params.seed = 0; + return params; +} + +static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) { + if (temperature == 0.0f) { + return; + } + + std::uniform_real_distribution uniform(0.0, 1.0); + for (int32_t i = 0; i < n_vocab; i++) { + double noise = uniform(rng); + // Prevent log(0) + noise = std::max(noise, 1e-20); + double gumbel_noise = std::pow(-std::log(noise), temperature); + logits[i] = std::exp(logits[i]) / gumbel_noise; + } +} + +static std::vector get_num_transfer_tokens(int32_t mask_count, int32_t steps) { + std::vector num_transfer_tokens(steps); + + int32_t base = mask_count / steps; + int32_t remainder = mask_count % steps; + + for (int32_t i = 0; i < steps; i++) { + num_transfer_tokens[i] = base + (i < remainder ? 1 : 0); + } + + return num_transfer_tokens; +} + +static void diffusion_generate_llada(llama_context * ctx, + const llama_token * input_tokens, + llama_token * output_tokens, + int32_t n_input, + struct diffusion_params_llada params, + int32_t & n_generated) { + n_generated = 0; + if (!ctx || !input_tokens || !output_tokens || n_input <= 0) { + return; + } + + const llama_model * model = llama_get_model(ctx); + + std::vector in(params.max_length, params.mask_token_id); + std::copy(input_tokens, input_tokens + n_input, in.begin()); + + GGML_ASSERT(params.max_length % params.block_length == 0); + int num_blocks = params.max_length / params.block_length; + + GGML_ASSERT(params.steps % num_blocks == 0); + + int steps = params.steps / num_blocks; + + int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); + llama_set_causal_attn(ctx, false); + + // Pre-allocate buffers for Classifier-Free Guidance + int32_t logits_size = n_vocab * params.max_length; + std::vector cond_logits_buffer; + std::vector un_x_buffer; + if (params.cfg_scale > 0.0f) { + cond_logits_buffer.resize(logits_size); + un_x_buffer.resize(params.max_length); + } + + llama_batch batch = llama_batch_init(params.max_length, 0, 1); + batch.n_tokens = params.max_length; + + std::vector argmax; + std::mt19937 rng(params.seed); + + int64_t total_sampling_time = 0; + int64_t total_time = 0; + + std::vector confidence(params.max_length); + + int64_t time_start = ggml_time_us(); + for (int block_num = 0; block_num < num_blocks; block_num++) { + // Get number of tokens to transfer for this step + int32_t block_start = n_input + block_num * params.block_length; + int32_t block_end = std::min(n_input + (block_num + 1) * params.block_length, params.max_length); + + // Count masked tokens in current block + int32_t block_mask_count = 0; + for (int i = block_start; i < block_end; i++) { + if (in[i] == params.mask_token_id) { + block_mask_count++; + } + } + auto num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps); + + for (int step = 0; step < steps; step++) { + if (params.step_callback) { + if (!params.step_callback(step + block_num * steps, + params.steps, in.data(), + params.max_length, + params.step_callback_user_data)) { + break; + } + } + + float * logits = nullptr; + + if (params.cfg_scale > 0.0f) { + for (int32_t i = 0; i < batch.n_tokens; i++) { + batch.token[i] = in[i]; + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = 1; + } + + int ret = llama_decode(ctx, batch); + if (ret != 0) { + LOG_ERR("Failed to generate conditional"); + } + float * cond_logits_ptr = llama_get_logits(ctx); + std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float)); + + std::copy(in.begin(), in.end(), un_x_buffer.begin()); + for (int32_t i = 0; i < n_input; i++) { + un_x_buffer[i] = params.mask_token_id; + } + + for (int32_t i = 0; i < batch.n_tokens; i++) { + batch.token[i] = un_x_buffer[i]; + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = 1; + } + ret = llama_decode(ctx, batch); + GGML_ASSERT(ret == 0); + float * uncond_logits = llama_get_logits(ctx); + for (int32_t i = 0; i < logits_size; i++) { + cond_logits_buffer[i] = + uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]); + } + + logits = cond_logits_buffer.data(); + } else { + // Standard generation without CFG + for (int32_t i = 0; i < batch.n_tokens; i++) { + batch.token[i] = in[i]; + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = 1; + } + + int ret = llama_decode(ctx, batch); + if (ret != 0) { + LOG_ERR("Failed to generate"); + } + logits = llama_get_logits(ctx); + } + + int64_t time_start_sampling = ggml_time_us(); + + if (params.temperature > 0.0f) { + add_gumbel_noise(logits, n_vocab, params.temperature, rng); + } + + argmax.clear(); + + for (int i = 0; i < params.max_length; ++i) { + float max_value = std::numeric_limits::min(); + llama_token tok = LLAMA_TOKEN_NULL; + for (int vob = 0; vob < n_vocab; vob++) { + if (logits[n_vocab * i + vob] > max_value) { + max_value = logits[n_vocab * i + vob]; + tok = vob; + } + } + argmax.push_back(tok); + } + + // Create mask index to track which positions are masked + std::vector mask_index(params.max_length); + for (int i = 0; i < params.max_length; i++) { + mask_index[i] = (in[i] == params.mask_token_id); + } + + if (params.remasking == REMASKING_LOW_CONFIDENCE) { + // inplace softmax + argmax calculation. TODO: check why llama_sampler is so slow here + for (int i = block_start; i < block_end; i++) { + if (mask_index[i]) { + float * pos_logits = logits + i * n_vocab; + + llama_token best_token = 0; + float max_logit = pos_logits[0]; + for (int32_t j = 1; j < n_vocab; j++) { + if (pos_logits[j] > max_logit) { + max_logit = pos_logits[j]; + best_token = j; + } + } + + float sum_exp = 0.0f; + for (int32_t j = 0; j < n_vocab; j++) { + sum_exp += std::exp(pos_logits[j] - max_logit); + } + + float prob = std::exp(pos_logits[best_token] - max_logit) / sum_exp; + confidence[i] = prob; + + argmax[i] = best_token; + } else { + confidence[i] = -std::numeric_limits::infinity(); // Non-masked positions + } + } + } else if (params.remasking == REMASKING_RANDOM) { + // Random remasking: assign random values for masked positions + std::uniform_real_distribution uniform(0.0f, 1.0f); + for (int i = 0; i < params.max_length; i++) { + if (mask_index[i]) { + confidence[i] = uniform(rng); + } else { + confidence[i] = -std::numeric_limits::infinity(); // Non-masked positions + } + } + } + + for (int i = n_input + (block_num + 1) * params.block_length; i < params.max_length; i++) { + confidence[i] = -std::numeric_limits::infinity(); + } + + int32_t transfer_count = num_transfer_tokens[step]; + + std::vector> conf_pairs; + for (int i = n_input; i < params.max_length; i++) { + if (mask_index[i] && confidence[i] > -std::numeric_limits::infinity()) { + conf_pairs.push_back({ confidence[i], i }); + } + } + + std::partial_sort( + conf_pairs.begin(), conf_pairs.begin() + std::min(transfer_count, (int32_t) conf_pairs.size()), + conf_pairs.end(), [](const std::pair & a, const std::pair & b) { + return a.first > b.first; + }); + + for (int i = 0; i < std::min(transfer_count, (int32_t) conf_pairs.size()); i++) { + int32_t pos = conf_pairs[i].second; + in[pos] = argmax[pos]; + } + + int64_t time_end_sampling = ggml_time_us(); + total_sampling_time += time_end_sampling - time_start_sampling; + } + } + int64_t time_end = ggml_time_us(); + total_time += time_end - time_start; + + LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n", total_time / 1000.0, + total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps); + + llama_batch_free(batch); + + memcpy(output_tokens, in.data(), in.size() * sizeof(llama_token)); + + n_generated = params.max_length; +} + +static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) { + if (!use_chat_template) { + return prompt; + } + + auto chat_templates = common_chat_templates_init(model, ""); + + common_chat_templates_inputs inputs; + common_chat_msg user_msg; + user_msg.role = "user"; + user_msg.content = prompt; + inputs.add_generation_prompt = true; + inputs.messages.push_back(user_msg); + + auto result = common_chat_templates_apply(chat_templates.get(), inputs); + + return result.prompt; +} + +struct callback_data { + const common_params_diffusion_llada * diff_params; + const llama_vocab * vocab; + int32_t n_input; +}; + +static bool diffusion_step_callback(int32_t step, + int32_t total_steps, + const llama_token * tokens, + int32_t n_tokens, + void * user_data) { + callback_data * data = static_cast(user_data); + + auto print_progress_bar = [](int32_t step, int32_t total_steps) { + int progress_percent = (step * 100) / total_steps; + int progress_bars = (step * 50) / total_steps; + LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%", + step, + total_steps, + std::string(progress_bars, '=').c_str(), + std::string(50 - progress_bars, ' ').c_str(), + progress_percent); + }; + + if (data->diff_params->visual_mode) { + // Visual mode: clear + LOG_INF("\033[2J\033[H"); // Clear screen and move cursor to top-left + + print_progress_bar(step, total_steps); + + LOG_INF("\n"); + + std::string current_text = " "; + + for (int32_t i = data->n_input; i < n_tokens; i++) { + std::string token_str; + if (tokens[i] != llama_vocab_mask(data->vocab)) { + char piece[256]; + int n_chars = llama_token_to_piece(data->vocab, tokens[i], piece, sizeof(piece), 0, false); + if (n_chars > 0) { + piece[n_chars] = '\0'; + token_str = piece; + } + } else { + token_str = " "; + } + + current_text += token_str; + } + + LOG_INF("%s\n", current_text.c_str()); + } else { + print_progress_bar(step, total_steps); + } + + return true; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION_LLADA)) { + return 1; + } + + common_init(); + llama_backend_init(); + + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = params.n_gpu_layers; + model_params.devices = params.devices.data(); + model_params.use_mmap = params.use_mmap; + model_params.use_mlock = params.use_mlock; + model_params.check_tensors = params.check_tensors; + + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); + if (!model) { + LOG_ERR("error: failed to load model '%s'\n", params.model.path.c_str()); + return 1; + } + + char arch_str[128]; + GGML_ASSERT(llama_model_meta_val_str(model, "general.architecture", arch_str, 128) >= 0 && + std::string(arch_str) == "llada"); + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = params.n_ctx; + ctx_params.n_batch = params.n_batch; + ctx_params.n_ubatch = params.n_ubatch; + ctx_params.flash_attn = params.flash_attn; + ctx_params.no_perf = params.no_perf; + ctx_params.type_k = params.cache_type_k; + ctx_params.type_v = params.cache_type_v; + + llama_context * ctx = llama_init_from_model(model, ctx_params); + if (!ctx) { + LOG_ERR("error: failed to create context\n"); + llama_model_free(model); + return 1; + } + + llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads); + + const llama_vocab * vocab = llama_model_get_vocab(model); + std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model); + + std::vector input_tokens = common_tokenize(vocab, formatted_prompt, + /*add special tokens*/ true, + /*parse special*/ true); + + // For LLaDA models, forcefully add BOS token at the beginning. TODO: check why this is needed vs HF + llama_token bos_token = llama_vocab_bos(vocab); + if (bos_token != LLAMA_TOKEN_NULL && (input_tokens.empty() || input_tokens[0] != bos_token)) { + input_tokens.insert(input_tokens.begin(), bos_token); + } + + int n_input = input_tokens.size(); + + if (n_input >= params.n_ctx) { + LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx); + llama_free(ctx); + llama_model_free(model); + return 1; + } + + llama_token mask_token_id = llama_vocab_mask(vocab); + GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL); + + diffusion_params_llada llada_params = diffusion_default_params_llada(); + llada_params.steps = params.diffusion_llada.steps; + llada_params.block_length = params.diffusion_llada.block_length; + llada_params.temperature = params.sampling.temp; + llada_params.cfg_scale = params.diffusion_llada.cfg_scale; + llada_params.remasking = static_cast(params.diffusion_llada.remasking); + llada_params.mask_token_id = mask_token_id; + llada_params.seed = params.sampling.seed; + llada_params.max_length = params.n_ubatch; + + callback_data cb_data = { ¶ms.diffusion_llada, vocab, n_input }; + llada_params.step_callback = diffusion_step_callback; + llada_params.step_callback_user_data = &cb_data; + + LOG_INF("Using LLaDA diffusion generation\n"); + LOG_INF("llada_diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); + LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "steps", llada_params.steps); + LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "max_length", llada_params.max_length); + LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "block_length", llada_params.block_length); + LOG_INF("llada_diffusion_params: - %-25s f32 = %.3f\n", "temperature", llada_params.temperature); + LOG_INF("llada_diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", llada_params.cfg_scale); + + int32_t n_generated = 0; + std::vector output_tokens(params.n_ubatch); + + diffusion_generate_llada(ctx, input_tokens.data(), output_tokens.data(), n_input, llada_params, n_generated); + + if (n_generated > 0) { + if (params.diffusion_llada.visual_mode) { + //clear screen and move cursor to top-left + LOG_INF("\033[2J\033[H"); + } + + output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input); + std::string output_data = common_detokenize(vocab, output_tokens, false); + LOG_INF("\n%s\n", output_data.c_str()); + } else { + LOG_INF("Error: diffusion generation failed\n"); + } + + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + + return 0; +} \ No newline at end of file diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c97b61d09c711..2a6a22d82f186 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -377,6 +377,7 @@ class MODEL_ARCH(IntEnum): LFM2 = auto() DREAM = auto() SMALLTHINKER = auto() + LLADA = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -697,6 +698,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.LFM2: "lfm2", MODEL_ARCH.DREAM: "dream", MODEL_ARCH.SMALLTHINKER: "smallthinker", + MODEL_ARCH.LLADA: "llada", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -1318,6 +1320,21 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.LLADA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.QWEN2VL: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index bfd4fd37a3f68..16abe7e0e8e2c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -32,6 +32,7 @@ class TensorNameMap: "model.word_embeddings", # bailingmoe "language_model.model.embed_tokens", # llama4 "encoder", # neobert + "model.transformer.wte", # llada ), # Token type embeddings @@ -71,6 +72,7 @@ class TensorNameMap: "head", # rwkv "head.out", # wavtokenizer "lm_head", # llama4 + "model.transformer.ff_out", # llada ), # Output norm @@ -94,6 +96,7 @@ class TensorNameMap: "model.ln_out", # rwkv7 "backbone.final_layer_norm", # wavtokenizer "model.norm", # llama4 + "model.transformer.ln_f", # llada ), # Rope frequencies @@ -139,6 +142,7 @@ class TensorNameMap: "model.layers.{bid}.input_layernorm", # llama4 "transformer_encoder.{bid}.attention_norm", # neobert "model.layers.{bid}.operator_norm", # lfm2 + "model.transformer.blocks.{bid}.attn_norm", # llada ), # Attention norm 2 @@ -183,6 +187,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok "transformer.h.{bid}.attn.attention.q_proj", # exaone "model.layers.{bid}.self_attn.q_proj", # llama4 + "model.transformer.blocks.{bid}.q_proj", # llada ), # Attention key @@ -199,6 +204,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok "transformer.h.{bid}.attn.attention.k_proj", # exaone "model.layers.{bid}.self_attn.k_proj", # llama4 + "model.transformer.blocks.{bid}.k_proj", # llada ), # Attention value @@ -214,6 +220,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok "transformer.h.{bid}.attn.attention.v_proj", # exaone "model.layers.{bid}.self_attn.v_proj", # llama4 + "model.transformer.blocks.{bid}.v_proj", # llada ), # Attention output @@ -246,6 +253,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.attention.out_proj", # exaone "model.layers.{bid}.self_attn.o_proj", # llama4 "transformer_encoder.{bid}.wo", # neobert + "model.transformer.blocks.{bid}.attn_out", # llada ), # Attention output norm @@ -291,6 +299,7 @@ class TensorNameMap: "model.layers.{bid}.post_attention_layernorm", # llama4 "transformer_encoder.{bid}.ffn_norm", # neobert "model.layers.layers.{bid}.pre_mlp_norm", # plamo2 + "model.transformer.blocks.{bid}.ff_norm", # llada ), # Post feed-forward norm @@ -363,7 +372,11 @@ class TensorNameMap: "transformer.h.{bid}.mlp.c_fc_1", # exaone "model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid "transformer_encoder.{bid}.ffn.w12", # neobert +<<<<<<< HEAD "model.layers.{bid}.block_sparse_moe.up", # smallthinker +======= + "model.transformer.blocks.{bid}.up_proj", # llada +>>>>>>> c56f1b02 (Add support for Llada-8b: diffusion model) ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -404,7 +417,11 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid +<<<<<<< HEAD "model.layers.{bid}.block_sparse_moe.gate", # smallthinker +======= + "model.transformer.blocks.{bid}.ff_proj", # llada +>>>>>>> c56f1b02 (Add support for Llada-8b: diffusion model) ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -453,7 +470,11 @@ class TensorNameMap: "model.layers.h.{bid}.mlp.c_proj", # exaone "model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid "transformer_encoder.{bid}.ffn.w3", # neobert +<<<<<<< HEAD "model.layers.{bid}.block_sparse_moe.down", # smallthinker +======= + "model.transformer.blocks.{bid}.ff_out", # llada +>>>>>>> c56f1b02 (Add support for Llada-8b: diffusion model) ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/include/llama.h b/include/llama.h index 6f454a508a06c..1a51e74a8d63f 100644 --- a/include/llama.h +++ b/include/llama.h @@ -537,6 +537,9 @@ extern "C" { // Returns true if the model is recurrent (like Mamba, RWKV, etc.) LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model); + // Returns true if the model is diffusion-based (like LLaDA, Dream, etc.) + LLAMA_API bool llama_model_is_diffusion(const struct llama_model * model); + // Returns 0 on success LLAMA_API uint32_t llama_model_quantize( const char * fname_inp, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index dbf977443ae85..71143cfd2fb08 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -88,7 +88,11 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_SMOLLM3, "smollm3" }, { LLM_ARCH_LFM2, "lfm2" }, { LLM_ARCH_DREAM, "dream" }, +<<<<<<< HEAD { LLM_ARCH_SMALLTHINKER, "smallthinker" }, +======= + { LLM_ARCH_LLADA, "llada" }, +>>>>>>> c56f1b02 (Add support for Llada-8b: diffusion model) { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1972,6 +1976,23 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_LLADA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2224,6 +2245,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { bool llm_arch_is_diffusion(const llm_arch & arch) { switch (arch) { case LLM_ARCH_DREAM: + case LLM_ARCH_LLADA: return true; default: return false; diff --git a/src/llama-arch.h b/src/llama-arch.h index 8267a8d3aa491..8ea80806c9c8d 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -93,6 +93,7 @@ enum llm_arch { LLM_ARCH_LFM2, LLM_ARCH_DREAM, LLM_ARCH_SMALLTHINKER, + LLM_ARCH_LLADA, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e3aa9e6f91af9..ba5ef9b0fb74f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -869,6 +869,21 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.causal_attn = false; } break; + case LLM_ARCH_LLADA: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion + switch (hparams.n_layer) { + case 32: + type = LLM_TYPE_8B; + break; + default: + type = LLM_TYPE_UNKNOWN; + } + // Set non-causal attention for diffusion models + hparams.causal_attn = false; + } + break; case LLM_ARCH_QWEN2MOE: { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); @@ -2149,6 +2164,53 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } } break; + case LLM_ARCH_LLADA: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = + create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + + // Use separate Q, K, V projections without bias, matching LLaDALlamaBlock + layer.wq = + create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, 0); + // No bias for QKV projections as per config: include_bias=false, include_qkv_bias=false + layer.wo = + create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); + + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot / 2 }, + TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); + + // optional MLP bias + layer.ffn_gate_b = + create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED); + layer.ffn_down_b = + create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), { n_embd }, TENSOR_NOT_REQUIRED); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), { n_ff }, TENSOR_NOT_REQUIRED); + } + } + break; case LLM_ARCH_LLAMA4: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -8042,6 +8104,106 @@ struct llm_build_dream : public llm_graph_context { } }; +struct llm_build_llada : public llm_graph_context { + llm_build_llada(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : + llm_graph_context(params) { + // LLaDA is similar to LLaMA but uses non-causal attention for diffusion + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + // Non-causal attention for diffusion + auto * inp_attn = build_attn_inp_no_cache(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute separate Q, K, V projections without bias, matching LLaDALlamaBlock + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, + 1.0f / sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, model.layers[il].ffn_up, NULL, NULL, model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + struct llm_build_qwen2vl : public llm_graph_context { llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -17201,6 +17363,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, case LLM_ARCH_NEO_BERT: case LLM_ARCH_WAVTOKENIZER_DEC: case LLM_ARCH_DREAM: + case LLM_ARCH_LLADA: { res = nullptr; } break; @@ -17367,6 +17530,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_LLADA: + { + llm = std::make_unique(*this, params, gf); + } + break; case LLM_ARCH_QWEN2VL: { llm = std::make_unique(*this, params); @@ -17765,6 +17933,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { // use what we call a normal RoPE, operating on pairs of consecutive head values case LLM_ARCH_LLAMA: + case LLM_ARCH_LLADA: case LLM_ARCH_LLAMA4: case LLM_ARCH_DECI: case LLM_ARCH_BAICHUAN: @@ -17943,6 +18112,10 @@ bool llama_model_is_recurrent(const llama_model * model) { return llm_arch_is_recurrent(model->arch); } +bool llama_model_is_diffusion(const llama_model * model) { + return llm_arch_is_diffusion(model->arch); +} + const std::vector> & llama_internal_get_tensor_map(const llama_model * model) { return model->tensors_by_name; } From 0fa8b866a651f6af4cb26be5387ce83b8714eff7 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 19 Jul 2025 17:31:05 +0800 Subject: [PATCH 02/12] Add README --- common/arg.cpp | 2 +- convert_hf_to_gguf.py | 6 ++-- examples/diffusion/README.md | 39 ++++++++++++++++++++++ examples/diffusion/diffusion-llada-cli.cpp | 4 +-- src/llama-model.cpp | 6 ++-- 5 files changed, 48 insertions(+), 9 deletions(-) create mode 100644 examples/diffusion/README.md diff --git a/common/arg.cpp b/common/arg.cpp index 8f32517e56b6e..39a5d835b4a89 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3487,7 +3487,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.diffusion_llada.cfg_scale = std::stof(value); } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA })); add_opt(common_arg( - { "--diffusion-remasking-alg" }, "N", + { "--diffusion-alg" }, "N", string_format("remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM (default: %d)", params.diffusion_llada.remasking), [](common_params & params, int value) { params.diffusion_llada.remasking = value; } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA })); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2733544113fc6..0e9894ab7f87f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2999,15 +2999,15 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(rope_dim) # Set context length for LLaDA - context_length = self.hparams.get("max_sequence_length") + context_length = self.hparams.get("max_sequence_length", 4096) self.gguf_writer.add_context_length(context_length) # Set embedding length (dimension size) - embedding_length = self.hparams.get("d_model") + embedding_length = self.hparams.get("d_model", 4096) self.gguf_writer.add_embedding_length(embedding_length) # Set feed forward length (MLP hidden size) - feed_forward_length = self.hparams.get("mlp_hidden_size") + feed_forward_length = self.hparams.get("mlp_hidden_size", 12288) self.gguf_writer.add_feed_forward_length(feed_forward_length) # Set RoPE parameters diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md new file mode 100644 index 0000000000000..53ddb9f59d2e0 --- /dev/null +++ b/examples/diffusion/README.md @@ -0,0 +1,39 @@ +# Diffusion Text Generation Examples + +This directory contains implementations for diffusion-based text generation using two different model architectures: **Dream** and **LLaDA-8B**. Both models use iterative denoising processes to generate text, but employ different sampling strategies and algorithms. + +## Supported Models + +### 1. Dream Model (`llama-diffusion-dream-cli`) + +- https://huggingface.co/Dream-org/Dream-v0-Base-7B +- Original PR - https://github.com/ggml-org/llama.cpp/pull/14644 + +The Dream model supports four different sampling algorithms controlled by the `--diffusion-alg` parameter: + +1. **ORIGIN (0)** - Original diffusion algorithm + - Uses probability transfer based on timestep ratios + - Default algorithm with standard confidence-based token selection + +2. **MASKGIT_PLUS (1)** - Enhanced MaskGIT sampling + - Improved version of the MaskGIT algorithm + +3. **TOPK_MARGIN (2)** - Top-K margin-based sampling + - Confidence calculated as the margin between top-1 and top-2 probabilities + +4. **ENTROPY (3)** - Entropy-based sampling (recommended) + - Uses entropy calculation for confidence estimation + +### 2. LLaDA-8B Model (`llama-diffusion-llada-cli`) + +- https://huggingface.co/GSAI-ML/LLaDA-8B-Instruct + +### LLaDA Model Remasking Strategies + +The LLaDA model uses two remasking approaches controlled by the `--diffusion-alg` parameter: + +1. **REMASKING_LOW_CONFIDENCE (0)** - Default strategy + - Remasks tokens with lowest confidence scores + - Uses softmax probabilities to determine confidence + +2. **REMASKING_RANDOM (1)** - Random remasking diff --git a/examples/diffusion/diffusion-llada-cli.cpp b/examples/diffusion/diffusion-llada-cli.cpp index 770f2cae7af42..cab53eef2174a 100644 --- a/examples/diffusion/diffusion-llada-cli.cpp +++ b/examples/diffusion/diffusion-llada-cli.cpp @@ -489,7 +489,7 @@ int main(int argc, char ** argv) { //clear screen and move cursor to top-left LOG_INF("\033[2J\033[H"); } - + output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input); std::string output_data = common_detokenize(vocab, output_tokens, false); LOG_INF("\n%s\n", output_data.c_str()); @@ -502,4 +502,4 @@ int main(int argc, char ** argv) { llama_backend_free(); return 0; -} \ No newline at end of file +} diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ba5ef9b0fb74f..92a7efed3dab3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -8105,7 +8105,7 @@ struct llm_build_dream : public llm_graph_context { }; struct llm_build_llada : public llm_graph_context { - llm_build_llada(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : + llm_build_llada(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { // LLaDA is similar to LLaMA but uses non-causal attention for diffusion const int64_t n_embd_head = hparams.n_embd_head_v; @@ -8158,7 +8158,7 @@ struct llm_build_llada : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, + cur = build_attn(inp_attn, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } @@ -17532,7 +17532,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { break; case LLM_ARCH_LLADA: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_QWEN2VL: From 267a09dfaf96970e08dc4f4082cf6e8bf7f95f8f Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 20 Jul 2025 10:40:50 +0800 Subject: [PATCH 03/12] Fix README and convert_hf_to_gguf --- common/arg.cpp | 2 +- convert_hf_to_gguf.py | 26 -------------------------- examples/diffusion/README.md | 4 ++-- 3 files changed, 3 insertions(+), 29 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 39a5d835b4a89..d8824cb47cde4 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3487,7 +3487,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.diffusion_llada.cfg_scale = std::stof(value); } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA })); add_opt(common_arg( - { "--diffusion-alg" }, "N", + { "--diffusion-algorithm" }, "N", string_format("remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM (default: %d)", params.diffusion_llada.remasking), [](common_params & params, int value) { params.diffusion_llada.remasking = value; } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA })); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0e9894ab7f87f..9079185cac9ac 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2911,9 +2911,6 @@ class LLaDAModel(TextModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # fix for SmolVLM2, missing `num_attention_heads` in config.json - if self.hf_arch == "VLlama3ForCausalLM": - self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] @@ -2962,29 +2959,6 @@ def set_vocab(self): # Llama 3 self._set_vocab_gpt2() - # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) - if self.hparams.get("vocab_size", 32000) == 32016: - special_vocab = gguf.SpecialVocab( - self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot'] - ) - special_vocab._set_special_token("prefix", 32007) - special_vocab._set_special_token("suffix", 32008) - special_vocab._set_special_token("middle", 32009) - special_vocab._set_special_token("eot", 32010) - special_vocab.add_to_gguf(self.gguf_writer) - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - # Apply to granite small models only - if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) - def set_gguf_parameters(self): super().set_gguf_parameters() self._try_set_pooling_type() diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md index 53ddb9f59d2e0..410127df6593d 100644 --- a/examples/diffusion/README.md +++ b/examples/diffusion/README.md @@ -9,7 +9,7 @@ This directory contains implementations for diffusion-based text generation usin - https://huggingface.co/Dream-org/Dream-v0-Base-7B - Original PR - https://github.com/ggml-org/llama.cpp/pull/14644 -The Dream model supports four different sampling algorithms controlled by the `--diffusion-alg` parameter: +The Dream model supports four different sampling algorithms controlled by the `--diffusion-algorithm` parameter: 1. **ORIGIN (0)** - Original diffusion algorithm - Uses probability transfer based on timestep ratios @@ -30,7 +30,7 @@ The Dream model supports four different sampling algorithms controlled by the `- ### LLaDA Model Remasking Strategies -The LLaDA model uses two remasking approaches controlled by the `--diffusion-alg` parameter: +The LLaDA model uses two remasking approaches controlled by the `--diffusion-algorithm` parameter: 1. **REMASKING_LOW_CONFIDENCE (0)** - Default strategy - Remasks tokens with lowest confidence scores From 812bc383e3aba40f2fa929e42d8c046dc8a6a46e Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 22 Jul 2025 17:40:48 +0800 Subject: [PATCH 04/12] convert_hf_to_gguf.py: address review comments --- convert_hf_to_gguf.py | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9079185cac9ac..f2465220e94a5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2909,9 +2909,6 @@ class LLaDAModel(TextModel): model_arch = gguf.MODEL_ARCH.LLADA undo_permute = True - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] toktypes: list[int] = [] @@ -2950,14 +2947,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: return tokens, toktypes, tokpre def set_vocab(self): - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - try: - self._set_vocab_llama_hf() - except (FileNotFoundError, TypeError): - # Llama 3 - self._set_vocab_gpt2() + self._set_vocab_gpt2() def set_gguf_parameters(self): super().set_gguf_parameters() @@ -2995,14 +2985,6 @@ def set_gguf_parameters(self): # LLaDA models use non-causal attention for diffusion, similar to Dream self.gguf_writer.add_causal_attention(False) # Handle RoPE scaling similar to LlamaModel and Dream - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - elif rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) # Add LLaDA-specific parameters mask_token_id = self.hparams.get("mask_token_id") From 6bb0093664dc0d3af2dc5af1b106f7825376b67f Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 26 Jul 2025 15:04:14 +0800 Subject: [PATCH 05/12] Make everything in a single example --- common/arg.cpp | 79 +- common/common.h | 28 +- examples/diffusion/CMakeLists.txt | 10 +- examples/diffusion/README.md | 18 +- examples/diffusion/diffusion-cli.cpp | 876 +++++++++++++++++++++ examples/diffusion/diffusion-dream-cli.cpp | 512 ------------ examples/diffusion/diffusion-llada-cli.cpp | 505 ------------ 7 files changed, 936 insertions(+), 1092 deletions(-) create mode 100644 examples/diffusion/diffusion-cli.cpp delete mode 100644 examples/diffusion/diffusion-dream-cli.cpp delete mode 100644 examples/diffusion/diffusion-llada-cli.cpp diff --git a/common/arg.cpp b/common/arg.cpp index d8824cb47cde4..97902f41edb9f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3438,59 +3438,50 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER})); - // shared diffusion parameters add_opt(common_arg( { "--diffusion-steps" }, "N", - string_format("number of diffusion steps (default: %d)", params.diffusion_dream.steps), - [](common_params & params, int value) { - params.diffusion_dream.steps = value; - params.diffusion_llada.steps = value; - } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_DREAM, LLAMA_EXAMPLE_DIFFUSION_LLADA })); + string_format("number of diffusion steps (default: %d)", params.diffusion.steps), + [](common_params & params, int value) { params.diffusion.steps = value; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( { "--diffusion-visual" }, string_format("enable visual diffusion mode (show progressive generation) (default: %s)", - params.diffusion_dream.visual_mode ? "true" : "false"), - [](common_params & params) { - params.diffusion_dream.visual_mode = true; - params.diffusion_llada.visual_mode = true; - } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_DREAM, LLAMA_EXAMPLE_DIFFUSION_LLADA })); + params.diffusion.visual_mode ? "true" : "false"), + [](common_params & params) { params.diffusion.visual_mode = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); - // DREAM-specific diffusion parameters add_opt(common_arg( - { "--diffusion-eps" }, "F", - string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion_dream.eps), - [](common_params & params, const std::string & value) { params.diffusion_dream.eps = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_DREAM })); + { "--diffusion--dream-eps" }, "F", + string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps), + [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( - { "--diffusion-algorithm" }, "N", + { "--diffusion-dream-algorithm" }, "N", string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)", - params.diffusion_dream.algorithm), - [](common_params & params, int value) { params.diffusion_dream.algorithm = value; } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_DREAM })); - add_opt(common_arg( - { "--diffusion-alg-temp" }, "F", - string_format("algorithm temperature (default: %.3f)", (double) params.diffusion_dream.alg_temp), - [](common_params & params, const std::string & value) { params.diffusion_dream.alg_temp = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_DREAM })); - - // LLADA-specific diffusion parameters - add_opt(common_arg( - { "--diffusion-block-length" }, "N", - string_format("block length for generation (default: %d)", params.diffusion_llada.block_length), - [](common_params & params, int value) { params.diffusion_llada.block_length = value; } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA })); - add_opt(common_arg( - { "--diffusion-cfg-scale" }, "F", - string_format("classifier-free guidance scale (default: %.3f)", (double) params.diffusion_llada.cfg_scale), - [](common_params & params, const std::string & value) { params.diffusion_llada.cfg_scale = std::stof(value); } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA })); - add_opt(common_arg( - { "--diffusion-algorithm" }, "N", - string_format("remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM (default: %d)", params.diffusion_llada.remasking), - [](common_params & params, int value) { params.diffusion_llada.remasking = value; } - ).set_examples({ LLAMA_EXAMPLE_DIFFUSION_LLADA })); + params.diffusion.algorithm), + [](common_params & params, int value) { params.diffusion.algorithm = value; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + { "--diffusion-dream-alg-temp" }, "F", + string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp), + [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + + add_opt(common_arg( + { "--diffusion-llada-block-length" }, "N", + string_format("llada block length for generation (default: %d)", params.diffusion.block_length), + [](common_params & params, int value) { params.diffusion.block_length = value; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + { "--diffusion-llada-cfg-scale" }, "F", + string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale), + [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + { "--diffusion-llada-algorithm" }, "N", + string_format("llada remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM (default: %d)", params.diffusion.remasking), + [](common_params & params, int value) { params.diffusion.remasking = value; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); return ctx_arg; } diff --git a/common/common.h b/common/common.h index 3a53387074e7d..9da5954c787bd 100644 --- a/common/common.h +++ b/common/common.h @@ -81,8 +81,7 @@ enum llama_example { LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PARALLEL, LLAMA_EXAMPLE_TTS, - LLAMA_EXAMPLE_DIFFUSION_DREAM, - LLAMA_EXAMPLE_DIFFUSION_LLADA, + LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_COUNT, }; @@ -220,20 +219,20 @@ struct common_params_vocoder { bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT }; -struct common_params_diffusion_dream { - int32_t steps = 64; // number of diffusion steps - float eps = 1e-3f; // epsilon for timesteps - int32_t algorithm = 0; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY) - float alg_temp = 0.0f; // algorithm temperature - bool visual_mode = false; // show progressive diffusion on screen -}; +struct common_params_diffusion { + // Common parameters + int32_t steps = 128; // number of diffusion steps + bool visual_mode = false; // show progressive diffusion on screen -struct common_params_diffusion_llada { - int32_t steps = 64; // number of diffusion steps + // Dream-specific parameters + float eps = 1e-3f; // epsilon for timesteps + int32_t algorithm = 3; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY) + float alg_temp = 0.0f; // algorithm temperature + + // LLaDA-specific parameters int32_t block_length = 32; // block length for generation float cfg_scale = 0.2f; // classifier-free guidance scale - int32_t remasking = 0; // remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM - bool visual_mode = false; // show progressive diffusion on screen + int32_t remasking = 1; // remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM }; enum common_reasoning_format { @@ -287,8 +286,7 @@ struct common_params { struct common_params_sampling sampling; struct common_params_speculative speculative; struct common_params_vocoder vocoder; - struct common_params_diffusion_dream diffusion_dream; - struct common_params_diffusion_llada diffusion_llada; + struct common_params_diffusion diffusion; struct common_params_model model; diff --git a/examples/diffusion/CMakeLists.txt b/examples/diffusion/CMakeLists.txt index 459beec5f8011..396549c8029d9 100644 --- a/examples/diffusion/CMakeLists.txt +++ b/examples/diffusion/CMakeLists.txt @@ -1,11 +1,5 @@ -set(TARGET llama-diffusion-dream-cli) -add_executable(${TARGET} diffusion-dream-cli.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_17) - -set(TARGET llama-diffusion-llada-cli) -add_executable(${TARGET} diffusion-llada-cli.cpp) +set(TARGET llama-diffusion-cli) +add_executable(${TARGET} diffusion-cli.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md index 410127df6593d..c497c39b0fe8b 100644 --- a/examples/diffusion/README.md +++ b/examples/diffusion/README.md @@ -2,18 +2,18 @@ This directory contains implementations for diffusion-based text generation using two different model architectures: **Dream** and **LLaDA-8B**. Both models use iterative denoising processes to generate text, but employ different sampling strategies and algorithms. -## Supported Models +## Supported Architechtures -### 1. Dream Model (`llama-diffusion-dream-cli`) +### 1. Dream +Example models: - https://huggingface.co/Dream-org/Dream-v0-Base-7B -- Original PR - https://github.com/ggml-org/llama.cpp/pull/14644 +- PR - https://github.com/ggml-org/llama.cpp/pull/14644 -The Dream model supports four different sampling algorithms controlled by the `--diffusion-algorithm` parameter: +The Dream model supports four different sampling algorithms controlled by the `--diffusion-dream-algorithm` parameter: 1. **ORIGIN (0)** - Original diffusion algorithm - Uses probability transfer based on timestep ratios - - Default algorithm with standard confidence-based token selection 2. **MASKGIT_PLUS (1)** - Enhanced MaskGIT sampling - Improved version of the MaskGIT algorithm @@ -21,16 +21,18 @@ The Dream model supports four different sampling algorithms controlled by the `- 3. **TOPK_MARGIN (2)** - Top-K margin-based sampling - Confidence calculated as the margin between top-1 and top-2 probabilities -4. **ENTROPY (3)** - Entropy-based sampling (recommended) +4. **ENTROPY (3)** - Entropy-based sampling (default, recommended) - Uses entropy calculation for confidence estimation -### 2. LLaDA-8B Model (`llama-diffusion-llada-cli`) +### 2. LLaDA +Example models: - https://huggingface.co/GSAI-ML/LLaDA-8B-Instruct +- PR: https://github.com/ggml-org/llama.cpp/pull/14771 ### LLaDA Model Remasking Strategies -The LLaDA model uses two remasking approaches controlled by the `--diffusion-algorithm` parameter: +The LLaDA model uses two remasking approaches controlled by the `--diffusion-llada-algorithm` parameter: 1. **REMASKING_LOW_CONFIDENCE (0)** - Default strategy - Remasks tokens with lowest confidence scores diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp new file mode 100644 index 0000000000000..a9ebaf0213846 --- /dev/null +++ b/examples/diffusion/diffusion-cli.cpp @@ -0,0 +1,876 @@ +#include "arg.h" +#include "chat.h" +#include "common.h" +#include "llama.h" +#include "log.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +// Dream remasking algorithms +enum diffusion_algorithm_dream { + ORIGIN = 0, + MASKGIT_PLUS = 1, + TOPK_MARGIN = 2, + ENTROPY = 3, +}; + +// LLaDA remasking types +enum diffusion_algorithm_llada { + LOW_CONFIDENCE = 0, + RANDOM = 1, +}; + +typedef bool (*diffusion_step_callback_t)(int32_t step, + int32_t total_steps, + const llama_token * tokens, + int32_t n_tokens, + void * user_data); + +// Unified diffusion parameters structure +struct diffusion_params { + diffusion_params() { + steps = 128; + temperature = 0.2f; + mask_token_id = LLAMA_TOKEN_NULL; + step_callback = nullptr; + step_callback_user_data = nullptr; + seed = 0; + } + + int32_t steps; + float temperature; + llama_token mask_token_id; + diffusion_step_callback_t step_callback; + void * step_callback_user_data; + int32_t seed; + bool visual_mode; +}; + +struct dream_diffusion_params : diffusion_params { + float eps; + float top_p; + int32_t top_k; + enum diffusion_algorithm_dream algorithm; + float alg_temp; +}; + +struct llada_diffusion_params : diffusion_params { + int32_t max_length; + int32_t block_length; + float cfg_scale; + enum diffusion_algorithm_llada algorithm; +}; + +static dream_diffusion_params default_params_dream() { + dream_diffusion_params params = {}; + + // Dream defaults + params.eps = 1e-3f; + params.top_p = 0.95f; + params.top_k = 0; + params.algorithm = diffusion_algorithm_dream::ENTROPY; + params.alg_temp = 0.0f; + + return params; +} + +static llada_diffusion_params default_params_llada() { + llada_diffusion_params params = {}; + + params.max_length = 128; + params.block_length = 32; + params.cfg_scale = 0; + params.algorithm = diffusion_algorithm_llada::LOW_CONFIDENCE; + + return params; +} + +struct callback_data { + diffusion_params * diff_params; + const llama_vocab * vocab; + int32_t n_input; +}; + +static bool diffusion_step_callback(int32_t step, + int32_t total_steps, + const llama_token * tokens, + int32_t n_tokens, + void * user_data) { + (void) user_data; + + callback_data * data = static_cast(user_data); + + auto print_progress_bar = [](int32_t step, int32_t total_steps) { + int progress_percent = (step * 100) / total_steps; + int progress_bars = (step * 50) / total_steps; + LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%", + step, + total_steps, + std::string(progress_bars, '=').c_str(), + std::string(50 - progress_bars, ' ').c_str(), + progress_percent); + }; + + if (data->diff_params->visual_mode) { + // Visual mode: clear + LOG_INF("\033[2J\033[H"); // Clear screen and move cursor to top-left + + print_progress_bar(step, total_steps); + + LOG_INF("\n"); + + std::string current_text = " "; + + for (int32_t i = data->n_input; i < n_tokens; i++) { + std::string token_str; + if (tokens[i] != llama_vocab_mask(data->vocab)) { + char piece[256]; + int n_chars = llama_token_to_piece(data->vocab, tokens[i], piece, sizeof(piece), 0, false); + if (n_chars > 0) { + piece[n_chars] = '\0'; + token_str = piece; + } + } else { + token_str = " "; + } + + current_text += token_str; + } + + LOG_INF("%s\n", current_text.c_str()); + } else { + print_progress_bar(step, total_steps); + } + + return true; +} + +// Helper functions for LLaDA diffusion +static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) { + if (temperature == 0.0f) { + return; + } + + std::uniform_real_distribution uniform(0.0, 1.0); + for (int32_t i = 0; i < n_vocab; i++) { + double noise = uniform(rng); + // Prevent log(0) + noise = std::max(noise, 1e-20); + double gumbel_noise = std::pow(-std::log(noise), temperature); + logits[i] = std::exp(logits[i]) / gumbel_noise; + } +} + +static std::vector get_num_transfer_tokens(int32_t mask_count, int32_t steps) { + std::vector num_transfer_tokens(steps); + + int32_t base = mask_count / steps; + int32_t remainder = mask_count % steps; + + for (int32_t i = 0; i < steps; i++) { + num_transfer_tokens[i] = base + (i < remainder ? 1 : 0); + } + + return num_transfer_tokens; +} + +//End helper functions for LLaDA diffusion + +static void diffusion_generate_dream(llama_context * ctx, + const llama_token * input_tokens, + llama_token * output_tokens, + int32_t n_input, + int32_t max_length, + const dream_diffusion_params & params, + int32_t & n_generated) { + n_generated = 0; + if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) { + return; + } + + const llama_model * model = llama_get_model(ctx); + + // Initialize with input and pad with mask tokens + std::copy(input_tokens, input_tokens + n_input, output_tokens); + std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id); + + std::mt19937 rng(params.seed); + + std::vector timesteps(params.steps + 1); + for (int32_t i = 0; i <= params.steps; i++) { + timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps); + } + + llama_set_causal_attn(ctx, false); + + int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); + + std::vector candidates(n_vocab); + + std::vector conf_candidates; + conf_candidates.reserve(max_length); + + std::vector mask_positions; + mask_positions.reserve(max_length); + + struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params()); + if (params.top_k > 0) { + llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k)); + } + if (params.top_p < 1.0f) { + llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1)); + } + if (params.temperature > 0.0f) { + llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature)); + } + llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed)); + + struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed); + + llama_batch batch = llama_batch_init(max_length, 0, 1); + batch.n_tokens = max_length; + + int64_t total_sampling_time = 0; + int64_t total_time = 0; + + int64_t time_start = ggml_time_us(); + for (int32_t step = 0; step < params.steps; step++) { + if (params.step_callback) { + if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) { + break; + } + } + + for (int32_t i = 0; i < max_length; i++) { + batch.token[i] = output_tokens[i]; + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = 1; + } + + int ret = llama_decode(ctx, batch); + if (ret != 0) { + LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret); + break; + } + + float * raw_logits = llama_get_logits(ctx); + if (!raw_logits) { + LOG_ERR("%s: failed to get logits at step %d\n", __func__, step); + break; + } + + auto get_logits_for_pos = [&](int32_t pos) -> const float * { + return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab; + }; + + int64_t time_start_sampling = ggml_time_us(); + + mask_positions.clear(); + for (int32_t i = 0; i < max_length; i++) { + if (output_tokens[i] == params.mask_token_id) { + mask_positions.push_back(i); + } + } + + if (mask_positions.empty()) { + break; + } + + float t = timesteps[step]; + float s = timesteps[step + 1]; + + if (params.algorithm == diffusion_algorithm_dream::ORIGIN) { + float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f; + + for (int32_t pos : mask_positions) { + if (std::uniform_real_distribution(0.0f, 1.0f)(rng) < p_transfer) { + const float * pos_logits = get_logits_for_pos(pos); + for (int32_t token_id = 0; token_id < n_vocab; token_id++) { + candidates[token_id].id = token_id; + candidates[token_id].logit = pos_logits[token_id]; + candidates[token_id].p = 0.0f; + } + + llama_token_data_array cur_p = { + /* .data = */ candidates.data(), + /* .size = */ (size_t) n_vocab, // Reset size to full vocab + /* .selected = */ -1, + /* .sorted = */ false, + }; + + llama_sampler_apply(sampler, &cur_p); + output_tokens[pos] = cur_p.data[cur_p.selected].id; + } + } + } else { + std::vector> confidences; + std::vector sampled_tokens(mask_positions.size()); + + for (size_t i = 0; i < mask_positions.size(); i++) { + int32_t pos = mask_positions[i]; + const float * pos_logits = get_logits_for_pos(pos); + + for (int32_t token_id = 0; token_id < n_vocab; token_id++) { + candidates[token_id].logit = pos_logits[token_id]; + candidates[token_id].p = 0.0f; + candidates[token_id].id = token_id; + } + + llama_token_data_array cur_p = { + /* .data = */ candidates.data(), + /* .size = */ candidates.size(), + /* .selected = */ -1, + /* .sorted = */ false, + }; + + llama_sampler_apply(sampler, &cur_p); + + llama_token sampled_token = cur_p.data[cur_p.selected].id; + + float confidence = 0.0f; + if (params.algorithm == diffusion_algorithm_dream::ENTROPY) { + const float epsilon = 1e-10f; + for (size_t j = 0; j < cur_p.size; j++) { + float prob = cur_p.data[j].p; + confidence += prob * logf(prob + epsilon); + } + } else if (params.algorithm == diffusion_algorithm_dream::TOPK_MARGIN) { + confidence = cur_p.data[0].p - cur_p.data[1].p; + } else { + confidence = cur_p.data[cur_p.selected].p; + } + + sampled_tokens[i] = sampled_token; + confidences.emplace_back(confidence, i); + } + + int32_t num_transfer = + (step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size(); + + if (num_transfer > 0) { + if (params.alg_temp == 0.0f) { + std::partial_sort(confidences.begin(), + confidences.begin() + num_transfer, + confidences.end(), + [](const std::pair & a, const std::pair & b) { + if (a.first != b.first) { + return a.first > b.first; + } + return a.second < b.second; + }); + } else { + conf_candidates.clear(); + + for (int32_t pos = 0; pos < max_length; pos++) { + float conf_logit = -std::numeric_limits::infinity(); + + auto it = std::find(mask_positions.begin(), mask_positions.end(), pos); + if (it != mask_positions.end()) { + size_t mask_idx = std::distance(mask_positions.begin(), it); + conf_logit = confidences[mask_idx].first / params.alg_temp; // Apply temperature scaling + } + + conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f }); + } + + llama_token_data_array conf_array = { + /* .data = */ conf_candidates.data(), + /* .size = */ conf_candidates.size(), + /* .selected = */ -1, + /* .sorted = */ false, + }; + + for (int32_t i = 0; i < num_transfer; i++) { + // Apply distribution sampler to get selected index + llama_sampler_apply(dist_sampler, &conf_array); + int selected_idx = conf_array.selected; + confidences[i].second = conf_candidates[selected_idx].id; + + conf_candidates[selected_idx].p = 0.0f; + conf_array.selected = -1; + } + } + + if (params.alg_temp == 0.0f) { + // Deterministic - use confidence order + for (int32_t i = 0; i < num_transfer; i++) { + int32_t mask_idx = confidences[i].second; + int32_t pos = mask_positions[mask_idx]; + llama_token token = sampled_tokens[mask_idx]; + output_tokens[pos] = token; + } + } else { + for (int32_t i = 0; i < num_transfer; i++) { + int32_t pos = confidences[i].second; + auto it = std::find(mask_positions.begin(), mask_positions.end(), pos); + if (it != mask_positions.end()) { + int32_t mask_idx = std::distance(mask_positions.begin(), it); + output_tokens[pos] = sampled_tokens[mask_idx]; + } + } + } + } + } + int64_t time_end_sampling = ggml_time_us(); + total_sampling_time += time_end_sampling - time_start_sampling; + } + int64_t time_end = ggml_time_us(); + total_time += time_end - time_start; + + LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n", + total_time / 1000.0, + total_time / 1000.0 / params.steps, + total_sampling_time / 1000.0 / params.steps); + + llama_batch_free(batch); + llama_sampler_free(sampler); + llama_sampler_free(dist_sampler); + + n_generated = max_length; +} + +static void diffusion_generate_llada(llama_context * ctx, + const llama_token * input_tokens, + llama_token * output_tokens, + int32_t n_input, + const llada_diffusion_params & params, + int32_t & n_generated) { + n_generated = 0; + if (!ctx || !input_tokens || !output_tokens || n_input <= 0) { + return; + } + + const llama_model * model = llama_get_model(ctx); + + std::vector in(params.max_length, params.mask_token_id); + std::copy(input_tokens, input_tokens + n_input, in.begin()); + + GGML_ASSERT(params.max_length % params.block_length == 0); + int num_blocks = params.max_length / params.block_length; + + GGML_ASSERT(params.steps % num_blocks == 0); + + int steps = params.steps / num_blocks; + + int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); + llama_set_causal_attn(ctx, false); + + // Pre-allocate buffers for Classifier-Free Guidance + int32_t logits_size = n_vocab * params.max_length; + std::vector cond_logits_buffer; + std::vector un_x_buffer; + if (params.cfg_scale > 0.0f) { + cond_logits_buffer.resize(logits_size); + un_x_buffer.resize(params.max_length); + } + + llama_batch batch = llama_batch_init(params.max_length, 0, 1); + batch.n_tokens = params.max_length; + + std::vector argmax; + std::mt19937 rng(params.seed); + + int64_t total_sampling_time = 0; + int64_t total_time = 0; + + std::vector confidence(params.max_length); + + int64_t time_start = ggml_time_us(); + for (int block_num = 0; block_num < num_blocks; block_num++) { + // Get number of tokens to transfer for this step + int32_t block_start = n_input + block_num * params.block_length; + int32_t block_end = std::min(n_input + (block_num + 1) * params.block_length, params.max_length); + + // Count masked tokens in current block + int32_t block_mask_count = 0; + for (int i = block_start; i < block_end; i++) { + if (in[i] == params.mask_token_id) { + block_mask_count++; + } + } + auto num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps); + + for (int step = 0; step < steps; step++) { + if (params.step_callback) { + if (!params.step_callback(step + block_num * steps, + params.steps, + in.data(), + params.max_length, + params.step_callback_user_data)) { + break; + } + } + + float * logits = nullptr; + + if (params.cfg_scale > 0.0f) { + for (int32_t i = 0; i < batch.n_tokens; i++) { + batch.token[i] = in[i]; + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = 1; + } + + int ret = llama_decode(ctx, batch); + if (ret != 0) { + LOG_ERR("Failed to generate conditional"); + } + float * cond_logits_ptr = llama_get_logits(ctx); + std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float)); + + std::copy(in.begin(), in.end(), un_x_buffer.begin()); + for (int32_t i = 0; i < n_input; i++) { + un_x_buffer[i] = params.mask_token_id; + } + + for (int32_t i = 0; i < batch.n_tokens; i++) { + batch.token[i] = un_x_buffer[i]; + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = 1; + } + ret = llama_decode(ctx, batch); + GGML_ASSERT(ret == 0); + float * uncond_logits = llama_get_logits(ctx); + for (int32_t i = 0; i < logits_size; i++) { + cond_logits_buffer[i] = + uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]); + } + + logits = cond_logits_buffer.data(); + } else { + // Standard generation without CFG + for (int32_t i = 0; i < batch.n_tokens; i++) { + batch.token[i] = in[i]; + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = 1; + } + + int ret = llama_decode(ctx, batch); + if (ret != 0) { + LOG_ERR("Failed to generate"); + } + logits = llama_get_logits(ctx); + } + + int64_t time_start_sampling = ggml_time_us(); + + if (params.temperature > 0.0f) { + add_gumbel_noise(logits, n_vocab, params.temperature, rng); + } + + argmax.clear(); + + for (int i = 0; i < params.max_length; ++i) { + float max_value = std::numeric_limits::min(); + llama_token tok = LLAMA_TOKEN_NULL; + for (int vob = 0; vob < n_vocab; vob++) { + if (logits[n_vocab * i + vob] > max_value) { + max_value = logits[n_vocab * i + vob]; + tok = vob; + } + } + argmax.push_back(tok); + } + + // Create mask index to track which positions are masked + std::vector mask_index(params.max_length); + for (int i = 0; i < params.max_length; i++) { + mask_index[i] = (in[i] == params.mask_token_id); + } + + if (params.algorithm == diffusion_algorithm_llada::LOW_CONFIDENCE) { + // inplace softmax + argmax calculation. TODO: check why llama_sampler is so slow here + for (int i = block_start; i < block_end; i++) { + if (mask_index[i]) { + float * pos_logits = logits + i * n_vocab; + + llama_token best_token = 0; + float max_logit = pos_logits[0]; + for (int32_t j = 1; j < n_vocab; j++) { + if (pos_logits[j] > max_logit) { + max_logit = pos_logits[j]; + best_token = j; + } + } + + float sum_exp = 0.0f; + for (int32_t j = 0; j < n_vocab; j++) { + sum_exp += std::exp(pos_logits[j] - max_logit); + } + + float prob = std::exp(pos_logits[best_token] - max_logit) / sum_exp; + confidence[i] = prob; + + argmax[i] = best_token; + } else { + confidence[i] = -std::numeric_limits::infinity(); // Non-masked positions + } + } + } else if (params.algorithm == diffusion_algorithm_llada::RANDOM) { + // Random remasking: assign random values for masked positions + std::uniform_real_distribution uniform(0.0f, 1.0f); + for (int i = 0; i < params.max_length; i++) { + if (mask_index[i]) { + confidence[i] = uniform(rng); + } else { + confidence[i] = -std::numeric_limits::infinity(); // Non-masked positions + } + } + } + + for (int i = n_input + (block_num + 1) * params.block_length; i < params.max_length; i++) { + confidence[i] = -std::numeric_limits::infinity(); + } + + int32_t transfer_count = num_transfer_tokens[step]; + + std::vector> conf_pairs; + for (int i = n_input; i < params.max_length; i++) { + if (mask_index[i] && confidence[i] > -std::numeric_limits::infinity()) { + conf_pairs.push_back({ confidence[i], i }); + } + } + + std::partial_sort(conf_pairs.begin(), + conf_pairs.begin() + std::min(transfer_count, (int32_t) conf_pairs.size()), + conf_pairs.end(), + [](const std::pair & a, const std::pair & b) { + return a.first > b.first; + }); + + for (int i = 0; i < std::min(transfer_count, (int32_t) conf_pairs.size()); i++) { + int32_t pos = conf_pairs[i].second; + in[pos] = argmax[pos]; + } + + int64_t time_end_sampling = ggml_time_us(); + total_sampling_time += time_end_sampling - time_start_sampling; + } + } + int64_t time_end = ggml_time_us(); + total_time += time_end - time_start; + + LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n", + total_time / 1000.0, + total_time / 1000.0 / params.steps, + total_sampling_time / 1000.0 / params.steps); + + llama_batch_free(batch); + + memcpy(output_tokens, in.data(), in.size() * sizeof(llama_token)); + + n_generated = params.max_length; +} + +static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) { + if (!use_chat_template) { + return prompt; + } + + auto chat_templates = common_chat_templates_init(model, ""); + + common_chat_templates_inputs inputs; + common_chat_msg user_msg; + user_msg.role = "user"; + user_msg.content = prompt; + inputs.add_generation_prompt = true; + inputs.messages.push_back(user_msg); + + auto result = common_chat_templates_apply(chat_templates.get(), inputs); + + return result.prompt; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION)) { + return 1; + } + + common_init(); + llama_backend_init(); + + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = params.n_gpu_layers; + model_params.devices = params.devices.data(); + model_params.use_mmap = params.use_mmap; + model_params.use_mlock = params.use_mlock; + model_params.check_tensors = params.check_tensors; + + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); + if (!model) { + LOG_ERR("error: failed to load model '%s'\n", params.model.path.c_str()); + return 1; + } + + char arch_str[128]; + GGML_ASSERT(llama_model_meta_val_str(model, "general.architecture", arch_str, 128) >= 0); + + std::string arch = std::string(arch_str); + + if (arch != "dream" && arch != "llada") { + LOG_ERR("error: unsupported model architecture '%s' for diffusion. Expected 'dream' or 'llada'\n", arch_str); + llama_model_free(model); + return 1; + } + + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = params.n_ctx; + ctx_params.n_batch = params.n_batch; + ctx_params.n_ubatch = params.n_ubatch; + ctx_params.flash_attn = params.flash_attn; + ctx_params.no_perf = params.no_perf; + ctx_params.type_k = params.cache_type_k; + ctx_params.type_v = params.cache_type_v; + + llama_context * ctx = llama_init_from_model(model, ctx_params); + if (!ctx) { + LOG_ERR("error: failed to create context\n"); + llama_model_free(model); + return 1; + } + + llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads); + + const llama_vocab * vocab = llama_model_get_vocab(model); + std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model); + + std::vector input_tokens = common_tokenize(vocab, + formatted_prompt, + /*add special tokens*/ true, + /*parse special*/ true); + + // For LLaDA models, forcefully add BOS token at the beginning. TODO: check why + if (arch == "llada") { + llama_token bos_token = llama_vocab_bos(vocab); + if (bos_token != LLAMA_TOKEN_NULL && (input_tokens.empty() || input_tokens[0] != bos_token)) { + input_tokens.insert(input_tokens.begin(), bos_token); + } + } + + int n_input = input_tokens.size(); + + if (n_input >= params.n_ctx) { + LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx); + llama_free(ctx); + llama_model_free(model); + return 1; + } + + llama_token mask_token_id = llama_vocab_mask(vocab); + GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL); + + bool visual_mode = params.diffusion.visual_mode; + + int32_t n_generated = 0; + std::vector output_tokens(params.n_ubatch); + + if (arch == "dream") { + struct dream_diffusion_params diff_params = default_params_dream(); + diff_params.mask_token_id = mask_token_id; + diff_params.seed = params.sampling.seed; + diff_params.temperature = params.sampling.temp; + + diff_params.steps = params.diffusion.steps; + diff_params.eps = params.diffusion.eps; + diff_params.top_p = params.sampling.top_p; + diff_params.top_k = params.sampling.top_k; + diff_params.algorithm = static_cast(params.diffusion.algorithm); + diff_params.alg_temp = params.diffusion.alg_temp; + diff_params.visual_mode = params.diffusion.visual_mode; + + diff_params.step_callback = diffusion_step_callback; + callback_data cb_data = { &diff_params, vocab, n_input }; + diff_params.step_callback_user_data = &cb_data; + + GGML_ASSERT(diff_params.algorithm >= 0 && diff_params.algorithm <= 3); + + const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" }; + const char * alg_name = alg_names[params.diffusion.algorithm]; + + LOG_INF("dream_diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); + LOG_INF("dream_diffusion_params: - %-25s u32 = %d\n", "steps", params.diffusion.steps); + LOG_INF("dream_diffusion_params: - %-25s f32 = %.6f\n", "eps", params.diffusion.eps); + LOG_INF("dream_diffusion_params: - %-25s u32 = %d (%s)\n", + "algorithm", + params.diffusion.algorithm, + alg_name); + LOG_INF("dream_diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", params.diffusion.alg_temp); + + diffusion_generate_dream( + ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch, diff_params, n_generated); + } else { + // Use LLaDA parameters + struct llada_diffusion_params diff_params = default_params_llada(); + + diff_params.mask_token_id = mask_token_id; + diff_params.seed = params.sampling.seed; + diff_params.temperature = params.sampling.temp; + + diff_params.steps = params.diffusion.steps; + diff_params.max_length = params.n_ubatch; + diff_params.block_length = params.diffusion.block_length; + diff_params.cfg_scale = params.diffusion.cfg_scale; + diff_params.algorithm = static_cast(params.diffusion.remasking); + diff_params.visual_mode = params.diffusion.visual_mode; + + GGML_ASSERT(diff_params.algorithm >= 0 && diff_params.algorithm <= 1); + + const char * alg_names[] = { "LOW_CONFIDENCE", "RANDOM" }; + const char * alg_name = alg_names[diff_params.algorithm]; + + diff_params.step_callback = diffusion_step_callback; + callback_data cb_data = { &diff_params, vocab, n_input }; + diff_params.step_callback_user_data = &cb_data; + + LOG_INF("llada_diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); + LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "steps", diff_params.steps); + LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "max_length", diff_params.max_length); + LOG_INF("llada_diffusion_params: - %-25s u32 = %d (%s)\n", + "algorithm", + params.diffusion.algorithm, + alg_name); + LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "block_length", diff_params.block_length); + LOG_INF("llada_diffusion_params: - %-25s f32 = %.3f\n", "temperature", diff_params.temperature); + LOG_INF("llada_diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", diff_params.cfg_scale); + + diffusion_generate_llada(ctx, input_tokens.data(), output_tokens.data(), n_input, diff_params, n_generated); + } + + if (n_generated > 0) { + if (visual_mode) { + //clear screen and move cursor to top-left + LOG_INF("\033[2J\033[H"); + } + + output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input); + std::string output_data = common_detokenize(vocab, output_tokens, false); + LOG_INF("\n%s\n", output_data.c_str()); + } else { + LOG_INF("Error: diffusion generation failed\n"); + } + + llama_free(ctx); + llama_model_free(model); + llama_backend_free(); + + return 0; +} diff --git a/examples/diffusion/diffusion-dream-cli.cpp b/examples/diffusion/diffusion-dream-cli.cpp deleted file mode 100644 index 308024263b451..0000000000000 --- a/examples/diffusion/diffusion-dream-cli.cpp +++ /dev/null @@ -1,512 +0,0 @@ -#include "arg.h" -#include "chat.h" -#include "common.h" -#include "llama.h" -#include "log.h" - -#include -#include -#include -#include -#include -#include -#include - -typedef bool (*diffusion_step_callback_t)(int32_t step, - int32_t total_steps, - const llama_token * tokens, - int32_t n_tokens, - void * user_data); - -enum diffusion_alg { - DIFFUSION_ALG_ORIGIN = 0, - DIFFUSION_ALG_MASKGIT_PLUS = 1, - DIFFUSION_ALG_TOPK_MARGIN = 2, - DIFFUSION_ALG_ENTROPY = 3, -}; - -struct diffusion_params { - int32_t steps; - float eps; - float temperature; - float top_p; - int32_t top_k; - llama_token mask_token_id; - enum diffusion_alg algorithm; - float alg_temp; - diffusion_step_callback_t step_callback; - void * step_callback_user_data; - int32_t seed; -}; - - -static diffusion_params diffusion_default_params() { - diffusion_params params = {}; - params.steps = 64; - params.eps = 1e-3f; - params.temperature = 0.2f; - params.top_p = 0.95f; - params.top_k = 0; - params.mask_token_id = LLAMA_TOKEN_NULL; - params.algorithm = DIFFUSION_ALG_ORIGIN; - params.alg_temp = 0.0f; - params.step_callback = nullptr; - params.step_callback_user_data = nullptr; - params.seed = 0; - return params; -} - -static void diffusion_generate(llama_context * ctx, - const llama_token * input_tokens, - llama_token * output_tokens, - int32_t n_input, - int32_t max_length, - struct diffusion_params params, - int32_t & n_generated) { - - n_generated = 0; - if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) { - return; - } - - const llama_model * model = llama_get_model(ctx); - - // Initialize with input and pad with mask tokens - std::copy(input_tokens, input_tokens + n_input, output_tokens); - std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id); - - std::mt19937 rng(params.seed); - - std::vector timesteps(params.steps + 1); - for (int32_t i = 0; i <= params.steps; i++) { - timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps); - } - - llama_set_causal_attn(ctx, false); - - int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); - - std::vector candidates(n_vocab); - - std::vector conf_candidates; - conf_candidates.reserve(max_length); - - std::vector mask_positions; - mask_positions.reserve(max_length); - - struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params()); - if (params.top_k > 0) { - llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k)); - } - if (params.top_p < 1.0f) { - llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1)); - } - if (params.temperature > 0.0f) { - llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature)); - } - llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed)); - - struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed); - - llama_batch batch = llama_batch_init(max_length, 0, 1); - batch.n_tokens = max_length; - - int64_t total_sampling_time = 0; - int64_t total_time = 0; - - int64_t time_start = ggml_time_us(); - for (int32_t step = 0; step < params.steps; step++) { - if (params.step_callback) { - if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) { - break; - } - } - - for (int32_t i = 0; i < max_length; i++) { - batch.token[i] = output_tokens[i]; - batch.pos[i] = i; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = 0; - batch.logits[i] = 1; - } - - int ret = llama_decode(ctx, batch); - if (ret != 0) { - LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret); - break; - } - - float * raw_logits = llama_get_logits(ctx); - if (!raw_logits) { - LOG_ERR("%s: failed to get logits at step %d\n", __func__, step); - break; - } - - auto get_logits_for_pos = [&](int32_t pos) -> const float * { - return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab; - }; - - int64_t time_start_sampling = ggml_time_us(); - - mask_positions.clear(); - for (int32_t i = 0; i < max_length; i++) { - if (output_tokens[i] == params.mask_token_id) { - mask_positions.push_back(i); - } - } - - if (mask_positions.empty()) { - break; - } - - float t = timesteps[step]; - float s = timesteps[step + 1]; - - if (params.algorithm == DIFFUSION_ALG_ORIGIN) { - float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f; - - for (int32_t pos : mask_positions) { - if (std::uniform_real_distribution(0.0f, 1.0f)(rng) < p_transfer) { - const float * pos_logits = get_logits_for_pos(pos); - for (int32_t token_id = 0; token_id < n_vocab; token_id++) { - candidates[token_id].id = token_id; - candidates[token_id].logit = pos_logits[token_id]; - candidates[token_id].p = 0.0f; - } - - llama_token_data_array cur_p = { - /* .data = */ candidates.data(), - /* .size = */ (size_t) n_vocab, // Reset size to full vocab - /* .selected = */ -1, - /* .sorted = */ false, - }; - - llama_sampler_apply(sampler, &cur_p); - output_tokens[pos] = cur_p.data[cur_p.selected].id; - } - } - } else { - std::vector> confidences; - std::vector sampled_tokens(mask_positions.size()); - - for (size_t i = 0; i < mask_positions.size(); i++) { - int32_t pos = mask_positions[i]; - const float * pos_logits = get_logits_for_pos(pos); - - for (int32_t token_id = 0; token_id < n_vocab; token_id++) { - candidates[token_id].logit = pos_logits[token_id]; - candidates[token_id].p = 0.0f; - candidates[token_id].id = token_id; - } - - llama_token_data_array cur_p = { - /* .data = */ candidates.data(), - /* .size = */ candidates.size(), - /* .selected = */ -1, - /* .sorted = */ false, - }; - - llama_sampler_apply(sampler, &cur_p); - - llama_token sampled_token = cur_p.data[cur_p.selected].id; - - float confidence = 0.0f; - if (params.algorithm == DIFFUSION_ALG_ENTROPY) { - const float epsilon = 1e-10f; - for (size_t j = 0; j < cur_p.size; j++) { - float prob = cur_p.data[j].p; - confidence += prob * logf(prob + epsilon); - } - } else if (params.algorithm == DIFFUSION_ALG_TOPK_MARGIN) { - confidence = cur_p.data[0].p - cur_p.data[1].p; - } else { - confidence = cur_p.data[cur_p.selected].p; - } - - sampled_tokens[i] = sampled_token; - confidences.emplace_back(confidence, i); - } - - int32_t num_transfer = - (step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size(); - - if (num_transfer > 0) { - if (params.alg_temp == 0.0f) { - std::partial_sort(confidences.begin(), confidences.begin() + num_transfer, confidences.end(), - [](const std::pair & a, const std::pair & b) { - if (a.first != b.first) { - return a.first > b.first; - } - return a.second < b.second; - }); - } else { - conf_candidates.clear(); - - for (int32_t pos = 0; pos < max_length; pos++) { - float conf_logit = -std::numeric_limits::infinity(); - - auto it = std::find(mask_positions.begin(), mask_positions.end(), pos); - if (it != mask_positions.end()) { - size_t mask_idx = std::distance(mask_positions.begin(), it); - conf_logit = confidences[mask_idx].first / params.alg_temp; // Apply temperature scaling - } - - conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f }); - } - - llama_token_data_array conf_array = { - /* .data = */ conf_candidates.data(), - /* .size = */ conf_candidates.size(), - /* .selected = */ -1, - /* .sorted = */ false, - }; - - for (int32_t i = 0; i < num_transfer; i++) { - // Apply distribution sampler to get selected index - llama_sampler_apply(dist_sampler, &conf_array); - int selected_idx = conf_array.selected; - confidences[i].second = conf_candidates[selected_idx].id; - - conf_candidates[selected_idx].p = 0.0f; - conf_array.selected = -1; - } - } - - if (params.alg_temp == 0.0f) { - // Deterministic - use confidence order - for (int32_t i = 0; i < num_transfer; i++) { - int32_t mask_idx = confidences[i].second; - int32_t pos = mask_positions[mask_idx]; - llama_token token = sampled_tokens[mask_idx]; - output_tokens[pos] = token; - } - } else { - for (int32_t i = 0; i < num_transfer; i++) { - int32_t pos = confidences[i].second; - auto it = std::find(mask_positions.begin(), mask_positions.end(), pos); - if (it != mask_positions.end()) { - int32_t mask_idx = std::distance(mask_positions.begin(), it); - output_tokens[pos] = sampled_tokens[mask_idx]; - } - } - } - } - } - int64_t time_end_sampling = ggml_time_us(); - total_sampling_time += time_end_sampling - time_start_sampling; - } - int64_t time_end = ggml_time_us(); - total_time += time_end - time_start; - - LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n", - total_time / 1000.0, total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps); - - - llama_batch_free(batch); - llama_sampler_free(sampler); - llama_sampler_free(dist_sampler); - - n_generated = max_length; -} - - - - -static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) { - if (!use_chat_template) { - return prompt; - } - - auto chat_templates = common_chat_templates_init(model, ""); - - common_chat_templates_inputs inputs; - common_chat_msg user_msg; - user_msg.role = "user"; - user_msg.content = prompt; - inputs.add_generation_prompt = true; - inputs.messages.push_back(user_msg); - - auto result = common_chat_templates_apply(chat_templates.get(), inputs); - - return result.prompt; -} - -struct callback_data { - const common_params_diffusion_dream * diff_params; - const llama_vocab * vocab; - int32_t n_input; -}; - -static bool diffusion_step_callback(int32_t step, - int32_t total_steps, - const llama_token * tokens, - int32_t n_tokens, - void * user_data) { - (void)user_data; - - callback_data * data = static_cast(user_data); - - auto print_progress_bar = [](int32_t step, int32_t total_steps) { - int progress_percent = (step * 100) / total_steps; - int progress_bars = (step * 50) / total_steps; - LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%", - step, - total_steps, - std::string(progress_bars, '=').c_str(), - std::string(50 - progress_bars, ' ').c_str(), - progress_percent); - }; - - if (data->diff_params->visual_mode) { - // Visual mode: clear - LOG_INF("\033[2J\033[H"); // Clear screen and move cursor to top-left - - print_progress_bar(step, total_steps); - - LOG_INF("\n"); - - std::string current_text = " "; - - for (int32_t i = data->n_input; i < n_tokens; i++) { - std::string token_str; - if (tokens[i] != llama_vocab_mask(data->vocab)) { - char piece[256]; - int n_chars = llama_token_to_piece(data->vocab, tokens[i], piece, sizeof(piece), 0, false); - if (n_chars > 0) { - piece[n_chars] = '\0'; - token_str = piece; - } - } else { - token_str = " "; - } - - current_text += token_str; - } - - LOG_INF("%s\n", current_text.c_str()); - } else { - print_progress_bar(step, total_steps); - } - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - common_params params; - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION_DREAM)) { - return 1; - } - - const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" }; - const char * alg_name = (params.diffusion_dream.algorithm >= 0 && params.diffusion_dream.algorithm <= 3) ? - alg_names[params.diffusion_dream.algorithm] : - "UNKNOWN"; - - common_init(); - llama_backend_init(); - - llama_model_params model_params = llama_model_default_params(); - model_params.n_gpu_layers = params.n_gpu_layers; - model_params.devices = params.devices.data(); - model_params.use_mmap = params.use_mmap; - model_params.use_mlock = params.use_mlock; - model_params.check_tensors = params.check_tensors; - - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); - if (!model) { - LOG_ERR("error: failed to load model '%s'\n", params.model.path.c_str()); - return 1; - } - - // Check if the model architecture is Dream - char arch_str[128]; - GGML_ASSERT(llama_model_meta_val_str(model, "general.architecture", arch_str, 128) >= 0 && - std::string(arch_str) == "dream"); - - llama_context_params ctx_params = llama_context_default_params(); - ctx_params.n_ctx = params.n_ctx; - ctx_params.n_batch = params.n_batch; - ctx_params.n_ubatch = params.n_ubatch; - ctx_params.flash_attn = params.flash_attn; - ctx_params.no_perf = params.no_perf; - ctx_params.type_k = params.cache_type_k; - ctx_params.type_v = params.cache_type_v; - - llama_context * ctx = llama_init_from_model(model, ctx_params); - if (!ctx) { - LOG_ERR("error: failed to create context\n"); - llama_model_free(model); - return 1; - } - - llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads); - - const llama_vocab * vocab = llama_model_get_vocab(model); - std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model); - - std::vector input_tokens = common_tokenize(vocab, formatted_prompt, - /*add special tokens*/ true, - /*parse special*/ true); - int n_input = input_tokens.size(); - - if (n_input >= params.n_ctx) { - LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx); - llama_free(ctx); - llama_model_free(model); - return 1; - } - - struct diffusion_params ldiff_params = diffusion_default_params(); - ldiff_params.steps = params.diffusion_dream.steps; - ldiff_params.eps = params.diffusion_dream.eps; - ldiff_params.temperature = params.sampling.temp; - ldiff_params.top_p = params.sampling.top_p; - ldiff_params.top_k = params.sampling.top_k; - ldiff_params.algorithm = static_cast(params.diffusion_dream.algorithm); - ldiff_params.alg_temp = params.diffusion_dream.alg_temp; - ldiff_params.seed = params.sampling.seed; - - llama_token mask_token_id = llama_vocab_mask(vocab); - GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL); - - LOG_INF("dream_diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); - LOG_INF("dream_diffusion_params: - %-25s u32 = %d\n", "steps", params.diffusion_dream.steps); - LOG_INF("dream_diffusion_params: - %-25s f32 = %.6f\n", "eps", params.diffusion_dream.eps); - LOG_INF("dream_diffusion_params: - %-25s u32 = %d (%s)\n", "algorithm", params.diffusion_dream.algorithm, - alg_name); - LOG_INF("dream_diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", params.diffusion_dream.alg_temp); - - ldiff_params.mask_token_id = mask_token_id; - - callback_data cb_data = { ¶ms.diffusion_dream, vocab, n_input }; - - ldiff_params.step_callback = diffusion_step_callback; - ldiff_params.step_callback_user_data = &cb_data; - - int32_t n_generated = 0; - - std::vector output_tokens(params.n_ubatch); - diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch, - ldiff_params, n_generated); - - if (n_generated > 0) { - if (params.diffusion_dream.visual_mode) { - //clear screen and move cursor to top-left - LOG_INF("\033[2J\033[H"); - } - output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input); - std::string output_data = common_detokenize(vocab, output_tokens, false); - LOG_INF("\n%s\n", output_data.c_str()); - } else { - LOG_INF("Error: diffusion generation failed\n"); - } - - llama_free(ctx); - llama_model_free(model); - llama_backend_free(); - - return 0; -} diff --git a/examples/diffusion/diffusion-llada-cli.cpp b/examples/diffusion/diffusion-llada-cli.cpp deleted file mode 100644 index cab53eef2174a..0000000000000 --- a/examples/diffusion/diffusion-llada-cli.cpp +++ /dev/null @@ -1,505 +0,0 @@ -#include "arg.h" -#include "chat.h" -#include "common.h" -#include "llama.h" -#include "log.h" - - -#include -#include -#include -#include -#include -#include -#include - -enum remasking_type { - REMASKING_LOW_CONFIDENCE = 0, - REMASKING_RANDOM = 1, -}; - -struct diffusion_params_llada { - int32_t steps; - int32_t max_length; - int32_t block_length; - float temperature; - float cfg_scale; - llama_token mask_token_id; - enum remasking_type remasking; - bool (*step_callback)(int32_t step, int32_t total_steps, const llama_token * tokens, int32_t n_tokens, void * user_data); - void * step_callback_user_data; - int32_t seed; -}; - -static diffusion_params_llada diffusion_default_params_llada() { - diffusion_params_llada params = {}; - params.steps = 128; - params.max_length = 256; - params.block_length = 32; - params.temperature = 0.0f; - params.cfg_scale = 0.2f; - params.mask_token_id = LLAMA_TOKEN_NULL; - params.remasking = REMASKING_LOW_CONFIDENCE; - params.step_callback = nullptr; - params.step_callback_user_data = nullptr; - params.seed = 0; - return params; -} - -static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) { - if (temperature == 0.0f) { - return; - } - - std::uniform_real_distribution uniform(0.0, 1.0); - for (int32_t i = 0; i < n_vocab; i++) { - double noise = uniform(rng); - // Prevent log(0) - noise = std::max(noise, 1e-20); - double gumbel_noise = std::pow(-std::log(noise), temperature); - logits[i] = std::exp(logits[i]) / gumbel_noise; - } -} - -static std::vector get_num_transfer_tokens(int32_t mask_count, int32_t steps) { - std::vector num_transfer_tokens(steps); - - int32_t base = mask_count / steps; - int32_t remainder = mask_count % steps; - - for (int32_t i = 0; i < steps; i++) { - num_transfer_tokens[i] = base + (i < remainder ? 1 : 0); - } - - return num_transfer_tokens; -} - -static void diffusion_generate_llada(llama_context * ctx, - const llama_token * input_tokens, - llama_token * output_tokens, - int32_t n_input, - struct diffusion_params_llada params, - int32_t & n_generated) { - n_generated = 0; - if (!ctx || !input_tokens || !output_tokens || n_input <= 0) { - return; - } - - const llama_model * model = llama_get_model(ctx); - - std::vector in(params.max_length, params.mask_token_id); - std::copy(input_tokens, input_tokens + n_input, in.begin()); - - GGML_ASSERT(params.max_length % params.block_length == 0); - int num_blocks = params.max_length / params.block_length; - - GGML_ASSERT(params.steps % num_blocks == 0); - - int steps = params.steps / num_blocks; - - int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); - llama_set_causal_attn(ctx, false); - - // Pre-allocate buffers for Classifier-Free Guidance - int32_t logits_size = n_vocab * params.max_length; - std::vector cond_logits_buffer; - std::vector un_x_buffer; - if (params.cfg_scale > 0.0f) { - cond_logits_buffer.resize(logits_size); - un_x_buffer.resize(params.max_length); - } - - llama_batch batch = llama_batch_init(params.max_length, 0, 1); - batch.n_tokens = params.max_length; - - std::vector argmax; - std::mt19937 rng(params.seed); - - int64_t total_sampling_time = 0; - int64_t total_time = 0; - - std::vector confidence(params.max_length); - - int64_t time_start = ggml_time_us(); - for (int block_num = 0; block_num < num_blocks; block_num++) { - // Get number of tokens to transfer for this step - int32_t block_start = n_input + block_num * params.block_length; - int32_t block_end = std::min(n_input + (block_num + 1) * params.block_length, params.max_length); - - // Count masked tokens in current block - int32_t block_mask_count = 0; - for (int i = block_start; i < block_end; i++) { - if (in[i] == params.mask_token_id) { - block_mask_count++; - } - } - auto num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps); - - for (int step = 0; step < steps; step++) { - if (params.step_callback) { - if (!params.step_callback(step + block_num * steps, - params.steps, in.data(), - params.max_length, - params.step_callback_user_data)) { - break; - } - } - - float * logits = nullptr; - - if (params.cfg_scale > 0.0f) { - for (int32_t i = 0; i < batch.n_tokens; i++) { - batch.token[i] = in[i]; - batch.pos[i] = i; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = 0; - batch.logits[i] = 1; - } - - int ret = llama_decode(ctx, batch); - if (ret != 0) { - LOG_ERR("Failed to generate conditional"); - } - float * cond_logits_ptr = llama_get_logits(ctx); - std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float)); - - std::copy(in.begin(), in.end(), un_x_buffer.begin()); - for (int32_t i = 0; i < n_input; i++) { - un_x_buffer[i] = params.mask_token_id; - } - - for (int32_t i = 0; i < batch.n_tokens; i++) { - batch.token[i] = un_x_buffer[i]; - batch.pos[i] = i; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = 0; - batch.logits[i] = 1; - } - ret = llama_decode(ctx, batch); - GGML_ASSERT(ret == 0); - float * uncond_logits = llama_get_logits(ctx); - for (int32_t i = 0; i < logits_size; i++) { - cond_logits_buffer[i] = - uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]); - } - - logits = cond_logits_buffer.data(); - } else { - // Standard generation without CFG - for (int32_t i = 0; i < batch.n_tokens; i++) { - batch.token[i] = in[i]; - batch.pos[i] = i; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = 0; - batch.logits[i] = 1; - } - - int ret = llama_decode(ctx, batch); - if (ret != 0) { - LOG_ERR("Failed to generate"); - } - logits = llama_get_logits(ctx); - } - - int64_t time_start_sampling = ggml_time_us(); - - if (params.temperature > 0.0f) { - add_gumbel_noise(logits, n_vocab, params.temperature, rng); - } - - argmax.clear(); - - for (int i = 0; i < params.max_length; ++i) { - float max_value = std::numeric_limits::min(); - llama_token tok = LLAMA_TOKEN_NULL; - for (int vob = 0; vob < n_vocab; vob++) { - if (logits[n_vocab * i + vob] > max_value) { - max_value = logits[n_vocab * i + vob]; - tok = vob; - } - } - argmax.push_back(tok); - } - - // Create mask index to track which positions are masked - std::vector mask_index(params.max_length); - for (int i = 0; i < params.max_length; i++) { - mask_index[i] = (in[i] == params.mask_token_id); - } - - if (params.remasking == REMASKING_LOW_CONFIDENCE) { - // inplace softmax + argmax calculation. TODO: check why llama_sampler is so slow here - for (int i = block_start; i < block_end; i++) { - if (mask_index[i]) { - float * pos_logits = logits + i * n_vocab; - - llama_token best_token = 0; - float max_logit = pos_logits[0]; - for (int32_t j = 1; j < n_vocab; j++) { - if (pos_logits[j] > max_logit) { - max_logit = pos_logits[j]; - best_token = j; - } - } - - float sum_exp = 0.0f; - for (int32_t j = 0; j < n_vocab; j++) { - sum_exp += std::exp(pos_logits[j] - max_logit); - } - - float prob = std::exp(pos_logits[best_token] - max_logit) / sum_exp; - confidence[i] = prob; - - argmax[i] = best_token; - } else { - confidence[i] = -std::numeric_limits::infinity(); // Non-masked positions - } - } - } else if (params.remasking == REMASKING_RANDOM) { - // Random remasking: assign random values for masked positions - std::uniform_real_distribution uniform(0.0f, 1.0f); - for (int i = 0; i < params.max_length; i++) { - if (mask_index[i]) { - confidence[i] = uniform(rng); - } else { - confidence[i] = -std::numeric_limits::infinity(); // Non-masked positions - } - } - } - - for (int i = n_input + (block_num + 1) * params.block_length; i < params.max_length; i++) { - confidence[i] = -std::numeric_limits::infinity(); - } - - int32_t transfer_count = num_transfer_tokens[step]; - - std::vector> conf_pairs; - for (int i = n_input; i < params.max_length; i++) { - if (mask_index[i] && confidence[i] > -std::numeric_limits::infinity()) { - conf_pairs.push_back({ confidence[i], i }); - } - } - - std::partial_sort( - conf_pairs.begin(), conf_pairs.begin() + std::min(transfer_count, (int32_t) conf_pairs.size()), - conf_pairs.end(), [](const std::pair & a, const std::pair & b) { - return a.first > b.first; - }); - - for (int i = 0; i < std::min(transfer_count, (int32_t) conf_pairs.size()); i++) { - int32_t pos = conf_pairs[i].second; - in[pos] = argmax[pos]; - } - - int64_t time_end_sampling = ggml_time_us(); - total_sampling_time += time_end_sampling - time_start_sampling; - } - } - int64_t time_end = ggml_time_us(); - total_time += time_end - time_start; - - LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n", total_time / 1000.0, - total_time / 1000.0 / params.steps, total_sampling_time / 1000.0 / params.steps); - - llama_batch_free(batch); - - memcpy(output_tokens, in.data(), in.size() * sizeof(llama_token)); - - n_generated = params.max_length; -} - -static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) { - if (!use_chat_template) { - return prompt; - } - - auto chat_templates = common_chat_templates_init(model, ""); - - common_chat_templates_inputs inputs; - common_chat_msg user_msg; - user_msg.role = "user"; - user_msg.content = prompt; - inputs.add_generation_prompt = true; - inputs.messages.push_back(user_msg); - - auto result = common_chat_templates_apply(chat_templates.get(), inputs); - - return result.prompt; -} - -struct callback_data { - const common_params_diffusion_llada * diff_params; - const llama_vocab * vocab; - int32_t n_input; -}; - -static bool diffusion_step_callback(int32_t step, - int32_t total_steps, - const llama_token * tokens, - int32_t n_tokens, - void * user_data) { - callback_data * data = static_cast(user_data); - - auto print_progress_bar = [](int32_t step, int32_t total_steps) { - int progress_percent = (step * 100) / total_steps; - int progress_bars = (step * 50) / total_steps; - LOG_INF("\rdiffusion step: %d/%d [%s%s] %d%%", - step, - total_steps, - std::string(progress_bars, '=').c_str(), - std::string(50 - progress_bars, ' ').c_str(), - progress_percent); - }; - - if (data->diff_params->visual_mode) { - // Visual mode: clear - LOG_INF("\033[2J\033[H"); // Clear screen and move cursor to top-left - - print_progress_bar(step, total_steps); - - LOG_INF("\n"); - - std::string current_text = " "; - - for (int32_t i = data->n_input; i < n_tokens; i++) { - std::string token_str; - if (tokens[i] != llama_vocab_mask(data->vocab)) { - char piece[256]; - int n_chars = llama_token_to_piece(data->vocab, tokens[i], piece, sizeof(piece), 0, false); - if (n_chars > 0) { - piece[n_chars] = '\0'; - token_str = piece; - } - } else { - token_str = " "; - } - - current_text += token_str; - } - - LOG_INF("%s\n", current_text.c_str()); - } else { - print_progress_bar(step, total_steps); - } - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - common_params params; - - if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DIFFUSION_LLADA)) { - return 1; - } - - common_init(); - llama_backend_init(); - - llama_model_params model_params = llama_model_default_params(); - model_params.n_gpu_layers = params.n_gpu_layers; - model_params.devices = params.devices.data(); - model_params.use_mmap = params.use_mmap; - model_params.use_mlock = params.use_mlock; - model_params.check_tensors = params.check_tensors; - - llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); - if (!model) { - LOG_ERR("error: failed to load model '%s'\n", params.model.path.c_str()); - return 1; - } - - char arch_str[128]; - GGML_ASSERT(llama_model_meta_val_str(model, "general.architecture", arch_str, 128) >= 0 && - std::string(arch_str) == "llada"); - - llama_context_params ctx_params = llama_context_default_params(); - ctx_params.n_ctx = params.n_ctx; - ctx_params.n_batch = params.n_batch; - ctx_params.n_ubatch = params.n_ubatch; - ctx_params.flash_attn = params.flash_attn; - ctx_params.no_perf = params.no_perf; - ctx_params.type_k = params.cache_type_k; - ctx_params.type_v = params.cache_type_v; - - llama_context * ctx = llama_init_from_model(model, ctx_params); - if (!ctx) { - LOG_ERR("error: failed to create context\n"); - llama_model_free(model); - return 1; - } - - llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads); - - const llama_vocab * vocab = llama_model_get_vocab(model); - std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model); - - std::vector input_tokens = common_tokenize(vocab, formatted_prompt, - /*add special tokens*/ true, - /*parse special*/ true); - - // For LLaDA models, forcefully add BOS token at the beginning. TODO: check why this is needed vs HF - llama_token bos_token = llama_vocab_bos(vocab); - if (bos_token != LLAMA_TOKEN_NULL && (input_tokens.empty() || input_tokens[0] != bos_token)) { - input_tokens.insert(input_tokens.begin(), bos_token); - } - - int n_input = input_tokens.size(); - - if (n_input >= params.n_ctx) { - LOG_ERR("error: input too long (%d tokens), max context is %d\n", n_input, params.n_ctx); - llama_free(ctx); - llama_model_free(model); - return 1; - } - - llama_token mask_token_id = llama_vocab_mask(vocab); - GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL); - - diffusion_params_llada llada_params = diffusion_default_params_llada(); - llada_params.steps = params.diffusion_llada.steps; - llada_params.block_length = params.diffusion_llada.block_length; - llada_params.temperature = params.sampling.temp; - llada_params.cfg_scale = params.diffusion_llada.cfg_scale; - llada_params.remasking = static_cast(params.diffusion_llada.remasking); - llada_params.mask_token_id = mask_token_id; - llada_params.seed = params.sampling.seed; - llada_params.max_length = params.n_ubatch; - - callback_data cb_data = { ¶ms.diffusion_llada, vocab, n_input }; - llada_params.step_callback = diffusion_step_callback; - llada_params.step_callback_user_data = &cb_data; - - LOG_INF("Using LLaDA diffusion generation\n"); - LOG_INF("llada_diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); - LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "steps", llada_params.steps); - LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "max_length", llada_params.max_length); - LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "block_length", llada_params.block_length); - LOG_INF("llada_diffusion_params: - %-25s f32 = %.3f\n", "temperature", llada_params.temperature); - LOG_INF("llada_diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", llada_params.cfg_scale); - - int32_t n_generated = 0; - std::vector output_tokens(params.n_ubatch); - - diffusion_generate_llada(ctx, input_tokens.data(), output_tokens.data(), n_input, llada_params, n_generated); - - if (n_generated > 0) { - if (params.diffusion_llada.visual_mode) { - //clear screen and move cursor to top-left - LOG_INF("\033[2J\033[H"); - } - - output_tokens.erase(output_tokens.begin(), output_tokens.begin() + n_input); - std::string output_data = common_detokenize(vocab, output_tokens, false); - LOG_INF("\n%s\n", output_data.c_str()); - } else { - LOG_INF("Error: diffusion generation failed\n"); - } - - llama_free(ctx); - llama_model_free(model); - llama_backend_free(); - - return 0; -} From 3e7efcba8adbb66151933cbe4dab9069371b2097 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 28 Jul 2025 14:20:44 +0800 Subject: [PATCH 06/12] Remove model-specific sampling --- common/arg.cpp | 19 +- common/common.h | 24 +- convert_hf_to_gguf.py | 6 + examples/diffusion/README.md | 44 +- examples/diffusion/diffusion-cli.cpp | 832 +++++++++++---------------- gguf-py/gguf/constants.py | 3 + gguf-py/gguf/gguf_writer.py | 5 + gguf-py/gguf/tensor_mapping.py | 9 - src/llama-arch.cpp | 3 - 9 files changed, 360 insertions(+), 585 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 97902f41edb9f..74137d2db959d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3451,37 +3451,38 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( - { "--diffusion--dream-eps" }, "F", + { "--diffusion-eps" }, "F", string_format("epsilon for timesteps (default: %.6f)", (double) params.diffusion.eps), [](common_params & params, const std::string & value) { params.diffusion.eps = std::stof(value); } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( - { "--diffusion-dream-algorithm" }, "N", - string_format("diffusion algorithm: 0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY (default: %d)", + { "--diffusion-algorithm" }, "N", + string_format("diffusion algorithm: 0=ORIGIN, 1=ENTROPY_BASED, 2=MARGIN_BASED, 3=RANDOM, 4=LOW_CONFIDENCE (default: %d)", params.diffusion.algorithm), [](common_params & params, int value) { params.diffusion.algorithm = value; } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( - { "--diffusion-dream-alg-temp" }, "F", + { "--diffusion-alg-temp" }, "F", string_format("dream algorithm temperature (default: %.3f)", (double) params.diffusion.alg_temp), [](common_params & params, const std::string & value) { params.diffusion.alg_temp = std::stof(value); } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( - { "--diffusion-llada-block-length" }, "N", + { "--diffusion-block-length" }, "N", string_format("llada block length for generation (default: %d)", params.diffusion.block_length), [](common_params & params, int value) { params.diffusion.block_length = value; } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( - { "--diffusion-llada-cfg-scale" }, "F", + { "--diffusion-cfg-scale" }, "F", string_format("llada classifier-free guidance scale (default: %.3f)", (double) params.diffusion.cfg_scale), [](common_params & params, const std::string & value) { params.diffusion.cfg_scale = std::stof(value); } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( - { "--diffusion-llada-algorithm" }, "N", - string_format("llada remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM (default: %d)", params.diffusion.remasking), - [](common_params & params, int value) { params.diffusion.remasking = value; } + { "--diffusion-add-gumbel-noise" }, "F", + string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"), + [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + return ctx_arg; } diff --git a/common/common.h b/common/common.h index 9da5954c787bd..a69a1d31b6d92 100644 --- a/common/common.h +++ b/common/common.h @@ -220,19 +220,17 @@ struct common_params_vocoder { }; struct common_params_diffusion { - // Common parameters - int32_t steps = 128; // number of diffusion steps - bool visual_mode = false; // show progressive diffusion on screen - - // Dream-specific parameters - float eps = 1e-3f; // epsilon for timesteps - int32_t algorithm = 3; // diffusion algorithm (0=ORIGIN, 1=MASKGIT_PLUS, 2=TOPK_MARGIN, 3=ENTROPY) - float alg_temp = 0.0f; // algorithm temperature - - // LLaDA-specific parameters - int32_t block_length = 32; // block length for generation - float cfg_scale = 0.2f; // classifier-free guidance scale - int32_t remasking = 1; // remasking algorithm: 0=LOW_CONFIDENCE, 1=RANDOM + int32_t steps = 128; + bool visual_mode = false; + + float eps = 0; // epsilon for timesteps + int32_t block_length = 32; // block length for generation + + int32_t algorithm = 4; // default algorithm: low-confidence + float alg_temp = 0.0f; // algorithm temperature + + float cfg_scale = 0; // classifier-free guidance scale + bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0 }; enum common_reasoning_format { diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f2465220e94a5..db3fc31d1a1e8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2988,9 +2988,15 @@ def set_gguf_parameters(self): # Add LLaDA-specific parameters mask_token_id = self.hparams.get("mask_token_id") + if mask_token_id is not None: self.gguf_writer.add_mask_token_id(mask_token_id) + self.gguf_writer.add_add_bos_token(True) + + logging.info("Adding diffusion shift logits to False") + self.gguf_writer.add_diffusion_shift_logits(False) + @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md index c497c39b0fe8b..38c4ad8d3809b 100644 --- a/examples/diffusion/README.md +++ b/examples/diffusion/README.md @@ -1,41 +1,7 @@ -# Diffusion Text Generation Examples +# Diffusion Text Generation -This directory contains implementations for diffusion-based text generation using two different model architectures: **Dream** and **LLaDA-8B**. Both models use iterative denoising processes to generate text, but employ different sampling strategies and algorithms. +This directory contains implementations for Diffusion LLMs (DLLMs) -## Supported Architechtures - -### 1. Dream - -Example models: -- https://huggingface.co/Dream-org/Dream-v0-Base-7B -- PR - https://github.com/ggml-org/llama.cpp/pull/14644 - -The Dream model supports four different sampling algorithms controlled by the `--diffusion-dream-algorithm` parameter: - -1. **ORIGIN (0)** - Original diffusion algorithm - - Uses probability transfer based on timestep ratios - -2. **MASKGIT_PLUS (1)** - Enhanced MaskGIT sampling - - Improved version of the MaskGIT algorithm - -3. **TOPK_MARGIN (2)** - Top-K margin-based sampling - - Confidence calculated as the margin between top-1 and top-2 probabilities - -4. **ENTROPY (3)** - Entropy-based sampling (default, recommended) - - Uses entropy calculation for confidence estimation - -### 2. LLaDA - -Example models: -- https://huggingface.co/GSAI-ML/LLaDA-8B-Instruct -- PR: https://github.com/ggml-org/llama.cpp/pull/14771 - -### LLaDA Model Remasking Strategies - -The LLaDA model uses two remasking approaches controlled by the `--diffusion-llada-algorithm` parameter: - -1. **REMASKING_LOW_CONFIDENCE (0)** - Default strategy - - Remasks tokens with lowest confidence scores - - Uses softmax probabilities to determine confidence - -2. **REMASKING_RANDOM (1)** - Random remasking +More Info: +- https://github.com/ggml-org/llama.cpp/pull/14644 +- https://github.com/ggml-org/llama.cpp/pull/14771 diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index a9ebaf0213846..9a32b39e8b8e1 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -14,18 +14,12 @@ #include #include -// Dream remasking algorithms -enum diffusion_algorithm_dream { - ORIGIN = 0, - MASKGIT_PLUS = 1, - TOPK_MARGIN = 2, - ENTROPY = 3, -}; +enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 }; -// LLaDA remasking types -enum diffusion_algorithm_llada { - LOW_CONFIDENCE = 0, - RANDOM = 1, +// Unified transfer scheduling methods +enum transfer_schedule { + TIMESTEP_BASED = 0, // Dream-style: (1.0 - s/t) * remaining + BLOCK_BASED = 1, // LLaDA-style: process in blocks with get_num_transfer_tokens }; typedef bool (*diffusion_step_callback_t)(int32_t step, @@ -34,71 +28,99 @@ typedef bool (*diffusion_step_callback_t)(int32_t step, int32_t n_tokens, void * user_data); -// Unified diffusion parameters structure struct diffusion_params { - diffusion_params() { - steps = 128; - temperature = 0.2f; - mask_token_id = LLAMA_TOKEN_NULL; - step_callback = nullptr; - step_callback_user_data = nullptr; - seed = 0; - } - - int32_t steps; - float temperature; - llama_token mask_token_id; - diffusion_step_callback_t step_callback; - void * step_callback_user_data; - int32_t seed; - bool visual_mode; + int32_t steps{}; + float temperature{}; + llama_token mask_token_id{}; + diffusion_step_callback_t step_callback{}; + void * step_callback_user_data{}; + int32_t seed{}; + bool visual_mode{}; + bool shift_logits{}; // Shift logits by -1 after decode + + float top_p{ 0 }; + int32_t top_k{ 0 }; + + diffusion_algorithm algorithm{ CONFIDENCE_BASED }; + transfer_schedule schedule{ TIMESTEP_BASED }; + + float cfg_scale{ 0 }; // Config scale for classifier-free guidance + float eps{ 0 }; // Timestep scheduling + int32_t block_length{ 0 }; // Block size (for block scheduling) + float alg_temp{ 0 }; // algorithm temperature (0.0 = deterministic) + bool add_gumbel_noise{ false }; // Add gumbel noise to the logits if temp > 0.0 + + int32_t max_length{}; // Maximum sequence length }; -struct dream_diffusion_params : diffusion_params { - float eps; - float top_p; - int32_t top_k; - enum diffusion_algorithm_dream algorithm; - float alg_temp; +struct callback_data { + diffusion_params * diff_params; + const llama_vocab * vocab; + int32_t n_input; }; -struct llada_diffusion_params : diffusion_params { - int32_t max_length; - int32_t block_length; - float cfg_scale; - enum diffusion_algorithm_llada algorithm; -}; +static float calculate_confidence(const llama_token_data_array & cur_p, + diffusion_algorithm algorithm, + std::mt19937 & rng) { + switch (algorithm) { + case CONFIDENCE_BASED: + return cur_p.data[cur_p.selected].p; // Selected token probability + + case ENTROPY_BASED: + { + float entropy = 0.0f; + const float epsilon = 1e-10f; + for (size_t i = 0; i < cur_p.size; i++) { + float prob = cur_p.data[i].p; + entropy += prob * logf(prob + epsilon); + } + return -entropy; // Higher entropy = lower confidence + } + + case MARGIN_BASED: + return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p; -static dream_diffusion_params default_params_dream() { - dream_diffusion_params params = {}; + case RANDOM: + { + std::uniform_real_distribution uniform(0.0f, 1.0f); + return uniform(rng); // Random confidence + } - // Dream defaults - params.eps = 1e-3f; - params.top_p = 0.95f; - params.top_k = 0; - params.algorithm = diffusion_algorithm_dream::ENTROPY; - params.alg_temp = 0.0f; + case ORIGIN: + return cur_p.data[cur_p.selected].p; - return params; + default: + return 0.0f; + } } -static llada_diffusion_params default_params_llada() { - llada_diffusion_params params = {}; +// Unified transfer count calculation function +static int32_t calculate_transfer_count(int32_t step, + int32_t total_steps, + int32_t remaining_masked, + transfer_schedule schedule, + float eps, + const std::vector & num_transfer_tokens = {}) { + switch (schedule) { + case TIMESTEP_BASED: + { + float t = 1.0f - (float) step / total_steps * (1.0f - eps); + float s = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps); + float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f; + return (int32_t) (remaining_masked * p_transfer); + } - params.max_length = 128; - params.block_length = 32; - params.cfg_scale = 0; - params.algorithm = diffusion_algorithm_llada::LOW_CONFIDENCE; + case BLOCK_BASED: + if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) { + return num_transfer_tokens[step]; + } + return remaining_masked / (total_steps - step); // Fallback - return params; + default: + return remaining_masked / (total_steps - step); + } } -struct callback_data { - diffusion_params * diff_params; - const llama_vocab * vocab; - int32_t n_input; -}; - static bool diffusion_step_callback(int32_t step, int32_t total_steps, const llama_token * tokens, @@ -153,7 +175,6 @@ static bool diffusion_step_callback(int32_t step, return true; } -// Helper functions for LLaDA diffusion static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) { if (temperature == 0.0f) { return; @@ -182,17 +203,14 @@ static std::vector get_num_transfer_tokens(int32_t mask_count, int32_t return num_transfer_tokens; } -//End helper functions for LLaDA diffusion - -static void diffusion_generate_dream(llama_context * ctx, - const llama_token * input_tokens, - llama_token * output_tokens, - int32_t n_input, - int32_t max_length, - const dream_diffusion_params & params, - int32_t & n_generated) { +static void diffusion_generate(llama_context * ctx, + const llama_token * input_tokens, + llama_token * output_tokens, + int32_t n_input, + const diffusion_params & params, + int32_t & n_generated) { n_generated = 0; - if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || max_length <= n_input) { + if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) { return; } @@ -200,27 +218,21 @@ static void diffusion_generate_dream(llama_context * ctx, // Initialize with input and pad with mask tokens std::copy(input_tokens, input_tokens + n_input, output_tokens); - std::fill(output_tokens + n_input, output_tokens + max_length, params.mask_token_id); + std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id); std::mt19937 rng(params.seed); - std::vector timesteps(params.steps + 1); - for (int32_t i = 0; i <= params.steps; i++) { - timesteps[i] = 1.0f - (float) i / params.steps * (1.0f - params.eps); - } - llama_set_causal_attn(ctx, false); int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); std::vector candidates(n_vocab); - std::vector conf_candidates; - conf_candidates.reserve(max_length); - + conf_candidates.reserve(params.max_length); std::vector mask_positions; - mask_positions.reserve(max_length); + mask_positions.reserve(params.max_length); + // Setup sampler chain struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params()); if (params.top_k > 0) { llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k)); @@ -235,237 +247,10 @@ static void diffusion_generate_dream(llama_context * ctx, struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed); - llama_batch batch = llama_batch_init(max_length, 0, 1); - batch.n_tokens = max_length; - - int64_t total_sampling_time = 0; - int64_t total_time = 0; - - int64_t time_start = ggml_time_us(); - for (int32_t step = 0; step < params.steps; step++) { - if (params.step_callback) { - if (!params.step_callback(step, params.steps, output_tokens, max_length, params.step_callback_user_data)) { - break; - } - } - - for (int32_t i = 0; i < max_length; i++) { - batch.token[i] = output_tokens[i]; - batch.pos[i] = i; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = 0; - batch.logits[i] = 1; - } - - int ret = llama_decode(ctx, batch); - if (ret != 0) { - LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, step, ret); - break; - } - - float * raw_logits = llama_get_logits(ctx); - if (!raw_logits) { - LOG_ERR("%s: failed to get logits at step %d\n", __func__, step); - break; - } - - auto get_logits_for_pos = [&](int32_t pos) -> const float * { - return pos == 0 ? raw_logits : raw_logits + (pos - 1) * n_vocab; - }; - - int64_t time_start_sampling = ggml_time_us(); - - mask_positions.clear(); - for (int32_t i = 0; i < max_length; i++) { - if (output_tokens[i] == params.mask_token_id) { - mask_positions.push_back(i); - } - } - - if (mask_positions.empty()) { - break; - } - - float t = timesteps[step]; - float s = timesteps[step + 1]; - - if (params.algorithm == diffusion_algorithm_dream::ORIGIN) { - float p_transfer = (step < params.steps - 1) ? (1.0f - s / t) : 1.0f; - - for (int32_t pos : mask_positions) { - if (std::uniform_real_distribution(0.0f, 1.0f)(rng) < p_transfer) { - const float * pos_logits = get_logits_for_pos(pos); - for (int32_t token_id = 0; token_id < n_vocab; token_id++) { - candidates[token_id].id = token_id; - candidates[token_id].logit = pos_logits[token_id]; - candidates[token_id].p = 0.0f; - } - - llama_token_data_array cur_p = { - /* .data = */ candidates.data(), - /* .size = */ (size_t) n_vocab, // Reset size to full vocab - /* .selected = */ -1, - /* .sorted = */ false, - }; - - llama_sampler_apply(sampler, &cur_p); - output_tokens[pos] = cur_p.data[cur_p.selected].id; - } - } - } else { - std::vector> confidences; - std::vector sampled_tokens(mask_positions.size()); - - for (size_t i = 0; i < mask_positions.size(); i++) { - int32_t pos = mask_positions[i]; - const float * pos_logits = get_logits_for_pos(pos); - - for (int32_t token_id = 0; token_id < n_vocab; token_id++) { - candidates[token_id].logit = pos_logits[token_id]; - candidates[token_id].p = 0.0f; - candidates[token_id].id = token_id; - } - - llama_token_data_array cur_p = { - /* .data = */ candidates.data(), - /* .size = */ candidates.size(), - /* .selected = */ -1, - /* .sorted = */ false, - }; - - llama_sampler_apply(sampler, &cur_p); - - llama_token sampled_token = cur_p.data[cur_p.selected].id; - - float confidence = 0.0f; - if (params.algorithm == diffusion_algorithm_dream::ENTROPY) { - const float epsilon = 1e-10f; - for (size_t j = 0; j < cur_p.size; j++) { - float prob = cur_p.data[j].p; - confidence += prob * logf(prob + epsilon); - } - } else if (params.algorithm == diffusion_algorithm_dream::TOPK_MARGIN) { - confidence = cur_p.data[0].p - cur_p.data[1].p; - } else { - confidence = cur_p.data[cur_p.selected].p; - } - - sampled_tokens[i] = sampled_token; - confidences.emplace_back(confidence, i); - } - - int32_t num_transfer = - (step < params.steps - 1) ? (int32_t) (mask_positions.size() * (1.0f - s / t)) : mask_positions.size(); - - if (num_transfer > 0) { - if (params.alg_temp == 0.0f) { - std::partial_sort(confidences.begin(), - confidences.begin() + num_transfer, - confidences.end(), - [](const std::pair & a, const std::pair & b) { - if (a.first != b.first) { - return a.first > b.first; - } - return a.second < b.second; - }); - } else { - conf_candidates.clear(); - - for (int32_t pos = 0; pos < max_length; pos++) { - float conf_logit = -std::numeric_limits::infinity(); - - auto it = std::find(mask_positions.begin(), mask_positions.end(), pos); - if (it != mask_positions.end()) { - size_t mask_idx = std::distance(mask_positions.begin(), it); - conf_logit = confidences[mask_idx].first / params.alg_temp; // Apply temperature scaling - } - - conf_candidates.emplace_back(llama_token_data{ pos, conf_logit, 0.0f }); - } - - llama_token_data_array conf_array = { - /* .data = */ conf_candidates.data(), - /* .size = */ conf_candidates.size(), - /* .selected = */ -1, - /* .sorted = */ false, - }; - - for (int32_t i = 0; i < num_transfer; i++) { - // Apply distribution sampler to get selected index - llama_sampler_apply(dist_sampler, &conf_array); - int selected_idx = conf_array.selected; - confidences[i].second = conf_candidates[selected_idx].id; - - conf_candidates[selected_idx].p = 0.0f; - conf_array.selected = -1; - } - } - - if (params.alg_temp == 0.0f) { - // Deterministic - use confidence order - for (int32_t i = 0; i < num_transfer; i++) { - int32_t mask_idx = confidences[i].second; - int32_t pos = mask_positions[mask_idx]; - llama_token token = sampled_tokens[mask_idx]; - output_tokens[pos] = token; - } - } else { - for (int32_t i = 0; i < num_transfer; i++) { - int32_t pos = confidences[i].second; - auto it = std::find(mask_positions.begin(), mask_positions.end(), pos); - if (it != mask_positions.end()) { - int32_t mask_idx = std::distance(mask_positions.begin(), it); - output_tokens[pos] = sampled_tokens[mask_idx]; - } - } - } - } - } - int64_t time_end_sampling = ggml_time_us(); - total_sampling_time += time_end_sampling - time_start_sampling; - } - int64_t time_end = ggml_time_us(); - total_time += time_end - time_start; - - LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n", - total_time / 1000.0, - total_time / 1000.0 / params.steps, - total_sampling_time / 1000.0 / params.steps); - - llama_batch_free(batch); - llama_sampler_free(sampler); - llama_sampler_free(dist_sampler); - - n_generated = max_length; -} - -static void diffusion_generate_llada(llama_context * ctx, - const llama_token * input_tokens, - llama_token * output_tokens, - int32_t n_input, - const llada_diffusion_params & params, - int32_t & n_generated) { - n_generated = 0; - if (!ctx || !input_tokens || !output_tokens || n_input <= 0) { - return; - } - - const llama_model * model = llama_get_model(ctx); - - std::vector in(params.max_length, params.mask_token_id); - std::copy(input_tokens, input_tokens + n_input, in.begin()); - - GGML_ASSERT(params.max_length % params.block_length == 0); - int num_blocks = params.max_length / params.block_length; - - GGML_ASSERT(params.steps % num_blocks == 0); - - int steps = params.steps / num_blocks; - - int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model)); - llama_set_causal_attn(ctx, false); + llama_batch batch = llama_batch_init(params.max_length, 0, 1); + batch.n_tokens = params.max_length; - // Pre-allocate buffers for Classifier-Free Guidance + // Pre-allocate buffers for CFG if needed int32_t logits_size = n_vocab * params.max_length; std::vector cond_logits_buffer; std::vector un_x_buffer; @@ -474,194 +259,243 @@ static void diffusion_generate_llada(llama_context * ctx, un_x_buffer.resize(params.max_length); } - llama_batch batch = llama_batch_init(params.max_length, 0, 1); - batch.n_tokens = params.max_length; + // For block-based processing + std::vector num_transfer_tokens; + int32_t num_blocks = 1; + int32_t steps_per_block = params.steps; + + if (params.schedule == BLOCK_BASED) { + GGML_ASSERT(params.max_length % params.block_length == 0); + num_blocks = params.max_length / params.block_length; + GGML_ASSERT(params.steps % num_blocks == 0); + steps_per_block = params.steps / num_blocks; + } + std::vector confidence(params.max_length); std::vector argmax; - std::mt19937 rng(params.seed); int64_t total_sampling_time = 0; int64_t total_time = 0; + int64_t time_start = ggml_time_us(); - std::vector confidence(params.max_length); - - int64_t time_start = ggml_time_us(); for (int block_num = 0; block_num < num_blocks; block_num++) { - // Get number of tokens to transfer for this step - int32_t block_start = n_input + block_num * params.block_length; - int32_t block_end = std::min(n_input + (block_num + 1) * params.block_length, params.max_length); - - // Count masked tokens in current block - int32_t block_mask_count = 0; - for (int i = block_start; i < block_end; i++) { - if (in[i] == params.mask_token_id) { - block_mask_count++; + int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0; + int32_t block_end = (params.schedule == BLOCK_BASED) ? + std::min(n_input + (block_num + 1) * params.block_length, params.max_length) : + params.max_length; + + // Count masked tokens in current block for block-based processing + if (params.schedule == BLOCK_BASED) { + int32_t block_mask_count = 0; + for (int i = block_start; i < block_end; i++) { + if (output_tokens[i] == params.mask_token_id) { + block_mask_count++; + } } + num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block); } - auto num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps); - for (int step = 0; step < steps; step++) { + for (int32_t step = 0; step < steps_per_block; step++) { + int32_t global_step = block_num * steps_per_block + step; + if (params.step_callback) { - if (!params.step_callback(step + block_num * steps, - params.steps, - in.data(), - params.max_length, - params.step_callback_user_data)) { + if (!params.step_callback( + global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) { break; } } + // Setup batch + for (int32_t i = 0; i < params.max_length; i++) { + batch.token[i] = output_tokens[i]; + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = 1; + } + float * logits = nullptr; if (params.cfg_scale > 0.0f) { - for (int32_t i = 0; i < batch.n_tokens; i++) { - batch.token[i] = in[i]; - batch.pos[i] = i; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = 0; - batch.logits[i] = 1; - } - int ret = llama_decode(ctx, batch); if (ret != 0) { LOG_ERR("Failed to generate conditional"); + break; } float * cond_logits_ptr = llama_get_logits(ctx); std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float)); - std::copy(in.begin(), in.end(), un_x_buffer.begin()); + // Unconditional generation (mask input) + std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin()); for (int32_t i = 0; i < n_input; i++) { un_x_buffer[i] = params.mask_token_id; } - for (int32_t i = 0; i < batch.n_tokens; i++) { - batch.token[i] = un_x_buffer[i]; - batch.pos[i] = i; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = 0; - batch.logits[i] = 1; + for (int32_t i = 0; i < params.max_length; i++) { + batch.token[i] = un_x_buffer[i]; } ret = llama_decode(ctx, batch); - GGML_ASSERT(ret == 0); + if (ret != 0) { + LOG_ERR("Failed to generate unconditional"); + break; + } float * uncond_logits = llama_get_logits(ctx); + + // Apply CFG for (int32_t i = 0; i < logits_size; i++) { cond_logits_buffer[i] = uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]); } - logits = cond_logits_buffer.data(); } else { - // Standard generation without CFG - for (int32_t i = 0; i < batch.n_tokens; i++) { - batch.token[i] = in[i]; - batch.pos[i] = i; - batch.n_seq_id[i] = 1; - batch.seq_id[i][0] = 0; - batch.logits[i] = 1; - } - int ret = llama_decode(ctx, batch); if (ret != 0) { - LOG_ERR("Failed to generate"); + LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret); + break; } logits = llama_get_logits(ctx); } - int64_t time_start_sampling = ggml_time_us(); - - if (params.temperature > 0.0f) { - add_gumbel_noise(logits, n_vocab, params.temperature, rng); + if (!logits) { + LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step); + break; } - argmax.clear(); + auto get_logits_for_pos = [&](int32_t pos) -> const float * { + if (params.shift_logits) { + return pos == 0 ? logits : logits + (pos - 1) * n_vocab; + } + return logits + (pos) *n_vocab; + }; + + int64_t time_start_sampling = ggml_time_us(); - for (int i = 0; i < params.max_length; ++i) { - float max_value = std::numeric_limits::min(); - llama_token tok = LLAMA_TOKEN_NULL; - for (int vob = 0; vob < n_vocab; vob++) { - if (logits[n_vocab * i + vob] > max_value) { - max_value = logits[n_vocab * i + vob]; - tok = vob; + mask_positions.clear(); + for (int32_t i = 0; i < params.max_length; i++) { + if (output_tokens[i] == params.mask_token_id) { + // For block-based, only consider current block + if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) { + mask_positions.push_back(i); } } - argmax.push_back(tok); } - // Create mask index to track which positions are masked - std::vector mask_index(params.max_length); - for (int i = 0; i < params.max_length; i++) { - mask_index[i] = (in[i] == params.mask_token_id); + if (mask_positions.empty()) { + break; } - if (params.algorithm == diffusion_algorithm_llada::LOW_CONFIDENCE) { - // inplace softmax + argmax calculation. TODO: check why llama_sampler is so slow here - for (int i = block_start; i < block_end; i++) { - if (mask_index[i]) { - float * pos_logits = logits + i * n_vocab; - - llama_token best_token = 0; - float max_logit = pos_logits[0]; - for (int32_t j = 1; j < n_vocab; j++) { - if (pos_logits[j] > max_logit) { - max_logit = pos_logits[j]; - best_token = j; - } - } + if (params.add_gumbel_noise && params.temperature > 0.0f) { + add_gumbel_noise(logits, n_vocab, params.temperature, rng); + } - float sum_exp = 0.0f; - for (int32_t j = 0; j < n_vocab; j++) { - sum_exp += std::exp(pos_logits[j] - max_logit); + if (params.algorithm == ORIGIN) { + int32_t transfer_count = calculate_transfer_count( + step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens); + float p_transfer = (float) transfer_count / mask_positions.size(); + + for (int32_t pos : mask_positions) { + if (std::uniform_real_distribution(0.0f, 1.0f)(rng) < p_transfer) { + const float * pos_logits = get_logits_for_pos(pos); + for (int32_t token_id = 0; token_id < n_vocab; token_id++) { + candidates[token_id].id = token_id; + candidates[token_id].logit = pos_logits[token_id]; + candidates[token_id].p = 0.0f; } - float prob = std::exp(pos_logits[best_token] - max_logit) / sum_exp; - confidence[i] = prob; + llama_token_data_array cur_p = { + candidates.data(), + (size_t) n_vocab, + -1, + false, + }; - argmax[i] = best_token; - } else { - confidence[i] = -std::numeric_limits::infinity(); // Non-masked positions + llama_sampler_apply(sampler, &cur_p); + output_tokens[pos] = cur_p.data[cur_p.selected].id; } } - } else if (params.algorithm == diffusion_algorithm_llada::RANDOM) { - // Random remasking: assign random values for masked positions - std::uniform_real_distribution uniform(0.0f, 1.0f); - for (int i = 0; i < params.max_length; i++) { - if (mask_index[i]) { - confidence[i] = uniform(rng); - } else { - confidence[i] = -std::numeric_limits::infinity(); // Non-masked positions + } else { + std::vector> confidences; + std::vector sampled_tokens(mask_positions.size()); + + for (size_t i = 0; i < mask_positions.size(); i++) { + int32_t pos = mask_positions[i]; + const float * pos_logits = get_logits_for_pos(pos); + + for (int32_t token_id = 0; token_id < n_vocab; token_id++) { + candidates[token_id].logit = pos_logits[token_id]; + candidates[token_id].p = 0.0f; + candidates[token_id].id = token_id; } - } - } - for (int i = n_input + (block_num + 1) * params.block_length; i < params.max_length; i++) { - confidence[i] = -std::numeric_limits::infinity(); - } + llama_token_data_array cur_p = { + candidates.data(), + candidates.size(), + -1, + false, + }; + + llama_sampler_apply(sampler, &cur_p); + llama_token sampled_token = cur_p.data[cur_p.selected].id; - int32_t transfer_count = num_transfer_tokens[step]; + float conf = calculate_confidence(cur_p, params.algorithm, rng); - std::vector> conf_pairs; - for (int i = n_input; i < params.max_length; i++) { - if (mask_index[i] && confidence[i] > -std::numeric_limits::infinity()) { - conf_pairs.push_back({ confidence[i], i }); + sampled_tokens[i] = sampled_token; + confidences.emplace_back(conf, i); } - } - std::partial_sort(conf_pairs.begin(), - conf_pairs.begin() + std::min(transfer_count, (int32_t) conf_pairs.size()), - conf_pairs.end(), - [](const std::pair & a, const std::pair & b) { - return a.first > b.first; - }); + int32_t transfer_count = calculate_transfer_count( + step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens); + + if (transfer_count > 0) { + if (params.alg_temp == 0.0f) { + std::partial_sort(confidences.begin(), + confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()), + confidences.end(), + [](const std::pair & a, const std::pair & b) { + if (a.first != b.first) { + return a.first > b.first; + } + return a.second < b.second; + }); + + for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) { + int32_t mask_idx = confidences[i].second; + int32_t pos = mask_positions[mask_idx]; + output_tokens[pos] = sampled_tokens[mask_idx]; + } + } else { + conf_candidates.clear(); + for (size_t i = 0; i < confidences.size(); i++) { + float conf_logit = confidences[i].first / params.alg_temp; + conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f }); + } - for (int i = 0; i < std::min(transfer_count, (int32_t) conf_pairs.size()); i++) { - int32_t pos = conf_pairs[i].second; - in[pos] = argmax[pos]; + llama_token_data_array conf_array = { + conf_candidates.data(), + conf_candidates.size(), + -1, + false, + }; + + for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) { + llama_sampler_apply(dist_sampler, &conf_array); + int32_t selected_idx = conf_array.selected; + int32_t mask_idx = selected_idx; + int32_t pos = mask_positions[mask_idx]; + output_tokens[pos] = sampled_tokens[mask_idx]; + + conf_candidates[selected_idx].p = 0.0f; + conf_array.selected = -1; + } + } + } } int64_t time_end_sampling = ggml_time_us(); total_sampling_time += time_end_sampling - time_start_sampling; } } + int64_t time_end = ggml_time_us(); total_time += time_end - time_start; @@ -671,8 +505,8 @@ static void diffusion_generate_llada(llama_context * ctx, total_sampling_time / 1000.0 / params.steps); llama_batch_free(batch); - - memcpy(output_tokens, in.data(), in.size() * sizeof(llama_token)); + llama_sampler_free(sampler); + llama_sampler_free(dist_sampler); n_generated = params.max_length; } @@ -721,13 +555,8 @@ int main(int argc, char ** argv) { return 1; } - char arch_str[128]; - GGML_ASSERT(llama_model_meta_val_str(model, "general.architecture", arch_str, 128) >= 0); - - std::string arch = std::string(arch_str); - - if (arch != "dream" && arch != "llada") { - LOG_ERR("error: unsupported model architecture '%s' for diffusion. Expected 'dream' or 'llada'\n", arch_str); + if (!llama_model_is_diffusion(model)) { + LOG_ERR("error: unsupported model for diffusion"); llama_model_free(model); return 1; } @@ -758,14 +587,6 @@ int main(int argc, char ** argv) { /*add special tokens*/ true, /*parse special*/ true); - // For LLaDA models, forcefully add BOS token at the beginning. TODO: check why - if (arch == "llada") { - llama_token bos_token = llama_vocab_bos(vocab); - if (bos_token != LLAMA_TOKEN_NULL && (input_tokens.empty() || input_tokens[0] != bos_token)) { - input_tokens.insert(input_tokens.begin(), bos_token); - } - } - int n_input = input_tokens.size(); if (n_input >= params.n_ctx) { @@ -783,78 +604,65 @@ int main(int argc, char ** argv) { int32_t n_generated = 0; std::vector output_tokens(params.n_ubatch); - if (arch == "dream") { - struct dream_diffusion_params diff_params = default_params_dream(); - diff_params.mask_token_id = mask_token_id; - diff_params.seed = params.sampling.seed; - diff_params.temperature = params.sampling.temp; - - diff_params.steps = params.diffusion.steps; - diff_params.eps = params.diffusion.eps; - diff_params.top_p = params.sampling.top_p; - diff_params.top_k = params.sampling.top_k; - diff_params.algorithm = static_cast(params.diffusion.algorithm); - diff_params.alg_temp = params.diffusion.alg_temp; - diff_params.visual_mode = params.diffusion.visual_mode; - - diff_params.step_callback = diffusion_step_callback; - callback_data cb_data = { &diff_params, vocab, n_input }; - diff_params.step_callback_user_data = &cb_data; - - GGML_ASSERT(diff_params.algorithm >= 0 && diff_params.algorithm <= 3); - - const char * alg_names[] = { "ORIGIN", "MASKGIT_PLUS", "TOPK_MARGIN", "ENTROPY" }; - const char * alg_name = alg_names[params.diffusion.algorithm]; - - LOG_INF("dream_diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); - LOG_INF("dream_diffusion_params: - %-25s u32 = %d\n", "steps", params.diffusion.steps); - LOG_INF("dream_diffusion_params: - %-25s f32 = %.6f\n", "eps", params.diffusion.eps); - LOG_INF("dream_diffusion_params: - %-25s u32 = %d (%s)\n", - "algorithm", - params.diffusion.algorithm, - alg_name); - LOG_INF("dream_diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", params.diffusion.alg_temp); - - diffusion_generate_dream( - ctx, input_tokens.data(), output_tokens.data(), n_input, params.n_ubatch, diff_params, n_generated); + struct diffusion_params diff_params; + + char shift_logits_str[8]; + if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) { + diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0); } else { - // Use LLaDA parameters - struct llada_diffusion_params diff_params = default_params_llada(); + diff_params.shift_logits = true; + } - diff_params.mask_token_id = mask_token_id; - diff_params.seed = params.sampling.seed; - diff_params.temperature = params.sampling.temp; + //Use either eps or block length, but not both + GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0)); - diff_params.steps = params.diffusion.steps; - diff_params.max_length = params.n_ubatch; + if (params.diffusion.eps) { + diff_params.schedule = TIMESTEP_BASED; + diff_params.eps = params.diffusion.eps; + } else if (params.diffusion.block_length) { + diff_params.schedule = BLOCK_BASED; diff_params.block_length = params.diffusion.block_length; - diff_params.cfg_scale = params.diffusion.cfg_scale; - diff_params.algorithm = static_cast(params.diffusion.remasking); - diff_params.visual_mode = params.diffusion.visual_mode; - - GGML_ASSERT(diff_params.algorithm >= 0 && diff_params.algorithm <= 1); - - const char * alg_names[] = { "LOW_CONFIDENCE", "RANDOM" }; - const char * alg_name = alg_names[diff_params.algorithm]; - - diff_params.step_callback = diffusion_step_callback; - callback_data cb_data = { &diff_params, vocab, n_input }; - diff_params.step_callback_user_data = &cb_data; - - LOG_INF("llada_diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); - LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "steps", diff_params.steps); - LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "max_length", diff_params.max_length); - LOG_INF("llada_diffusion_params: - %-25s u32 = %d (%s)\n", - "algorithm", - params.diffusion.algorithm, - alg_name); - LOG_INF("llada_diffusion_params: - %-25s u32 = %d\n", "block_length", diff_params.block_length); - LOG_INF("llada_diffusion_params: - %-25s f32 = %.3f\n", "temperature", diff_params.temperature); - LOG_INF("llada_diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", diff_params.cfg_scale); - - diffusion_generate_llada(ctx, input_tokens.data(), output_tokens.data(), n_input, diff_params, n_generated); } + diff_params.mask_token_id = mask_token_id; + diff_params.seed = params.sampling.seed; + diff_params.temperature = params.sampling.temp; + diff_params.steps = params.diffusion.steps; + diff_params.algorithm = static_cast(params.diffusion.algorithm); + diff_params.max_length = params.n_ubatch; + diff_params.top_p = params.sampling.top_p; + diff_params.top_k = params.sampling.top_k; + diff_params.visual_mode = params.diffusion.visual_mode; + diff_params.add_gumbel_noise = params.diffusion.add_gumbel_noise; + + diff_params.step_callback = diffusion_step_callback; + callback_data cb_data = { &diff_params, vocab, n_input }; + diff_params.step_callback_user_data = &cb_data; + + const char * alg_names[] = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" }; + const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" }; + const char * alg_name = + (diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN"; + const char * sched_name = + (diff_params.schedule >= 0 && diff_params.schedule <= 1) ? sched_names[diff_params.schedule] : "UNKNOWN"; + + LOG_INF("diffusion_params: - %-25s llama_token = %d\n", "mask_token_id", mask_token_id); + LOG_INF("diffusion_params: - %-25s u32 = %d\n", "steps", diff_params.steps); + LOG_INF("diffusion_params: - %-25s u32 = %d\n", "max_length", diff_params.max_length); + LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name); + LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "schedule", diff_params.schedule, sched_name); + LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "temperature", diff_params.temperature); + if (diff_params.schedule == TIMESTEP_BASED) { + LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", diff_params.eps); + LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", diff_params.alg_temp); + } + if (diff_params.schedule == BLOCK_BASED) { + LOG_INF("diffusion_params: - %-25s u32 = %d\n", "block_length", diff_params.block_length); + LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", diff_params.cfg_scale); + } + + diffusion_generate(ctx, input_tokens.data(), output_tokens.data(), n_input, diff_params, n_generated); + if (n_generated > 0) { if (visual_mode) { //clear screen and move cursor to top-left diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2a6a22d82f186..ef47ea7359eda 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -279,6 +279,9 @@ class Attention: class Projector: STACK_FACTOR = "clip.audio.projector.stack_factor" + class Diffusion: + SHIFT_LOGITS = "diffusion.shift_logits" + # # recommended mapping of model tensor names for storage in gguf # diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 4f23f9b024619..7d027e413d2c1 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1047,6 +1047,11 @@ def add_audio_num_mel_bins(self, value: int) -> None: def add_audio_stack_factor(self, value: int) -> None: self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value) + # diffusion models + + def add_diffusion_shift_logits(self, value: int) -> None: + self.add_uint32(Keys.Diffusion.SHIFT_LOGITS, value) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 16abe7e0e8e2c..15adbfa781845 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -372,11 +372,8 @@ class TensorNameMap: "transformer.h.{bid}.mlp.c_fc_1", # exaone "model.layers.{bid}.feed_forward.up_proj", # llama4 jamba granite-hybrid "transformer_encoder.{bid}.ffn.w12", # neobert -<<<<<<< HEAD "model.layers.{bid}.block_sparse_moe.up", # smallthinker -======= "model.transformer.blocks.{bid}.up_proj", # llada ->>>>>>> c56f1b02 (Add support for Llada-8b: diffusion model) ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -417,11 +414,8 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid -<<<<<<< HEAD "model.layers.{bid}.block_sparse_moe.gate", # smallthinker -======= "model.transformer.blocks.{bid}.ff_proj", # llada ->>>>>>> c56f1b02 (Add support for Llada-8b: diffusion model) ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -470,11 +464,8 @@ class TensorNameMap: "model.layers.h.{bid}.mlp.c_proj", # exaone "model.layers.{bid}.feed_forward.down_proj", # llama4 jamba granite-hybrid "transformer_encoder.{bid}.ffn.w3", # neobert -<<<<<<< HEAD "model.layers.{bid}.block_sparse_moe.down", # smallthinker -======= "model.transformer.blocks.{bid}.ff_out", # llada ->>>>>>> c56f1b02 (Add support for Llada-8b: diffusion model) ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 71143cfd2fb08..15fb9d0b50809 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -88,11 +88,8 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_SMOLLM3, "smollm3" }, { LLM_ARCH_LFM2, "lfm2" }, { LLM_ARCH_DREAM, "dream" }, -<<<<<<< HEAD { LLM_ARCH_SMALLTHINKER, "smallthinker" }, -======= { LLM_ARCH_LLADA, "llada" }, ->>>>>>> c56f1b02 (Add support for Llada-8b: diffusion model) { LLM_ARCH_UNKNOWN, "(unknown)" }, }; From a50547c9895b6a899b2204a32d5aea0727aee295 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 31 Jul 2025 11:52:01 +0800 Subject: [PATCH 07/12] Remove unused argmax --- examples/diffusion/diffusion-cli.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 9a32b39e8b8e1..d2e889fc9878f 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -272,7 +272,6 @@ static void diffusion_generate(llama_context * ctx, } std::vector confidence(params.max_length); - std::vector argmax; int64_t total_sampling_time = 0; int64_t total_time = 0; From e864a49686420cf418cb3a58af4ce9368689d08e Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 31 Jul 2025 15:31:16 +0800 Subject: [PATCH 08/12] Remove braced initializers, improve README.md a bit --- examples/diffusion/README.md | 6 ++++ examples/diffusion/diffusion-cli.cpp | 46 ++++++++++++++-------------- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md index 38c4ad8d3809b..26de5668aa8e6 100644 --- a/examples/diffusion/README.md +++ b/examples/diffusion/README.md @@ -5,3 +5,9 @@ This directory contains implementations for Diffusion LLMs (DLLMs) More Info: - https://github.com/ggml-org/llama.cpp/pull/14644 - https://github.com/ggml-org/llama.cpp/pull/14771 + + +Example of using Dream architechture: `llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual` + +Example of using LLaDA architechture: `llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual` + diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index d2e889fc9878f..8431dcea8fe2a 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -29,28 +29,28 @@ typedef bool (*diffusion_step_callback_t)(int32_t step, void * user_data); struct diffusion_params { - int32_t steps{}; - float temperature{}; - llama_token mask_token_id{}; - diffusion_step_callback_t step_callback{}; - void * step_callback_user_data{}; - int32_t seed{}; - bool visual_mode{}; - bool shift_logits{}; // Shift logits by -1 after decode - - float top_p{ 0 }; - int32_t top_k{ 0 }; - - diffusion_algorithm algorithm{ CONFIDENCE_BASED }; - transfer_schedule schedule{ TIMESTEP_BASED }; - - float cfg_scale{ 0 }; // Config scale for classifier-free guidance - float eps{ 0 }; // Timestep scheduling - int32_t block_length{ 0 }; // Block size (for block scheduling) - float alg_temp{ 0 }; // algorithm temperature (0.0 = deterministic) - bool add_gumbel_noise{ false }; // Add gumbel noise to the logits if temp > 0.0 - - int32_t max_length{}; // Maximum sequence length + int32_t steps = 0; + float temperature = 0; + llama_token mask_token_id = LLAMA_TOKEN_NULL; + diffusion_step_callback_t step_callback = nullptr; + void * step_callback_user_data = nullptr; + int32_t seed = 0; + bool visual_mode = false; + bool shift_logits = false; // Shift logits by -1 after decode + + float top_p = 0.; + int32_t top_k = 0.; + + diffusion_algorithm algorithm = CONFIDENCE_BASED; + transfer_schedule schedule = TIMESTEP_BASED; + + float cfg_scale = 0.; // Config scale for classifier-free guidance + float eps = 0.; // Timestep scheduling + int32_t block_length = 0; // Block size (for block scheduling) + float alg_temp = 0; // algorithm temperature (0.0 = deterministic) + bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0 + + int32_t max_length = 0; // Maximum sequence length }; struct callback_data { @@ -271,7 +271,7 @@ static void diffusion_generate(llama_context * ctx, steps_per_block = params.steps / num_blocks; } - std::vector confidence(params.max_length); + std::vector confidence(params.max_length); int64_t total_sampling_time = 0; int64_t total_time = 0; From 9691f4edd5ea2bb51b9a7283dd48a40432a34f37 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 31 Jul 2025 16:52:21 +0800 Subject: [PATCH 09/12] Add diffusion specific gguf params in set_vocab, remove setting rope_theta and rms_norm_eps --- convert_hf_to_gguf.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index db3fc31d1a1e8..28fb0185a5350 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2949,6 +2949,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: def set_vocab(self): self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_diffusion_shift_logits(False) + def set_gguf_parameters(self): super().set_gguf_parameters() self._try_set_pooling_type() @@ -2974,14 +2977,6 @@ def set_gguf_parameters(self): feed_forward_length = self.hparams.get("mlp_hidden_size", 12288) self.gguf_writer.add_feed_forward_length(feed_forward_length) - # Set RoPE parameters - if "rope_theta" in self.hparams: - self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) - - # Set RMS norm epsilon - if "rms_norm_eps" in self.hparams: - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - # LLaDA models use non-causal attention for diffusion, similar to Dream self.gguf_writer.add_causal_attention(False) # Handle RoPE scaling similar to LlamaModel and Dream @@ -2992,11 +2987,6 @@ def set_gguf_parameters(self): if mask_token_id is not None: self.gguf_writer.add_mask_token_id(mask_token_id) - self.gguf_writer.add_add_bos_token(True) - - logging.info("Adding diffusion shift logits to False") - self.gguf_writer.add_diffusion_shift_logits(False) - @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: From 57201ccb07c3763fbbc9bf70bc3ded75086221a9 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 31 Jul 2025 17:14:03 +0800 Subject: [PATCH 10/12] Remove adding the mask token --- common/common.h | 4 ++-- convert_hf_to_gguf.py | 12 +++--------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/common/common.h b/common/common.h index a69a1d31b6d92..38129b99d511f 100644 --- a/common/common.h +++ b/common/common.h @@ -283,8 +283,8 @@ struct common_params { struct common_params_sampling sampling; struct common_params_speculative speculative; - struct common_params_vocoder vocoder; - struct common_params_diffusion diffusion; + struct common_params_vocoder vocoder; + struct common_params_diffusion diffusion; struct common_params_model model; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 28fb0185a5350..b75df8f57352e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2949,9 +2949,6 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: def set_vocab(self): self._set_vocab_gpt2() - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_diffusion_shift_logits(False) - def set_gguf_parameters(self): super().set_gguf_parameters() self._try_set_pooling_type() @@ -2979,13 +2976,10 @@ def set_gguf_parameters(self): # LLaDA models use non-causal attention for diffusion, similar to Dream self.gguf_writer.add_causal_attention(False) - # Handle RoPE scaling similar to LlamaModel and Dream - - # Add LLaDA-specific parameters - mask_token_id = self.hparams.get("mask_token_id") - if mask_token_id is not None: - self.gguf_writer.add_mask_token_id(mask_token_id) + # LLaDA specific parameters + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_diffusion_shift_logits(False) @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): From a326b130be779314f997e4167cfc0329a44281c3 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 31 Jul 2025 17:32:17 +0800 Subject: [PATCH 11/12] Move add_add_bos_token to set_vocab --- convert_hf_to_gguf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b75df8f57352e..db4112318d487 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2949,6 +2949,9 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: def set_vocab(self): self._set_vocab_gpt2() + # LLaDA specific parameters + self.gguf_writer.add_add_bos_token(True) + def set_gguf_parameters(self): super().set_gguf_parameters() self._try_set_pooling_type() @@ -2977,8 +2980,7 @@ def set_gguf_parameters(self): # LLaDA models use non-causal attention for diffusion, similar to Dream self.gguf_writer.add_causal_attention(False) - # LLaDA specific parameters - self.gguf_writer.add_add_bos_token(True) + # LLaDA models don't shift their logits self.gguf_writer.add_diffusion_shift_logits(False) @staticmethod From ac3f91fe65363d7b8830870f6ef7ca445441813a Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 31 Jul 2025 18:11:30 +0800 Subject: [PATCH 12/12] use add_bool in gguf_writer.py --- gguf-py/gguf/gguf_writer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7d027e413d2c1..f4fd64ad822fa 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1049,8 +1049,8 @@ def add_audio_stack_factor(self, value: int) -> None: # diffusion models - def add_diffusion_shift_logits(self, value: int) -> None: - self.add_uint32(Keys.Diffusion.SHIFT_LOGITS, value) + def add_diffusion_shift_logits(self, value: bool) -> None: + self.add_bool(Keys.Diffusion.SHIFT_LOGITS, value) def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = ''