Efficient-Large-Model
/

Fast_dLLM_v2_1.5B

Model card Files Files and versions

Chengyue Wu commited on Oct 5

Commit

da56081

·

1 Parent(s): 0417cd5

update

Files changed (2) hide show

README.md +0 -1
modeling.py +1 -3

README.md CHANGED Viewed

@@ -41,7 +41,6 @@ Our approach introduces a novel decoding recipe incorporating a complementary at
 - **Params**: 1.54B (non-embedding: 1.31B)
 - **Layers**: 28
 - **Attention Heads**: 12 (Q), 2 (KV, GQA)
-- **Context Window**: 32,768 tokens (generation length: 8,192)
 - **Key Feature**: Parallel **block-wise decoding** + **hierarchical caching**
 ---

 - **Params**: 1.54B (non-embedding: 1.31B)
 - **Layers**: 28
 - **Attention Heads**: 12 (Q), 2 (KV, GQA)
 - **Key Feature**: Parallel **block-wise decoding** + **hierarchical caching**
 ---

modeling.py CHANGED Viewed

@@ -581,7 +581,6 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
             x_init = torch.cat([input_ids, x_init], dim=1)
             x_t = x_init.clone()
-            step = 0
             block_past_key_values = None
             while True:
                 if stop_token in x_t[:, prompt_length:]:
@@ -612,7 +611,7 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
                                 break
                         if use_block_cache:
-                            if step % block_cache_refresh_interval == 0 or (x_t[:, -block_size+small_block_start_idx] == mask_id).any():
                                 output = self.forward(input_ids=x_t[:, -block_size:], use_cache=True, past_key_values=past_key_values, update_past_key_values=False, use_block_cache=True)
                                 logits, block_past_key_values = output.logits, output.block_past_key_values
                                 logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
@@ -638,7 +637,6 @@ class Fast_dLLM_QwenForCausalLM(Fast_dLLM_QwenPreTrainedModel, GenerationMixin):
                         x_t[:, start:end][unmask_idx] = x_1[unmask_idx]
-                        step += 1
             input_ids = x_t
         # Truncate stop_token
         if stop_token in input_ids[:, original_input_length:]:

             x_init = torch.cat([input_ids, x_init], dim=1)
             x_t = x_init.clone()
             block_past_key_values = None
             while True:
                 if stop_token in x_t[:, prompt_length:]:
                                 break
                         if use_block_cache:
+                            if block_past_key_values is None or (x_t[:, -block_size+small_block_start_idx] == mask_id).any():
                                 output = self.forward(input_ids=x_t[:, -block_size:], use_cache=True, past_key_values=past_key_values, update_past_key_values=False, use_block_cache=True)
                                 logits, block_past_key_values = output.logits, output.block_past_key_values
                                 logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
                         x_t[:, start:end][unmask_idx] = x_1[unmask_idx]
             input_ids = x_t
         # Truncate stop_token
         if stop_token in input_ids[:, original_input_length:]: