deepseek-ai
/

DeepSeek-OCR

@@ -24,6 +24,13 @@ import numpy as np
 import time
 def load_image(image_path):
     try:
@@ -502,7 +509,7 @@ class DeepseekOCRModel(DeepseekV2Model):
                     images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
                     # exit()
-                    inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
                 idx += 1
@@ -622,8 +629,8 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
@@ -645,6 +652,7 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                 max_cache_length is not None
                 and attention_mask is not None
                 and cache_length + input_ids.shape[1] > max_cache_length
             ):
                 attention_mask = attention_mask[:, -max_cache_length:]
@@ -911,12 +919,12 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         if not eval_mode:
             streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
-                        input_ids.unsqueeze(0).cuda(),
-                        images=[(images_crop.cuda(), images_ori.cuda())],
-                        images_seq_mask = images_seq_mask.unsqueeze(0).cuda(),
                         images_spatial_crop = images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
@@ -929,12 +937,12 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                         )
         else:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
-                        input_ids.unsqueeze(0).cuda(),
-                        images=[(images_crop.cuda(), images_ori.cuda())],
-                        images_seq_mask = images_seq_mask.unsqueeze(0).cuda(),
                         images_spatial_crop = images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
@@ -947,7 +955,7 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         if '<image>' in conversation[0]['content'] and eval_mode:
-                outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
                 stop_str = '<｜end▁of▁sentence｜>'
                 if outputs.endswith(stop_str):
                     outputs = outputs[:-len(stop_str)]
@@ -957,7 +965,7 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
                 return outputs
         if '<image>' in conversation[0]['content'] and test_compress:
-            outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
             pure_texts_outputs_token_length = len(text_encode(tokenizer, outputs, bos=False, eos=False))
             print('='*50)
             print('image size: ', (w, h))
@@ -968,7 +976,7 @@ class DeepseekOCRForCausalLM(DeepseekV2ForCausalLM):
         if '<image>' in conversation[0]['content'] and save_results:
-            outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
             stop_str = '<｜end▁of▁sentence｜>'
             print('='*15 + 'save results:' + '='*15)

 import time
+DEVICE = "cpu"
+if torch.mps.is_available():
+    DEVICE = "mps"
+elif torch.cuda.is_available():
+    DEVICE = "cuda"
 def load_image(image_path):
     try:
                     images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
                     # exit()
+                    inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).to(DEVICE), images_in_this_batch)
                 idx += 1
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
+                past_length = cache_length
+                max_cache_length = past_key_values.get_max_cache_shape()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
                 max_cache_length is not None
                 and attention_mask is not None
                 and cache_length + input_ids.shape[1] > max_cache_length
+                and max_cache_length > -1
             ):
                 attention_mask = attention_mask[:, -max_cache_length:]
         if not eval_mode:
             streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
+            with torch.autocast(DEVICE, dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
+                        input_ids.unsqueeze(0).to(DEVICE),
+                        images=[(images_crop.to(DEVICE), images_ori.to(DEVICE))],
+                        images_seq_mask = images_seq_mask.unsqueeze(0).to(DEVICE),
                         images_spatial_crop = images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
                         )
         else:
+            with torch.autocast(DEVICE, dtype=torch.bfloat16):
                 with torch.no_grad():
                     output_ids = self.generate(
+                        input_ids.unsqueeze(0).to(DEVICE),
+                        images=[(images_crop.to(DEVICE), images_ori.to(DEVICE))],
+                        images_seq_mask = images_seq_mask.unsqueeze(0).to(DEVICE),
                         images_spatial_crop = images_spatial_crop,
                         # do_sample=False,
                         # num_beams = 1,
         if '<image>' in conversation[0]['content'] and eval_mode:
+                outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(DEVICE).shape[1]:])
                 stop_str = '<｜end▁of▁sentence｜>'
                 if outputs.endswith(stop_str):
                     outputs = outputs[:-len(stop_str)]
                 return outputs
         if '<image>' in conversation[0]['content'] and test_compress:
+            outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(DEVICE).shape[1]:])
             pure_texts_outputs_token_length = len(text_encode(tokenizer, outputs, bos=False, eos=False))
             print('='*50)
             print('image size: ', (w, h))
         if '<image>' in conversation[0]['content'] and save_results:
+            outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(DEVICE).shape[1]:])
             stop_str = '<｜end▁of▁sentence｜>'
             print('='*15 + 'save results:' + '='*15)

modeling_deepseekv2.py CHANGED Viewed

@@ -36,7 +36,6 @@ from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from transformers.models.llama.modeling_llama import (
     LlamaAttention,
-    LlamaFlashAttention2
 )
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
@@ -889,7 +888,7 @@ class DeepseekV2Attention(nn.Module):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(q_pe, seq_len=kv_seq_len)
         q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
@@ -1018,7 +1017,7 @@ class DeepseekV2FlashAttention2(DeepseekV2Attention):
         kv_seq_len = value_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
@@ -1235,7 +1234,7 @@ ATTENTION_CLASSES = {
     "mla_flash_attention_2": DeepseekV2FlashAttention2,
     "mha_eager": LlamaAttention,
-    "mha_flash_attention_2": LlamaFlashAttention2
 }
@@ -1539,7 +1538,7 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel):
             use_legacy_cache = not isinstance(past_key_values, Cache)
             if use_legacy_cache:
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
@@ -1779,8 +1778,8 @@ class DeepseekV2ForCausalLM(DeepseekV2PreTrainedModel):
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None

 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from transformers.models.llama.modeling_llama import (
     LlamaAttention,
 )
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
         cos, sin = self.rotary_emb(q_pe, seq_len=kv_seq_len)
         q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
         kv_seq_len = value_states.shape[-2]
         if past_key_value is not None:
+            kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
     "mla_flash_attention_2": DeepseekV2FlashAttention2,
     "mha_eager": LlamaAttention,
+    "mha_flash_attention_2": None
 }
             use_legacy_cache = not isinstance(past_key_values, Cache)
             if use_legacy_cache:
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_seq_length()
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
+                past_length = cache_length
+                max_cache_length = past_key_values.get_max_cache_shape()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None