xvla log fix

2026-07-07 18:11:50 +00:00 · 2026-02-26 09:55:25 +00:00
parent 59b33c0ea3
commit 0bda187268
2 changed files with 0 additions and 22 deletions
@@ -13,12 +13,9 @@
 import warnings

 from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging

 """ Florence-2 configuration"""

-logger = logging.get_logger(__name__)
-

 class Florence2VisionConfig(PretrainedConfig):
    r"""
@@ -46,7 +46,6 @@ from transformers.utils import (
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
-    logging,
    replace_return_docstrings,
 )

@@ -57,8 +56,6 @@ if is_flash_attn_2_available():
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

-logger = logging.get_logger(__name__)
-
 _CONFIG_FOR_DOC = "Florence2Config"


@@ -992,12 +989,6 @@ class Florence2FlashAttention2(Florence2Attention):
            else:
                target_dtype = self.q_proj.weight.dtype

-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
            query_states = query_states.to(target_dtype)
            key_states = key_states.to(target_dtype)
            value_states = value_states.to(target_dtype)
@@ -1135,11 +1126,6 @@ class Florence2SdpaAttention(Florence2Attention):
    ) -> tuple[torch.Tensor, torch.Tensor | None, tuple[torch.Tensor] | None]:
        """Input shape: Batch x Time x Channel"""
        if output_attentions or layer_head_mask is not None:
-            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Florence2Model is using Florence2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
-                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
            return super().forward(
                hidden_states,
                key_value_states=key_value_states,
@@ -1860,9 +1846,6 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
            use_cache = False

        # decoder layers
@@ -2160,8 +2143,6 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if labels is not None:
-            if use_cache:
-                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
            use_cache = False
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                decoder_input_ids = shift_tokens_right(