From 2424d20b74e11fdc6c6457f19e237899361a7b6c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 12 Sep 2024 15:27:48 +0200
Subject: [PATCH 01/58] initial commit

---
 docs/source/en/_toctree.yml                   |   2 +
 docs/source/en/model_doc/pixtral.md           |  45 ++
 src/transformers/__init__.py                  |  18 +
 src/transformers/modeling_utils.py            |  42 +-
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   2 +
 src/transformers/models/auto/modeling_auto.py |   2 +
 .../models/auto/processing_auto.py            |   1 +
 .../models/auto/tokenization_auto.py          |   1 +
 src/transformers/models/pixtral/__init__.py   |  53 ++
 .../models/pixtral/configuration_pixtral.py   | 107 +++
 .../pixtral/convert_pixtral_weights_to_hf.py  | 103 +++
 .../models/pixtral/modeling_pixtral.py        | 622 ++++++++++++++++++
 tests/models/pixtral/__init__.py              |   0
 tests/models/pixtral/test_modeling_pixtral.py | 570 ++++++++++++++++
 15 files changed, 1549 insertions(+), 20 deletions(-)
 create mode 100644 docs/source/en/model_doc/pixtral.md
 create mode 100644 src/transformers/models/pixtral/__init__.py
 create mode 100644 src/transformers/models/pixtral/configuration_pixtral.py
 create mode 100644 src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
 create mode 100644 src/transformers/models/pixtral/modeling_pixtral.py
 create mode 100644 tests/models/pixtral/__init__.py
 create mode 100644 tests/models/pixtral/test_modeling_pixtral.py
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 1c7f62ec6ea7b8..235ea81a7f1ea6 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -862,6 +862,8 @@
         title: Perceiver
       - local: model_doc/pix2struct
         title: Pix2Struct
+      - local: model_doc/pixtral
+        title: Pixtral
       - local: model_doc/sam
         title: Segment Anything
       - local: model_doc/siglip
diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
new file mode 100644
index 00000000000000..30c62af3df3467
--- /dev/null
+++ b/docs/source/en/model_doc/pixtral.md
@@ -0,0 +1,45 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pixtral
+
+# Pixtral
+
+## Overview
+
+The Pixtral model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## PixtralConfig
+
+[[autodoc]] PixtralConfig
+
+## PixtralModel
+
+[[autodoc]] PixtralModel
+    - forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 00cc67915f3664..94bfae8ebcdeff 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -526,6 +526,10 @@
         "LlavaConfig",
         "LlavaProcessor",
     ],
+    "models.pixtral": [
+        "PixtralConfig",
+       
+    ],
     "models.llava_next": [
         "LlavaNextConfig",
         "LlavaNextProcessor",
@@ -2524,6 +2528,12 @@
             "LlavaPreTrainedModel",
         ]
     )
+    _import_structure["models.pixtral"].extend(
+        [
+            "PixtralModel",
+            "PixtralPreTrainedModel",
+        ]
+    )
     _import_structure["models.llava_next"].extend(
         [
             "LlavaNextForConditionalGeneration",
@@ -5290,6 +5300,10 @@
         LlavaConfig,
         LlavaProcessor,
     )
+    from .models.pixtral import (
+        PixtralConfig,
+       
+    )
     from .models.llava_next import (
         LlavaNextConfig,
         LlavaNextProcessor,
@@ -7097,6 +7111,10 @@
             LlavaForConditionalGeneration,
             LlavaPreTrainedModel,
         )
+        from .models.pixtral import (
+            PixtralModel,
+            PixtralPreTrainedModel,
+        )
         from .models.llava_next import (
             LlavaNextForConditionalGeneration,
             LlavaNextPreTrainedModel,
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 359509f469a703..d0a13da512156b 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -548,11 +548,12 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], is_quantized: bool
         # Check format of the archive
         with safe_open(checkpoint_file, framework="pt") as f:
             metadata = f.metadata()
-        if metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
-            raise OSError(
-                f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
-                "you save your model with the `save_pretrained` method."
-            )
+        if metadata is not None:
+            if metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
+                raise OSError(
+                    f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
+                    "you save your model with the `save_pretrained` method."
+                )
         return safe_load_file(checkpoint_file)
     try:
         if (
@@ -3751,21 +3752,22 @@ def from_pretrained(
             with safe_open(resolved_archive_file, framework="pt") as f:
                 metadata = f.metadata()
 
-            if metadata.get("format") == "pt":
-                pass
-            elif metadata.get("format") == "tf":
-                from_tf = True
-                logger.info("A TensorFlow safetensors file is being loaded in a PyTorch model.")
-            elif metadata.get("format") == "flax":
-                from_flax = True
-                logger.info("A Flax safetensors file is being loaded in a PyTorch model.")
-            elif metadata.get("format") == "mlx":
-                # This is a mlx file, we assume weights are compatible with pt
-                pass
-            else:
-                raise ValueError(
-                    f"Incompatible safetensors file. File metadata is not ['pt', 'tf', 'flax', 'mlx'] but {metadata.get('format')}"
-                )
+            if metadata is not None:
+                if metadata.get("format") == "pt":
+                    pass
+                elif metadata.get("format") == "tf":
+                    from_tf = True
+                    logger.info("A TensorFlow safetensors file is being loaded in a PyTorch model.")
+                elif metadata.get("format") == "flax":
+                    from_flax = True
+                    logger.info("A Flax safetensors file is being loaded in a PyTorch model.")
+                elif metadata.get("format") == "mlx":
+                    # This is a mlx file, we assume weights are compatible with pt
+                    pass
+                else:
+                    raise ValueError(
+                        f"Incompatible safetensors file. File metadata is not ['pt', 'tf', 'flax', 'mlx'] but {metadata.get('format')}"
+                    )
 
         from_pt = not (from_tf | from_flax)
 
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 26b96def67d992..0ecadd8d22167b 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -130,6 +130,7 @@
     lilt,
     llama,
     llava,
+    pixtral,
     llava_next,
     llava_next_video,
     llava_onevision,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index fa1a7fb88eafa8..ac2b3ca34949ce 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -147,6 +147,7 @@
         ("lilt", "LiltConfig"),
         ("llama", "LlamaConfig"),
         ("llava", "LlavaConfig"),
+        ("pixtral", "PixtralConfig"),
         ("llava_next", "LlavaNextConfig"),
         ("llava_next_video", "LlavaNextVideoConfig"),
         ("llava_onevision", "LlavaOnevisionConfig"),
@@ -443,6 +444,7 @@
         ("llama2", "Llama2"),
         ("llama3", "Llama3"),
         ("llava", "LLaVa"),
+        ("pixtral", "Pixtral"),
         ("llava_next", "LLaVA-NeXT"),
         ("llava_next_video", "LLaVa-NeXT-Video"),
         ("llava_onevision", "LLaVA-Onevision"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 45a9c4d0d078b7..1f860c25e8dbcc 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -277,6 +277,7 @@
         ("xmod", "XmodModel"),
         ("yolos", "YolosModel"),
         ("yoso", "YosoModel"),
+        ("pixtral", "PixtralModel"),
     ]
 )
 
@@ -728,6 +729,7 @@
         ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
+        ("pixtral", "PixtralModel"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 7f49e0e8d99730..d055693316df18 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -71,6 +71,7 @@
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
         ("llava", "LlavaProcessor"),
+        ("pixtral", "PixtralProcessor"),
         ("llava_next", "LlavaNextProcessor"),
         ("llava_next_video", "LlavaNextVideoProcessor"),
         ("llava_onevision", "LlavaOnevisionProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index c8eb06db04a098..00bbef64d99aa9 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -257,6 +257,7 @@
                 ),
             ),
             ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+            ("pixtral", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("llava-onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/pixtral/__init__.py b/src/transformers/models/pixtral/__init__.py
new file mode 100644
index 00000000000000..cb43777e2d5a2e
--- /dev/null
+++ b/src/transformers/models/pixtral/__init__.py
@@ -0,0 +1,53 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_pixtral": ["PixtralConfig"],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_pixtral"] = [
+        "PixtralModel",
+        "PixtralPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_pixtral import PixtralConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_pixtral import (
+            PixtralModel,
+            PixtralPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
new file mode 100644
index 00000000000000..0484233d389245
--- /dev/null
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -0,0 +1,107 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pixtral model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class PixtralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PixtralModel`]. It is used to instantiate an
+    Pixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Pixtral-9B.
+
+    e.g. [pixtral-hf/pixtral-9b](https://huggingface.co/pixtral-hf/pixtral-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        vision_feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+
+    Example:
+
+    ```python
+    >>> from transformers import PixtralModel, PixtralConfig, CLIPVisionConfig, LlamaConfig
+
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> # Initializing a Pixtral pixtral-1.5-7b style configuration
+    >>> configuration = PixtralConfig(vision_config, text_config)
+
+    >>> # Initializing a model from the pixtral-1.5-7b style configuration
+    >>> model = PixtralModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "pixtral"
+
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4096,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=1024,
+        patch_size=16,
+        hidden_act="gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        rope_theta=10000.0,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.rope_theta = rope_theta
+        self.tie_word_embeddings = tie_word_embeddings
+        self.head_dim = hidden_size // num_attention_heads
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
new file mode 100644
index 00000000000000..6706907e1e0708
--- /dev/null
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -0,0 +1,103 @@
+from transformers import LlavaConfig, LlavaForConditionalGeneration, AutoTokenizer, MistralConfig, PixtralConfig
+
+import torch
+from safetensors.torch import load_file as safe_load_file
+import regex as re
+
+tokenizer = AutoTokenizer.from_pretrained("leafspark/Pixtral-12B-2409-hf", )
+
+
+text_config = MistralConfig(
+    attention_dropout=0.0,
+    bos_token_id=1,
+    eos_token_id=2,
+    head_dim=128,
+    hidden_act="silu",
+    hidden_size=5120,
+    initializer_range=0.02,
+    intermediate_size=14336,
+    max_position_embeddings=1024000,
+    model_type="mistral",
+    num_attention_heads=32,
+    num_hidden_layers=40,
+    num_key_value_heads=8,
+    rms_norm_eps=1e-05,
+    rope_theta=1000000000.0,
+    sliding_window=None,
+    tie_word_embeddings=False,
+    vocab_size=131072
+)
+
+vision_config = PixtralConfig()
+config = LlavaConfig(vision_config, text_config)
+config.architectures = ["LlavaForConditionalGeneration"]
+config.save_pretrained("/Users/arthurzucker/Work/pixtral")
+
+        
+original_state_dict = safe_load_file("/Users/arthurzucker/Work/pixtral/model.safetensors")
+
+
+OLD_KEY_TO_NEW_KEY_MAPPING = {
+    # Layer Normalization Weights
+    r"vision_encoder.transformer.layers.(\d+).input_layernorm.weight": r"vision_tower.transformer.layers.\1.attention_norm.weight",
+    r"vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"vision_tower.transformer.layers.\1.post_attention_layernorm.weight",
+    
+    # Self Attention Projections
+    r"vision_encoder.transformer.layers.(\d+).attention.wq.weight": r"vision_tower.transformer.layers.\1.self_attn.q_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wk.weight": r"vision_tower.transformer.layers.\1.self_attn.k_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wv.weight": r"vision_tower.transformer.layers.\1.self_attn.v_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wo.weight": r"vision_tower.transformer.layers.\1.self_attn.o_proj.weight",
+    
+    # MLP Projections
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"vision_tower.transformer.layers.\1.mlp.gate_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"vision_tower.transformer.layers.\1.mlp.down_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"vision_tower.transformer.layers.\1.mlp.up_proj.weight",
+    
+    # Additional mappings
+    r"vision_encoder": r"vision_tower",
+    r"vision_language_adapter.w_in": r"multimodal_projector.linear_1",
+    r"vision_language_adapter.w_out": r"multimodal_projector.linear_2",
+    r"layers.(\d+).attention.wq.weight": r"language_model.model.layers.\1.self_attn.q_proj.weight",
+    r"layers.(\d+).attention.wk.weight": r"language_model.model.layers.\1.self_attn.k_proj.weight",
+    r"layers.(\d+).attention.wv.weight": r"language_model.model.layers.\1.self_attn.v_proj.weight",
+    r"layers.(\d+).attention.wo.weight": r"language_model.model.layers.\1.self_attn.o_proj.weight",
+    r"layers.(\d+).feed_forward.w1.weight": r"vision_tower.transformer.layers.\1.mlp.gate_proj.weight",
+    r"layers.(\d+).feed_forward.w2.weight": r"vision_tower.transformer.layers.\1.mlp.down_proj.weight",
+    r"layers.(\d+).feed_forward.w3.weight": r"vision_tower.transformer.layers.\1.mlp.up_proj.weight",
+}
+
+new_state_dict = {} 
+all_keys = "\n".join(original_state_dict.keys())
+old_keys = all_keys
+for old, new in OLD_KEY_TO_NEW_KEY_MAPPING.items():
+    all_keys = re.sub(old,new,all_keys)
+
+OLD_TO_NEW = dict(zip(old_keys.split("\n"), all_keys.split("\n")))
+
+new_dict={}
+
+def permute_for_rope(value,n_heads, config):
+        dim1 = config.head_dim
+        dim2 = config.hidden_size
+        return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) 
+
+for key, value in original_state_dict.items():
+
+    if "vision_encoder" in key:
+        _config = vision_config
+    else:
+        _config = text_config
+        # convert the text model (basically mistral model)
+
+    if "q_proj" in key:
+        value = permute_for_rope(value,_config.num_attention_heads)
+    if "k_proj" in key:
+        value = permute_for_rope(value,_config.num_key_value_heads)
+
+    new_key = OLD_TO_NEW[key]
+    new_dict[new_key] = value
+
+with torch.device("meta"):
+    model = LlavaForConditionalGeneration(config)
+
+model.load_state_dict(new_dict, strict=True, assign=True)
\ No newline at end of file
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
new file mode 100644
index 00000000000000..1ce923b253d8d9
--- /dev/null
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -0,0 +1,622 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Pixtral model."""
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ... import PreTrainedModel
+from ...activations import ACT2FN
+from ...modeling_outputs import ModelOutput, BaseModelOutput
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto import AutoModelForCausalLM
+from .configuration_pixtral import PixtralConfig
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "PixtralConfig"
+
+
+@dataclass
+# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Pixtral
+class PixtralCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Pixtral causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class PixtralRotaryEmbedding(nn.Module):
+    """
+        The key with pixtral embedding is just that you have a frequency for each pixel positions.
+        If you have height x width pixels (or embedding pixels)
+
+        then the frequency used for ROPE is given by indexing the pre_computed frequency on the
+        width and height.
+
+        What you output is of dimension batch, height * width, dim with dim the embed dim.
+
+        This simply means that for each image hidden states, you are going to add
+        a corresponding positional embedding, based on it's index in the grid.
+    """
+    def __init__(self, config, device):
+        super().__init__()
+
+        self.dim = config.head_dim
+        self.base = config.rope_theta
+        max_patches_per_side = config.image_size // config.patch_size
+        freqs = 1.0 / (self.base**(torch.arange(0, self.dim, 2).float() / self.dim))
+
+        h = torch.arange(max_patches_per_side, device=freqs.device)
+        w = torch.arange(max_patches_per_side, device=freqs.device)
+
+        freqs_h = torch.outer(h, freqs[::2]).float()
+        freqs_w = torch.outer(w, freqs[1::2]).float()
+        inv_freq = torch.cat(
+            [
+                freqs_h[:, None, :].repeat(1, max_patches_per_side, 1),
+                freqs_w[None, :, :].repeat(max_patches_per_side, 1, 1),
+            ],
+            dim=-1,
+        ).reshape(-1, self.dim) # we reshape to only index on the position indexes, not tuple of indexes
+
+        # TODO maybe make it torch compatible later on. We can also just slice
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+
+        # Core RoPE block
+        freqs = self.inv_freq[position_ids]
+        # position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = freqs
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+class PixtralAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, patches, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, patches, -1)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+# Copied from gemma2
+class PixtralMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_activation]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class PixtralRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+def position_ids_in_meshgrid(patch_embeds_list: list[torch.Tensor]) -> torch.Tensor:
+    positions = []
+    for patch in patch_embeds_list:
+        height, width = patch.shape[-2:]
+        mesh = torch.meshgrid(torch.arange(height*width), indexing="ij")
+        h_grid, v_grid = mesh
+        ids = h_grid * width + v_grid
+        positions.append(ids)
+    return torch.cat(positions)
+
+
+
+class PixtralAttentionLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5)
+        self.feed_forward = PixtralMLP
+        self.attention = PixtralAttention(config)
+        self.ffn_norm =  PixtralRMSNorm(config.hidden_size, eps=1e-5)
+
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.attention_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.ffn_norm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+
+
+
+class PixtralTransformer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = torch.nn.ModuleList()
+        for _ in range(config.num_hidden_layers):
+            self.layers.append(PixtralAttentionLayer(config))
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_embeddings,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    position_embeddings=position_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+
+
+PIXTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`PixtralConfig`] or [`PixtralVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    PIXTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llava.modeling_llava.LlavaPreTrainedModel with Llava->Pixtral,llava->pixtral
+class PixtralPreTrainedModel(PreTrainedModel):
+    config_class = PixtralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PixtralVisionAttention"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        # important: this ported version of Pixtral isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/LLaVA/tree/main/pixtral should serve for that purpose
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+PIXTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
+            [`CLIPImageProcessor`] for processing images).
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        vision_feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+@add_start_docstrings(
+    """The PIXTRAL model which consists of a vision backbone and a language model.""",
+    PIXTRAL_START_DOCSTRING,
+)
+class PixtralModel(PixtralPreTrainedModel):
+    base_model_prefix = "vision_encoder"
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.patch_conv = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=config.hidden_size,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias=False,
+        )
+        self.ln_pre = PixtralRMSNorm(config.hidden_size, eps=1e-5)
+        self.transformer = PixtralTransformer(config)
+        self.patch_positional_embedding = PixtralRotaryEmbedding(config, device=self.device)
+
+    @property
+    def max_patches_per_side(self) -> int:
+        return self.args.image_size // self.args.patch_size
+
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+
+    @property
+    def dtype(self) -> torch.device:
+        return next(self.parameters()).dtype
+
+    def forward(self, images: List[torch.Tensor]) -> torch.Tensor:
+        """
+        Args:
+            images: list of N_img images of variable sizes,
+                each of shape (C, H, W)
+        Returns:
+            image_features: tensor of token features for
+                all tokens of all images of shape (N_toks, D)
+        """
+        # pass images through initial convolution independently
+        patch_embeds_list = [
+            self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in images
+        ]
+
+        # flatten to a single sequence
+        patch_embeds = torch.cat(
+            [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
+        patch_embeds = self.ln_pre(patch_embeds)
+
+        # positional embeddings
+        position_ids = position_ids_in_meshgrid(patch_embeds_list).to(self.device)
+
+        position_embedding = self.patch_positional_embedding(position_ids)
+        attention_mask = None
+        out = self.transformer(patch_embeds, attention_mask, position_embedding)
+
+        # remove batch dimension of the single sequence
+        return out.squeeze(0)
diff --git a/tests/models/pixtral/__init__.py b/tests/models/pixtral/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py
new file mode 100644
index 00000000000000..0c8a82e0b0b787
--- /dev/null
+++ b/tests/models/pixtral/test_modeling_pixtral.py
@@ -0,0 +1,570 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Testing suite for the PyTorch Pixtral model."""
+
+import gc
+import unittest
+
+import requests
+
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    PixtralConfig,
+    PixtralModel,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import (
+    require_bitsandbytes,
+    require_torch,
+    require_torch_gpu,
+    require_vision,
+    slow,
+    torch_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+else:
+    is_torch_greater_or_equal_than_2_0 = False
+
+if is_vision_available():
+    from PIL import Image
+
+
+class PixtralVisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        ignore_index=-100,
+        image_token_index=0,
+        projector_hidden_act="gelu",
+        seq_length=7,
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-1,
+        text_config={
+            "model_type": "llama",
+            "seq_length": 7,
+            "is_training": True,
+            "use_input_mask": True,
+            "use_token_type_ids": False,
+            "use_labels": True,
+            "vocab_size": 99,
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "attention_probs_dropout_prob": 0.1,
+            "max_position_embeddings": 512,
+            "type_vocab_size": 16,
+            "type_sequence_label_size": 2,
+            "initializer_range": 0.02,
+            "num_labels": 3,
+            "num_choices": 4,
+            "pad_token_id": 0,
+        },
+        is_training=True,
+        vision_config={
+            "image_size": 30,
+            "patch_size": 2,
+            "num_channels": 3,
+            "is_training": True,
+            "hidden_size": 32,
+            "projection_dim": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "dropout": 0.1,
+            "attention_dropout": 0.1,
+            "initializer_range": 0.02,
+        },
+    ):
+        self.parent = parent
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.seq_length = seq_length
+
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.vocab_size = text_config["vocab_size"]
+        self.hidden_size = text_config["hidden_size"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.is_training = is_training
+
+        self.batch_size = 3
+        self.num_channels = 3
+        self.image_size = 336
+        self.encoder_seq_length = 231
+
+    def get_config(self):
+        return PixtralConfig(
+            text_config=self.text_config,
+            vision_config=self.vision_config,
+            ignore_index=self.ignore_index,
+            image_token_index=self.image_token_index,
+            projector_hidden_act=self.projector_hidden_act,
+            vision_feature_select_strategy=self.vision_feature_select_strategy,
+            vision_feature_layer=self.vision_feature_layer,
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                self.vision_config["num_channels"],
+                self.vision_config["image_size"],
+                self.vision_config["image_size"],
+            ]
+        )
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
+        attention_mask = input_ids.ne(1).to(torch_device)
+        # we are giving 3 images let's make sure we pass in 3 image tokens
+        input_ids[:, 1] = config.image_token_index
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+    def create_and_check_pixtral_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask):
+        model = PixtralModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            logits = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                pixel_values=pixel_values.to(torch.bfloat16),
+                return_dict=True,
+            )["logits"]
+        self.parent.assertFalse(torch.isnan(logits).any().item())
+
+
+@require_torch
+class PixtralModelModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Model tester for `PixtralModel`.
+    """
+
+    all_model_classes = (PixtralModel,) if is_torch_available() else ()
+    test_pruning = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = PixtralVisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=PixtralConfig, has_text_modality=False)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            wte = model.get_input_embeddings()
+            inputs["inputs_embeds"] = wte(input_ids)
+
+            with torch.no_grad():
+                model(**inputs)
+
+    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    # while some other models require pixel_values to be present
+    def test_inputs_embeds_matches_input_ids(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = self._prepare_for_class(inputs_dict, model_class)
+            input_ids = inputs["input_ids"]
+            del inputs["input_ids"]
+            del inputs["pixel_values"]
+
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+
+            with torch.no_grad():
+                out_ids = model(input_ids=input_ids, **inputs)[0]
+                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    @unittest.skip(reason="Compile not yet supported because in LLava models")
+    def test_sdpa_can_compile_dynamic(self):
+        pass
+
+    @unittest.skip(reason="Compile not yet supported because in LLava models")
+    def test_sdpa_can_dispatch_on_flash(self):
+        pass
+
+
+@require_torch
+class PixtralModelIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = AutoProcessor.from_pretrained("pixtral-hf/bakPixtral-v1-hf")
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model = PixtralModel.from_pretrained("pixtral-hf/bakPixtral-v1-hf", load_in_4bit=True)
+
+        prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
+        image_file = "https://pixtral-vl.github.io/static/images/view.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = self.processor(prompt, raw_image, return_tensors="pt")
+
+        EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]])  # fmt: skip
+        self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
+
+        output = model.generate(**inputs, max_new_tokens=20)
+        EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,"  # fmt: skip
+
+        self.assertEqual(
+            self.processor.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_llama_single(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model_id = "mistral-community/pixtral"
+
+        model = PixtralModel.from_pretrained("mistral-community/pixtral", load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place? ASSISTANT:"
+        image_file = "https://pixtral-vl.github.io/static/images/view.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+
+        output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
+        EXPECTED_DECODED_TEXT = "USER:  \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area."  # fmt: skip
+
+        self.assertEqual(
+            processor.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_llama_batched(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model_id = "mistral-community/pixtral"
+
+        model = PixtralModel.from_pretrained("mistral-community/pixtral", load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompts = [
+            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT:",
+            "USER: <image>\nWhat is this? ASSISTANT:",
+        ]
+        image1 = Image.open(requests.get("https://pixtral-vl.github.io/static/images/view.jpg", stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, you', 'USER:  \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat is located on']  # fmt: skip
+
+        self.assertEqual(
+            processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_batch(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model = PixtralModel.from_pretrained("pixtral-hf/bakPixtral-v1-hf", load_in_4bit=True)
+        # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
+        prompts = [
+            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
+            "USER: <image>\nWhat is this?\nASSISTANT:",
+        ]
+        image1 = Image.open(requests.get("https://pixtral-vl.github.io/static/images/view.jpg", stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = self.processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = [
+            'USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.',
+            'USER:  \nWhat is this?\nASSISTANT: Cats'
+        ]  # fmt: skip
+        self.assertEqual(
+            self.processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_llama_batched_regression(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model_id = "mistral-community/pixtral"
+
+        # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
+        model = PixtralModel.from_pretrained(
+            "mistral-community/pixtral", load_in_4bit=True, attn_implementation="eager"
+        )
+        processor = AutoProcessor.from_pretrained(model_id, pad_token="<pad>")
+
+        prompts = [
+            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
+            "USER: <image>\nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: <image>\nAnd this?\nASSISTANT:",
+        ]
+        image1 = Image.open(requests.get("https://pixtral-vl.github.io/static/images/view.jpg", stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = processor(prompts, images=[image1, image2, image1], return_tensors="pt", padding=True)
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water', 'USER:  \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER:  \nAnd this?\nASSISTANT: A cat sleeping on a bed.']  # fmt: skip
+
+        self.assertEqual(
+            processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_torch
+    @require_vision
+    def test_batched_generation(self):
+        model = PixtralModel.from_pretrained("mistral-community/pixtral", load_in_4bit=True)
+
+        processor = AutoProcessor.from_pretrained("mistral-community/pixtral")
+
+        prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
+        prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
+        prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
+        url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
+        url2 = "https://images.unsplash.com/photo-1617258683320-61900b281ced?q=80&w=3087&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
+        image1 = Image.open(requests.get(url1, stream=True).raw)
+        image2 = Image.open(requests.get(url2, stream=True).raw)
+
+        inputs = processor(
+            text=[prompt1, prompt2, prompt3],
+            images=[image1, image2, image1, image2],
+            return_tensors="pt",
+            padding=True,
+        ).to(torch_device)
+
+        model = model.eval()
+
+        EXPECTED_OUTPUT = [
+            "\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
+            "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
+            "\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
+        ]
+
+        generate_ids = model.generate(**inputs, max_new_tokens=20)
+        outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        self.assertEqual(outputs, EXPECTED_OUTPUT)
+
+    @slow
+    @require_bitsandbytes
+    def test_pixtral_index_error_bug(self):
+        # This is a reproducer of https://github.com/huggingface/transformers/pull/28032 and makes sure it does not happen anymore
+        # Please refer to that PR, or specifically https://github.com/huggingface/transformers/pull/28032#issuecomment-1860650043 for
+        # more details
+        model_id = "mistral-community/pixtral"
+        model = PixtralModel.from_pretrained(model_id, load_in_4bit=True)
+
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        # Simulate a super long prompt
+        user_prompt = "Describe the image:?\n" * 200
+        prompt = f"USER: <image>\n{user_prompt}ASSISTANT:"
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+
+        # Make sure that `generate` works
+        _ = model.generate(**inputs, max_new_tokens=20)
+
+    @slow
+    @require_torch_gpu
+    def test_pixtral_merge_inputs_error_bug(self):
+        # This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
+        model_id = "mistral-community/pixtral"
+        model = PixtralModel.from_pretrained(model_id, load_in_4bit=True)
+
+        # Simulate some user inputs
+        pixel_values = torch.randn(
+            (1, 3, 336, 336),
+            dtype=torch.float,
+            device=torch_device,
+        )
+        input_ids = torch.tensor(
+            [
+                [32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
+            ],
+            dtype=torch.long,
+            device=torch_device,
+        )
+        attention_mask = torch.tensor(
+            [[0, 0, 1, 1, 1, 1, 1, 1, 1]],
+            dtype=torch.long,
+            device=torch_device,
+        )
+
+        # Make sure that the loss is properly computed
+        loss = model(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=input_ids,
+        ).loss
+        loss.backward()
+
+    def test_tokenizer_integration(self):
+        slow_tokenizer = AutoTokenizer.from_pretrained("liuhaotian/pixtral-v1.6-34b", use_fast=False)
+        slow_tokenizer.add_tokens("<image>", True)
+
+        fast_tokenizer = AutoTokenizer.from_pretrained(
+            "liuhaotian/pixtral-v1.6-34b",
+            bos_token="<|startoftext|>",
+            eos_token="<|endoftext|>",
+            from_slow=True,
+            legacy=False,
+        )
+        fast_tokenizer.add_tokens("<image>", True)
+
+        prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
+        EXPECTED_OUTPUT = ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '<image>', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n']  # fmt: skip
+        self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
+        self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
+
+    @slow
+    @require_bitsandbytes
+    def test_generation_no_images(self):
+        model_id = "mistral-community/pixtral"
+        model = PixtralModel.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        # Prepare inputs with no images
+        inputs = processor("Hello, I am", return_tensors="pt").to(torch_device)
+
+        # Make sure that `generate` works
+        _ = model.generate(**inputs, max_new_tokens=20)
+
+    @slow
+    @require_bitsandbytes
+    def test_generation_siglip_backbone(self):
+        model_id = "pixtral-hf/pixtral-interleave-qwen-0.5b-hf"
+        model = PixtralModel.from_pretrained(model_id, torch_dtype="float16", device_map=torch_device)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        # check processing with expansion of inputs (w/o expansion should work with any backbone)
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = processor(
+            text="<|im_start|>user\n<image>\nWhat are these?<|im_end|>\n<|im_start|>assistant",
+            images=raw_image,
+            return_tensors="pt",
+        ).to(torch_device, torch.float16)
+
+        # Make sure that `generate` works
+        output = model.generate(**inputs, max_new_tokens=30)
+
+        EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat"
+        self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)
+
+    @slow
+    @require_bitsandbytes
+    def test_expansion_in_processing(self):
+        model_id = "mistral-community/pixtral"
+        model = PixtralModel.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
+        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+
+        # check processing with expansion of inputs
+        processor.vision_feature_select_strategy = "default"
+        processor.patch_size = 14
+        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
+
+        # check processing without expansion of inputs (legacy behavior)
+        processor.vision_feature_select_strategy = None
+        processor.patch_size = None
+        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+        self.assertTrue(inputs.input_ids.shape[-1] == 18)
+
+        # generate exactly 20 tokens
+        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
+        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
+
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())

From 4780f37bccd7b52f4a9a0a6b9c8cf9cc39b238af Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 12 Sep 2024 15:43:33 +0200
Subject: [PATCH 02/58] gloups

---
 src/transformers/models/pixtral/modeling_pixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 1ce923b253d8d9..afb78ac6765d60 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -210,7 +210,7 @@ def __init__(self, config):
         self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
         self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
 
     def forward(
         self,
@@ -247,7 +247,7 @@ def forward(
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(batch_size, patches, -1)
 
-        attn_output = self.out_proj(attn_output)
+        attn_output = self.o_proj(attn_output)
 
         return attn_output, attn_weights
 

From 1b897b3841e395846e514c62f176ab47c9c90551 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 12 Sep 2024 15:59:18 +0200
Subject: [PATCH 03/58] updates

---
 .../models/pixtral/configuration_pixtral.py   |  4 +-
 .../pixtral/convert_pixtral_weights_to_hf.py  | 46 +++++++++++--------
 .../models/pixtral/modeling_pixtral.py        |  2 +-
 3 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
index 0484233d389245..90ce7785f0408a 100644
--- a/src/transformers/models/pixtral/configuration_pixtral.py
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -83,7 +83,7 @@ def __init__(
         num_channels=3,
         image_size=1024,
         patch_size=16,
-        hidden_act="gelu",
+        hidden_activation="gelu",
         layer_norm_eps=1e-5,
         attention_dropout=0.0,
         rope_theta=10000.0,
@@ -101,7 +101,7 @@ def __init__(
         self.image_size = image_size
         self.attention_dropout = attention_dropout
         self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
+        self.hidden_activation = hidden_activation
         self.rope_theta = rope_theta
         self.tie_word_embeddings = tie_word_embeddings
         self.head_dim = hidden_size // num_attention_heads
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 6706907e1e0708..d6d74700383548 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -39,38 +39,44 @@
 
 OLD_KEY_TO_NEW_KEY_MAPPING = {
     # Layer Normalization Weights
-    r"vision_encoder.transformer.layers.(\d+).input_layernorm.weight": r"vision_tower.transformer.layers.\1.attention_norm.weight",
-    r"vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"vision_tower.transformer.layers.\1.post_attention_layernorm.weight",
+    r"vision_encoder.transformer.layers.(\d+).input_layernorm.weight":  r"vision_tower.transformer.layers.\1.attention_norm.weight",
+    r"vision_encoder.transformer.layers.(\d+).ffn_norm.weight":         r"vision_tower.transformer.layers.\1.ffn_norm.weight",
     
     # Self Attention Projections
-    r"vision_encoder.transformer.layers.(\d+).attention.wq.weight": r"vision_tower.transformer.layers.\1.self_attn.q_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wk.weight": r"vision_tower.transformer.layers.\1.self_attn.k_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wv.weight": r"vision_tower.transformer.layers.\1.self_attn.v_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wo.weight": r"vision_tower.transformer.layers.\1.self_attn.o_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wq.weight":     r"vision_tower.transformer.layers.\1.attention.q_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wk.weight":     r"vision_tower.transformer.layers.\1.attention.k_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wv.weight":     r"vision_tower.transformer.layers.\1.attention.v_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wo.weight":     r"vision_tower.transformer.layers.\1.attention.o_proj.weight",
     
     # MLP Projections
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"vision_tower.transformer.layers.\1.mlp.gate_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"vision_tower.transformer.layers.\1.mlp.down_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"vision_tower.transformer.layers.\1.mlp.up_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight":  r"vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight":  r"vision_tower.transformer.layers.\1.feed_forward.down_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight":  r"vision_tower.transformer.layers.\1.feed_forward.up_proj.weight",
     
     # Additional mappings
-    r"vision_encoder": r"vision_tower",
-    r"vision_language_adapter.w_in": r"multimodal_projector.linear_1",
-    r"vision_language_adapter.w_out": r"multimodal_projector.linear_2",
-    r"layers.(\d+).attention.wq.weight": r"language_model.model.layers.\1.self_attn.q_proj.weight",
-    r"layers.(\d+).attention.wk.weight": r"language_model.model.layers.\1.self_attn.k_proj.weight",
-    r"layers.(\d+).attention.wv.weight": r"language_model.model.layers.\1.self_attn.v_proj.weight",
-    r"layers.(\d+).attention.wo.weight": r"language_model.model.layers.\1.self_attn.o_proj.weight",
-    r"layers.(\d+).feed_forward.w1.weight": r"vision_tower.transformer.layers.\1.mlp.gate_proj.weight",
-    r"layers.(\d+).feed_forward.w2.weight": r"vision_tower.transformer.layers.\1.mlp.down_proj.weight",
-    r"layers.(\d+).feed_forward.w3.weight": r"vision_tower.transformer.layers.\1.mlp.up_proj.weight",
+    r"vision_encoder":                                  r"vision_tower",
+    r"vision_language_adapter.w_in":                    r"multi_modal_projector.linear_1",
+    r"vision_language_adapter.w_out":                   r"multi_modal_projector.linear_2",
+    r"layers.(\d+).attention.wq.weight":                r"language_model.model.layers.\1.self_attn.q_proj.weight",
+    r"layers.(\d+).attention.wk.weight":                r"language_model.model.layers.\1.self_attn.k_proj.weight",
+    r"layers.(\d+).attention.wv.weight":                r"language_model.model.layers.\1.self_attn.v_proj.weight",
+    r"layers.(\d+).attention.wo.weight":                r"language_model.model.layers.\1.self_attn.o_proj.weight",
+    r"layers.(\d+).feed_forward.w1.weight":             r"language_model.model.layers.\1.mlp.gate_proj.weight",
+    r"layers.(\d+).feed_forward.w2.weight":             r"language_model.model.layers.\1.mlp.down_proj.weight",
+    r"layers.(\d+).feed_forward.w3.weight":             r"language_model.model.layers.\1.mlp.up_proj.weight",
+    r"layers.(\d+).ffn_norm.weight":                    r"language_model.model.layers.\1.post_attention_layernorm.weight",
+    r"layers.(\d+).attention_norm.weight":              r"language_model.model.layers.\1.input_layernorm.weight",
+    r"tok_embeddings.weight":                           r"language_model.model.embed_tokens.weight",
+    r"output.weight":                                   r"language_model.lm_head.weight",
+    r"norm.weight":                                     r"language_model.model.norm.weight"
+
 }
 
 new_state_dict = {} 
 all_keys = "\n".join(original_state_dict.keys())
 old_keys = all_keys
 for old, new in OLD_KEY_TO_NEW_KEY_MAPPING.items():
-    all_keys = re.sub(old,new,all_keys)
+    all_keys = re.sub(r"\n"+ old,r"\n"+new,all_keys)
 
 OLD_TO_NEW = dict(zip(old_keys.split("\n"), all_keys.split("\n")))
 
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index afb78ac6765d60..fb69d6f5656c1c 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -303,7 +303,7 @@ class PixtralAttentionLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.attention_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5)
-        self.feed_forward = PixtralMLP
+        self.feed_forward = PixtralMLP(config)
         self.attention = PixtralAttention(config)
         self.ffn_norm =  PixtralRMSNorm(config.hidden_size, eps=1e-5)
 

From 1e9752745894c42ea7df3ce96bbcfa9757dba271 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 12 Sep 2024 16:02:39 +0200
Subject: [PATCH 04/58] work

---
 .../pixtral/convert_pixtral_weights_to_hf.py       | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index d6d74700383548..4127300803816b 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -73,7 +73,7 @@
 }
 
 new_state_dict = {} 
-all_keys = "\n".join(original_state_dict.keys())
+all_keys = "\n"+ "\n".join(original_state_dict.keys())
 old_keys = all_keys
 for old, new in OLD_KEY_TO_NEW_KEY_MAPPING.items():
     all_keys = re.sub(r"\n"+ old,r"\n"+new,all_keys)
@@ -82,7 +82,7 @@
 
 new_dict={}
 
-def permute_for_rope(value,n_heads, config):
+def permute_for_rope(value, n_heads, config):
         dim1 = config.head_dim
         dim2 = config.hidden_size
         return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) 
@@ -95,12 +95,12 @@ def permute_for_rope(value,n_heads, config):
         _config = text_config
         # convert the text model (basically mistral model)
 
-    if "q_proj" in key:
-        value = permute_for_rope(value,_config.num_attention_heads)
-    if "k_proj" in key:
-        value = permute_for_rope(value,_config.num_key_value_heads)
-
     new_key = OLD_TO_NEW[key]
+    if "q_proj" in new_key:
+        value = permute_for_rope(value,_config.num_attention_heads, _config)
+    if "k_proj" in new_key:
+        value = permute_for_rope(value,_config.num_key_value_heads, _config)
+
     new_dict[new_key] = value
 
 with torch.device("meta"):

From fb0e78c02aa6ca95a074403c0ccd6a9cabca9420 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 12 Sep 2024 16:10:52 +0200
Subject: [PATCH 05/58] weights match

---
 .../pixtral/convert_pixtral_weights_to_hf.py  | 35 +++++++++++++++----
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 4127300803816b..d20fab06bba8dd 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -4,6 +4,9 @@
 from safetensors.torch import load_file as safe_load_file
 import regex as re
 
+from PIL import Image
+import requests
+from transformers import AutoProcessor
 tokenizer = AutoTokenizer.from_pretrained("leafspark/Pixtral-12B-2409-hf", )
 
 
@@ -83,27 +86,45 @@
 new_dict={}
 
 def permute_for_rope(value, n_heads, config):
-        dim1 = config.head_dim
+        dim1 = value.shape[0]
         dim2 = config.hidden_size
         return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) 
 
 for key, value in original_state_dict.items():
 
+    new_key = OLD_TO_NEW[key]
     if "vision_encoder" in key:
         _config = vision_config
+        num_attention_heads = _config.num_attention_heads
     else:
         _config = text_config
+        if "q_proj" in new_key:
+            num_attention_heads = _config.num_attention_heads
+        if "k_proj" in new_key:
+            num_attention_heads = _config.num_key_value_heads
         # convert the text model (basically mistral model)
 
-    new_key = OLD_TO_NEW[key]
-    if "q_proj" in new_key:
-        value = permute_for_rope(value,_config.num_attention_heads, _config)
-    if "k_proj" in new_key:
-        value = permute_for_rope(value,_config.num_key_value_heads, _config)
+
+    if "q_proj" in new_key or "k_proj" in new_key:
+        value = permute_for_rope(value,num_attention_heads, _config)
 
     new_dict[new_key] = value
 
 with torch.device("meta"):
     model = LlavaForConditionalGeneration(config)
 
-model.load_state_dict(new_dict, strict=True, assign=True)
\ No newline at end of file
+model.load_state_dict(new_dict, strict=True, assign=True)
+
+
+
+processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+processor.tokenizer = tokenizer
+prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+inputs = processor(text=prompt, images=image, return_tensors="pt")
+
+# Generate
+generate_ids = model.generate(**inputs, max_new_tokens=15)
+processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

From eb76b0caae5e6963c2f2bd1d19fa72515878c099 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 12 Sep 2024 16:57:35 +0200
Subject: [PATCH 06/58] nits

---
 .../pixtral/convert_pixtral_weights_to_hf.py  |  4 +--
 .../models/pixtral/modeling_pixtral.py        | 26 ++++++++++---------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index d20fab06bba8dd..f376e8b4b991c3 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -34,10 +34,10 @@
 vision_config = PixtralConfig()
 config = LlavaConfig(vision_config, text_config)
 config.architectures = ["LlavaForConditionalGeneration"]
-config.save_pretrained("/Users/arthurzucker/Work/pixtral")
+config.save_pretrained("../pixtral")
 
         
-original_state_dict = safe_load_file("/Users/arthurzucker/Work/pixtral/model.safetensors")
+original_state_dict = safe_load_file("../pixtral/consolidated.safetensors")
 
 
 OLD_KEY_TO_NEW_KEY_MAPPING = {
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index fb69d6f5656c1c..4e78b8a171587b 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -97,7 +97,7 @@ class PixtralRotaryEmbedding(nn.Module):
     """
     def __init__(self, config, device):
         super().__init__()
-
+        self.rope_type = "default"
         self.dim = config.head_dim
         self.base = config.rope_theta
         max_patches_per_side = config.image_size // config.patch_size
@@ -115,6 +115,7 @@ def __init__(self, config, device):
             ],
             dim=-1,
         ).reshape(-1, self.dim) # we reshape to only index on the position indexes, not tuple of indexes
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
 
         # TODO maybe make it torch compatible later on. We can also just slice
         self.register_buffer("inv_freq", inv_freq, persistent=False)
@@ -131,8 +132,7 @@ def forward(self, x, position_ids):
         device_type = x.device.type
         device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
         with torch.autocast(device_type=device_type, enabled=False):
-            freqs = freqs
-            emb = torch.cat((freqs, freqs), dim=-1)
+            emb = freqs
             cos = emb.cos()
             sin = emb.sin()
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
@@ -167,7 +167,7 @@ def rotate_half(x):
 
 
 # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=0):
     """Applies Rotary Position Embedding to the query and key tensors.
 
     Args:
@@ -287,14 +287,14 @@ def extra_repr(self):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
 
 
-def position_ids_in_meshgrid(patch_embeds_list: list[torch.Tensor]) -> torch.Tensor:
+def position_ids_in_meshgrid(patch_embeds_list):
     positions = []
     for patch in patch_embeds_list:
         height, width = patch.shape[-2:]
-        mesh = torch.meshgrid(torch.arange(height*width), indexing="ij")
-        h_grid, v_grid = mesh
-        ids = h_grid * width + v_grid
-        positions.append(ids)
+        mesh = torch.meshgrid(torch.arange(height), torch.arange(width), indexing="ij")
+        h_grid, v_grid = torch.stack(mesh, dim=-1).reshape(-1, 2).chunk(2,-1)
+        ids = h_grid * height + v_grid
+        positions.append(ids[:,0])
     return torch.cat(positions)
 
 
@@ -328,7 +328,7 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.attention_norm(hidden_states)
-        hidden_states, attn_weights = self.self_attn(
+        hidden_states, attn_weights = self.attention(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
             position_embeddings=position_embeddings,
@@ -352,9 +352,11 @@ def forward(
 class PixtralTransformer(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.config = config
         self.layers = torch.nn.ModuleList()
         for _ in range(config.num_hidden_layers):
             self.layers.append(PixtralAttentionLayer(config))
+        self.gradient_checkpointing = False
 
     def forward(
         self,
@@ -592,7 +594,7 @@ def device(self) -> torch.device:
     def dtype(self) -> torch.device:
         return next(self.parameters()).dtype
 
-    def forward(self, images: List[torch.Tensor]) -> torch.Tensor:
+    def forward(self, images: List[torch.Tensor], output_hidden_states=False, *kwargs) -> torch.Tensor:
         """
         Args:
             images: list of N_img images of variable sizes,
@@ -614,7 +616,7 @@ def forward(self, images: List[torch.Tensor]) -> torch.Tensor:
         # positional embeddings
         position_ids = position_ids_in_meshgrid(patch_embeds_list).to(self.device)
 
-        position_embedding = self.patch_positional_embedding(position_ids)
+        position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
         attention_mask = None
         out = self.transformer(patch_embeds, attention_mask, position_embedding)
 

From 334d7a9917c348edb481551c65922a1df451b1d4 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 12 Sep 2024 17:12:04 +0200
Subject: [PATCH 07/58] nits

---
 .../pixtral/convert_pixtral_weights_to_hf.py       | 14 ++++++++------
 .../models/pixtral/modeling_pixtral.py             |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index f376e8b4b991c3..c824062e297e47 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -34,6 +34,7 @@
 vision_config = PixtralConfig()
 config = LlavaConfig(vision_config, text_config)
 config.architectures = ["LlavaForConditionalGeneration"]
+config.text_config.head_dim = 128
 config.save_pretrained("../pixtral")
 
         
@@ -110,20 +111,21 @@ def permute_for_rope(value, n_heads, config):
 
     new_dict[new_key] = value
 
-with torch.device("meta"):
-    model = LlavaForConditionalGeneration(config)
-
-model.load_state_dict(new_dict, strict=True, assign=True)
-
+config.text_config.head_dim = 128
+# with torch.device("meta"):
+#     model = LlavaForConditionalGeneration(config)
+# model.load_state_dict(new_dict, strict=True, assign=True)
 
+# model.save_pretrained("../pixtral")
 
+model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config).to("cuda")
 processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
 processor.tokenizer = tokenizer
 prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
 url = "https://www.ilankelman.org/stopsigns/australia.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 
-inputs = processor(text=prompt, images=image, return_tensors="pt")
+inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
 
 # Generate
 generate_ids = model.generate(**inputs, max_new_tokens=15)
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 4e78b8a171587b..d72d8a13fc9384 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -621,4 +621,4 @@ def forward(self, images: List[torch.Tensor], output_hidden_states=False, *kwarg
         out = self.transformer(patch_embeds, attention_mask, position_embedding)
 
         # remove batch dimension of the single sequence
-        return out.squeeze(0)
+        return out

From 30439a1633698344c3af2b884483552ba44600b0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 12 Sep 2024 18:11:01 +0200
Subject: [PATCH 08/58] updates to support the tokenizer :)

---
 .../pixtral/convert_pixtral_weights_to_hf.py  | 106 +++++++++++++++++-
 .../models/pixtral/modeling_pixtral.py        |   2 +-
 2 files changed, 103 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index c824062e297e47..513cd50bc24c85 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -1,4 +1,4 @@
-from transformers import LlavaConfig, LlavaForConditionalGeneration, AutoTokenizer, MistralConfig, PixtralConfig
+from transformers import LlavaConfig, LlavaForConditionalGeneration, AutoTokenizer, MistralConfig, PixtralConfig, PreTrainedTokenizerFast
 
 import torch
 from safetensors.torch import load_file as safe_load_file
@@ -7,7 +7,100 @@
 from PIL import Image
 import requests
 from transformers import AutoProcessor
-tokenizer = AutoTokenizer.from_pretrained("leafspark/Pixtral-12B-2409-hf", )
+
+
+
+from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+
+# Load Mistral tokenizer
+
+model_name = "mistralai/Pixtral-12B-2409"
+
+tokenizer = MistralTokenizer.from_model(model_name)
+
+vocab = tokenizer.instruct_tokenizer.tokenizer._tekken_token2id_nospecial
+all_special = [token.value if hasattr(token,"value") else token for token in tokenizer.instruct_tokenizer.tokenizer._all_special_tokens]
+specials_tokens = {token : all_special.index(token)  for token in all_special}
+specials_tokens.update(vocab)
+vocab = specials_tokens
+from transformers.convert_slow_tokenizer import *
+class MistralConverter:
+    """
+    A general tiktoken converter.
+    """
+
+    def __init__(
+        self,
+        vocab=None,
+        pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
+        add_prefix_space=False,
+        additional_special_tokens=None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args)
+        self.vocab = vocab
+        self.pattern = pattern
+        self.add_prefix_space = add_prefix_space
+        self.additional_special_tokens = additional_special_tokens
+
+    def extract_vocab_merges_from_model(self, vocab: str):
+        try:
+            from tiktoken.load import load_tiktoken_bpe
+        except Exception:
+            raise ValueError(
+                "`tiktoken` is required to read a `tiktoken` file. Install it with " "`pip install tiktoken`."
+            )
+
+        bpe_ranks = vocab 
+        byte_encoder = bytes_to_unicode()
+
+        def token_bytes_to_string(b):
+            return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
+
+        merges = []
+        vocab = {}
+        for idx, (token, rank) in enumerate(bpe_ranks.items()):
+            if token not in all_special:
+                vocab[token_bytes_to_string(token)] = idx
+                if len(token) == 1:
+                    continue
+                local = []
+                for index in range(1, len(token)):
+                    piece_l, piece_r = token[:index], token[index:]
+                    if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks:
+                        local.append((piece_l, piece_r, rank))
+                local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False)
+                merges.extend(local)
+            else:
+                vocab[token] = idx
+        merges = sorted(merges, key=lambda val: val[2], reverse=False)
+        merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges]
+        return vocab, merges
+
+    def tokenizer(self):
+        vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab)
+        tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False))
+        if hasattr(tokenizer.model, "ignore_merges"):
+            tokenizer.model.ignore_merges = True
+        return tokenizer
+
+    def converted(self) -> Tokenizer:
+        tokenizer = self.tokenizer()
+        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False),
+                pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False),
+            ]
+        )
+        tokenizer.decoder = decoders.ByteLevel()
+        tokenizer.add_special_tokens(self.additional_special_tokens)
+
+        tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
+
+        return tokenizer
+
+tokenizer = PreTrainedTokenizerFast(tokenizer_object = MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted())
 
 
 text_config = MistralConfig(
@@ -37,7 +130,7 @@
 config.text_config.head_dim = 128
 config.save_pretrained("../pixtral")
 
-        
+tokenizer.model_input_names = ['input_ids', 'attention_mask']
 original_state_dict = safe_load_file("../pixtral/consolidated.safetensors")
 
 
@@ -117,7 +210,9 @@ def permute_for_rope(value, n_heads, config):
 # model.load_state_dict(new_dict, strict=True, assign=True)
 
 # model.save_pretrained("../pixtral")
-
+config.vision_feature_layer = -1
+config.image_token_index = 10
+config.vision_feature_select_strategy = "full"
 model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config).to("cuda")
 processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
 processor.tokenizer = tokenizer
@@ -125,8 +220,11 @@ def permute_for_rope(value, n_heads, config):
 url = "https://www.ilankelman.org/stopsigns/australia.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 
+prompt = '<s>[INST][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_END]Describe this image in one sentence.[/INST]'
+input_ids_ = torch.tensor([[1, 3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 13, 5847, 13089, 1593, 3937, 1294, 1925, 19286, 1046, 4]]).long()
 inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
 
+input_ids = torch.tensor([[1, 5, 1091, 19227, 4994, 2811, 1429, 5165, 1897, 1429, 5165, 2811, 16753, 2391, 2811, 1429, 1689, 45971, 1095, 45629, 1897, 1429, 14653, 2811, 1429, 4147, 1278, 3519, 17253, 1897, 1429, 26204, 2811, 16753, 4994, 2811, 1429, 6371, 1897, 1429, 48649, 2811, 16753, 17611, 2811, 16753, 4994, 2811, 1429, 3607, 1897, 1429, 14653, 2811, 1429, 1784, 5970, 1321, 3468, 1044, 1324, 3596, 1046, 5151, 12717, 1044, 13461, 50666, 1429, 8092, 2811, 16753, 4994, 2811, 1429, 3607, 1897, 1429, 31222, 2811, 12161, 1099, 79092, 1897, 1429, 38600, 10432, 31597, 1429, 14653, 2811, 1429, 1784, 6138, 5476, 1317, 2210, 1046, 90463, 1593, 1562, 1278, 8616, 7285, 2613, 47579, 1429, 15760, 2811, 12161, 17611, 1897, 1429, 8092, 4964, 2821, 27028, 6, 3, 7493, 1681, 1278, 17253, 2479, 9406, 1294, 6993, 4]])
 # Generate
 generate_ids = model.generate(**inputs, max_new_tokens=15)
 processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index d72d8a13fc9384..e5a4d5b34522da 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -429,7 +429,7 @@ def forward(
         if not return_dict:
             return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
         return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+            last_hidden_state=hidden_states, hidden_states=[hidden_states], attentions=all_attentions
         )
 
 

From 65441271f2b6354674cd5ad9d33db8c722c5e434 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Thu, 12 Sep 2024 18:28:48 +0200
Subject: [PATCH 09/58] updates

---
 .../models/pixtral/convert_pixtral_weights_to_hf.py           | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 513cd50bc24c85..1e608a4699d8cf 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -214,7 +214,7 @@ def permute_for_rope(value, n_heads, config):
 config.image_token_index = 10
 config.vision_feature_select_strategy = "full"
 model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config).to("cuda")
-processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", image_token = "[IMG]")
 processor.tokenizer = tokenizer
 prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
 url = "https://www.ilankelman.org/stopsigns/australia.jpg"
@@ -226,5 +226,7 @@ def permute_for_rope(value, n_heads, config):
 
 input_ids = torch.tensor([[1, 5, 1091, 19227, 4994, 2811, 1429, 5165, 1897, 1429, 5165, 2811, 16753, 2391, 2811, 1429, 1689, 45971, 1095, 45629, 1897, 1429, 14653, 2811, 1429, 4147, 1278, 3519, 17253, 1897, 1429, 26204, 2811, 16753, 4994, 2811, 1429, 6371, 1897, 1429, 48649, 2811, 16753, 17611, 2811, 16753, 4994, 2811, 1429, 3607, 1897, 1429, 14653, 2811, 1429, 1784, 5970, 1321, 3468, 1044, 1324, 3596, 1046, 5151, 12717, 1044, 13461, 50666, 1429, 8092, 2811, 16753, 4994, 2811, 1429, 3607, 1897, 1429, 31222, 2811, 12161, 1099, 79092, 1897, 1429, 38600, 10432, 31597, 1429, 14653, 2811, 1429, 1784, 6138, 5476, 1317, 2210, 1046, 90463, 1593, 1562, 1278, 8616, 7285, 2613, 47579, 1429, 15760, 2811, 12161, 17611, 1897, 1429, 8092, 4964, 2821, 27028, 6, 3, 7493, 1681, 1278, 17253, 2479, 9406, 1294, 6993, 4]])
 # Generate
+
+
 generate_ids = model.generate(**inputs, max_new_tokens=15)
 processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

From a45122b98489d9ff75d196ad5cbbb15d9871ae1d Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 08:34:16 +0100
Subject: [PATCH 10/58] Pixtral processor (#33454)

* rough outline

* Add in image break and end tokens

* Fix

* Udo some formatting changes

* Set patch_size default

* Fix
---
 src/transformers/__init__.py                  |  20 +-
 src/transformers/models/__init__.py           |   2 +-
 .../models/auto/configuration_auto.py         |   4 +-
 .../models/auto/image_processing_auto.py      |   1 +
 src/transformers/models/auto/modeling_auto.py |   4 +-
 .../models/auto/processing_auto.py            |   2 +-
 .../models/auto/tokenization_auto.py          |   2 +-
 src/transformers/models/pixtral/__init__.py   |  21 +-
 .../models/pixtral/configuration_pixtral.py   |   1 -
 .../pixtral/image_processing_pixtral.py       | 412 ++++++++++++++++++
 .../models/pixtral/processing_pixtral.py      | 190 ++++++++
 11 files changed, 640 insertions(+), 19 deletions(-)
 create mode 100644 src/transformers/models/pixtral/image_processing_pixtral.py
 create mode 100644 src/transformers/models/pixtral/processing_pixtral.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 94bfae8ebcdeff..5ff1da3f1e0a08 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -528,7 +528,6 @@
     ],
     "models.pixtral": [
         "PixtralConfig",
-       
     ],
     "models.llava_next": [
         "LlavaNextConfig",
@@ -647,6 +646,7 @@
     "models.phi": ["PhiConfig"],
     "models.phi3": ["Phi3Config"],
     "models.phobert": ["PhobertTokenizer"],
+    "models.pixtral": ["PixtralConfig", "PixtralProcessor"],
     "models.pix2struct": [
         "Pix2StructConfig",
         "Pix2StructProcessor",
@@ -1202,6 +1202,7 @@
     _import_structure["models.owlv2"].append("Owlv2ImageProcessor")
     _import_structure["models.owlvit"].extend(["OwlViTFeatureExtractor", "OwlViTImageProcessor"])
     _import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
+    _import_structure["models.pixtral"].append("PixtralImageProcessor")
     _import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"])
     _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
     _import_structure["models.pvt"].extend(["PvtImageProcessor"])
@@ -5300,10 +5301,6 @@
         LlavaConfig,
         LlavaProcessor,
     )
-    from .models.pixtral import (
-        PixtralConfig,
-       
-    )
     from .models.llava_next import (
         LlavaNextConfig,
         LlavaNextProcessor,
@@ -5448,6 +5445,10 @@
         Pix2StructTextConfig,
         Pix2StructVisionConfig,
     )
+    from .models.pixtral import (
+        PixtralConfig,
+        PixtralProcessor,
+    )
     from .models.plbart import PLBartConfig
     from .models.poolformer import (
         PoolFormerConfig,
@@ -6023,6 +6024,7 @@
         from .models.owlvit import OwlViTFeatureExtractor, OwlViTImageProcessor
         from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor
         from .models.pix2struct import Pix2StructImageProcessor
+        from .models.pixtral import PixtralImageProcessor
         from .models.poolformer import (
             PoolFormerFeatureExtractor,
             PoolFormerImageProcessor,
@@ -7111,10 +7113,6 @@
             LlavaForConditionalGeneration,
             LlavaPreTrainedModel,
         )
-        from .models.pixtral import (
-            PixtralModel,
-            PixtralPreTrainedModel,
-        )
         from .models.llava_next import (
             LlavaNextForConditionalGeneration,
             LlavaNextPreTrainedModel,
@@ -7466,6 +7464,10 @@
             Pix2StructTextModel,
             Pix2StructVisionModel,
         )
+        from .models.pixtral import (
+            PixtralModel,
+            PixtralPreTrainedModel,
+        )
         from .models.plbart import (
             PLBartForCausalLM,
             PLBartForConditionalGeneration,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 0ecadd8d22167b..2022048cd4553f 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -130,7 +130,6 @@
     lilt,
     llama,
     llava,
-    pixtral,
     llava_next,
     llava_next_video,
     llava_onevision,
@@ -188,6 +187,7 @@
     phi3,
     phobert,
     pix2struct,
+    pixtral,
     plbart,
     poolformer,
     pop2piano,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index ac2b3ca34949ce..97d9c60fa41c23 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -147,7 +147,6 @@
         ("lilt", "LiltConfig"),
         ("llama", "LlamaConfig"),
         ("llava", "LlavaConfig"),
-        ("pixtral", "PixtralConfig"),
         ("llava_next", "LlavaNextConfig"),
         ("llava_next_video", "LlavaNextVideoConfig"),
         ("llava_onevision", "LlavaOnevisionConfig"),
@@ -206,6 +205,7 @@
         ("phi", "PhiConfig"),
         ("phi3", "Phi3Config"),
         ("pix2struct", "Pix2StructConfig"),
+        ("pixtral", "PixtralConfig"),
         ("plbart", "PLBartConfig"),
         ("poolformer", "PoolFormerConfig"),
         ("pop2piano", "Pop2PianoConfig"),
@@ -444,7 +444,6 @@
         ("llama2", "Llama2"),
         ("llama3", "Llama3"),
         ("llava", "LLaVa"),
-        ("pixtral", "Pixtral"),
         ("llava_next", "LLaVA-NeXT"),
         ("llava_next_video", "LLaVa-NeXT-Video"),
         ("llava_onevision", "LLaVA-Onevision"),
@@ -511,6 +510,7 @@
         ("phi3", "Phi3"),
         ("phobert", "PhoBERT"),
         ("pix2struct", "Pix2Struct"),
+        ("pixtral", "Pixtral"),
         ("plbart", "PLBart"),
         ("poolformer", "PoolFormer"),
         ("pop2piano", "Pop2Piano"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index c83c43518a6a31..95d9ddef8f7979 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -114,6 +114,7 @@
             ("owlvit", ("OwlViTImageProcessor",)),
             ("perceiver", ("PerceiverImageProcessor",)),
             ("pix2struct", ("Pix2StructImageProcessor",)),
+            ("pixtral", ("PixtralImageProcessor",)),
             ("poolformer", ("PoolFormerImageProcessor",)),
             ("pvt", ("PvtImageProcessor",)),
             ("pvt_v2", ("PvtImageProcessor",)),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 1f860c25e8dbcc..eb2ab82c960d3b 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -193,6 +193,7 @@
         ("persimmon", "PersimmonModel"),
         ("phi", "PhiModel"),
         ("phi3", "Phi3Model"),
+        ("pixtral", "PixtralModel"),
         ("plbart", "PLBartModel"),
         ("poolformer", "PoolFormerModel"),
         ("prophetnet", "ProphetNetModel"),
@@ -277,7 +278,6 @@
         ("xmod", "XmodModel"),
         ("yolos", "YolosModel"),
         ("yoso", "YosoModel"),
-        ("pixtral", "PixtralModel"),
     ]
 )
 
@@ -729,12 +729,12 @@
         ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
-        ("pixtral", "PixtralModel"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),
         ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
         ("paligemma", "PaliGemmaForConditionalGeneration"),
         ("pix2struct", "Pix2StructForConditionalGeneration"),
+        ("pixtral", "PixtralModel"),
         ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
         ("video_llava", "VideoLlavaForConditionalGeneration"),
         ("vipllava", "VipLlavaForConditionalGeneration"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index d055693316df18..82d325248eabfb 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -71,7 +71,6 @@
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
         ("llava", "LlavaProcessor"),
-        ("pixtral", "PixtralProcessor"),
         ("llava_next", "LlavaNextProcessor"),
         ("llava_next_video", "LlavaNextVideoProcessor"),
         ("llava_onevision", "LlavaOnevisionProcessor"),
@@ -83,6 +82,7 @@
         ("owlvit", "OwlViTProcessor"),
         ("paligemma", "PaliGemmaProcessor"),
         ("pix2struct", "Pix2StructProcessor"),
+        ("pixtral", "PixtralProcessor"),
         ("pop2piano", "Pop2PianoProcessor"),
         ("qwen2_audio", "Qwen2AudioProcessor"),
         ("qwen2_vl", "Qwen2VLProcessor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 00bbef64d99aa9..2f0e8591740da4 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -257,7 +257,6 @@
                 ),
             ),
             ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
-            ("pixtral", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("llava-onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
@@ -386,6 +385,7 @@
             ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("phobert", ("PhobertTokenizer", None)),
             ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
+            ("pixtral", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
             ("prophetnet", ("ProphetNetTokenizer", None)),
             ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/pixtral/__init__.py b/src/transformers/models/pixtral/__init__.py
index cb43777e2d5a2e..2020287559a124 100644
--- a/src/transformers/models/pixtral/__init__.py
+++ b/src/transformers/models/pixtral/__init__.py
@@ -13,11 +13,12 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
     "configuration_pixtral": ["PixtralConfig"],
+    "processing_pixtral": ["PixtralProcessor"],
 }
 
 
@@ -32,9 +33,17 @@
         "PixtralPreTrainedModel",
     ]
 
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_pixtral"] = ["PixtralImageProcessor"]
+
 
 if TYPE_CHECKING:
-    from .configuration_pixtral import PixtralConfig
+    from .configuration_pixtral import PixtralConfig, PixtralProcessor
 
     try:
         if not is_torch_available():
@@ -47,6 +56,14 @@
             PixtralPreTrainedModel,
         )
 
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_pixtral import PixtralImageProcessor
+
 else:
     import sys
 
diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
index 90ce7785f0408a..667466872ff13e 100644
--- a/src/transformers/models/pixtral/configuration_pixtral.py
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -15,7 +15,6 @@
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..auto import CONFIG_MAPPING
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
new file mode 100644
index 00000000000000..3481b793ec7e9b
--- /dev/null
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -0,0 +1,412 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Pixtral."""
+
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+from ...utils.import_utils import requires_backends
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+# Adapted from function in image_transforms.py t oensure any transparent pixels are converted to white.
+def convert_to_rgb(image: ImageInput) -> ImageInput:
+    """
+    Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+    as is.
+    Args:
+        image (Image):
+            The image to convert.
+    """
+    requires_backends(convert_to_rgb, ["vision"])
+
+    if not isinstance(image, PIL.Image.Image):
+        return image
+
+    if image.mode == "RGB":
+        return image
+
+    # First we convert to RGBA to set background to white.
+    image = image.convert("RGBA")
+
+    # Create a new image with a white background.
+    new_image = PIL.Image.new("RGBA", image.size, "WHITE")
+    new_image.paste(image, (0, 0), image)
+    new_image = new_image.convert("RGB")
+    return new_image
+
+
+def _num_image_tokens(image_size: Tuple[int, int], patch_size: Tuple[int, int]) -> int:
+    """
+    Calculate the number of image tokens given the image size and patch size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The size of the image as `(height, width)`.
+        patch_size (`Tuple[int, int]`):
+            The patch size as `(height, width)`.
+
+    Returns:
+        `int`: The number of image tokens.
+    """
+    height, width = image_size
+    patch_height, patch_width = patch_size if isinstance(patch_size, (tuple, list)) else (patch_size, patch_size)
+    num_width_tokens = (width - 1) // patch_width + 1
+    num_height_tokens = (height - 1) // patch_height + 1
+    return num_height_tokens, num_width_tokens
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    patch_size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]`):
+            Max image size an input image can be. Must be a dictionary with the key "longest_edge".
+        patch_size (`int` or `Tuple[int, int]`):
+            The patch_size as `(height, width)` to use for resizing the image. If patch_size is an integer, `(patch_size, patch_size)`
+            will be used
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
+
+    Returns:
+        `tuple`: The target (height, width) dimension of the output image after resizing.
+    """
+    max_height, max_width = size if isinstance(size, (tuple, list)) else (size, size)
+    patch_height, patch_width = patch_size if isinstance(patch_size, (tuple, list)) else (patch_size, patch_size)
+    height, width = get_image_size(input_image, input_data_format)
+
+    ratio = max(height / max_height, width / max_width)
+
+    if ratio > 1:
+        # Orgiginal implementation uses `round` which utilises bankers rounding, which can lead to surprising results
+        height = int(np.ceil(height / ratio))
+        width = int(np.ceil(width / ratio))
+
+    num_height_tokens, num_width_tokens = _num_image_tokens((height, width), (patch_height, patch_width))
+    return num_height_tokens * patch_height, num_width_tokens * patch_width
+
+
+class PixtralImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Pixtral image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"longest_edge": 1024}`):
+        patch_size (`Dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
+            Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        patch_size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"longest_edge": 1024}
+        patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
+
+        self.do_resize = do_resize
+        self.size = size
+        self.patch_size = patch_size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.do_convert_rgb = do_convert_rgb
+        self._valid_processor_keys = [
+            "images",
+            "do_resize",
+            "size",
+            "resample",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_convert_rgb",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        patch_size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dict containing the longest possible edge of the image.
+            patch_size (`Dict[str, int]`):
+                Patch size used to calculate the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "longest_edge" in size:
+            size = (size["longest_edge"], size["longest_edge"])
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("size must contain either 'longest_edge' or 'height' and 'width'.")
+
+        if "height" in patch_size and "width" in patch_size:
+            patch_size = (patch_size["height"], patch_size["width"])
+        else:
+            raise ValueError("patch_size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            patch_size=patch_size,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        patch_size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Describes the maximum input dimensions to the model.
+            patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
+                Patch size in the model. Used to calculate the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        patch_size = get_size_dict(patch_size, default_to_square=True)
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        all_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(
+                    image=image,
+                    size=size,
+                    patch_size=patch_size,
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            all_images.append(image)
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            for image in all_images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
new file mode 100644
index 00000000000000..e9d3d581189a32
--- /dev/null
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -0,0 +1,190 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Pixtral.
+"""
+
+from typing import List, Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class PixtralProcessor(ProcessorMixin):
+    r"""
+    Constructs a Pixtral processor which wraps a Pixtral image processor and a Pixtral tokenizer into a single processor.
+
+    [`PixtralProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~PixtralProcessor.__call__`] and [`~PixtralProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`CLIPImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        patch_size (`int`, *optional*, defaults to 16):
+            Patch size from the vision tower.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"[IMG]"`):
+            Special token used to denote image location.
+        image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = [
+        "chat_template",
+        "patch_size",
+        "image_token",
+        "image_break_token",
+    ]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size: int = 16,
+        chat_template=None,
+        image_token="[IMG]",  # set the default and let users change if they have peculiar special tokens in rare cases
+        image_break_token="[IMG_BREAK]",
+        image_end_token="[IMG_END]",
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.image_token = image_token
+        self.image_break_token = image_break_token
+        self.image_end_token = image_end_token
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is not None:
+            image_inputs = self.image_processor(images, patch_size=self.patch_size, return_tensors=return_tensors)
+        else:
+            image_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = text
+        if image_inputs.get("pixel_values") is not None:
+            # Replace the image token with the expanded image token sequence
+            pixel_values = image_inputs["pixel_values"]
+            height, width = get_image_size(to_numpy_array(pixel_values[0]))
+            num_height_tokens = height // self.patch_size
+            num_width_tokens = width // self.patch_size
+
+            prompt_strings = []
+            replace_tokens = [self.image_token] * num_width_tokens + [self.image_break_token] * num_height_tokens
+            replace_tokens[-1] = self.image_end_token
+            replace_str = "".join(replace_tokens)
+            for sample in text:
+                sample = sample.replace(self.image_token, replace_str)
+                prompt_strings.append(sample)
+
+        text_inputs = self.tokenizer(
+            prompt_strings,
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+        )
+        return BatchFeature(data={**text_inputs, **image_inputs})
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

From b6db4eec3869438c349517afb688b4c070bcf6bb Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 09:51:05 +0100
Subject: [PATCH 11/58] Fix token expansion

---
 src/transformers/models/pixtral/processing_pixtral.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index e9d3d581189a32..efaf8e960d525c 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -47,6 +47,9 @@ class PixtralProcessor(ProcessorMixin):
         image_token (`str`, *optional*, defaults to `"[IMG]"`):
             Special token used to denote image location.
         image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
+            Special token used to denote the end of a line of pixels in an image.
+        image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
+            Special token used to denote the end of an image input.
     """
 
     attributes = ["image_processor", "tokenizer"]
@@ -55,6 +58,7 @@ class PixtralProcessor(ProcessorMixin):
         "patch_size",
         "image_token",
         "image_break_token",
+        "image_end_token",
     ]
     image_processor_class = "AutoImageProcessor"
     tokenizer_class = "AutoTokenizer"
@@ -150,7 +154,7 @@ def __call__(
             num_width_tokens = width // self.patch_size
 
             prompt_strings = []
-            replace_tokens = [self.image_token] * num_width_tokens + [self.image_break_token] * num_height_tokens
+            replace_tokens = [[self.image_token] * num_width_tokens + self.image_break_token] * num_height_tokens
             replace_tokens[-1] = self.image_end_token
             replace_str = "".join(replace_tokens)
             for sample in text:

From b8df95ddba44cc8ff4e0f0336eab57b608bad5e0 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 10:51:36 +0200
Subject: [PATCH 12/58] nit in conversion script

---
 .../pixtral/convert_pixtral_weights_to_hf.py  | 63 +++++++++----------
 1 file changed, 30 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 1e608a4699d8cf..228f1a4c78c71c 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -1,4 +1,4 @@
-from transformers import LlavaConfig, LlavaForConditionalGeneration, AutoTokenizer, MistralConfig, PixtralConfig, PreTrainedTokenizerFast
+from transformers import LlavaConfig, LlavaForConditionalGeneration, PixtralProcessor, MistralConfig, PixtralConfig, PreTrainedTokenizerFast, PixtralImageProcessor
 
 import torch
 from safetensors.torch import load_file as safe_load_file
@@ -169,40 +169,42 @@ def converted(self) -> Tokenizer:
 
 }
 
-new_state_dict = {} 
-all_keys = "\n"+ "\n".join(original_state_dict.keys())
-old_keys = all_keys
-for old, new in OLD_KEY_TO_NEW_KEY_MAPPING.items():
-    all_keys = re.sub(r"\n"+ old,r"\n"+new,all_keys)
 
-OLD_TO_NEW = dict(zip(old_keys.split("\n"), all_keys.split("\n")))
-
-new_dict={}
 
 def permute_for_rope(value, n_heads, config):
         dim1 = value.shape[0]
         dim2 = config.hidden_size
         return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) 
 
-for key, value in original_state_dict.items():
+def convert_dictionnary(original_state_dict):
+    new_dict={}
+
+    all_keys = "\n"+ "\n".join(original_state_dict.keys())
+    old_keys = all_keys
+    for old, new in OLD_KEY_TO_NEW_KEY_MAPPING.items():
+        all_keys = re.sub(r"\n"+ old,r"\n"+new,all_keys)
+
+    OLD_TO_NEW = dict(zip(old_keys.split("\n"), all_keys.split("\n")))
+
+    for key, value in original_state_dict.items():
 
-    new_key = OLD_TO_NEW[key]
-    if "vision_encoder" in key:
-        _config = vision_config
-        num_attention_heads = _config.num_attention_heads
-    else:
-        _config = text_config
-        if "q_proj" in new_key:
+        new_key = OLD_TO_NEW[key]
+        if "vision_encoder" in key:
+            _config = vision_config
             num_attention_heads = _config.num_attention_heads
-        if "k_proj" in new_key:
-            num_attention_heads = _config.num_key_value_heads
-        # convert the text model (basically mistral model)
+        else:
+            _config = text_config
+            if "q_proj" in new_key:
+                num_attention_heads = _config.num_attention_heads
+            if "k_proj" in new_key:
+                num_attention_heads = _config.num_key_value_heads
+            # convert the text model (basically mistral model)
 
 
-    if "q_proj" in new_key or "k_proj" in new_key:
-        value = permute_for_rope(value,num_attention_heads, _config)
+        if "q_proj" in new_key or "k_proj" in new_key:
+            value = permute_for_rope(value,num_attention_heads, _config)
 
-    new_dict[new_key] = value
+        new_dict[new_key] = value
 
 config.text_config.head_dim = 128
 # with torch.device("meta"):
@@ -213,20 +215,15 @@ def permute_for_rope(value, n_heads, config):
 config.vision_feature_layer = -1
 config.image_token_index = 10
 config.vision_feature_select_strategy = "full"
+config.image_seq_length = 1
 model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config).to("cuda")
-processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", image_token = "[IMG]")
+image_processor = PixtralImageProcessor()
+processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token = "[IMG]")
 processor.tokenizer = tokenizer
-prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+prompt = "USER: [IMG]\nWhat's the content of the image? ASSISTANT:"
 url = "https://www.ilankelman.org/stopsigns/australia.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
-
-prompt = '<s>[INST][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_BREAK][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG][IMG_END]Describe this image in one sentence.[/INST]'
-input_ids_ = torch.tensor([[1, 3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 13, 5847, 13089, 1593, 3937, 1294, 1925, 19286, 1046, 4]]).long()
 inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
 
-input_ids = torch.tensor([[1, 5, 1091, 19227, 4994, 2811, 1429, 5165, 1897, 1429, 5165, 2811, 16753, 2391, 2811, 1429, 1689, 45971, 1095, 45629, 1897, 1429, 14653, 2811, 1429, 4147, 1278, 3519, 17253, 1897, 1429, 26204, 2811, 16753, 4994, 2811, 1429, 6371, 1897, 1429, 48649, 2811, 16753, 17611, 2811, 16753, 4994, 2811, 1429, 3607, 1897, 1429, 14653, 2811, 1429, 1784, 5970, 1321, 3468, 1044, 1324, 3596, 1046, 5151, 12717, 1044, 13461, 50666, 1429, 8092, 2811, 16753, 4994, 2811, 1429, 3607, 1897, 1429, 31222, 2811, 12161, 1099, 79092, 1897, 1429, 38600, 10432, 31597, 1429, 14653, 2811, 1429, 1784, 6138, 5476, 1317, 2210, 1046, 90463, 1593, 1562, 1278, 8616, 7285, 2613, 47579, 1429, 15760, 2811, 12161, 17611, 1897, 1429, 8092, 4964, 2821, 27028, 6, 3, 7493, 1681, 1278, 17253, 2479, 9406, 1294, 6993, 4]])
-# Generate
-
-
 generate_ids = model.generate(**inputs, max_new_tokens=15)
-processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))

From 185c4358948f43a1642e00a6f78cd6f1f290db83 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 09:55:28 +0100
Subject: [PATCH 13/58] Fix image token list creation

---
 src/transformers/models/pixtral/processing_pixtral.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index efaf8e960d525c..c1da48808036ee 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -154,7 +154,9 @@ def __call__(
             num_width_tokens = width // self.patch_size
 
             prompt_strings = []
-            replace_tokens = [[self.image_token] * num_width_tokens + self.image_break_token] * num_height_tokens
+            replace_tokens = [[self.image_token] * num_width_tokens + [self.image_break_token]] * num_height_tokens
+            # Flatten list
+            replace_tokens = [item for sublist in replace_tokens for item in sublist]
             replace_tokens[-1] = self.image_end_token
             replace_str = "".join(replace_tokens)
             for sample in text:

From 92c273569bdacb0a5dca8931c975573cc473ec57 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 13:20:32 +0200
Subject: [PATCH 14/58] done

---
 .../pixtral/convert_pixtral_weights_to_hf.py  | 27 +++++++++++++++++--
 .../models/pixtral/modeling_pixtral.py        | 25 ++++++++---------
 2 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 228f1a4c78c71c..203d4d92cd9f62 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -220,10 +220,33 @@ def convert_dictionnary(original_state_dict):
 image_processor = PixtralImageProcessor()
 processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token = "[IMG]")
 processor.tokenizer = tokenizer
-prompt = "USER: [IMG]\nWhat's the content of the image? ASSISTANT:"
+prompt = "[INST]\nWhat's the content of the image?"
 url = "https://www.ilankelman.org/stopsigns/australia.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
 inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
 
-generate_ids = model.generate(**inputs, max_new_tokens=15)
+
+
+messages = [
+    {
+        "role": "user",
+        "content": [{"type": "image_url", "image_url": {"url": url}}, {"type": "text", "text": prompt}]
+    },
+]
+
+tok = MistralTokenizer.from_model(model_name)
+
+
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
+tokenized = tok.encode_chat_completion(
+    ChatCompletionRequest(
+        messages=messages,
+        model=model_name,
+    )
+)
+
+inputs["input_ids"] = torch.tensor([tokenized.tokens], dtype=torch.long, device="cuda")
+inputs["pixel_values"] = torch.tensor(tokenized.images,  device="cuda")
+del inputs["attention_mask"]
+generate_ids = model.generate(**inputs, max_new_tokens=100)
 print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index e5a4d5b34522da..b053885154cf80 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -81,6 +81,16 @@ class PixtralCausalLMOutputWithPast(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
+def position_ids_in_meshgrid(patch_embeds_list, max_width):
+    positions = []
+    for patch in patch_embeds_list:
+        height, width = patch.shape[-2:]
+        mesh = torch.meshgrid(torch.arange(height), torch.arange(width), indexing="ij")
+        h_grid, v_grid = torch.stack(mesh, dim=-1).reshape(-1, 2).chunk(2,-1)
+        ids = h_grid * max_width + v_grid
+        positions.append(ids[:,0])
+    return torch.cat(positions)
+
 
 class PixtralRotaryEmbedding(nn.Module):
     """
@@ -114,11 +124,11 @@ def __init__(self, config, device):
                 freqs_w[None, :, :].repeat(max_patches_per_side, 1, 1),
             ],
             dim=-1,
-        ).reshape(-1, self.dim) # we reshape to only index on the position indexes, not tuple of indexes
+        ).reshape(-1, self.dim//2) # we reshape to only index on the position indexes, not tuple of indexes
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
 
         # TODO maybe make it torch compatible later on. We can also just slice
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("inv_freq", torch.cat((inv_freq, inv_freq), dim=-1), persistent=False)
 
     @torch.no_grad()
     def forward(self, x, position_ids):
@@ -287,15 +297,6 @@ def extra_repr(self):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
 
 
-def position_ids_in_meshgrid(patch_embeds_list):
-    positions = []
-    for patch in patch_embeds_list:
-        height, width = patch.shape[-2:]
-        mesh = torch.meshgrid(torch.arange(height), torch.arange(width), indexing="ij")
-        h_grid, v_grid = torch.stack(mesh, dim=-1).reshape(-1, 2).chunk(2,-1)
-        ids = h_grid * height + v_grid
-        positions.append(ids[:,0])
-    return torch.cat(positions)
 
 
 
@@ -614,7 +615,7 @@ def forward(self, images: List[torch.Tensor], output_hidden_states=False, *kwarg
         patch_embeds = self.ln_pre(patch_embeds)
 
         # positional embeddings
-        position_ids = position_ids_in_meshgrid(patch_embeds_list).to(self.device)
+        position_ids = position_ids_in_meshgrid(patch_embeds_list, max_width=self.config.image_size // self.config.patch_size).to(self.device)
 
         position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
         attention_mask = None

From ea2d9fb4af322fcb1f899d01b5d260fce74c7cb2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 13:23:20 +0200
Subject: [PATCH 15/58] add expected results

---
 .../models/pixtral/convert_pixtral_weights_to_hf.py          | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 203d4d92cd9f62..cf5e59a931ec60 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -249,4 +249,7 @@ def convert_dictionnary(original_state_dict):
 inputs["pixel_values"] = torch.tensor(tokenized.images,  device="cuda")
 del inputs["attention_mask"]
 generate_ids = model.generate(**inputs, max_new_tokens=100)
-print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
+"""
+What's the content of the image?The image depicts a vibrant street scene in what appears to be a Chinatown district, characterized by its traditional architectural elements and cultural signage. A prominent feature is the red and white stop sign in the foreground, which has been adorned with a banner that reads "OPTUS." Behind the stop sign, there's an ornate gate with intricate designs and Chinese characters, marking the entrance to the district. The gate is flanked by buildings with colorful facades and signs in both English and Chinese
+"""
\ No newline at end of file

From 6ee62a736a9583cd418e50a98a3fc82c4d62d2dc Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 12:28:33 +0100
Subject: [PATCH 16/58] Process list of list of images (#33465)

---
 .../pixtral/image_processing_pixtral.py       | 127 +++++++++++++-----
 .../models/pixtral/processing_pixtral.py      |  40 ++++--
 2 files changed, 117 insertions(+), 50 deletions(-)

diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 3481b793ec7e9b..f3a67f481a0b09 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Image processor class for Pixtral."""
 
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -31,6 +31,7 @@
     get_image_size,
     infer_channel_dimension_format,
     is_scaled_image,
+    is_valid_image,
     make_list_of_images,
     to_numpy_array,
     valid_images,
@@ -48,7 +49,40 @@
     import PIL
 
 
-# Adapted from function in image_transforms.py t oensure any transparent pixels are converted to white.
+# Copied from transformers.models.idefics2.image_processing_idefics2.make_list_of_images
+def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
+    """
+    Convert a single image or a list of images to a list of numpy arrays.
+
+    Args:
+        images (`ImageInput`):
+            A single image or a list of images.
+
+    Returns:
+        A list of numpy arrays.
+    """
+    # If it's a single image, convert it to a list of lists
+    if is_valid_image(images):
+        images = [[images]]
+    # If it's a list of images, it's a single batch, so convert it to a list of lists
+    elif isinstance(images, (list, tuple)) and len(images) > 0 and is_valid_image(images[0]):
+        images = [images]
+    # If it's a list of batches, it's already in the right format
+    elif (
+        isinstance(images, (list, tuple))
+        and len(images) > 0
+        and isinstance(images[0], (list, tuple))
+        and is_valid_image(images[0][0])
+    ):
+        pass
+    else:
+        raise ValueError(
+            "Invalid input type. Must be a single image, a list of images, or a list of batches of images."
+        )
+    return images
+
+
+# Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white.
 def convert_to_rgb(image: ImageInput) -> ImageInput:
     """
     Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
@@ -134,6 +168,18 @@ def get_resize_output_image_size(
     return num_height_tokens * patch_height, num_width_tokens * patch_width
 
 
+# Hack to get tensor conversion used in BatchFeature without batching the images
+def _get_is_as_tensor_fns(tensor_type: Union[str, TensorType]) -> Tuple[Callable, Callable]:
+    return BatchFeature()._get_is_as_tensor_fns(tensor_type)
+
+
+def convert_to_tensor(array, tensor_type: Union[str, TensorType]) -> Any:
+    is_tensor, as_tensor = _get_is_as_tensor_fns(tensor_type)
+    if is_tensor(array):
+        return array
+    return as_tensor(array)
+
+
 class PixtralImageProcessor(BaseImageProcessor):
     r"""
     Constructs a Pixtral image processor.
@@ -333,11 +379,11 @@ def preprocess(
                 - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                 - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
         """
+        patch_size = patch_size if patch_size is not None else self.patch_size
         patch_size = get_size_dict(patch_size, default_to_square=True)
 
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
-        patch_size = patch_size if patch_size is not None else self.patch_size
         resample = resample if resample is not None else self.resample
         do_rescale = do_rescale if do_rescale is not None else self.do_rescale
         rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
@@ -348,13 +394,14 @@ def preprocess(
 
         validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
 
-        images = make_list_of_images(images)
+        images_list = make_list_of_images(images)
 
-        if not valid_images(images):
+        if not valid_images(images_list[0][0]):
             raise ValueError(
                 "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
+
         validate_preprocess_arguments(
             do_rescale=do_rescale,
             rescale_factor=rescale_factor,
@@ -367,12 +414,12 @@ def preprocess(
         )
 
         if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
+            images_list = [[convert_to_rgb(image) for image in images] for images in images_list]
 
         # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
+        images_list = [[to_numpy_array(image) for image in images] for images in images_list]
 
-        if is_scaled_image(images[0]) and do_rescale:
+        if is_scaled_image(images_list[0][0]) and do_rescale:
             logger.warning_once(
                 "It looks like you are trying to rescale already rescaled images. If the input"
                 " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
@@ -380,33 +427,41 @@ def preprocess(
 
         if input_data_format is None:
             # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        all_images = []
-        for image in images:
-            if do_resize:
-                image = self.resize(
-                    image=image,
-                    size=size,
-                    patch_size=patch_size,
-                    resample=resample,
-                    input_data_format=input_data_format,
-                )
-
-            if do_rescale:
-                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
-
-            if do_normalize:
-                image = self.normalize(
-                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
-                )
-
-            all_images.append(image)
-
-        images = [
-            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
-            for image in all_images
+            input_data_format = infer_channel_dimension_format(images_list[0][0])
+
+        batch_images = []
+        batch_image_sizes = []
+        for sample_images in images_list:
+            images = []
+            image_sizes = []
+            for image in sample_images:
+                if do_resize:
+                    image = self.resize(
+                        image=image,
+                        size=size,
+                        patch_size=patch_size,
+                        resample=resample,
+                        input_data_format=input_data_format,
+                    )
+
+                if do_rescale:
+                    image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+                if do_normalize:
+                    image = self.normalize(
+                        image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                    )
+
+                images.append(image)
+                image_sizes.append(get_image_size(image, input_data_format))
+            batch_images.append(images)
+            batch_image_sizes.append(image_sizes)
+
+        images_list = [
+            [to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images]
+            for images in batch_images
         ]
 
-        data = {"pixel_values": images}
-        return BatchFeature(data=data, tensor_type=return_tensors)
+        # Convert to tensor type outside of BatchFeature to avoid batching the images of different sizes
+        images_list = [[convert_to_tensor(image, return_tensors) for image in images] for images in images_list]
+        return BatchFeature(data={"images": images_list, "image_sizes": batch_image_sizes}, tensor_type=None)
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index c1da48808036ee..9a9b7a23a41371 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -19,7 +19,7 @@
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType, logging
@@ -146,21 +146,33 @@ def __call__(
 
         # try to expand inputs in processing if we have the necessary parts
         prompt_strings = text
-        if image_inputs.get("pixel_values") is not None:
+        if image_inputs.get("images") is not None:
             # Replace the image token with the expanded image token sequence
-            pixel_values = image_inputs["pixel_values"]
-            height, width = get_image_size(to_numpy_array(pixel_values[0]))
-            num_height_tokens = height // self.patch_size
-            num_width_tokens = width // self.patch_size
-
+            images = image_inputs["images"]
+            image_sizes = image_inputs.pop("image_sizes")
             prompt_strings = []
-            replace_tokens = [[self.image_token] * num_width_tokens + [self.image_break_token]] * num_height_tokens
-            # Flatten list
-            replace_tokens = [item for sublist in replace_tokens for item in sublist]
-            replace_tokens[-1] = self.image_end_token
-            replace_str = "".join(replace_tokens)
-            for sample in text:
-                sample = sample.replace(self.image_token, replace_str)
+
+            for sample_images, sample_image_sizes, sample in zip(images, image_sizes, text):
+                replace_strings = []
+                # First calculate the number of tokens needed for each image and put in a placeholder
+                for image, image_size in zip(sample_images, sample_image_sizes):
+                    height, width = image_size
+                    num_height_tokens = height // self.patch_size
+                    num_width_tokens = width // self.patch_size
+                    replace_tokens = [
+                        [self.image_token] * num_width_tokens + [self.image_break_token]
+                    ] * num_height_tokens
+                    # Flatten list
+                    replace_tokens = [item for sublist in replace_tokens for item in sublist]
+                    replace_tokens[-1] = self.image_end_token
+                    replace_str = "".join(replace_tokens)
+                    replace_strings.append(replace_str)
+                    sample = sample.replace(self.image_token, "<placeholder>", 1)
+
+                while "<placeholder>" in sample:
+                    replace_str = replace_strings.pop(0)
+                    sample = sample.replace("<placeholder>", replace_str, 1)
+
                 prompt_strings.append(sample)
 
         text_inputs = self.tokenizer(

From 5f33680e5d0a9f2e53c4995f6ce432bf634b7e1c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 13:28:49 +0200
Subject: [PATCH 17/58] updates

---
 .../pixtral/convert_pixtral_weights_to_hf.py  | 285 +++++++++---------
 .../models/pixtral/modeling_pixtral.py        |  58 ++--
 2 files changed, 172 insertions(+), 171 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index cf5e59a931ec60..e4fd58cbc5f14b 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -1,29 +1,54 @@
-from transformers import LlavaConfig, LlavaForConditionalGeneration, PixtralProcessor, MistralConfig, PixtralConfig, PreTrainedTokenizerFast, PixtralImageProcessor
-
-import torch
-from safetensors.torch import load_file as safe_load_file
 import regex as re
-
-from PIL import Image
 import requests
-from transformers import AutoProcessor
-
-
-
+import torch
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+from PIL import Image
+from safetensors.torch import load_file as safe_load_file
 
-# Load Mistral tokenizer
+from transformers import (
+    LlavaConfig,
+    LlavaForConditionalGeneration,
+    MistralConfig,
+    PixtralConfig,
+    PixtralImageProcessor,
+    PixtralProcessor,
+    PreTrainedTokenizerFast,
+)
+from transformers.convert_slow_tokenizer import *
 
-model_name = "mistralai/Pixtral-12B-2409"
 
-tokenizer = MistralTokenizer.from_model(model_name)
 
-vocab = tokenizer.instruct_tokenizer.tokenizer._tekken_token2id_nospecial
-all_special = [token.value if hasattr(token,"value") else token for token in tokenizer.instruct_tokenizer.tokenizer._all_special_tokens]
-specials_tokens = {token : all_special.index(token)  for token in all_special}
-specials_tokens.update(vocab)
-vocab = specials_tokens
-from transformers.convert_slow_tokenizer import *
+OLD_KEY_TO_NEW_KEY_MAPPING = {
+    # Layer Normalization Weights
+    r"vision_encoder.transformer.layers.(\d+).input_layernorm.weight": r"vision_tower.transformer.layers.\1.attention_norm.weight",
+    r"vision_encoder.transformer.layers.(\d+).ffn_norm.weight": r"vision_tower.transformer.layers.\1.ffn_norm.weight",
+    # Self Attention Projections
+    r"vision_encoder.transformer.layers.(\d+).attention.wq.weight": r"vision_tower.transformer.layers.\1.attention.q_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wk.weight": r"vision_tower.transformer.layers.\1.attention.k_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wv.weight": r"vision_tower.transformer.layers.\1.attention.v_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).attention.wo.weight": r"vision_tower.transformer.layers.\1.attention.o_proj.weight",
+    # MLP Projections
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight": r"vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight": r"vision_tower.transformer.layers.\1.feed_forward.down_proj.weight",
+    r"vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight": r"vision_tower.transformer.layers.\1.feed_forward.up_proj.weight",
+    # Additional mappings
+    r"vision_encoder": r"vision_tower",
+    r"vision_language_adapter.w_in": r"multi_modal_projector.linear_1",
+    r"vision_language_adapter.w_out": r"multi_modal_projector.linear_2",
+    r"layers.(\d+).attention.wq.weight": r"language_model.model.layers.\1.self_attn.q_proj.weight",
+    r"layers.(\d+).attention.wk.weight": r"language_model.model.layers.\1.self_attn.k_proj.weight",
+    r"layers.(\d+).attention.wv.weight": r"language_model.model.layers.\1.self_attn.v_proj.weight",
+    r"layers.(\d+).attention.wo.weight": r"language_model.model.layers.\1.self_attn.o_proj.weight",
+    r"layers.(\d+).feed_forward.w1.weight": r"language_model.model.layers.\1.mlp.gate_proj.weight",
+    r"layers.(\d+).feed_forward.w2.weight": r"language_model.model.layers.\1.mlp.down_proj.weight",
+    r"layers.(\d+).feed_forward.w3.weight": r"language_model.model.layers.\1.mlp.up_proj.weight",
+    r"layers.(\d+).ffn_norm.weight": r"language_model.model.layers.\1.post_attention_layernorm.weight",
+    r"layers.(\d+).attention_norm.weight": r"language_model.model.layers.\1.input_layernorm.weight",
+    r"tok_embeddings.weight": r"language_model.model.embed_tokens.weight",
+    r"output.weight": r"language_model.lm_head.weight",
+    r"norm.weight": r"language_model.model.norm.weight",
+}
+
 class MistralConverter:
     """
     A general tiktoken converter.
@@ -46,13 +71,13 @@ def __init__(
 
     def extract_vocab_merges_from_model(self, vocab: str):
         try:
-            from tiktoken.load import load_tiktoken_bpe
+            pass
         except Exception:
             raise ValueError(
                 "`tiktoken` is required to read a `tiktoken` file. Install it with " "`pip install tiktoken`."
             )
 
-        bpe_ranks = vocab 
+        bpe_ranks = vocab
         byte_encoder = bytes_to_unicode()
 
         def token_bytes_to_string(b):
@@ -61,7 +86,7 @@ def token_bytes_to_string(b):
         merges = []
         vocab = {}
         for idx, (token, rank) in enumerate(bpe_ranks.items()):
-            if token not in all_special:
+            if token not in self.additional_special_tokens:
                 vocab[token_bytes_to_string(token)] = idx
                 if len(token) == 1:
                     continue
@@ -100,94 +125,47 @@ def converted(self) -> Tokenizer:
 
         return tokenizer
 
-tokenizer = PreTrainedTokenizerFast(tokenizer_object = MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted())
-
-
-text_config = MistralConfig(
-    attention_dropout=0.0,
-    bos_token_id=1,
-    eos_token_id=2,
-    head_dim=128,
-    hidden_act="silu",
-    hidden_size=5120,
-    initializer_range=0.02,
-    intermediate_size=14336,
-    max_position_embeddings=1024000,
-    model_type="mistral",
-    num_attention_heads=32,
-    num_hidden_layers=40,
-    num_key_value_heads=8,
-    rms_norm_eps=1e-05,
-    rope_theta=1000000000.0,
-    sliding_window=None,
-    tie_word_embeddings=False,
-    vocab_size=131072
-)
 
-vision_config = PixtralConfig()
-config = LlavaConfig(vision_config, text_config)
-config.architectures = ["LlavaForConditionalGeneration"]
-config.text_config.head_dim = 128
-config.save_pretrained("../pixtral")
+def convert_mistral_tokenizer():
+    model_name = "mistralai/Pixtral-12B-2409"
 
-tokenizer.model_input_names = ['input_ids', 'attention_mask']
-original_state_dict = safe_load_file("../pixtral/consolidated.safetensors")
+    tokenizer = MistralTokenizer.from_model(model_name)
 
+    vocab = tokenizer.instruct_tokenizer.tokenizer._tekken_token2id_nospecial
+    all_special = [
+        token.value if hasattr(token, "value") else token
+        for token in tokenizer.instruct_tokenizer.tokenizer._all_special_tokens
+    ]
+    specials_tokens = {token: all_special.index(token) for token in all_special}
+    specials_tokens.update(vocab)
+    vocab = specials_tokens
 
-OLD_KEY_TO_NEW_KEY_MAPPING = {
-    # Layer Normalization Weights
-    r"vision_encoder.transformer.layers.(\d+).input_layernorm.weight":  r"vision_tower.transformer.layers.\1.attention_norm.weight",
-    r"vision_encoder.transformer.layers.(\d+).ffn_norm.weight":         r"vision_tower.transformer.layers.\1.ffn_norm.weight",
-    
-    # Self Attention Projections
-    r"vision_encoder.transformer.layers.(\d+).attention.wq.weight":     r"vision_tower.transformer.layers.\1.attention.q_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wk.weight":     r"vision_tower.transformer.layers.\1.attention.k_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wv.weight":     r"vision_tower.transformer.layers.\1.attention.v_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).attention.wo.weight":     r"vision_tower.transformer.layers.\1.attention.o_proj.weight",
-    
-    # MLP Projections
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w1.weight":  r"vision_tower.transformer.layers.\1.feed_forward.gate_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w2.weight":  r"vision_tower.transformer.layers.\1.feed_forward.down_proj.weight",
-    r"vision_encoder.transformer.layers.(\d+).feed_forward.w3.weight":  r"vision_tower.transformer.layers.\1.feed_forward.up_proj.weight",
-    
-    # Additional mappings
-    r"vision_encoder":                                  r"vision_tower",
-    r"vision_language_adapter.w_in":                    r"multi_modal_projector.linear_1",
-    r"vision_language_adapter.w_out":                   r"multi_modal_projector.linear_2",
-    r"layers.(\d+).attention.wq.weight":                r"language_model.model.layers.\1.self_attn.q_proj.weight",
-    r"layers.(\d+).attention.wk.weight":                r"language_model.model.layers.\1.self_attn.k_proj.weight",
-    r"layers.(\d+).attention.wv.weight":                r"language_model.model.layers.\1.self_attn.v_proj.weight",
-    r"layers.(\d+).attention.wo.weight":                r"language_model.model.layers.\1.self_attn.o_proj.weight",
-    r"layers.(\d+).feed_forward.w1.weight":             r"language_model.model.layers.\1.mlp.gate_proj.weight",
-    r"layers.(\d+).feed_forward.w2.weight":             r"language_model.model.layers.\1.mlp.down_proj.weight",
-    r"layers.(\d+).feed_forward.w3.weight":             r"language_model.model.layers.\1.mlp.up_proj.weight",
-    r"layers.(\d+).ffn_norm.weight":                    r"language_model.model.layers.\1.post_attention_layernorm.weight",
-    r"layers.(\d+).attention_norm.weight":              r"language_model.model.layers.\1.input_layernorm.weight",
-    r"tok_embeddings.weight":                           r"language_model.model.embed_tokens.weight",
-    r"output.weight":                                   r"language_model.lm_head.weight",
-    r"norm.weight":                                     r"language_model.model.norm.weight"
+    tokenizer = PreTrainedTokenizerFast(
+        tokenizer_object=MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted()
+    )
+    tokenizer.model_input_names = ["input_ids", "attention_mask"]
 
-}
+    return tokenizer
 
 
 
 def permute_for_rope(value, n_heads, config):
-        dim1 = value.shape[0]
-        dim2 = config.hidden_size
-        return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) 
+    dim1 = value.shape[0]
+    dim2 = config.hidden_size
+    return value.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
 
-def convert_dictionnary(original_state_dict):
-    new_dict={}
+def convert_dictionnary(original_state_dict, vision_config, text_config):
+    new_dict = {}
 
-    all_keys = "\n"+ "\n".join(original_state_dict.keys())
+    all_keys = "\n" + "\n".join(original_state_dict.keys())
     old_keys = all_keys
     for old, new in OLD_KEY_TO_NEW_KEY_MAPPING.items():
-        all_keys = re.sub(r"\n"+ old,r"\n"+new,all_keys)
+        all_keys = re.sub(r"\n" + old, r"\n" + new, all_keys)
 
     OLD_TO_NEW = dict(zip(old_keys.split("\n"), all_keys.split("\n")))
 
     for key, value in original_state_dict.items():
-
         new_key = OLD_TO_NEW[key]
         if "vision_encoder" in key:
             _config = vision_config
@@ -200,56 +178,85 @@ def convert_dictionnary(original_state_dict):
                 num_attention_heads = _config.num_key_value_heads
             # convert the text model (basically mistral model)
 
-
         if "q_proj" in new_key or "k_proj" in new_key:
-            value = permute_for_rope(value,num_attention_heads, _config)
+            value = permute_for_rope(value, num_attention_heads, _config)
 
         new_dict[new_key] = value
-
-config.text_config.head_dim = 128
-# with torch.device("meta"):
-#     model = LlavaForConditionalGeneration(config)
-# model.load_state_dict(new_dict, strict=True, assign=True)
-
-# model.save_pretrained("../pixtral")
-config.vision_feature_layer = -1
-config.image_token_index = 10
-config.vision_feature_select_strategy = "full"
-config.image_seq_length = 1
-model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config).to("cuda")
-image_processor = PixtralImageProcessor()
-processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token = "[IMG]")
-processor.tokenizer = tokenizer
-prompt = "[INST]\nWhat's the content of the image?"
-url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
-
-
-
-messages = [
-    {
-        "role": "user",
-        "content": [{"type": "image_url", "image_url": {"url": url}}, {"type": "text", "text": prompt}]
-    },
-]
-
-tok = MistralTokenizer.from_model(model_name)
-
-
-from mistral_common.protocol.instruct.request import ChatCompletionRequest
-tokenized = tok.encode_chat_completion(
-    ChatCompletionRequest(
-        messages=messages,
-        model=model_name,
+    return new_dict
+
+def convert_mistral_model():
+
+    text_config = MistralConfig(
+        attention_dropout=0.0,
+        bos_token_id=1,
+        eos_token_id=2,
+        head_dim=128,
+        hidden_act="silu",
+        hidden_size=5120,
+        initializer_range=0.02,
+        intermediate_size=14336,
+        max_position_embeddings=1024000,
+        model_type="mistral",
+        num_attention_heads=32,
+        num_hidden_layers=40,
+        num_key_value_heads=8,
+        rms_norm_eps=1e-05,
+        rope_theta=1000000000.0,
+        sliding_window=None,
+        tie_word_embeddings=False,
+        vocab_size=131072,
     )
-)
 
-inputs["input_ids"] = torch.tensor([tokenized.tokens], dtype=torch.long, device="cuda")
-inputs["pixel_values"] = torch.tensor(tokenized.images,  device="cuda")
-del inputs["attention_mask"]
-generate_ids = model.generate(**inputs, max_new_tokens=100)
-print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
+    vision_config = PixtralConfig()
+    config = LlavaConfig(vision_config, text_config)
+    config.architectures = ["LlavaForConditionalGeneration"]
+    config.text_config.head_dim = 128
+    config.save_pretrained("../pixtral")
+
+    original_state_dict = safe_load_file("../pixtral/consolidated.safetensors")
+    new_dict = convert_dictionnary(original_state_dict, vision_config, text_config)
+
+    config.text_config.head_dim = 128
+    with torch.device("meta"):
+        model = LlavaForConditionalGeneration(config)
+    model.load_state_dict(new_dict, strict=True, assign=True)
+
+    model.save_pretrained("../pixtral")
+    config.vision_feature_layer = -1
+    config.image_token_index = 10
+    config.vision_feature_select_strategy = "full"
+    config.image_seq_length = 1
+    tokenizer = convert_mistral_tokenizer()
+    model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config).to("cuda")
+    image_processor = PixtralImageProcessor()
+    processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
+    prompt = "[INST]\nWhat's the content of the image?"
+    url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
+    # inputs["input_ids"] = torch.tensor([tokenized.tokens], dtype=torch.long, device="cuda")
+    # inputs["pixel_values"] = torch.tensor(tokenized.images, device="cuda")
+    del inputs["attention_mask"]
+    generate_ids = model.generate(**inputs, max_new_tokens=100)
+    print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
+
+# messages = [
+#     {"role": "user", "content": [{"type": "image_url", "image_url": {"url": url}}, {"type": "text", "text": prompt}]},
+# ]
+
+# tok = MistralTokenizer.from_model(model_name)
+
+
+# from mistral_common.protocol.instruct.request import ChatCompletionRequest
+
+
+# tokenized = tok.encode_chat_completion(
+#     ChatCompletionRequest(
+#         messages=messages,
+#         model=model_name,
+#     )
+# )
+
 """
 What's the content of the image?The image depicts a vibrant street scene in what appears to be a Chinatown district, characterized by its traditional architectural elements and cultural signage. A prominent feature is the red and white stop sign in the foreground, which has been adorned with a banner that reads "OPTUS." Behind the stop sign, there's an ornate gate with intricate designs and Chinese characters, marking the entrance to the district. The gate is flanked by buildings with colorful facades and signs in both English and Chinese
-"""
\ No newline at end of file
+"""
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index b053885154cf80..71c288e8fb0190 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -23,16 +23,13 @@
 
 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...modeling_outputs import ModelOutput, BaseModelOutput
+from ...modeling_outputs import BaseModelOutput, ModelOutput
 from ...utils import (
     add_start_docstrings,
-    add_start_docstrings_to_model_forward,
     logging,
-    replace_return_docstrings,
 )
-from ..auto import AutoModelForCausalLM
 from .configuration_pixtral import PixtralConfig
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS
+
 
 logger = logging.get_logger(__name__)
 
@@ -81,37 +78,39 @@ class PixtralCausalLMOutputWithPast(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 
+
 def position_ids_in_meshgrid(patch_embeds_list, max_width):
     positions = []
     for patch in patch_embeds_list:
         height, width = patch.shape[-2:]
         mesh = torch.meshgrid(torch.arange(height), torch.arange(width), indexing="ij")
-        h_grid, v_grid = torch.stack(mesh, dim=-1).reshape(-1, 2).chunk(2,-1)
+        h_grid, v_grid = torch.stack(mesh, dim=-1).reshape(-1, 2).chunk(2, -1)
         ids = h_grid * max_width + v_grid
-        positions.append(ids[:,0])
+        positions.append(ids[:, 0])
     return torch.cat(positions)
 
 
 class PixtralRotaryEmbedding(nn.Module):
     """
-        The key with pixtral embedding is just that you have a frequency for each pixel positions.
-        If you have height x width pixels (or embedding pixels)
+    The key with pixtral embedding is just that you have a frequency for each pixel positions.
+    If you have height x width pixels (or embedding pixels)
 
-        then the frequency used for ROPE is given by indexing the pre_computed frequency on the
-        width and height.
+    then the frequency used for ROPE is given by indexing the pre_computed frequency on the
+    width and height.
 
-        What you output is of dimension batch, height * width, dim with dim the embed dim.
+    What you output is of dimension batch, height * width, dim with dim the embed dim.
 
-        This simply means that for each image hidden states, you are going to add
-        a corresponding positional embedding, based on it's index in the grid.
+    This simply means that for each image hidden states, you are going to add
+    a corresponding positional embedding, based on it's index in the grid.
     """
+
     def __init__(self, config, device):
         super().__init__()
         self.rope_type = "default"
         self.dim = config.head_dim
         self.base = config.rope_theta
         max_patches_per_side = config.image_size // config.patch_size
-        freqs = 1.0 / (self.base**(torch.arange(0, self.dim, 2).float() / self.dim))
+        freqs = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
 
         h = torch.arange(max_patches_per_side, device=freqs.device)
         w = torch.arange(max_patches_per_side, device=freqs.device)
@@ -124,7 +123,7 @@ def __init__(self, config, device):
                 freqs_w[None, :, :].repeat(max_patches_per_side, 1, 1),
             ],
             dim=-1,
-        ).reshape(-1, self.dim//2) # we reshape to only index on the position indexes, not tuple of indexes
+        ).reshape(-1, self.dim // 2)  # we reshape to only index on the position indexes, not tuple of indexes
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
 
         # TODO maybe make it torch compatible later on. We can also just slice
@@ -147,7 +146,6 @@ def forward(self, x, position_ids):
             sin = emb.sin()
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 
-
     def _dynamic_frequency_update(self, position_ids, device):
         """
         dynamic RoPE layers should recompute `inv_freq` in the following situations:
@@ -167,7 +165,6 @@ def _dynamic_frequency_update(self, position_ids, device):
             self.max_seq_len_cached = self.original_max_seq_len
 
 
-
 # Copied from transformers.models.llama.modeling_llama.rotate_half
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
@@ -203,6 +200,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=0):
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
 
+
 class PixtralAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -261,6 +259,7 @@ def forward(
 
         return attn_output, attn_weights
 
+
 # Copied from gemma2
 class PixtralMLP(nn.Module):
     def __init__(self, config):
@@ -297,17 +296,13 @@ def extra_repr(self):
         return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
 
 
-
-
-
 class PixtralAttentionLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.attention_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5)
         self.feed_forward = PixtralMLP(config)
         self.attention = PixtralAttention(config)
-        self.ffn_norm =  PixtralRMSNorm(config.hidden_size, eps=1e-5)
-
+        self.ffn_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5)
 
     def forward(
         self,
@@ -349,7 +344,6 @@ def forward(
         return outputs
 
 
-
 class PixtralTransformer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -434,8 +428,6 @@ def forward(
         )
 
 
-
-
 PIXTRAL_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -489,6 +481,7 @@ def _init_weights(self, module):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 
+
 PIXTRAL_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -563,12 +556,14 @@ def _init_weights(self, module):
             the complete sequence length.
 """
 
+
 @add_start_docstrings(
     """The PIXTRAL model which consists of a vision backbone and a language model.""",
     PIXTRAL_START_DOCSTRING,
 )
 class PixtralModel(PixtralPreTrainedModel):
     base_model_prefix = "vision_encoder"
+
     def __init__(self, config):
         super().__init__(config)
         self.config = config
@@ -605,17 +600,16 @@ def forward(self, images: List[torch.Tensor], output_hidden_states=False, *kwarg
                 all tokens of all images of shape (N_toks, D)
         """
         # pass images through initial convolution independently
-        patch_embeds_list = [
-            self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in images
-        ]
+        patch_embeds_list = [self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in images]
 
         # flatten to a single sequence
-        patch_embeds = torch.cat(
-            [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
+        patch_embeds = torch.cat([p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
         patch_embeds = self.ln_pre(patch_embeds)
 
         # positional embeddings
-        position_ids = position_ids_in_meshgrid(patch_embeds_list, max_width=self.config.image_size // self.config.patch_size).to(self.device)
+        position_ids = position_ids_in_meshgrid(
+            patch_embeds_list, max_width=self.config.image_size // self.config.patch_size
+        ).to(self.device)
 
         position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
         attention_mask = None

From cc18d8887bc4474a7bfb3be4c3fcb984eb67357d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 14:41:33 +0200
Subject: [PATCH 18/58] working image and processor

---
 .../pixtral/image_processing_pixtral.py       | 52 ++++++++++++++++-
 .../models/pixtral/processing_pixtral.py      | 58 +++++++++++++++++--
 2 files changed, 102 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index f3a67f481a0b09..e0ab682e1cac86 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -38,7 +38,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
-from ...utils import TensorType, is_vision_available, logging
+from ...utils import TensorType, is_vision_available, logging, is_torch_device, is_torch_dtype
 from ...utils.import_utils import requires_backends
 
 
@@ -168,6 +168,54 @@ def get_resize_output_image_size(
     return num_height_tokens * patch_height, num_width_tokens * patch_width
 
 
+class BatchMixFeature(BatchFeature):
+    def to(self, *args, **kwargs) -> "BatchFeature":
+        """
+        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
+        different `dtypes` and sending the `BatchFeature` to a different `device`.
+
+        Args:
+            args (`Tuple`):
+                Will be passed to the `to(...)` function of the tensors.
+            kwargs (`Dict`, *optional*):
+                Will be passed to the `to(...)` function of the tensors.
+
+        Returns:
+            [`BatchFeature`]: The same instance after modification.
+        """
+        requires_backends(self, ["torch"])
+        import torch  # noqa
+
+        new_data = {}
+        device = kwargs.get("device")
+        # Check if the args are a device or a dtype
+        if device is None and len(args) > 0:
+            # device should be always the first argument
+            arg = args[0]
+            if is_torch_dtype(arg):
+                # The first argument is a dtype
+                pass
+            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+                device = arg
+            else:
+                # it's something else
+                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+        for k, v in self.items():
+            # check if v is a floating point
+            if isinstance(k, list):
+                new_data[k] = [element.to(*args, **kwargs) for sample in v for element in sample]
+            if torch.is_floating_point(v):
+                # cast and send to device
+                new_data[k] = v.to(*args, **kwargs)
+            elif device is not None:
+                new_data[k] = v.to(device=device)
+            else:
+                new_data[k] = v
+        self.data = new_data
+        return self
+
+
 # Hack to get tensor conversion used in BatchFeature without batching the images
 def _get_is_as_tensor_fns(tensor_type: Union[str, TensorType]) -> Tuple[Callable, Callable]:
     return BatchFeature()._get_is_as_tensor_fns(tensor_type)
@@ -464,4 +512,4 @@ def preprocess(
 
         # Convert to tensor type outside of BatchFeature to avoid batching the images of different sizes
         images_list = [[convert_to_tensor(image, return_tensors) for image in images] for images in images_list]
-        return BatchFeature(data={"images": images_list, "image_sizes": batch_image_sizes}, tensor_type=None)
+        return BatchFeature(data={"pixel_values": images_list, "image_sizes": batch_image_sizes}, tensor_type=None)
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index 9a9b7a23a41371..eee0f90ff34557 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -22,12 +22,58 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType, logging
-
+from ...utils import TensorType, logging, is_torch_device, is_torch_dtype, requires_backends, is_torch_tensor
 
 logger = logging.get_logger(__name__)
 
 
+class BatchMixFeature(BatchFeature):
+    def to(self, *args, **kwargs) -> "BatchFeature":
+        """
+        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
+        different `dtypes` and sending the `BatchFeature` to a different `device`.
+
+        Args:
+            args (`Tuple`):
+                Will be passed to the `to(...)` function of the tensors.
+            kwargs (`Dict`, *optional*):
+                Will be passed to the `to(...)` function of the tensors.
+
+        Returns:
+            [`BatchFeature`]: The same instance after modification.
+        """
+        requires_backends(self, ["torch"])
+        import torch  # noqa
+
+        new_data = {}
+        device = kwargs.get("device")
+        # Check if the args are a device or a dtype
+        if device is None and len(args) > 0:
+            # device should be always the first argument
+            arg = args[0]
+            if is_torch_dtype(arg):
+                # The first argument is a dtype
+                pass
+            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+                device = arg
+            else:
+                # it's something else
+                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+        for k, v in self.items():
+            # check if v is a floating point
+            if isinstance(v, list):
+                new_data[k] = [element.to(*args, **kwargs)  for sample in v for element in sample if is_torch_tensor(element) ]
+            elif torch.is_floating_point(v):
+                # cast and send to device
+                new_data[k] = v.to(*args, **kwargs)
+            elif device is not None:
+                new_data[k] = v.to(device=device)
+            else:
+                new_data[k] = v
+        self.data = new_data
+        return self
+
 class PixtralProcessor(ProcessorMixin):
     r"""
     Constructs a Pixtral processor which wraps a Pixtral image processor and a Pixtral tokenizer into a single processor.
@@ -88,7 +134,7 @@ def __call__(
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_length=None,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
-    ) -> BatchFeature:
+    ) -> BatchMixFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
@@ -146,9 +192,9 @@ def __call__(
 
         # try to expand inputs in processing if we have the necessary parts
         prompt_strings = text
-        if image_inputs.get("images") is not None:
+        if image_inputs.get("pixel_values") is not None:
             # Replace the image token with the expanded image token sequence
-            images = image_inputs["images"]
+            images = image_inputs["pixel_values"]
             image_sizes = image_inputs.pop("image_sizes")
             prompt_strings = []
 
@@ -182,7 +228,7 @@ def __call__(
             truncation=truncation,
             max_length=max_length,
         )
-        return BatchFeature(data={**text_inputs, **image_inputs})
+        return BatchMixFeature(data={**text_inputs, **image_inputs})
 
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):

From f04075e878506572911b50c8185d131f7e565677 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 14:44:06 +0200
Subject: [PATCH 19/58] this is the expected format

---
 .../pixtral/convert_pixtral_weights_to_hf.py  | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index e4fd58cbc5f14b..f610dc1e0ce4d6 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -213,24 +213,24 @@ def convert_mistral_model():
     config.text_config.head_dim = 128
     config.save_pretrained("../pixtral")
 
-    original_state_dict = safe_load_file("../pixtral/consolidated.safetensors")
-    new_dict = convert_dictionnary(original_state_dict, vision_config, text_config)
+    # original_state_dict = safe_load_file("../pixtral/consolidated.safetensors")
+    # new_dict = convert_dictionnary(original_state_dict, vision_config, text_config)
 
-    config.text_config.head_dim = 128
-    with torch.device("meta"):
-        model = LlavaForConditionalGeneration(config)
-    model.load_state_dict(new_dict, strict=True, assign=True)
+    # config.text_config.head_dim = 128
+    # with torch.device("meta"):
+    #     model = LlavaForConditionalGeneration(config)
+    # model.load_state_dict(new_dict, strict=True, assign=True)
 
-    model.save_pretrained("../pixtral")
+    # model.save_pretrained("../pixtral")
     config.vision_feature_layer = -1
     config.image_token_index = 10
     config.vision_feature_select_strategy = "full"
     config.image_seq_length = 1
     tokenizer = convert_mistral_tokenizer()
-    model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config).to("cuda")
+    model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config, low_cpu_mem_usage=True).to("cuda")
     image_processor = PixtralImageProcessor()
     processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
-    prompt = "[INST]\nWhat's the content of the image?"
+    prompt = "<s>[INST][IMG]\nWhat's the content of the image?[/INST]"
     url = "https://www.ilankelman.org/stopsigns/australia.jpg"
     image = Image.open(requests.get(url, stream=True).raw)
     inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
@@ -240,6 +240,7 @@ def convert_mistral_model():
     generate_ids = model.generate(**inputs, max_new_tokens=100)
     print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
 
+convert_mistral_model()
 # messages = [
 #     {"role": "user", "content": [{"type": "image_url", "image_url": {"url": url}}, {"type": "text", "text": prompt}]},
 # ]

From 732071b97c208fdb294bfff34b8291ab61c58192 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 15:08:02 +0200
Subject: [PATCH 20/58] some fixes

---
 src/transformers/__init__.py                  | 11 +------
 .../pixtral/convert_pixtral_weights_to_hf.py  | 31 +++++++++++++------
 .../pixtral/image_processing_pixtral.py       |  6 ++--
 .../models/pixtral/processing_pixtral.py      |  8 +++--
 4 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5ff1da3f1e0a08..12ec78e8bdfab0 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -526,9 +526,6 @@
         "LlavaConfig",
         "LlavaProcessor",
     ],
-    "models.pixtral": [
-        "PixtralConfig",
-    ],
     "models.llava_next": [
         "LlavaNextConfig",
         "LlavaNextProcessor",
@@ -1364,7 +1361,7 @@
             "AlignVisionModel",
         ]
     )
-
+    (_import_structure["models.pixtral"].extend(["PixtralModel", "PixtralPreTrainedModel"]),)
     _import_structure["models.altclip"].extend(
         [
             "AltCLIPModel",
@@ -2529,12 +2526,6 @@
             "LlavaPreTrainedModel",
         ]
     )
-    _import_structure["models.pixtral"].extend(
-        [
-            "PixtralModel",
-            "PixtralPreTrainedModel",
-        ]
-    )
     _import_structure["models.llava_next"].extend(
         [
             "LlavaNextForConditionalGeneration",
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index f610dc1e0ce4d6..1ebf45c762ce10 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -1,9 +1,9 @@
 import regex as re
 import requests
-import torch
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from PIL import Image
-from safetensors.torch import load_file as safe_load_file
+from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import BPE
 
 from transformers import (
     LlavaConfig,
@@ -14,8 +14,7 @@
     PixtralProcessor,
     PreTrainedTokenizerFast,
 )
-from transformers.convert_slow_tokenizer import *
-
+from transformers.convert_slow_tokenizer import bytes_to_unicode
 
 
 OLD_KEY_TO_NEW_KEY_MAPPING = {
@@ -49,6 +48,7 @@
     r"norm.weight": r"language_model.model.norm.weight",
 }
 
+
 class MistralConverter:
     """
     A general tiktoken converter.
@@ -148,7 +148,6 @@ def convert_mistral_tokenizer():
     return tokenizer
 
 
-
 def permute_for_rope(value, n_heads, config):
     dim1 = value.shape[0]
     dim2 = config.hidden_size
@@ -184,8 +183,8 @@ def convert_dictionnary(original_state_dict, vision_config, text_config):
         new_dict[new_key] = value
     return new_dict
 
-def convert_mistral_model():
 
+def convert_mistral_model():
     text_config = MistralConfig(
         attention_dropout=0.0,
         bos_token_id=1,
@@ -227,19 +226,33 @@ def convert_mistral_model():
     config.vision_feature_select_strategy = "full"
     config.image_seq_length = 1
     tokenizer = convert_mistral_tokenizer()
-    model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config, low_cpu_mem_usage=True).to("cuda")
+    model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config, low_cpu_mem_usage=True).to(
+        "cuda"
+    )
     image_processor = PixtralImageProcessor()
     processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
     prompt = "<s>[INST][IMG]\nWhat's the content of the image?[/INST]"
     url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda")
+
+
+    IMG_URLS = [
+        Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
+        Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
+        Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
+        Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
+    ]
+    PROMPT = "<s>[INST][IMG][IMG][IMG][IMG]\nWhat's the content of the image?[/INST]."
+
+
+    # image = Image.open(requests.get(url, stream=True).raw)
+    inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
     # inputs["input_ids"] = torch.tensor([tokenized.tokens], dtype=torch.long, device="cuda")
     # inputs["pixel_values"] = torch.tensor(tokenized.images, device="cuda")
     del inputs["attention_mask"]
     generate_ids = model.generate(**inputs, max_new_tokens=100)
     print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
 
+
 convert_mistral_model()
 # messages = [
 #     {"role": "user", "content": [{"type": "image_url", "image_url": {"url": url}}, {"type": "text", "text": prompt}]},
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index e0ab682e1cac86..c3ac72443a9549 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -31,14 +31,14 @@
     get_image_size,
     infer_channel_dimension_format,
     is_scaled_image,
-    is_valid_image,
     make_list_of_images,
     to_numpy_array,
     valid_images,
     validate_kwargs,
+    is_valid_image,
     validate_preprocess_arguments,
 )
-from ...utils import TensorType, is_vision_available, logging, is_torch_device, is_torch_dtype
+from ...utils import TensorType, is_torch_device, is_torch_dtype, is_vision_available, logging
 from ...utils.import_utils import requires_backends
 
 
@@ -444,7 +444,7 @@ def preprocess(
 
         images_list = make_list_of_images(images)
 
-        if not valid_images(images_list[0][0]):
+        if not valid_images(images_list[0]):
             raise ValueError(
                 "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                 "torch.Tensor, tf.Tensor or jax.ndarray."
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index eee0f90ff34557..4d1a2af9390a5c 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -22,7 +22,8 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from ...utils import TensorType, logging, is_torch_device, is_torch_dtype, requires_backends, is_torch_tensor
+from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, logging, requires_backends
+
 
 logger = logging.get_logger(__name__)
 
@@ -63,7 +64,9 @@ def to(self, *args, **kwargs) -> "BatchFeature":
         for k, v in self.items():
             # check if v is a floating point
             if isinstance(v, list):
-                new_data[k] = [element.to(*args, **kwargs)  for sample in v for element in sample if is_torch_tensor(element) ]
+                new_data[k] = [
+                    element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
+                ]
             elif torch.is_floating_point(v):
                 # cast and send to device
                 new_data[k] = v.to(*args, **kwargs)
@@ -74,6 +77,7 @@ def to(self, *args, **kwargs) -> "BatchFeature":
         self.data = new_data
         return self
 
+
 class PixtralProcessor(ProcessorMixin):
     r"""
     Constructs a Pixtral processor which wraps a Pixtral image processor and a Pixtral tokenizer into a single processor.

From 3a15b4e7c8b5513fc5ad26ccbcda58aeef8628ce Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 16:13:22 +0200
Subject: [PATCH 21/58] push current updated

---
 .../pixtral/convert_pixtral_weights_to_hf.py  | 47 ++++++++++++++-----
 .../models/pixtral/modeling_pixtral.py        |  2 +-
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 1ebf45c762ce10..f0d24659a5525e 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -241,9 +241,21 @@ def convert_mistral_model():
         Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
         Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
     ]
-    PROMPT = "<s>[INST][IMG][IMG][IMG][IMG]\nWhat's the content of the image?[/INST]."
+    PROMPT = "<s>[INST]Describe the images[IMG] and [IMG] and [IMG] and [IMG][/INST]"
 
 
+    """
+Describe the content of each of the 4 following images: [IMG_START][IMG_START][IMG_START][IMG_START]Sure, here are the descriptions of the four images:
+
+1. **A black dog with blue eyes** is drinking from a stream in a lush landscape with mountains in the background**.
+
+2. **The dog appears to be in mid-stride, with its tongue outstretched towards the stream**.
+
+3. **The landscape is detailed with green grass and trees, and the stream meanders the mountains**.
+
+4. **In the distance, the dog appears to be running, with a sense of
+
+    """
     # image = Image.open(requests.get(url, stream=True).raw)
     inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
     # inputs["input_ids"] = torch.tensor([tokenized.tokens], dtype=torch.long, device="cuda")
@@ -252,24 +264,33 @@ def convert_mistral_model():
     generate_ids = model.generate(**inputs, max_new_tokens=100)
     print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
 
+    messages = [
+        {"role": "user", "content": [{"type": "image_url", "image_url": {"url": url}}, {"type": "image_url", "image_url": {"url": url}}, {"type": "image_url", "image_url": {"url": url}}, {"type": "image_url", "image_url": {"url": url}}, {"type": "text", "text": "Describe the content of each image"}]},
+    ]
+    
+    model_name = "mistralai/Pixtral-12B-2409"
+    tok = MistralTokenizer.from_model(model_name)
 
-convert_mistral_model()
-# messages = [
-#     {"role": "user", "content": [{"type": "image_url", "image_url": {"url": url}}, {"type": "text", "text": prompt}]},
-# ]
 
-# tok = MistralTokenizer.from_model(model_name)
+    from mistral_common.protocol.instruct.request import ChatCompletionRequest, UserMessage, ImageChunk, TextChunk
 
 
-# from mistral_common.protocol.instruct.request import ChatCompletionRequest
+    EXPECTED_TOKENS = tok.encode_chat_completion(
+        ChatCompletionRequest(
+            messages=[
+                UserMessage(
+                    content=[
+                        TextChunk(text="Describe the images"),
+                    ] + [ImageChunk(image=img) for img in IMG_URLS]
+                )
+            ],
+            model="pixtral",
+        )
+    )
+    assert tokenizer.decode(inputs["input_ids"][0]) == EXPECTED_TOKENS
 
+convert_mistral_model()
 
-# tokenized = tok.encode_chat_completion(
-#     ChatCompletionRequest(
-#         messages=messages,
-#         model=model_name,
-#     )
-# )
 
 """
 What's the content of the image?The image depicts a vibrant street scene in what appears to be a Chinatown district, characterized by its traditional architectural elements and cultural signage. A prominent feature is the red and white stop sign in the foreground, which has been adorned with a banner that reads "OPTUS." Behind the stop sign, there's an ornate gate with intricate designs and Chinese characters, marking the entrance to the district. The gate is flanked by buildings with colorful facades and signs in both English and Chinese
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 71c288e8fb0190..9b2fcef4a4e2d7 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -85,7 +85,7 @@ def position_ids_in_meshgrid(patch_embeds_list, max_width):
         height, width = patch.shape[-2:]
         mesh = torch.meshgrid(torch.arange(height), torch.arange(width), indexing="ij")
         h_grid, v_grid = torch.stack(mesh, dim=-1).reshape(-1, 2).chunk(2, -1)
-        ids = h_grid * max_width + v_grid
+        ids = h_grid  + v_grid * max_width
         positions.append(ids[:, 0])
     return torch.cat(positions)
 

From b773bde93f0e3fac1240583d11859f18557cc81c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 17:16:26 +0200
Subject: [PATCH 22/58] working mult images!

---
 .../pixtral/convert_pixtral_weights_to_hf.py  | 85 ++++++++++---------
 .../models/pixtral/modeling_pixtral.py        | 23 ++++-
 2 files changed, 64 insertions(+), 44 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index f0d24659a5525e..84a36e29c8310c 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -15,7 +15,7 @@
     PreTrainedTokenizerFast,
 )
 from transformers.convert_slow_tokenizer import bytes_to_unicode
-
+import torch
 
 OLD_KEY_TO_NEW_KEY_MAPPING = {
     # Layer Normalization Weights
@@ -226,14 +226,11 @@ def convert_mistral_model():
     config.vision_feature_select_strategy = "full"
     config.image_seq_length = 1
     tokenizer = convert_mistral_tokenizer()
-    model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config, low_cpu_mem_usage=True).to(
+    model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config, low_cpu_mem_usage=True, torch_dtype = torch.bfloat16).to(
         "cuda"
     )
     image_processor = PixtralImageProcessor()
     processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
-    prompt = "<s>[INST][IMG]\nWhat's the content of the image?[/INST]"
-    url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-
 
     IMG_URLS = [
         Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
@@ -241,53 +238,34 @@ def convert_mistral_model():
         Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
         Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
     ]
-    PROMPT = "<s>[INST]Describe the images[IMG] and [IMG] and [IMG] and [IMG][/INST]"
-
-
-    """
-Describe the content of each of the 4 following images: [IMG_START][IMG_START][IMG_START][IMG_START]Sure, here are the descriptions of the four images:
-
-1. **A black dog with blue eyes** is drinking from a stream in a lush landscape with mountains in the background**.
+    PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
 
-2. **The dog appears to be in mid-stride, with its tongue outstretched towards the stream**.
-
-3. **The landscape is detailed with green grass and trees, and the stream meanders the mountains**.
-
-4. **In the distance, the dog appears to be running, with a sense of
-
-    """
     # image = Image.open(requests.get(url, stream=True).raw)
     inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
-    # inputs["input_ids"] = torch.tensor([tokenized.tokens], dtype=torch.long, device="cuda")
-    # inputs["pixel_values"] = torch.tensor(tokenized.images, device="cuda")
-    del inputs["attention_mask"]
-    generate_ids = model.generate(**inputs, max_new_tokens=100)
+    generate_ids = model.generate(**inputs, max_new_tokens=500)
     print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
 
-    messages = [
-        {"role": "user", "content": [{"type": "image_url", "image_url": {"url": url}}, {"type": "image_url", "image_url": {"url": url}}, {"type": "image_url", "image_url": {"url": url}}, {"type": "image_url", "image_url": {"url": url}}, {"type": "text", "text": "Describe the content of each image"}]},
-    ]
     
-    model_name = "mistralai/Pixtral-12B-2409"
-    tok = MistralTokenizer.from_model(model_name)
+    # model_name = "mistralai/Pixtral-12B-2409"
+    # tok = MistralTokenizer.from_model(model_name)
 
 
-    from mistral_common.protocol.instruct.request import ChatCompletionRequest, UserMessage, ImageChunk, TextChunk
+    # from mistral_common.protocol.instruct.request import ChatCompletionRequest, UserMessage, ImageChunk, TextChunk
 
 
-    EXPECTED_TOKENS = tok.encode_chat_completion(
-        ChatCompletionRequest(
-            messages=[
-                UserMessage(
-                    content=[
-                        TextChunk(text="Describe the images"),
-                    ] + [ImageChunk(image=img) for img in IMG_URLS]
-                )
-            ],
-            model="pixtral",
-        )
-    )
-    assert tokenizer.decode(inputs["input_ids"][0]) == EXPECTED_TOKENS
+    # EXPECTED_TOKENS = tok.encode_chat_completion(
+    #     ChatCompletionRequest(
+    #         messages=[
+    #             UserMessage(
+    #                 content=[
+    #                     TextChunk(text="Describe the images"),
+    #                 ] + [ImageChunk(image=img) for img in IMG_URLS]
+    #             )
+    #         ],
+    #         model="pixtral",
+    #     )
+    # )
+    # assert tokenizer.decode(inputs["input_ids"][0]) == EXPECTED_TOKENS
 
 convert_mistral_model()
 
@@ -295,3 +273,26 @@ def convert_mistral_model():
 """
 What's the content of the image?The image depicts a vibrant street scene in what appears to be a Chinatown district, characterized by its traditional architectural elements and cultural signage. A prominent feature is the red and white stop sign in the foreground, which has been adorned with a banner that reads "OPTUS." Behind the stop sign, there's an ornate gate with intricate designs and Chinese characters, marking the entrance to the district. The gate is flanked by buildings with colorful facades and signs in both English and Chinese
 """
+
+"""
+Describe the images.
+Sure, let's break down each image description:
+
+1. **Image 1:**
+   - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera.
+   - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur.
+
+2. **Image 2:**
+   - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley.
+   - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image.
+
+3. **Image 3:**
+   - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset.
+   - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene.
+
+4. **Image 4:**
+   - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers.
+   - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.
+
+Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
+"""
\ No newline at end of file
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 9b2fcef4a4e2d7..79a46ec8abb4ae 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -85,7 +85,7 @@ def position_ids_in_meshgrid(patch_embeds_list, max_width):
         height, width = patch.shape[-2:]
         mesh = torch.meshgrid(torch.arange(height), torch.arange(width), indexing="ij")
         h_grid, v_grid = torch.stack(mesh, dim=-1).reshape(-1, 2).chunk(2, -1)
-        ids = h_grid  + v_grid * max_width
+        ids = h_grid * max_width + v_grid
         positions.append(ids[:, 0])
     return torch.cat(positions)
 
@@ -612,8 +612,27 @@ def forward(self, images: List[torch.Tensor], output_hidden_states=False, *kwarg
         ).to(self.device)
 
         position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
-        attention_mask = None
+        # LAST TODO:
+        attention_mask = generate_block_attention_mask([p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds)
         out = self.transformer(patch_embeds, attention_mask, position_embedding)
 
         # remove batch dimension of the single sequence
         return out
+
+def generate_block_attention_mask(patch_embeds_list, tensor):
+    dtype = tensor.dtype
+    device = tensor.device
+    # Get the number of patches (sequence length) from the first element in patch_embeds_list
+    seq_len = tensor.shape[1]
+    d_min = torch.finfo(dtype).min
+    causal_mask = torch.full((seq_len, seq_len), fill_value=d_min, dtype=dtype, device=device) 
+    # Create an empty attention mask (1: attend, 0: no attend)
+    
+    # Fill the mask with 1s within blocks
+    block_end_idx = torch.tensor(patch_embeds_list).cumsum(-1)
+    block_start_idx = torch.tensor([0]+patch_embeds_list[:-1]).cumsum(-1) 
+    for start, end in zip(block_start_idx, block_end_idx): 
+        causal_mask[start:end, start:end] = 0
+
+    causal_mask = causal_mask[None, None, :, :].expand( tensor.shape[0], 1, -1, -1) 
+    return causal_mask
\ No newline at end of file

From 6c58167bf60f591c633820ca0d1a76f5af119f76 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 17:26:01 +0200
Subject: [PATCH 23/58] add a small integration test

---
 .../models/pixtral/modeling_pixtral.py        |  3 --
 tests/models/llava/test_modeling_llava.py     | 47 +++++++++++++++++++
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 79a46ec8abb4ae..86915123b24dee 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -612,11 +612,8 @@ def forward(self, images: List[torch.Tensor], output_hidden_states=False, *kwarg
         ).to(self.device)
 
         position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
-        # LAST TODO:
         attention_mask = generate_block_attention_mask([p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds)
         out = self.transformer(patch_embeds, attention_mask, position_embedding)
-
-        # remove batch dimension of the single sequence
         return out
 
 def generate_block_attention_mask(patch_embeds_list, tensor):
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 2fed802b5a2fb3..ba2baad623b05e 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -569,3 +569,50 @@ def test_expansion_in_processing(self):
 
         # check that both inputs are handled correctly and generate the same output
         self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
+
+    @slow
+    @require_bitsandbytes
+    def test_pixtral(self):
+        model_id = "hf-internal-testing/pixtral-12b"
+        model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        IMG_URLS = [
+            Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
+            Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
+        ]
+        PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
+
+        # image = Image.open(requests.get(url, stream=True).raw)
+        inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
+        generate_ids = model.generate(**inputs, max_new_tokens=500)
+        ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+        # fmt: off
+        EXPECTED_GENERATION = """
+Describe the images.
+Sure, let's break down each image description:
+
+1. **Image 1:**
+   - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera.
+   - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur.
+
+2. **Image 2:**
+   - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley.
+   - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image.
+
+3. **Image 3:**
+   - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset.
+   - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene.
+
+4. **Image 4:**
+   - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers.
+   - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.
+
+Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
+"""
+        # fmt: on
+        # check that both inputs are handled correctly and generate the same output
+        self.assertListEqual(ouptut, EXPECTED_GENERATION) 

From c4c32fb1192c4e4eaaeb77fdb667e550346e1067 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 16:27:44 +0100
Subject: [PATCH 24/58] Uodate configuration docstring

---
 .../models/pixtral/configuration_pixtral.py   | 41 +++++++++++--------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
index 667466872ff13e..732b62625e8581 100644
--- a/src/transformers/models/pixtral/configuration_pixtral.py
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -32,23 +32,30 @@ class PixtralConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
-            The config object or dictionary of the vision backbone.
-        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
-            The config object or dictionary of the text backbone.
-        ignore_index (`int`, *optional*, defaults to -100):
-            The ignore index for the loss function.
-        image_token_index (`int`, *optional*, defaults to 32000):
-            The image token index to encode the image prompt.
-        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
-            The activation function used by the multimodal projector.
-        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Can be one of `"default"` or `"full"`.
-        vision_feature_layer (`int`, *optional*, defaults to -2):
-            The index of the layer to select the vision feature.
-        image_seq_length (`int`, *optional*, defaults to 576):
-            Sequence length of one image embedding.
+        hidden_size (`int`, `optional`, defaults to 1024):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, `optional`, defaults to 4096):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, `optional`, defaults to 24):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, `optional`, defaults to 16):
+            Number of attention heads in the Transformer decoder.
+        num_channels (`int`, `optional`, defaults to 3):
+            Number of input channels in the input images.
+        image_size (`int`, `optional`, defaults to 1024):
+            Max dimension of the input images.
+        patch_size (`int`, `optional`, defaults to 16):
+            Size of the image patches.
+        hidden_activation (`str`, `optional`, defaults to "gelu"):
+            Activation function used in the hidden layers.
+        layer_norm_eps (`float`, `optional`, defaults to 1e-5):
+            Epsilon value for layer normalization.
+        attention_dropout (`float`, `optional`, defaults to 0.0):
+            Dropout probability for the attention layers.
+        rope_theta (`float`, `optional`, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        tie_word_embeddings (`bool`, `optional`, defaults to False):
+            Whether to tie the word embeddings with the input embeddings.
 
     Example:
 

From 9c621af620d34aaee02b08fa6898abf9a8343bcc Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 16:28:55 +0100
Subject: [PATCH 25/58] Formatting

---
 .../pixtral/convert_pixtral_weights_to_hf.py   | 15 +++++++--------
 .../models/pixtral/image_processing_pixtral.py |  4 +---
 .../models/pixtral/modeling_pixtral.py         | 18 +++++++++++-------
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 84a36e29c8310c..c7a5fc2bd50de5 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -1,5 +1,6 @@
 import regex as re
 import requests
+import torch
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from PIL import Image
 from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
@@ -15,7 +16,7 @@
     PreTrainedTokenizerFast,
 )
 from transformers.convert_slow_tokenizer import bytes_to_unicode
-import torch
+
 
 OLD_KEY_TO_NEW_KEY_MAPPING = {
     # Layer Normalization Weights
@@ -226,9 +227,9 @@ def convert_mistral_model():
     config.vision_feature_select_strategy = "full"
     config.image_seq_length = 1
     tokenizer = convert_mistral_tokenizer()
-    model = LlavaForConditionalGeneration.from_pretrained("../pixtral", config=config, low_cpu_mem_usage=True, torch_dtype = torch.bfloat16).to(
-        "cuda"
-    )
+    model = LlavaForConditionalGeneration.from_pretrained(
+        "../pixtral", config=config, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
+    ).to("cuda")
     image_processor = PixtralImageProcessor()
     processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
 
@@ -245,14 +246,11 @@ def convert_mistral_model():
     generate_ids = model.generate(**inputs, max_new_tokens=500)
     print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
 
-    
     # model_name = "mistralai/Pixtral-12B-2409"
     # tok = MistralTokenizer.from_model(model_name)
 
-
     # from mistral_common.protocol.instruct.request import ChatCompletionRequest, UserMessage, ImageChunk, TextChunk
 
-
     # EXPECTED_TOKENS = tok.encode_chat_completion(
     #     ChatCompletionRequest(
     #         messages=[
@@ -267,6 +265,7 @@ def convert_mistral_model():
     # )
     # assert tokenizer.decode(inputs["input_ids"][0]) == EXPECTED_TOKENS
 
+
 convert_mistral_model()
 
 
@@ -295,4 +294,4 @@ def convert_mistral_model():
    - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.
 
 Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
-"""
\ No newline at end of file
+"""
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index c3ac72443a9549..37850cb0017223 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -20,7 +20,6 @@
 
 from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
 from ...image_transforms import (
-    get_resize_output_image_size,
     resize,
     to_channel_dimension_format,
 )
@@ -31,11 +30,10 @@
     get_image_size,
     infer_channel_dimension_format,
     is_scaled_image,
-    make_list_of_images,
+    is_valid_image,
     to_numpy_array,
     valid_images,
     validate_kwargs,
-    is_valid_image,
     validate_preprocess_arguments,
 )
 from ...utils import TensorType, is_torch_device, is_torch_dtype, is_vision_available, logging
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 86915123b24dee..a95ae346324af4 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -612,24 +612,28 @@ def forward(self, images: List[torch.Tensor], output_hidden_states=False, *kwarg
         ).to(self.device)
 
         position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
-        attention_mask = generate_block_attention_mask([p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds)
+        # LAST TODO:
+        attention_mask = generate_block_attention_mask(
+            [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds
+        )
         out = self.transformer(patch_embeds, attention_mask, position_embedding)
         return out
 
+
 def generate_block_attention_mask(patch_embeds_list, tensor):
     dtype = tensor.dtype
     device = tensor.device
     # Get the number of patches (sequence length) from the first element in patch_embeds_list
     seq_len = tensor.shape[1]
     d_min = torch.finfo(dtype).min
-    causal_mask = torch.full((seq_len, seq_len), fill_value=d_min, dtype=dtype, device=device) 
+    causal_mask = torch.full((seq_len, seq_len), fill_value=d_min, dtype=dtype, device=device)
     # Create an empty attention mask (1: attend, 0: no attend)
-    
+
     # Fill the mask with 1s within blocks
     block_end_idx = torch.tensor(patch_embeds_list).cumsum(-1)
-    block_start_idx = torch.tensor([0]+patch_embeds_list[:-1]).cumsum(-1) 
-    for start, end in zip(block_start_idx, block_end_idx): 
+    block_start_idx = torch.tensor([0] + patch_embeds_list[:-1]).cumsum(-1)
+    for start, end in zip(block_start_idx, block_end_idx):
         causal_mask[start:end, start:end] = 0
 
-    causal_mask = causal_mask[None, None, :, :].expand( tensor.shape[0], 1, -1, -1) 
-    return causal_mask
\ No newline at end of file
+    causal_mask = causal_mask[None, None, :, :].expand(tensor.shape[0], 1, -1, -1)
+    return causal_mask

From 172b5bc403dc8d464e5207f6df3b402771e6b212 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 16:31:56 +0100
Subject: [PATCH 26/58] Config docstring fix

---
 .../models/pixtral/configuration_pixtral.py          | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
index 732b62625e8581..da7479b342b754 100644
--- a/src/transformers/models/pixtral/configuration_pixtral.py
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -62,16 +62,10 @@ class PixtralConfig(PretrainedConfig):
     ```python
     >>> from transformers import PixtralModel, PixtralConfig, CLIPVisionConfig, LlamaConfig
 
-    >>> # Initializing a CLIP-vision config
-    >>> vision_config = CLIPVisionConfig()
+    >>> # Initializing a Pixtral 12B style configuration
+    >>> config = PixtralConfig()
 
-    >>> # Initializing a Llama config
-    >>> text_config = LlamaConfig()
-
-    >>> # Initializing a Pixtral pixtral-1.5-7b style configuration
-    >>> configuration = PixtralConfig(vision_config, text_config)
-
-    >>> # Initializing a model from the pixtral-1.5-7b style configuration
+    >>> # Initializing a model from the pixtral 12B style configuration
     >>> model = PixtralModel(configuration)
 
     >>> # Accessing the model configuration

From d34bac092f2e8a019fb2b235ea934e82e3f70ed2 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 17:35:30 +0200
Subject: [PATCH 27/58] simplify model test

---
 tests/models/pixtral/test_modeling_pixtral.py | 473 +++---------------
 1 file changed, 80 insertions(+), 393 deletions(-)

diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py
index 0c8a82e0b0b787..f888536a1001e7 100644
--- a/tests/models/pixtral/test_modeling_pixtral.py
+++ b/tests/models/pixtral/test_modeling_pixtral.py
@@ -48,128 +48,98 @@
 if is_vision_available():
     from PIL import Image
 
-
-class PixtralVisionText2TextModelTester:
+class PixtralModelTester:
     def __init__(
         self,
         parent,
-        ignore_index=-100,
-        image_token_index=0,
-        projector_hidden_act="gelu",
-        seq_length=7,
-        vision_feature_select_strategy="default",
-        vision_feature_layer=-1,
-        text_config={
-            "model_type": "llama",
-            "seq_length": 7,
-            "is_training": True,
-            "use_input_mask": True,
-            "use_token_type_ids": False,
-            "use_labels": True,
-            "vocab_size": 99,
-            "hidden_size": 32,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "intermediate_size": 37,
-            "hidden_act": "gelu",
-            "hidden_dropout_prob": 0.1,
-            "attention_probs_dropout_prob": 0.1,
-            "max_position_embeddings": 512,
-            "type_vocab_size": 16,
-            "type_sequence_label_size": 2,
-            "initializer_range": 0.02,
-            "num_labels": 3,
-            "num_choices": 4,
-            "pad_token_id": 0,
-        },
+        batch_size=12,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
         is_training=True,
-        vision_config={
-            "image_size": 30,
-            "patch_size": 2,
-            "num_channels": 3,
-            "is_training": True,
-            "hidden_size": 32,
-            "projection_dim": 32,
-            "num_hidden_layers": 2,
-            "num_attention_heads": 4,
-            "intermediate_size": 37,
-            "dropout": 0.1,
-            "attention_dropout": 0.1,
-            "initializer_range": 0.02,
-        },
+        hidden_size=32,
+        projection_dim=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=37,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=0.02,
+        scope=None,
     ):
         self.parent = parent
-        self.ignore_index = ignore_index
-        self.image_token_index = image_token_index
-        self.projector_hidden_act = projector_hidden_act
-        self.vision_feature_select_strategy = vision_feature_select_strategy
-        self.vision_feature_layer = vision_feature_layer
-        self.text_config = text_config
-        self.vision_config = vision_config
-        self.seq_length = seq_length
-
-        self.num_hidden_layers = text_config["num_hidden_layers"]
-        self.vocab_size = text_config["vocab_size"]
-        self.hidden_size = text_config["hidden_size"]
-        self.num_attention_heads = text_config["num_attention_heads"]
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
         self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+        # in ViT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = num_patches + 1
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        config = self.get_config()
 
-        self.batch_size = 3
-        self.num_channels = 3
-        self.image_size = 336
-        self.encoder_seq_length = 231
+        return config, pixel_values
 
     def get_config(self):
         return PixtralConfig(
-            text_config=self.text_config,
-            vision_config=self.vision_config,
-            ignore_index=self.ignore_index,
-            image_token_index=self.image_token_index,
-            projector_hidden_act=self.projector_hidden_act,
-            vision_feature_select_strategy=self.vision_feature_select_strategy,
-            vision_feature_layer=self.vision_feature_layer,
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            projection_dim=self.projection_dim,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            dropout=self.dropout,
+            attention_dropout=self.attention_dropout,
+            initializer_range=self.initializer_range,
         )
 
-    def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor(
-            [
-                self.batch_size,
-                self.vision_config["num_channels"],
-                self.vision_config["image_size"],
-                self.vision_config["image_size"],
-            ]
-        )
-        config = self.get_config()
-
-        return config, pixel_values
+    def create_and_check_model(self, config, pixel_values):
+        model = PixtralModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+    def create_and_check_model_with_projection(self, config, pixel_values):
+        model = PixtralModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            result = model(pixel_values)
+        # expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
+        image_size = (self.image_size, self.image_size)
+        patch_size = (self.patch_size, self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
+        self.parent.assertEqual(result.image_embeds.shape, (self.batch_size, self.projection_dim))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, pixel_values = config_and_inputs
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
-        attention_mask = input_ids.ne(1).to(torch_device)
-        # we are giving 3 images let's make sure we pass in 3 image tokens
-        input_ids[:, 1] = config.image_token_index
-        inputs_dict = {
-            "pixel_values": pixel_values,
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
+        inputs_dict = {"pixel_values": pixel_values}
         return config, inputs_dict
 
-    def create_and_check_pixtral_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask):
-        model = PixtralModel(config=config)
-        model.to(torch_device)
-        model.eval()
-        with torch.autocast(device_type="cuda", dtype=torch.float16):
-            logits = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                pixel_values=pixel_values.to(torch.bfloat16),
-                return_dict=True,
-            )["logits"]
-        self.parent.assertFalse(torch.isnan(logits).any().item())
-
 
 @require_torch
 class PixtralModelModelTest(ModelTesterMixin, unittest.TestCase):
@@ -182,7 +152,7 @@ class PixtralModelModelTest(ModelTesterMixin, unittest.TestCase):
     test_head_masking = False
 
     def setUp(self):
-        self.model_tester = PixtralVisionText2TextModelTester(self)
+        self.model_tester = PixtralModelTester(self)
         self.config_tester = ConfigTester(self, config_class=PixtralConfig, has_text_modality=False)
 
     # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
@@ -246,11 +216,11 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
-    @unittest.skip(reason="Compile not yet supported because in LLava models")
+    @unittest.skip(reason="Compile not yet supported because in Pixtral models")
     def test_sdpa_can_compile_dynamic(self):
         pass
 
-    @unittest.skip(reason="Compile not yet supported because in LLava models")
+    @unittest.skip(reason="Compile not yet supported because in Pixtral models")
     def test_sdpa_can_dispatch_on_flash(self):
         pass
 
@@ -258,7 +228,7 @@ def test_sdpa_can_dispatch_on_flash(self):
 @require_torch
 class PixtralModelIntegrationTest(unittest.TestCase):
     def setUp(self):
-        self.processor = AutoProcessor.from_pretrained("pixtral-hf/bakPixtral-v1-hf")
+        self.processor = AutoProcessor.from_pretrained("hf-internal-testing/pixtral-12b")
 
     def tearDown(self):
         gc.collect()
@@ -268,9 +238,9 @@ def tearDown(self):
     @require_bitsandbytes
     def test_small_model_integration_test(self):
         # Let' s make sure we test the preprocessing to replace what is used
-        model = PixtralModel.from_pretrained("pixtral-hf/bakPixtral-v1-hf", load_in_4bit=True)
+        model = PixtralModel.from_pretrained("hf-internal-testing/pixtral-12b", load_in_4bit=True)
 
-        prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
+        prompt = "<s>[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
         image_file = "https://pixtral-vl.github.io/static/images/view.jpg"
         raw_image = Image.open(requests.get(image_file, stream=True).raw)
         inputs = self.processor(prompt, raw_image, return_tensors="pt")
@@ -284,287 +254,4 @@ def test_small_model_integration_test(self):
         self.assertEqual(
             self.processor.decode(output[0], skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test_llama_single(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-        model_id = "mistral-community/pixtral"
-
-        model = PixtralModel.from_pretrained("mistral-community/pixtral", load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place? ASSISTANT:"
-        image_file = "https://pixtral-vl.github.io/static/images/view.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
-
-        output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
-        EXPECTED_DECODED_TEXT = "USER:  \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area."  # fmt: skip
-
-        self.assertEqual(
-            processor.decode(output[0], skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test_llama_batched(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-        model_id = "mistral-community/pixtral"
-
-        model = PixtralModel.from_pretrained("mistral-community/pixtral", load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompts = [
-            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT:",
-            "USER: <image>\nWhat is this? ASSISTANT:",
-        ]
-        image1 = Image.open(requests.get("https://pixtral-vl.github.io/static/images/view.jpg", stream=True).raw)
-        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-
-        inputs = processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20)
-
-        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, you', 'USER:  \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat is located on']  # fmt: skip
-
-        self.assertEqual(
-            processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test_batch(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-        model = PixtralModel.from_pretrained("pixtral-hf/bakPixtral-v1-hf", load_in_4bit=True)
-        # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
-        prompts = [
-            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
-            "USER: <image>\nWhat is this?\nASSISTANT:",
-        ]
-        image1 = Image.open(requests.get("https://pixtral-vl.github.io/static/images/view.jpg", stream=True).raw)
-        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-
-        inputs = self.processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20)
-
-        EXPECTED_DECODED_TEXT = [
-            'USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.',
-            'USER:  \nWhat is this?\nASSISTANT: Cats'
-        ]  # fmt: skip
-        self.assertEqual(
-            self.processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    @require_bitsandbytes
-    def test_small_model_integration_test_llama_batched_regression(self):
-        # Let' s make sure we test the preprocessing to replace what is used
-        model_id = "mistral-community/pixtral"
-
-        # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before)
-        model = PixtralModel.from_pretrained(
-            "mistral-community/pixtral", load_in_4bit=True, attn_implementation="eager"
-        )
-        processor = AutoProcessor.from_pretrained(model_id, pad_token="<pad>")
-
-        prompts = [
-            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
-            "USER: <image>\nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: <image>\nAnd this?\nASSISTANT:",
-        ]
-        image1 = Image.open(requests.get("https://pixtral-vl.github.io/static/images/view.jpg", stream=True).raw)
-        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-
-        inputs = processor(prompts, images=[image1, image2, image1], return_tensors="pt", padding=True)
-
-        output = model.generate(**inputs, max_new_tokens=20)
-
-        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water', 'USER:  \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER:  \nAnd this?\nASSISTANT: A cat sleeping on a bed.']  # fmt: skip
-
-        self.assertEqual(
-            processor.batch_decode(output, skip_special_tokens=True),
-            EXPECTED_DECODED_TEXT,
-        )
-
-    @slow
-    @require_torch
-    @require_vision
-    def test_batched_generation(self):
-        model = PixtralModel.from_pretrained("mistral-community/pixtral", load_in_4bit=True)
-
-        processor = AutoProcessor.from_pretrained("mistral-community/pixtral")
-
-        prompt1 = "<image>\n<image>\nUSER: What's the the difference of two images?\nASSISTANT:"
-        prompt2 = "<image>\nUSER: Describe the image.\nASSISTANT:"
-        prompt3 = "<image>\nUSER: Describe the image.\nASSISTANT:"
-        url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
-        url2 = "https://images.unsplash.com/photo-1617258683320-61900b281ced?q=80&w=3087&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D"
-        image1 = Image.open(requests.get(url1, stream=True).raw)
-        image2 = Image.open(requests.get(url2, stream=True).raw)
-
-        inputs = processor(
-            text=[prompt1, prompt2, prompt3],
-            images=[image1, image2, image1, image2],
-            return_tensors="pt",
-            padding=True,
-        ).to(torch_device)
-
-        model = model.eval()
-
-        EXPECTED_OUTPUT = [
-            "\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while",
-            "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small",
-            "\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the",
-        ]
-
-        generate_ids = model.generate(**inputs, max_new_tokens=20)
-        outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        self.assertEqual(outputs, EXPECTED_OUTPUT)
-
-    @slow
-    @require_bitsandbytes
-    def test_pixtral_index_error_bug(self):
-        # This is a reproducer of https://github.com/huggingface/transformers/pull/28032 and makes sure it does not happen anymore
-        # Please refer to that PR, or specifically https://github.com/huggingface/transformers/pull/28032#issuecomment-1860650043 for
-        # more details
-        model_id = "mistral-community/pixtral"
-        model = PixtralModel.from_pretrained(model_id, load_in_4bit=True)
-
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        # Simulate a super long prompt
-        user_prompt = "Describe the image:?\n" * 200
-        prompt = f"USER: <image>\n{user_prompt}ASSISTANT:"
-        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
-
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
-
-        # Make sure that `generate` works
-        _ = model.generate(**inputs, max_new_tokens=20)
-
-    @slow
-    @require_torch_gpu
-    def test_pixtral_merge_inputs_error_bug(self):
-        # This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore
-        model_id = "mistral-community/pixtral"
-        model = PixtralModel.from_pretrained(model_id, load_in_4bit=True)
-
-        # Simulate some user inputs
-        pixel_values = torch.randn(
-            (1, 3, 336, 336),
-            dtype=torch.float,
-            device=torch_device,
-        )
-        input_ids = torch.tensor(
-            [
-                [32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900],
-            ],
-            dtype=torch.long,
-            device=torch_device,
-        )
-        attention_mask = torch.tensor(
-            [[0, 0, 1, 1, 1, 1, 1, 1, 1]],
-            dtype=torch.long,
-            device=torch_device,
-        )
-
-        # Make sure that the loss is properly computed
-        loss = model(
-            pixel_values=pixel_values,
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            labels=input_ids,
-        ).loss
-        loss.backward()
-
-    def test_tokenizer_integration(self):
-        slow_tokenizer = AutoTokenizer.from_pretrained("liuhaotian/pixtral-v1.6-34b", use_fast=False)
-        slow_tokenizer.add_tokens("<image>", True)
-
-        fast_tokenizer = AutoTokenizer.from_pretrained(
-            "liuhaotian/pixtral-v1.6-34b",
-            bos_token="<|startoftext|>",
-            eos_token="<|endoftext|>",
-            from_slow=True,
-            legacy=False,
-        )
-        fast_tokenizer.add_tokens("<image>", True)
-
-        prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
-        EXPECTED_OUTPUT = ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '<image>', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n']  # fmt: skip
-        self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
-        self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)
-
-    @slow
-    @require_bitsandbytes
-    def test_generation_no_images(self):
-        model_id = "mistral-community/pixtral"
-        model = PixtralModel.from_pretrained(model_id, load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        # Prepare inputs with no images
-        inputs = processor("Hello, I am", return_tensors="pt").to(torch_device)
-
-        # Make sure that `generate` works
-        _ = model.generate(**inputs, max_new_tokens=20)
-
-    @slow
-    @require_bitsandbytes
-    def test_generation_siglip_backbone(self):
-        model_id = "pixtral-hf/pixtral-interleave-qwen-0.5b-hf"
-        model = PixtralModel.from_pretrained(model_id, torch_dtype="float16", device_map=torch_device)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        # check processing with expansion of inputs (w/o expansion should work with any backbone)
-        processor.vision_feature_select_strategy = "default"
-        processor.patch_size = 14
-
-        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-        inputs = processor(
-            text="<|im_start|>user\n<image>\nWhat are these?<|im_end|>\n<|im_start|>assistant",
-            images=raw_image,
-            return_tensors="pt",
-        ).to(torch_device, torch.float16)
-
-        # Make sure that `generate` works
-        output = model.generate(**inputs, max_new_tokens=30)
-
-        EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat"
-        self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT)
-
-    @slow
-    @require_bitsandbytes
-    def test_expansion_in_processing(self):
-        model_id = "mistral-community/pixtral"
-        model = PixtralModel.from_pretrained(model_id, load_in_4bit=True)
-        processor = AutoProcessor.from_pretrained(model_id)
-
-        prompt = "USER: <image>\nDescribe the image:\nASSISTANT:"
-        image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        raw_image = Image.open(requests.get(image_file, stream=True).raw)
-
-        # check processing with expansion of inputs
-        processor.vision_feature_select_strategy = "default"
-        processor.patch_size = 14
-        inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
-
-        # check processing without expansion of inputs (legacy behavior)
-        processor.vision_feature_select_strategy = None
-        processor.patch_size = None
-        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
-        self.assertTrue(inputs.input_ids.shape[-1] == 18)
-
-        # generate exactly 20 tokens
-        output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20)
-        output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20)
-
-        # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist())
+        )
\ No newline at end of file

From e0907561d09c27729d28c1d6ed12d00013353a2e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 17:42:41 +0200
Subject: [PATCH 28/58] fixup modeling and etests

---
 .../models/pixtral/modeling_pixtral.py        | 101 ++++--------------
 tests/models/llava/test_modeling_llava.py     |   2 +-
 tests/models/pixtral/test_modeling_pixtral.py |   8 +-
 3 files changed, 27 insertions(+), 84 deletions(-)

diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index a95ae346324af4..39d2ec8781de7c 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """PyTorch Pixtral model."""
 
-from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
 import torch
@@ -23,7 +22,7 @@
 
 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput, ModelOutput
+from ...modeling_outputs import BaseModelOutput
 from ...utils import (
     add_start_docstrings,
     logging,
@@ -33,51 +32,6 @@
 
 logger = logging.get_logger(__name__)
 
-_CONFIG_FOR_DOC = "PixtralConfig"
-
-
-@dataclass
-# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Pixtral
-class PixtralCausalLMOutputWithPast(ModelOutput):
-    """
-    Base class for Pixtral causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
-            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
-            sequence_length, hidden_size)`.
-
-            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
-    """
-
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    past_key_values: Optional[List[torch.FloatTensor]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-
 
 def position_ids_in_meshgrid(patch_embeds_list, max_width):
     positions = []
@@ -260,7 +214,7 @@ def forward(
         return attn_output, attn_weights
 
 
-# Copied from gemma2
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Pixstral
 class PixtralMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -276,6 +230,7 @@ def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
 
 
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Pixtral
 class PixtralRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
@@ -456,22 +411,15 @@ class PixtralPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["PixtralVisionAttention"]
     _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
     _supports_cache_class = True
 
     def _init_weights(self, module):
-        # important: this ported version of Pixtral isn't meant for training from scratch - only
-        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
-        # https://github.com/haotian-liu/LLaVA/tree/main/pixtral should serve for that purpose
         std = (
             self.config.initializer_range
             if hasattr(self.config, "initializer_range")
             else self.config.text_config.initializer_range
         )
 
-        if hasattr(module, "class_embedding"):
-            module.class_embedding.data.normal_(mean=0.0, std=std)
-
         if isinstance(module, (nn.Linear, nn.Conv2d)):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
@@ -557,6 +505,22 @@ def _init_weights(self, module):
 """
 
 
+def generate_block_attention_mask(patch_embeds_list, tensor):
+    dtype = tensor.dtype
+    device = tensor.device
+    seq_len = tensor.shape[1]
+    d_min = torch.finfo(dtype).min
+    causal_mask = torch.full((seq_len, seq_len), fill_value=d_min, dtype=dtype, device=device)
+
+    block_end_idx = torch.tensor(patch_embeds_list).cumsum(-1)
+    block_start_idx = torch.tensor([0] + patch_embeds_list[:-1]).cumsum(-1)
+    for start, end in zip(block_start_idx, block_end_idx):
+        causal_mask[start:end, start:end] = 0
+
+    causal_mask = causal_mask[None, None, :, :].expand(tensor.shape[0], 1, -1, -1)
+    return causal_mask
+
+
 @add_start_docstrings(
     """The PIXTRAL model which consists of a vision backbone and a language model.""",
     PIXTRAL_START_DOCSTRING,
@@ -590,7 +554,9 @@ def device(self) -> torch.device:
     def dtype(self) -> torch.device:
         return next(self.parameters()).dtype
 
-    def forward(self, images: List[torch.Tensor], output_hidden_states=False, *kwargs) -> torch.Tensor:
+    def forward(
+        self, images: List[torch.Tensor], output_hidden_states=False, *kwargs
+    ) -> Union[Tuple, BaseModelOutput]:
         """
         Args:
             images: list of N_img images of variable sizes,
@@ -612,28 +578,7 @@ def forward(self, images: List[torch.Tensor], output_hidden_states=False, *kwarg
         ).to(self.device)
 
         position_embedding = self.patch_positional_embedding(patch_embeds, position_ids)
-        # LAST TODO:
         attention_mask = generate_block_attention_mask(
             [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds
         )
-        out = self.transformer(patch_embeds, attention_mask, position_embedding)
-        return out
-
-
-def generate_block_attention_mask(patch_embeds_list, tensor):
-    dtype = tensor.dtype
-    device = tensor.device
-    # Get the number of patches (sequence length) from the first element in patch_embeds_list
-    seq_len = tensor.shape[1]
-    d_min = torch.finfo(dtype).min
-    causal_mask = torch.full((seq_len, seq_len), fill_value=d_min, dtype=dtype, device=device)
-    # Create an empty attention mask (1: attend, 0: no attend)
-
-    # Fill the mask with 1s within blocks
-    block_end_idx = torch.tensor(patch_embeds_list).cumsum(-1)
-    block_start_idx = torch.tensor([0] + patch_embeds_list[:-1]).cumsum(-1)
-    for start, end in zip(block_start_idx, block_end_idx):
-        causal_mask[start:end, start:end] = 0
-
-    causal_mask = causal_mask[None, None, :, :].expand(tensor.shape[0], 1, -1, -1)
-    return causal_mask
+        return self.transformer(patch_embeds, attention_mask, position_embedding)
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index ba2baad623b05e..9dd64bcbd6cc0d 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -615,4 +615,4 @@ def test_pixtral(self):
 """
         # fmt: on
         # check that both inputs are handled correctly and generate the same output
-        self.assertListEqual(ouptut, EXPECTED_GENERATION) 
+        self.assertListEqual(ouptut, EXPECTED_GENERATION)
diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py
index f888536a1001e7..afc45ea9d89543 100644
--- a/tests/models/pixtral/test_modeling_pixtral.py
+++ b/tests/models/pixtral/test_modeling_pixtral.py
@@ -21,7 +21,6 @@
 
 from transformers import (
     AutoProcessor,
-    AutoTokenizer,
     PixtralConfig,
     PixtralModel,
     is_torch_available,
@@ -30,14 +29,12 @@
 from transformers.testing_utils import (
     require_bitsandbytes,
     require_torch,
-    require_torch_gpu,
-    require_vision,
     slow,
     torch_device,
 )
 
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, floats_tensor
 
 
 if is_torch_available():
@@ -48,6 +45,7 @@
 if is_vision_available():
     from PIL import Image
 
+
 class PixtralModelTester:
     def __init__(
         self,
@@ -254,4 +252,4 @@ def test_small_model_integration_test(self):
         self.assertEqual(
             self.processor.decode(output[0], skip_special_tokens=True),
             EXPECTED_DECODED_TEXT,
-        )
\ No newline at end of file
+        )

From 3725e233d2e8b8ce27bdd1ce0bf418dd5930ffdc Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 16:45:13 +0100
Subject: [PATCH 29/58] Return BatchMixFeature in image processor

---
 .../pixtral/image_processing_pixtral.py       | 98 +++++++++----------
 .../models/pixtral/processing_pixtral.py      |  1 +
 2 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 37850cb0017223..2a8abef7cbb719 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -47,6 +47,54 @@
     import PIL
 
 
+class BatchMixFeature(BatchFeature):
+    def to(self, *args, **kwargs) -> "BatchMixFeature":
+        """
+        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
+        different `dtypes` and sending the `BatchFeature` to a different `device`.
+
+        Args:
+            args (`Tuple`):
+                Will be passed to the `to(...)` function of the tensors.
+            kwargs (`Dict`, *optional*):
+                Will be passed to the `to(...)` function of the tensors.
+
+        Returns:
+            [`BatchFeature`]: The same instance after modification.
+        """
+        requires_backends(self, ["torch"])
+        import torch  # noqa
+
+        new_data = {}
+        device = kwargs.get("device")
+        # Check if the args are a device or a dtype
+        if device is None and len(args) > 0:
+            # device should be always the first argument
+            arg = args[0]
+            if is_torch_dtype(arg):
+                # The first argument is a dtype
+                pass
+            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
+                device = arg
+            else:
+                # it's something else
+                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
+        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
+        for k, v in self.items():
+            # check if v is a floating point
+            if isinstance(k, list):
+                new_data[k] = [element.to(*args, **kwargs) for sample in v for element in sample]
+            if torch.is_floating_point(v):
+                # cast and send to device
+                new_data[k] = v.to(*args, **kwargs)
+            elif device is not None:
+                new_data[k] = v.to(device=device)
+            else:
+                new_data[k] = v
+        self.data = new_data
+        return self
+
+
 # Copied from transformers.models.idefics2.image_processing_idefics2.make_list_of_images
 def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]:
     """
@@ -166,54 +214,6 @@ def get_resize_output_image_size(
     return num_height_tokens * patch_height, num_width_tokens * patch_width
 
 
-class BatchMixFeature(BatchFeature):
-    def to(self, *args, **kwargs) -> "BatchFeature":
-        """
-        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
-        different `dtypes` and sending the `BatchFeature` to a different `device`.
-
-        Args:
-            args (`Tuple`):
-                Will be passed to the `to(...)` function of the tensors.
-            kwargs (`Dict`, *optional*):
-                Will be passed to the `to(...)` function of the tensors.
-
-        Returns:
-            [`BatchFeature`]: The same instance after modification.
-        """
-        requires_backends(self, ["torch"])
-        import torch  # noqa
-
-        new_data = {}
-        device = kwargs.get("device")
-        # Check if the args are a device or a dtype
-        if device is None and len(args) > 0:
-            # device should be always the first argument
-            arg = args[0]
-            if is_torch_dtype(arg):
-                # The first argument is a dtype
-                pass
-            elif isinstance(arg, str) or is_torch_device(arg) or isinstance(arg, int):
-                device = arg
-            else:
-                # it's something else
-                raise ValueError(f"Attempting to cast a BatchFeature to type {str(arg)}. This is not supported.")
-        # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
-        for k, v in self.items():
-            # check if v is a floating point
-            if isinstance(k, list):
-                new_data[k] = [element.to(*args, **kwargs) for sample in v for element in sample]
-            if torch.is_floating_point(v):
-                # cast and send to device
-                new_data[k] = v.to(*args, **kwargs)
-            elif device is not None:
-                new_data[k] = v.to(device=device)
-            else:
-                new_data[k] = v
-        self.data = new_data
-        return self
-
-
 # Hack to get tensor conversion used in BatchFeature without batching the images
 def _get_is_as_tensor_fns(tensor_type: Union[str, TensorType]) -> Tuple[Callable, Callable]:
     return BatchFeature()._get_is_as_tensor_fns(tensor_type)
@@ -510,4 +510,4 @@ def preprocess(
 
         # Convert to tensor type outside of BatchFeature to avoid batching the images of different sizes
         images_list = [[convert_to_tensor(image, return_tensors) for image in images] for images in images_list]
-        return BatchFeature(data={"pixel_values": images_list, "image_sizes": batch_image_sizes}, tensor_type=None)
+        return BatchMixFeature(data={"pixel_values": images_list, "image_sizes": batch_image_sizes}, tensor_type=None)
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index 4d1a2af9390a5c..e77c643ee9fc86 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -28,6 +28,7 @@
 logger = logging.get_logger(__name__)
 
 
+# Copied from transformers.models.pixtral.image_processing_pixtral.BatchMixFeature
 class BatchMixFeature(BatchFeature):
     def to(self, *args, **kwargs) -> "BatchFeature":
         """

From 26adfeccd8b0898fd8b4c27c38c324d8211bfdcc Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 17:47:18 +0200
Subject: [PATCH 30/58] fix some copies

---
 docs/source/en/index.md                       |  1 +
 .../models/pixtral/modeling_pixtral.py        | 20 +++++++++----------
 src/transformers/utils/dummy_pt_objects.py    | 14 +++++++++++++
 .../utils/dummy_vision_objects.py             |  7 +++++++
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 8e3a4da8b021de..97148840a2d2ea 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -253,6 +253,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [Phi3](model_doc/phi3)                          |       ✅        |         ❌         |      ❌      |
 |                       [PhoBERT](model_doc/phobert)                       |       ✅        |         ✅         |      ✅      |
 |                    [Pix2Struct](model_doc/pix2struct)                    |       ✅        |         ❌         |      ❌      |
+|                       [Pixtral](model_doc/pixtral)                       |       ✅        |         ❌         |      ❌      |
 |                        [PLBart](model_doc/plbart)                        |       ✅        |         ❌         |      ❌      |
 |                    [PoolFormer](model_doc/poolformer)                    |       ✅        |         ❌         |      ❌      |
 |                     [Pop2Piano](model_doc/pop2piano)                     |       ✅        |         ❌         |      ❌      |
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 39d2ec8781de7c..d814457fc523e2 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -128,7 +128,7 @@ def rotate_half(x):
 
 
 # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=0):
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
     Args:
@@ -158,7 +158,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=0):
 class PixtralAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -214,27 +213,26 @@ def forward(
         return attn_output, attn_weights
 
 
-# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Pixstral
-class PixtralMLP(nn.Module):
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Pixtral
+class PixstralMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.config = config
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
         self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
         self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
-        self.act_fn = ACT2FN[config.hidden_activation]
+        self.act_fn = ACT2FN[config.hidden_act]
 
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
 
 
 # Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Pixtral
 class PixtralRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):
         """
-        LlamaRMSNorm is equivalent to T5LayerNorm
+        PixtralRMSNorm is equivalent to T5LayerNorm
         """
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -404,7 +402,6 @@ def forward(
     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
     PIXTRAL_START_DOCSTRING,
 )
-# Copied from transformers.models.llava.modeling_llava.LlavaPreTrainedModel with Llava->Pixtral,llava->pixtral
 class PixtralPreTrainedModel(PreTrainedModel):
     config_class = PixtralConfig
     base_model_prefix = "model"
@@ -414,6 +411,9 @@ class PixtralPreTrainedModel(PreTrainedModel):
     _supports_cache_class = True
 
     def _init_weights(self, module):
+        # important: this ported version of Pixtral isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/LLaVA/tree/main/pixtral should serve for that purpose
         std = (
             self.config.initializer_range
             if hasattr(self.config, "initializer_range")
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index b9ce0d0f15bbf5..2db7b38b580375 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -7067,6 +7067,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class PixtralModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PixtralPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class PLBartForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index 2493954a518b2c..436378582e54ca 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -506,6 +506,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class PixtralImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class PoolFormerFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 

From 4ee5cfb41224a2d17abfdbd073079899f3a654c9 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 17:48:33 +0200
Subject: [PATCH 31/58] update

---
 src/transformers/models/pixtral/image_processing_pixtral.py | 6 ++++--
 src/transformers/models/pixtral/processing_pixtral.py       | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 2a8abef7cbb719..9f3218b744b0a8 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -36,7 +36,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
-from ...utils import TensorType, is_torch_device, is_torch_dtype, is_vision_available, logging
+from ...utils import TensorType, is_torch_device, is_torch_dtype, is_vision_available, logging, is_torch_tensor
 from ...utils.import_utils import requires_backends
 
 
@@ -83,7 +83,9 @@ def to(self, *args, **kwargs) -> "BatchMixFeature":
         for k, v in self.items():
             # check if v is a floating point
             if isinstance(k, list):
-                new_data[k] = [element.to(*args, **kwargs) for sample in v for element in sample]
+                new_data[k] = [
+                    element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
+                ]
             if torch.is_floating_point(v):
                 # cast and send to device
                 new_data[k] = v.to(*args, **kwargs)
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index e77c643ee9fc86..ceca7586e46b4b 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -30,7 +30,7 @@
 
 # Copied from transformers.models.pixtral.image_processing_pixtral.BatchMixFeature
 class BatchMixFeature(BatchFeature):
-    def to(self, *args, **kwargs) -> "BatchFeature":
+    def to(self, *args, **kwargs) -> "BatchMixFeature":
         """
         Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
         different `dtypes` and sending the `BatchFeature` to a different `device`.

From 66de9670c1f10668524be1b241efebf1c44b60af Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 17:50:15 +0200
Subject: [PATCH 32/58] nits

---
 src/transformers/models/pixtral/image_processing_pixtral.py | 2 +-
 src/transformers/models/pixtral/processing_pixtral.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 9f3218b744b0a8..801d9a6a102357 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -82,7 +82,7 @@ def to(self, *args, **kwargs) -> "BatchMixFeature":
         # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
         for k, v in self.items():
             # check if v is a floating point
-            if isinstance(k, list):
+            if isinstance(v, list):
                 new_data[k] = [
                     element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
                 ]
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index ceca7586e46b4b..da0e03bfb21d4e 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -68,7 +68,7 @@ def to(self, *args, **kwargs) -> "BatchMixFeature":
                 new_data[k] = [
                     element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
                 ]
-            elif torch.is_floating_point(v):
+            if torch.is_floating_point(v):
                 # cast and send to device
                 new_data[k] = v.to(*args, **kwargs)
             elif device is not None:

From 07c760047b1f7795798dc444d123262da861722b Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 17:34:26 +0100
Subject: [PATCH 33/58] Update model docstring

---
 .../models/pixtral/modeling_pixtral.py        | 77 +++----------------
 1 file changed, 11 insertions(+), 66 deletions(-)

diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index d814457fc523e2..783b12f895a580 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -25,6 +25,7 @@
 from ...modeling_outputs import BaseModelOutput
 from ...utils import (
     add_start_docstrings,
+    add_start_docstrings_to_model_forward,
     logging,
 )
 from .configuration_pixtral import PixtralConfig
@@ -432,64 +433,8 @@ def _init_weights(self, module):
 
 PIXTRAL_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
-            The tensors corresponding to the input images. Pixel values can be obtained using
-            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
-            [`CLIPImageProcessor`] for processing images).
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        vision_feature_layer (`int`, *optional*, defaults to -2):
-            The index of the layer to select the vision feature.
-        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Can be one of `"default"` or `"full"`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
+        images: list of N_img images of variable sizes,
+                each of shape (C, H, W)
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -498,10 +443,6 @@ def _init_weights(self, module):
             more detail.
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
-            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
-            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
-            the complete sequence length.
 """
 
 
@@ -554,13 +495,17 @@ def device(self) -> torch.device:
     def dtype(self) -> torch.device:
         return next(self.parameters()).dtype
 
+    @add_start_docstrings_to_model_forward(PIXTRAL_INPUTS_DOCSTRING)
     def forward(
-        self, images: List[torch.Tensor], output_hidden_states=False, *kwargs
+        self,
+        images: List[torch.Tensor],
+        output_hidden_states: Optional[bool] = False,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        *args,
+        **kwargs,
     ) -> Union[Tuple, BaseModelOutput]:
         """
-        Args:
-            images: list of N_img images of variable sizes,
-                each of shape (C, H, W)
         Returns:
             image_features: tensor of token features for
                 all tokens of all images of shape (N_toks, D)

From ff04e9fdea1b19966f7bdcfe03582d96e939c7e5 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 17:41:41 +0100
Subject: [PATCH 34/58] Apply suggestions from code review

---
 docs/source/en/model_doc/pixtral.md                   |  4 +---
 src/transformers/modeling_utils.py                    | 11 +++++------
 .../models/pixtral/configuration_pixtral.py           |  4 ++--
 .../models/pixtral/image_processing_pixtral.py        |  2 ++
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
index 30c62af3df3467..23b8196d8cd9a8 100644
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@@ -16,8 +16,6 @@ rendered properly in your Markdown viewer.
 
 # Pixtral
 
-# Pixtral
-
 ## Overview
 
 The Pixtral model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
@@ -31,7 +29,7 @@ Tips:
 
 <INSERT TIPS ABOUT MODEL HERE>
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ) and [Amy Roberts](https://huggingface.co/amyeroberts).
 The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index d0a13da512156b..a3568eb18c3334 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -548,12 +548,11 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], is_quantized: bool
         # Check format of the archive
         with safe_open(checkpoint_file, framework="pt") as f:
             metadata = f.metadata()
-        if metadata is not None:
-            if metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
-                raise OSError(
-                    f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
-                    "you save your model with the `save_pretrained` method."
-                )
+        if metadata is not None and metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
+            raise OSError(
+                f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
+                "you save your model with the `save_pretrained` method."
+            )
         return safe_load_file(checkpoint_file)
     try:
         if (
diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
index da7479b342b754..f982e2434d7dde 100644
--- a/src/transformers/models/pixtral/configuration_pixtral.py
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -37,9 +37,9 @@ class PixtralConfig(PretrainedConfig):
         intermediate_size (`int`, `optional`, defaults to 4096):
             Dimension of the MLP representations.
         num_hidden_layers (`int`, `optional`, defaults to 24):
-            Number of hidden layers in the Transformer decoder.
+            Number of hidden layers in the Transformer encoder.
         num_attention_heads (`int`, `optional`, defaults to 16):
-            Number of attention heads in the Transformer decoder.
+            Number of attention heads in the Transformer encoder.
         num_channels (`int`, `optional`, defaults to 3):
             Number of input channels in the input images.
         image_size (`int`, `optional`, defaults to 1024):
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 801d9a6a102357..6ab1f5a04aaf52 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -237,6 +237,8 @@ class PixtralImageProcessor(BaseImageProcessor):
             Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
             `do_resize` in the `preprocess` method.
         size (`Dict[str, int]` *optional*, defaults to `{"longest_edge": 1024}`):
+            Size of the maximum dimension of either the height or width dimension of the image. Used to control how 
+            images are resized. If either the height or width are greater than `size["longest_edge"]` then both the height and width are rescaled by `height / ratio`, `width /ratio` where `ratio = max(height / longest_edge, width / longest_edge)`
         patch_size (`Dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
             Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
         resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):

From e7fae23d07c0f1d6610a18626af8d1225cbfe509 Mon Sep 17 00:00:00 2001
From: Amy Roberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 17:53:47 +0100
Subject: [PATCH 35/58] Fix up

---
 docs/source/en/model_doc/pixtral.md           | 12 +++++--
 src/transformers/__init__.py                  |  2 +-
 .../models/pixtral/configuration_pixtral.py   | 32 ++++++++-----------
 .../pixtral/image_processing_pixtral.py       |  4 +--
 .../models/pixtral/modeling_pixtral.py        |  2 +-
 5 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
index 23b8196d8cd9a8..15424d8953f5dc 100644
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@@ -29,8 +29,7 @@ Tips:
 
 <INSERT TIPS ABOUT MODEL HERE>
 
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ) and [Amy Roberts](https://huggingface.co/amyeroberts).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ) and [Amy Roberts](https://huggingface.co/amyeroberts). The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 
 ## PixtralConfig
@@ -41,3 +40,12 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 [[autodoc]] PixtralModel
     - forward
+
+## PixtralImageProcessor
+
+[[autodoc]] PixtralImageProcessor
+    - preprocess
+
+## PixtralProcessor
+
+[[autodoc]] PixtralProcessor
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 12ec78e8bdfab0..506c4c1a048f55 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1361,7 +1361,7 @@
             "AlignVisionModel",
         ]
     )
-    (_import_structure["models.pixtral"].extend(["PixtralModel", "PixtralPreTrainedModel"]),)
+    _import_structure["models.pixtral"].extend(["PixtralModel", "PixtralPreTrainedModel"])
     _import_structure["models.altclip"].extend(
         [
             "AltCLIPModel",
diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
index f982e2434d7dde..3879c853fc8eb0 100644
--- a/src/transformers/models/pixtral/configuration_pixtral.py
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -32,35 +32,33 @@ class PixtralConfig(PretrainedConfig):
     documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        hidden_size (`int`, `optional`, defaults to 1024):
+        hidden_size (`int`, *optional*, defaults to 1024):
             Dimension of the hidden representations.
-        intermediate_size (`int`, `optional`, defaults to 4096):
+        intermediate_size (`int`, *optional*, defaults to 4096):
             Dimension of the MLP representations.
-        num_hidden_layers (`int`, `optional`, defaults to 24):
+        num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, `optional`, defaults to 16):
+        num_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads in the Transformer encoder.
-        num_channels (`int`, `optional`, defaults to 3):
+        num_channels (`int`, *optional*, defaults to 3):
             Number of input channels in the input images.
-        image_size (`int`, `optional`, defaults to 1024):
+        image_size (`int`, *optional*, defaults to 1024):
             Max dimension of the input images.
-        patch_size (`int`, `optional`, defaults to 16):
+        patch_size (`int`, *optional*, defaults to 16):
             Size of the image patches.
-        hidden_activation (`str`, `optional`, defaults to "gelu"):
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
             Activation function used in the hidden layers.
-        layer_norm_eps (`float`, `optional`, defaults to 1e-5):
-            Epsilon value for layer normalization.
-        attention_dropout (`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             Dropout probability for the attention layers.
-        rope_theta (`float`, `optional`, defaults to 10000.0):
+        rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
-        tie_word_embeddings (`bool`, `optional`, defaults to False):
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie the word embeddings with the input embeddings.
 
     Example:
 
     ```python
-    >>> from transformers import PixtralModel, PixtralConfig, CLIPVisionConfig, LlamaConfig
+    >>> from transformers import PixtralModel, PixtralConfig
 
     >>> # Initializing a Pixtral 12B style configuration
     >>> config = PixtralConfig()
@@ -83,8 +81,7 @@ def __init__(
         num_channels=3,
         image_size=1024,
         patch_size=16,
-        hidden_activation="gelu",
-        layer_norm_eps=1e-5,
+        hidden_act="gelu",
         attention_dropout=0.0,
         rope_theta=10000.0,
         tie_word_embeddings=False,
@@ -100,8 +97,7 @@ def __init__(
         self.patch_size = patch_size
         self.image_size = image_size
         self.attention_dropout = attention_dropout
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_activation = hidden_activation
+        self.hidden_act = hidden_act
         self.rope_theta = rope_theta
         self.tie_word_embeddings = tie_word_embeddings
         self.head_dim = hidden_size // num_attention_heads
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 6ab1f5a04aaf52..3e8694ffcb4c31 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -36,7 +36,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
-from ...utils import TensorType, is_torch_device, is_torch_dtype, is_vision_available, logging, is_torch_tensor
+from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, is_vision_available, logging
 from ...utils.import_utils import requires_backends
 
 
@@ -237,7 +237,7 @@ class PixtralImageProcessor(BaseImageProcessor):
             Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
             `do_resize` in the `preprocess` method.
         size (`Dict[str, int]` *optional*, defaults to `{"longest_edge": 1024}`):
-            Size of the maximum dimension of either the height or width dimension of the image. Used to control how 
+            Size of the maximum dimension of either the height or width dimension of the image. Used to control how
             images are resized. If either the height or width are greater than `size["longest_edge"]` then both the height and width are rescaled by `height / ratio`, `width /ratio` where `ratio = max(height / longest_edge, width / longest_edge)`
         patch_size (`Dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
             Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 783b12f895a580..500c437870af8d 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -215,7 +215,7 @@ def forward(
 
 
 # Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Pixtral
-class PixstralMLP(nn.Module):
+class PixtralMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.hidden_size = config.hidden_size

From 324ba36e251aa9e94a1d21e26d4251dc0e68a4f5 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 19:27:41 +0200
Subject: [PATCH 36/58] updates

---
 docs/source/en/model_doc/pixtral.md           |  70 ++++++++-
 .../pixtral/convert_pixtral_weights_to_hf.py  | 139 ++++++++----------
 .../pixtral/image_processing_pixtral.py       |   2 +-
 .../models/pixtral/modeling_pixtral.py        |   2 +-
 4 files changed, 123 insertions(+), 90 deletions(-)

diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
index 30c62af3df3467..1e99847f3b83bb 100644
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@@ -20,25 +20,79 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The Pixtral model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+The Pixtral model was released by the Mistral AI team on [Vllm](https://github.com/vllm-project/vllm/pull/8377), where a version of the code can be found!
 
-The abstract from the paper is the following:
-
-*<INSERT PAPER ABSTRACT HERE>*
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- Pixtral is a multimodal model, the main contribution is the 2d ROPE on the images, and support for arbitrary image size (the images are not padded together nor are they resized)
+- This model follows the `Llava` familiy, meaning image embeddings are placed instead of the `[IMG]` token placeholders. 
+- The format for one or mulitple prompts is the following:
+```
+"<s>[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
+```
+Then, the processor will replace each `[IMG]` token with  a number of `[IMG]` token that depends on the height and the width of the image. Each *row* of the image is separated by a `[IMG_BREAK]` token, and each image is separated by a  `[IMG_END]` token.
+
+This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) and [ArthurZ](https://huggingface.co/ArthurZ)
+
+Here is an example of how to run it:
+
+```python 
+from transformers import AutoModelForConditionalGeneration, AutoProcessor
+
+model_id = "hf-internal-testing/pixtral-12b"
+model = AutoModelForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+processor = AutoProcessor.from_pretrained(model_id)
+
+IMG_URLS = [
+    Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
+    Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
+    Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
+    Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
+]
+PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
+
+inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
+generate_ids = model.generate(**inputs, max_new_tokens=500)
+ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+EXPECTED_GENERATION = """
+Describe the images.
+Sure, let's break down each image description:
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+1. **Image 1:**
+   - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera.
+   - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur.
 
+2. **Image 2:**
+   - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley.
+   - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image.
 
+3. **Image 3:**
+   - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset.
+   - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene.
+
+4. **Image 4:**
+   - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers.
+   - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.
+
+Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
+"""
+
+```
 ## PixtralConfig
 
 [[autodoc]] PixtralConfig
 
+## PixtralImageProcessor
+
+[[autodoc]] PixtralImageProcessor
+    - forward
+
+## PixtralProcessor
+[[autodoc]] PixtralProcessor
+    - forward
+
 ## PixtralModel
 
 [[autodoc]] PixtralModel
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index c7a5fc2bd50de5..9be6b7aa7a1494 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -1,8 +1,9 @@
+import argparse
+
 import regex as re
-import requests
 import torch
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-from PIL import Image
+from safetensors.torch import safe_load_file
 from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
 from tokenizers.models import BPE
 
@@ -18,6 +19,28 @@
 from transformers.convert_slow_tokenizer import bytes_to_unicode
 
 
+"""
+# Here is how to get the original tokens!
+model_name = "mistralai/Pixtral-12B-2409"
+tok = MistralTokenizer.from_model(model_name)
+
+from mistral_common.protocol.instruct.request import ChatCompletionRequest, UserMessage, ImageChunk, TextChunk
+
+EXPECTED_TOKENS = tok.encode_chat_completion(
+    ChatCompletionRequest(
+        messages=[
+            UserMessage(
+                content=[
+                    TextChunk(text="Describe the images"),
+                ] + [ImageChunk(image=img) for img in IMG_URLS]
+            )
+        ],
+        model="pixtral",
+    )
+)
+assert tokenizer.decode(inputs["input_ids"][0]) == EXPECTED_TOKENS
+"""
+
 OLD_KEY_TO_NEW_KEY_MAPPING = {
     # Layer Normalization Weights
     r"vision_encoder.transformer.layers.(\d+).input_layernorm.weight": r"vision_tower.transformer.layers.\1.attention_norm.weight",
@@ -185,7 +208,7 @@ def convert_dictionnary(original_state_dict, vision_config, text_config):
     return new_dict
 
 
-def convert_mistral_model():
+def convert_mistral_model(input_dir, output_dir):
     text_config = MistralConfig(
         attention_dropout=0.0,
         bos_token_id=1,
@@ -208,90 +231,46 @@ def convert_mistral_model():
     )
 
     vision_config = PixtralConfig()
-    config = LlavaConfig(vision_config, text_config)
+    config = LlavaConfig(
+        vision_config,
+        text_config,
+        vision_feature_layer=-1,
+        image_token_index=10,
+        vision_feature_select_strategy="full",
+        image_seq_length=1,
+    )
     config.architectures = ["LlavaForConditionalGeneration"]
-    config.text_config.head_dim = 128
-    config.save_pretrained("../pixtral")
-
-    # original_state_dict = safe_load_file("../pixtral/consolidated.safetensors")
-    # new_dict = convert_dictionnary(original_state_dict, vision_config, text_config)
-
-    # config.text_config.head_dim = 128
-    # with torch.device("meta"):
-    #     model = LlavaForConditionalGeneration(config)
-    # model.load_state_dict(new_dict, strict=True, assign=True)
-
-    # model.save_pretrained("../pixtral")
-    config.vision_feature_layer = -1
-    config.image_token_index = 10
-    config.vision_feature_select_strategy = "full"
-    config.image_seq_length = 1
-    tokenizer = convert_mistral_tokenizer()
-    model = LlavaForConditionalGeneration.from_pretrained(
-        "../pixtral", config=config, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16
-    ).to("cuda")
-    image_processor = PixtralImageProcessor()
-    processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
-
-    IMG_URLS = [
-        Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
-        Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
-        Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
-        Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
-    ]
-    PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
-
-    # image = Image.open(requests.get(url, stream=True).raw)
-    inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda")
-    generate_ids = model.generate(**inputs, max_new_tokens=500)
-    print(processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
+    config.save_pretrained(output_dir)
 
-    # model_name = "mistralai/Pixtral-12B-2409"
-    # tok = MistralTokenizer.from_model(model_name)
+    original_state_dict = safe_load_file(f"{input_dir}/consolidated.safetensors")
+    new_dict = convert_dictionnary(original_state_dict, vision_config, text_config)
 
-    # from mistral_common.protocol.instruct.request import ChatCompletionRequest, UserMessage, ImageChunk, TextChunk
+    with torch.device("meta"):
+        model = LlavaForConditionalGeneration(config)
+    model.load_state_dict(new_dict, strict=True, assign=True)
 
-    # EXPECTED_TOKENS = tok.encode_chat_completion(
-    #     ChatCompletionRequest(
-    #         messages=[
-    #             UserMessage(
-    #                 content=[
-    #                     TextChunk(text="Describe the images"),
-    #                 ] + [ImageChunk(image=img) for img in IMG_URLS]
-    #             )
-    #         ],
-    #         model="pixtral",
-    #     )
-    # )
-    # assert tokenizer.decode(inputs["input_ids"][0]) == EXPECTED_TOKENS
+    model.save_pretrained(output_dir)
 
+    tokenizer = convert_mistral_tokenizer()
+    image_processor = PixtralImageProcessor()
+    processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor, image_token="[IMG]")
+    processor.save_pretrained(output_dir)
 
-convert_mistral_model()
-
-
-"""
-What's the content of the image?The image depicts a vibrant street scene in what appears to be a Chinatown district, characterized by its traditional architectural elements and cultural signage. A prominent feature is the red and white stop sign in the foreground, which has been adorned with a banner that reads "OPTUS." Behind the stop sign, there's an ornate gate with intricate designs and Chinese characters, marking the entrance to the district. The gate is flanked by buildings with colorful facades and signs in both English and Chinese
-"""
-
-"""
-Describe the images.
-Sure, let's break down each image description:
-
-1. **Image 1:**
-   - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera.
-   - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur.
 
-2. **Image 2:**
-   - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley.
-   - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image.
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of LLaMA weights, which contains tokenizer.model and model folders",
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Location to write HF model and tokenizer",
+    )
 
-3. **Image 3:**
-   - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset.
-   - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene.
+    args = parser.parse_args()
+    convert_mistral_model(args.input_dir, args.output_dir)
 
-4. **Image 4:**
-   - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers.
-   - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden.
 
-Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it.
-"""
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 801d9a6a102357..70fcd565bb64d2 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -36,7 +36,7 @@
     validate_kwargs,
     validate_preprocess_arguments,
 )
-from ...utils import TensorType, is_torch_device, is_torch_dtype, is_vision_available, logging, is_torch_tensor
+from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, is_vision_available, logging
 from ...utils.import_utils import requires_backends
 
 
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index d814457fc523e2..653b8d9e4c9869 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -214,7 +214,7 @@ def forward(
 
 
 # Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Pixtral
-class PixstralMLP(nn.Module):
+class PixtralMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.hidden_size = config.hidden_size

From c4ad4e5efadfc766f6774e619675572a71800d6d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 19:28:46 +0200
Subject: [PATCH 37/58] revert modeling changes

---
 src/transformers/modeling_utils.py | 42 ++++++++++++++----------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index d0a13da512156b..359509f469a703 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -548,12 +548,11 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], is_quantized: bool
         # Check format of the archive
         with safe_open(checkpoint_file, framework="pt") as f:
             metadata = f.metadata()
-        if metadata is not None:
-            if metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
-                raise OSError(
-                    f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
-                    "you save your model with the `save_pretrained` method."
-                )
+        if metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
+            raise OSError(
+                f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
+                "you save your model with the `save_pretrained` method."
+            )
         return safe_load_file(checkpoint_file)
     try:
         if (
@@ -3752,22 +3751,21 @@ def from_pretrained(
             with safe_open(resolved_archive_file, framework="pt") as f:
                 metadata = f.metadata()
 
-            if metadata is not None:
-                if metadata.get("format") == "pt":
-                    pass
-                elif metadata.get("format") == "tf":
-                    from_tf = True
-                    logger.info("A TensorFlow safetensors file is being loaded in a PyTorch model.")
-                elif metadata.get("format") == "flax":
-                    from_flax = True
-                    logger.info("A Flax safetensors file is being loaded in a PyTorch model.")
-                elif metadata.get("format") == "mlx":
-                    # This is a mlx file, we assume weights are compatible with pt
-                    pass
-                else:
-                    raise ValueError(
-                        f"Incompatible safetensors file. File metadata is not ['pt', 'tf', 'flax', 'mlx'] but {metadata.get('format')}"
-                    )
+            if metadata.get("format") == "pt":
+                pass
+            elif metadata.get("format") == "tf":
+                from_tf = True
+                logger.info("A TensorFlow safetensors file is being loaded in a PyTorch model.")
+            elif metadata.get("format") == "flax":
+                from_flax = True
+                logger.info("A Flax safetensors file is being loaded in a PyTorch model.")
+            elif metadata.get("format") == "mlx":
+                # This is a mlx file, we assume weights are compatible with pt
+                pass
+            else:
+                raise ValueError(
+                    f"Incompatible safetensors file. File metadata is not ['pt', 'tf', 'flax', 'mlx'] but {metadata.get('format')}"
+                )
 
         from_pt = not (from_tf | from_flax)
 

From 9f2d98b4ab0b784eefece18e490e789f28b2772c Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 19:31:48 +0200
Subject: [PATCH 38/58] update

---
 docs/source/en/model_doc/pixtral.md                        | 4 ++--
 src/transformers/__init__.py                               | 4 ++--
 src/transformers/models/auto/configuration_auto.py         | 2 +-
 src/transformers/models/auto/tokenization_auto.py          | 2 +-
 src/transformers/models/pixtral/__init__.py                | 4 ++--
 src/transformers/models/pixtral/configuration_pixtral.py   | 6 +++---
 .../models/pixtral/convert_pixtral_weights_to_hf.py        | 4 ++--
 src/transformers/models/pixtral/modeling_pixtral.py        | 7 +++----
 tests/models/pixtral/test_modeling_pixtral.py              | 6 +++---
 9 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
index 1e99847f3b83bb..ff4b6b5854f556 100644
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@@ -80,9 +80,9 @@ Each image captures a different scene, from a close-up of a dog to expansive nat
 """
 
 ```
-## PixtralConfig
+## PixtralVisionConfig
 
-[[autodoc]] PixtralConfig
+[[autodoc]] PixtralVisionConfig
 
 ## PixtralImageProcessor
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 12ec78e8bdfab0..d654b0cf2c755a 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -643,7 +643,7 @@
     "models.phi": ["PhiConfig"],
     "models.phi3": ["Phi3Config"],
     "models.phobert": ["PhobertTokenizer"],
-    "models.pixtral": ["PixtralConfig", "PixtralProcessor"],
+    "models.pixtral": ["PixtralVisionConfig", "PixtralProcessor"],
     "models.pix2struct": [
         "Pix2StructConfig",
         "Pix2StructProcessor",
@@ -5437,7 +5437,7 @@
         Pix2StructVisionConfig,
     )
     from .models.pixtral import (
-        PixtralConfig,
+        PixtralVisionConfig,
         PixtralProcessor,
     )
     from .models.plbart import PLBartConfig
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 97d9c60fa41c23..2cd7d550d90b7a 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -205,7 +205,7 @@
         ("phi", "PhiConfig"),
         ("phi3", "Phi3Config"),
         ("pix2struct", "Pix2StructConfig"),
-        ("pixtral", "PixtralConfig"),
+        ("pixtral", "PixtralVisionConfig"),
         ("plbart", "PLBartConfig"),
         ("poolformer", "PoolFormerConfig"),
         ("pop2piano", "Pop2PianoConfig"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index 2f0e8591740da4..e735579108d857 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -385,7 +385,7 @@
             ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("phobert", ("PhobertTokenizer", None)),
             ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)),
-            ("pixtral", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
+            ("pixtral", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
             ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)),
             ("prophetnet", ("ProphetNetTokenizer", None)),
             ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
diff --git a/src/transformers/models/pixtral/__init__.py b/src/transformers/models/pixtral/__init__.py
index 2020287559a124..f3a55bea25f859 100644
--- a/src/transformers/models/pixtral/__init__.py
+++ b/src/transformers/models/pixtral/__init__.py
@@ -17,7 +17,7 @@
 
 
 _import_structure = {
-    "configuration_pixtral": ["PixtralConfig"],
+    "configuration_pixtral": ["PixtralVisionConfig"],
     "processing_pixtral": ["PixtralProcessor"],
 }
 
@@ -43,7 +43,7 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_pixtral import PixtralConfig, PixtralProcessor
+    from .configuration_pixtral import PixtralVisionConfig, PixtralProcessor
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
index da7479b342b754..12f9bd29421a54 100644
--- a/src/transformers/models/pixtral/configuration_pixtral.py
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -20,7 +20,7 @@
 logger = logging.get_logger(__name__)
 
 
-class PixtralConfig(PretrainedConfig):
+class PixtralVisionConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`PixtralModel`]. It is used to instantiate an
     Pixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
@@ -60,10 +60,10 @@ class PixtralConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import PixtralModel, PixtralConfig, CLIPVisionConfig, LlamaConfig
+    >>> from transformers import PixtralModel, PixtralVisionConfig, CLIPVisionConfig, LlamaConfig
 
     >>> # Initializing a Pixtral 12B style configuration
-    >>> config = PixtralConfig()
+    >>> config = PixtralVisionConfig()
 
     >>> # Initializing a model from the pixtral 12B style configuration
     >>> model = PixtralModel(configuration)
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 9be6b7aa7a1494..f2354a79c53925 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -11,7 +11,7 @@
     LlavaConfig,
     LlavaForConditionalGeneration,
     MistralConfig,
-    PixtralConfig,
+    PixtralVisionConfig,
     PixtralImageProcessor,
     PixtralProcessor,
     PreTrainedTokenizerFast,
@@ -230,7 +230,7 @@ def convert_mistral_model(input_dir, output_dir):
         vocab_size=131072,
     )
 
-    vision_config = PixtralConfig()
+    vision_config = PixtralVisionConfig()
     config = LlavaConfig(
         vision_config,
         text_config,
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 653b8d9e4c9869..ef3edc449e12fb 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -27,7 +27,7 @@
     add_start_docstrings,
     logging,
 )
-from .configuration_pixtral import PixtralConfig
+from .configuration_pixtral import PixtralVisionConfig
 
 
 logger = logging.get_logger(__name__)
@@ -300,7 +300,6 @@ def forward(
 class PixtralTransformer(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.config = config
         self.layers = torch.nn.ModuleList()
         for _ in range(config.num_hidden_layers):
             self.layers.append(PixtralAttentionLayer(config))
@@ -391,7 +390,7 @@ def forward(
     and behavior.
 
     Parameters:
-        config ([`PixtralConfig`] or [`PixtralVisionConfig`]):
+        config ([`PixtralVisionConfig`] or [`PixtralVisionConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -403,7 +402,7 @@ def forward(
     PIXTRAL_START_DOCSTRING,
 )
 class PixtralPreTrainedModel(PreTrainedModel):
-    config_class = PixtralConfig
+    config_class = PixtralVisionConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["PixtralVisionAttention"]
diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py
index afc45ea9d89543..3c351af325d3cd 100644
--- a/tests/models/pixtral/test_modeling_pixtral.py
+++ b/tests/models/pixtral/test_modeling_pixtral.py
@@ -21,7 +21,7 @@
 
 from transformers import (
     AutoProcessor,
-    PixtralConfig,
+    PixtralVisionConfig,
     PixtralModel,
     is_torch_available,
     is_vision_available,
@@ -92,7 +92,7 @@ def prepare_config_and_inputs(self):
         return config, pixel_values
 
     def get_config(self):
-        return PixtralConfig(
+        return PixtralVisionConfig(
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
@@ -151,7 +151,7 @@ class PixtralModelModelTest(ModelTesterMixin, unittest.TestCase):
 
     def setUp(self):
         self.model_tester = PixtralModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=PixtralConfig, has_text_modality=False)
+        self.config_tester = ConfigTester(self, config_class=PixtralVisionConfig, has_text_modality=False)
 
     # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
     def test_inputs_embeds(self):

From 97b4d937809f1abbda29e04495c74f5829968292 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 19:33:48 +0200
Subject: [PATCH 39/58] update

---
 src/transformers/__init__.py                                    | 2 +-
 src/transformers/models/pixtral/__init__.py                     | 2 +-
 .../models/pixtral/convert_pixtral_weights_to_hf.py             | 2 +-
 tests/models/pixtral/test_modeling_pixtral.py                   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c96cc1955608a6..03c9ff3988defa 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -5437,8 +5437,8 @@
         Pix2StructVisionConfig,
     )
     from .models.pixtral import (
-        PixtralVisionConfig,
         PixtralProcessor,
+        PixtralVisionConfig,
     )
     from .models.plbart import PLBartConfig
     from .models.poolformer import (
diff --git a/src/transformers/models/pixtral/__init__.py b/src/transformers/models/pixtral/__init__.py
index f3a55bea25f859..e09ed8e60127dd 100644
--- a/src/transformers/models/pixtral/__init__.py
+++ b/src/transformers/models/pixtral/__init__.py
@@ -43,7 +43,7 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_pixtral import PixtralVisionConfig, PixtralProcessor
+    from .configuration_pixtral import PixtralProcessor, PixtralVisionConfig
 
     try:
         if not is_torch_available():
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index f2354a79c53925..44717e2a5b101c 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -11,9 +11,9 @@
     LlavaConfig,
     LlavaForConditionalGeneration,
     MistralConfig,
-    PixtralVisionConfig,
     PixtralImageProcessor,
     PixtralProcessor,
+    PixtralVisionConfig,
     PreTrainedTokenizerFast,
 )
 from transformers.convert_slow_tokenizer import bytes_to_unicode
diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py
index 3c351af325d3cd..18e3d6d5a2309d 100644
--- a/tests/models/pixtral/test_modeling_pixtral.py
+++ b/tests/models/pixtral/test_modeling_pixtral.py
@@ -21,8 +21,8 @@
 
 from transformers import (
     AutoProcessor,
-    PixtralVisionConfig,
     PixtralModel,
+    PixtralVisionConfig,
     is_torch_available,
     is_vision_available,
 )

From ce23dc368395c4e763a005b18df6b8f2f5c436d9 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 19:38:19 +0200
Subject: [PATCH 40/58] fix load safe

---
 .../models/pixtral/convert_pixtral_weights_to_hf.py             | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 44717e2a5b101c..491a182cffe9a0 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -3,7 +3,7 @@
 import regex as re
 import torch
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
-from safetensors.torch import safe_load_file
+from safetensors.torch import load_file as safe_load_file
 from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
 from tokenizers.models import BPE
 

From bbf516c3db722cbfa82ab34b9e23eb5c88a35186 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 19:39:46 +0200
Subject: [PATCH 41/58] addd liscence

---
 .../models/pixtral/configuration_pixtral.py         |  2 +-
 .../models/pixtral/convert_pixtral_weights_to_hf.py | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py
index b91586edf7e5eb..dcc1e458ca78a3 100644
--- a/src/transformers/models/pixtral/configuration_pixtral.py
+++ b/src/transformers/models/pixtral/configuration_pixtral.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2024 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 491a182cffe9a0..99cdc6dcd3b319 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -1,3 +1,16 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 
 import regex as re

From b783e7a47139da67f01a3f798875558dc0da80a7 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 19:44:29 +0200
Subject: [PATCH 42/58] update

---
 src/transformers/models/pixtral/modeling_pixtral.py   | 4 ++--
 src/transformers/models/pixtral/processing_pixtral.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index d8bab77a47180b..0ea97dfdcf09b3 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -497,7 +497,7 @@ def dtype(self) -> torch.device:
     @add_start_docstrings_to_model_forward(PIXTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
-        images: List[torch.Tensor],
+        pixel_values: List[torch.Tensor],
         output_hidden_states: Optional[bool] = False,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
@@ -506,7 +506,7 @@ def forward(
     ) -> Union[Tuple, BaseModelOutput]:
         """
         Returns:
-            image_features: tensor of token features for
+            pixel_values: tensor of token features for
                 all tokens of all images of shape (N_toks, D)
         """
         # pass images through initial convolution independently
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index da0e03bfb21d4e..ef4317ffc1e4e5 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 443917f47ac54f95d872d147540ff0ab128333a6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 19:45:42 +0200
Subject: [PATCH 43/58] use pixel_values as required by the model

---
 src/transformers/models/pixtral/modeling_pixtral.py   | 4 ++--
 src/transformers/models/pixtral/processing_pixtral.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 0ea97dfdcf09b3..7bb6da9ae2c5fe 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -432,7 +432,7 @@ def _init_weights(self, module):
 
 PIXTRAL_INPUTS_DOCSTRING = r"""
     Args:
-        images: list of N_img images of variable sizes,
+        pixel_values: list of N_img images of variable sizes,
                 each of shape (C, H, W)
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
@@ -510,7 +510,7 @@ def forward(
                 all tokens of all images of shape (N_toks, D)
         """
         # pass images through initial convolution independently
-        patch_embeds_list = [self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in images]
+        patch_embeds_list = [self.patch_conv(img.unsqueeze(0).to(self.dtype)) for img in pixel_values]
 
         # flatten to a single sequence
         patch_embeds = torch.cat([p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1)
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index ef4317ffc1e4e5..e9d77f9de702a3 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -182,7 +182,7 @@ def __call__(
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
               `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
+            `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
         if images is not None:

From f9291eac7a7a787b755729c81e27e3942e33286f Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 20:05:12 +0200
Subject: [PATCH 44/58] skip some tests and refactor

---
 docs/source/en/index.md                       |   2 +-
 src/transformers/models/auto/modeling_auto.py |   1 -
 .../models/pixtral/modeling_pixtral.py        |   1 +
 tests/models/pixtral/test_modeling_pixtral.py | 115 ++++++++++++------
 4 files changed, 78 insertions(+), 41 deletions(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 97148840a2d2ea..c18426de4c031c 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -253,7 +253,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [Phi3](model_doc/phi3)                          |       ✅        |         ❌         |      ❌      |
 |                       [PhoBERT](model_doc/phobert)                       |       ✅        |         ✅         |      ✅      |
 |                    [Pix2Struct](model_doc/pix2struct)                    |       ✅        |         ❌         |      ❌      |
-|                       [Pixtral](model_doc/pixtral)                       |       ✅        |         ❌         |      ❌      |
+|                       [Pixtral](model_doc/pixtral)                       |       ❌        |         ❌         |      ❌      |
 |                        [PLBart](model_doc/plbart)                        |       ✅        |         ❌         |      ❌      |
 |                    [PoolFormer](model_doc/poolformer)                    |       ✅        |         ❌         |      ❌      |
 |                     [Pop2Piano](model_doc/pop2piano)                     |       ✅        |         ❌         |      ❌      |
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index eb2ab82c960d3b..e0d15f1e236590 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -734,7 +734,6 @@
         ("llava_onevision", "LlavaOnevisionForConditionalGeneration"),
         ("paligemma", "PaliGemmaForConditionalGeneration"),
         ("pix2struct", "Pix2StructForConditionalGeneration"),
-        ("pixtral", "PixtralModel"),
         ("qwen2_vl", "Qwen2VLForConditionalGeneration"),
         ("video_llava", "VideoLlavaForConditionalGeneration"),
         ("vipllava", "VipLlavaForConditionalGeneration"),
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 7bb6da9ae2c5fe..081a28e35cedfe 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -301,6 +301,7 @@ def forward(
 class PixtralTransformer(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.config = config
         self.layers = torch.nn.ModuleList()
         for _ in range(config.num_hidden_layers):
             self.layers.append(PixtralAttentionLayer(config))
diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py
index 18e3d6d5a2309d..bd41fa1c9e62fb 100644
--- a/tests/models/pixtral/test_modeling_pixtral.py
+++ b/tests/models/pixtral/test_modeling_pixtral.py
@@ -153,48 +153,13 @@ def setUp(self):
         self.model_tester = PixtralModelTester(self)
         self.config_tester = ConfigTester(self, config_class=PixtralVisionConfig, has_text_modality=False)
 
-    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
+    @unittest.skip("model does not support input embeds")
     def test_inputs_embeds(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            wte = model.get_input_embeddings()
-            inputs["inputs_embeds"] = wte(input_ids)
-
-            with torch.no_grad():
-                model(**inputs)
+        pass
 
-    # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
-    # while some other models require pixel_values to be present
+    @unittest.skip("model does not support input embeds")
     def test_inputs_embeds_matches_input_ids(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-            input_ids = inputs["input_ids"]
-            del inputs["input_ids"]
-            del inputs["pixel_values"]
-
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-
-            with torch.no_grad():
-                out_ids = model(input_ids=input_ids, **inputs)[0]
-                out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
-            self.assertTrue(torch.allclose(out_embeds, out_ids))
+        pass
 
     @unittest.skip(
         reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
@@ -222,6 +187,78 @@ def test_sdpa_can_compile_dynamic(self):
     def test_sdpa_can_dispatch_on_flash(self):
         pass
 
+    @unittest.skip(reason="Not supported yet")
+    def test_attention_outputs(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_cpu_offload(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_batching_equivalence(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_disk_offload_bin(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_multi_gpu_data_parallel_forward(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_model_parallelism(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_model_outputs_equivalence(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_save_load(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_model_get_set_embeddings(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_model_main_input_name(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_initialization(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_gradient_checkpointing_backward_compatibility(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_feed_forward_chunking(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_disk_offload_safetensors(self):
+        pass
+
+    @unittest.skip(reason="Not supported yet")
+    def test_determinism(self):
+        pass
+
 
 @require_torch
 class PixtralModelIntegrationTest(unittest.TestCase):

From db84a7d35ca1ee25f18d83a37ad02b08da428b17 Mon Sep 17 00:00:00 2001
From: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Date: Fri, 13 Sep 2024 19:05:49 +0100
Subject: [PATCH 45/58] Add pixtral image processing tests (#33476)

* Image processing tests

* Add processing tests

* woops

* defaults reflect pixtral image processor
---
 .../pixtral/image_processing_pixtral.py       |   2 +
 .../models/pixtral/processing_pixtral.py      |  26 +-
 .../pixtral/test_image_processing_pixtral.py  | 221 +++++++++++++++++
 .../models/pixtral/test_processor_pixtral.py  | 230 ++++++++++++++++++
 4 files changed, 477 insertions(+), 2 deletions(-)
 create mode 100644 tests/models/pixtral/test_image_processing_pixtral.py
 create mode 100644 tests/models/pixtral/test_processor_pixtral.py

diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index 3e8694ffcb4c31..de3c644653bea7 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -281,6 +281,7 @@ def __init__(
         super().__init__(**kwargs)
         size = size if size is not None else {"longest_edge": 1024}
         patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
+        patch_size = get_size_dict(patch_size, default_to_square=True)
 
         self.do_resize = do_resize
         self.size = size
@@ -296,6 +297,7 @@ def __init__(
             "images",
             "do_resize",
             "size",
+            "patch_size",
             "resample",
             "do_rescale",
             "rescale_factor",
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index e9d77f9de702a3..468e6421309481 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -19,7 +19,7 @@
 from typing import List, Optional, Union
 
 from ...feature_extraction_utils import BatchFeature
-from ...image_utils import ImageInput
+from ...image_utils import ImageInput, is_valid_image, load_image
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType, is_torch_device, is_torch_dtype, is_torch_tensor, logging, requires_backends
@@ -27,6 +27,15 @@
 
 logger = logging.get_logger(__name__)
 
+# Copied from transformers.models.idefics2.processing_idefics2.is_url
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+
+
+# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_url
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem)
+
 
 # Copied from transformers.models.pixtral.image_processing_pixtral.BatchMixFeature
 class BatchMixFeature(BatchFeature):
@@ -87,7 +96,7 @@ class PixtralProcessor(ProcessorMixin):
     [`~PixtralProcessor.__call__`] and [`~PixtralProcessor.decode`] for more information.
 
     Args:
-        image_processor ([`CLIPImageProcessor`], *optional*):
+        image_processor ([`PixtralImageProcessor`], *optional*):
             The image processor is a required input.
         tokenizer ([`LlamaTokenizerFast`], *optional*):
             The tokenizer is a required input.
@@ -186,6 +195,19 @@ def __call__(
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
         if images is not None:
+            if is_image_or_image_url(images):
+                images = [[images]]
+            elif isinstance(images, list) and is_image_or_image_url(images[0]):
+                images = [images]
+            elif (
+                not isinstance(images, list)
+                and not isinstance(images[0], list)
+                and not is_image_or_image_url(images[0][0])
+            ):
+                raise ValueError(
+                    "Invalid input images. Please provide a single image or a list of images or a list of list of images."
+                )
+            images = [[load_image(im) for im in sample] for sample in images]
             image_inputs = self.image_processor(images, patch_size=self.patch_size, return_tensors=return_tensors)
         else:
             image_inputs = {}
diff --git a/tests/models/pixtral/test_image_processing_pixtral.py b/tests/models/pixtral/test_image_processing_pixtral.py
new file mode 100644
index 00000000000000..2c571c1b70ca1e
--- /dev/null
+++ b/tests/models/pixtral/test_image_processing_pixtral.py
@@ -0,0 +1,221 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+
+from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from transformers.models.llava_next.image_processing_llava_next import select_best_resolution
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import PixtralImageProcessor
+
+
+class PixtralImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        image_size=18,
+        max_num_images_per_sample=3,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=None,
+        patch_size=None,
+        do_normalize=True,
+        image_mean=[0.48145466, 0.4578275, 0.40821073],
+        image_std=[0.26862954, 0.26130258, 0.27577711],
+        do_convert_rgb=True,
+    ):
+        size = size if size is not None else {"longest_edge": 24}
+        patch_size = patch_size if patch_size is not None else {"height": 8, "width": 8}
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.max_num_images_per_sample = max_num_images_per_sample
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.patch_size = patch_size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def prepare_image_processor_dict(self):
+        return {
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "patch_size": self.patch_size,
+            "do_normalize": self.do_normalize,
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_convert_rgb": self.do_convert_rgb,
+        }
+
+    def expected_output_image_shape(self, image):
+        if isinstance(image, Image.Image):
+            width, height = image.size
+        elif isinstance(image, np.ndarray):
+            height, width = image.shape[:2]
+        elif isinstance(image, torch.Tensor):
+            height, width = image.shape[-2:]
+
+        max_height = max_width = self.size.get("longest_edge")
+
+        ratio = max(height / max_height, width / max_width)
+        if ratio > 1:
+            height = int(np.ceil(height / ratio))
+            width = int(np.ceil(width / ratio))
+
+        patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+        num_height_tokens = (height - 1) // patch_height + 1
+        num_width_tokens = (width - 1) // patch_width + 1
+
+        height = num_height_tokens * patch_height
+        width = num_width_tokens * patch_width
+
+        return self.num_channels, height, width
+
+    def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        # Use prepare_image_inputs to make a list of list of single images
+
+        images_list = []
+        for _ in range(self.batch_size):
+            images = []
+            for _ in range(random.randint(1, self.max_num_images_per_sample)):
+                img = prepare_image_inputs(
+                    batch_size=1,
+                    num_channels=self.num_channels,
+                    min_resolution=self.min_resolution,
+                    max_resolution=self.max_resolution,
+                    equal_resolution=equal_resolution,
+                    numpify=numpify,
+                    torchify=torchify,
+                )[0]
+                images.append(img)
+            images_list.append(images)
+        return images_list
+
+
+@require_torch
+@require_vision
+class PixtralImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = PixtralImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        super().setUp()
+        self.image_processor_tester = PixtralImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "patch_size"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "rescale_factor"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_convert_rgb"))
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL images
+        image_inputs_list = self.image_processor_tester.prepare_image_inputs()
+        for image_inputs in image_inputs_list:
+            for image in image_inputs:
+                self.assertIsInstance(image, Image.Image)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").images
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
+        self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+
+        # Test batched
+        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").images
+        for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
+            for encoded_image, image in zip(encoded_images, images):
+                expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
+                self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        image_inputs_list = self.image_processor_tester.prepare_image_inputs(numpify=True)
+        for image_inputs in image_inputs_list:
+            for image in image_inputs:
+                self.assertIsInstance(image, np.ndarray)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").images
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
+        self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+
+        # Test batched
+        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").images
+        for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
+            for encoded_image, image in zip(encoded_images, images):
+                expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
+                self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        image_inputs_list = self.image_processor_tester.prepare_image_inputs(torchify=True)
+        for image_inputs in image_inputs_list:
+            for image in image_inputs:
+                self.assertIsInstance(image, torch.Tensor)
+
+        # Test not batched input
+        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").images
+        expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
+        self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
+
+        # Test batched
+        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").images
+        for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
+            for encoded_image, image in zip(encoded_images, images):
+                expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
+                self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
+
+    @unittest.skip(
+        reason="PixtralImageProcessor doesn't treat 4 channel PIL and numpy consistently yet"
+    )  # FIXME Amy
+    def test_call_numpy_4_channels(self):
+        pass
diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
new file mode 100644
index 00000000000000..04e48cbf54a8cc
--- /dev/null
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -0,0 +1,230 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import requests
+import unittest
+
+import torch
+
+from transformers.testing_utils import require_vision
+from transformers.utils import is_vision_available
+
+
+if is_vision_available():
+    from PIL import Image
+    from transformers import AutoProcessor, PixtralProcessor, PixtralImageProcessor, AutoTokenizer
+
+
+@require_vision
+class PixtralProcessorTest(unittest.TestCase):
+    processor_class = PixtralProcessor
+
+    @classmethod
+    def setUpClass(cls):
+        cls.url_0 = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        cls.image_0 = Image.open(requests.get(cls.url_0, stream=True).raw)
+        cls.url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        cls.image_1 = Image.open(requests.get(cls.url_1, stream=True).raw)
+        cls.url_2 = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
+        cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
+
+    def setUp(self):
+        super().setUp()
+
+        # FIXME - just load the processor directly from the checkpoint
+        tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        image_processor = PixtralImageProcessor()
+        self.processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+    def test_chat_template(self):
+        expected_prompt = "USER: [IMG]\nWhat is shown in this image? ASSISTANT:"
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        formatted_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+        self.assertEqual(expected_prompt, formatted_prompt)
+
+    def test_image_token_filling(self):
+        # Important to check with non square image
+        image = torch.randint(0, 2, (3, 500, 316))
+        expected_image_tokens = 1526
+        image_token_index = 32000
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            },
+        ]
+        inputs = self.processor(
+            text=[self.processor.apply_chat_template(messages)],
+            images=[image],
+            return_tensors="pt",
+        )
+        image_tokens = (inputs["input_ids"] == image_token_index).sum().item()
+        self.assertEqual(expected_image_tokens, image_tokens)
+
+    def test_processor_with_single_image(self):
+        prompt_string = "USER: [IMG]\nWhat's the content of the image? ASSISTANT:"
+
+        # Make small for checking image token expansion
+        self.processor.image_processor.size = {"longest_edge": 30}
+        self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+
+        # Test passing in an image
+        inputs_image = self.processor(text=prompt_string, images=self.image_0, return_tensors="pt")
+        self.assertIn("input_ids", inputs_image)
+        self.assertTrue(len(inputs_image["input_ids"]) == 1)
+        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["images"], list)
+        self.assertTrue(len(inputs_image["images"]) == 1)
+        self.assertIsInstance(inputs_image["images"][0], list)
+        self.assertTrue(len(inputs_image["images"][0]) == 1)
+        self.assertIsInstance(inputs_image["images"][0][0], torch.Tensor)
+
+        # fmt: off
+        input_ids = inputs_image["input_ids"]
+        self.assertEqual(
+            list(input_ids[0]),
+            # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
+            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 319, 1799, 9047, 13566, 29901]
+        )
+        # fmt: on
+
+        # Test passing in a url
+        inputs_url = self.processor(text=prompt_string, images=self.url_0, return_tensors="pt")
+        self.assertIn("input_ids", inputs_url)
+        self.assertTrue(len(inputs_url["input_ids"]) == 1)
+        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_url["images"], list)
+        self.assertTrue(len(inputs_url["images"]) == 1)
+        self.assertIsInstance(inputs_url["images"][0], list)
+        self.assertTrue(len(inputs_url["images"][0]) == 1)
+        self.assertIsInstance(inputs_url["images"][0][0], torch.Tensor)
+
+        # fmt: off
+        input_ids = inputs_url["input_ids"]
+        self.assertEqual(
+            list(input_ids[0]),
+            # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
+            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 319, 1799, 9047, 13566, 29901]
+        )
+        # fmt: on
+
+    def test_processor_with_multiple_images_single_list(self):
+        prompt_string = "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:"
+
+        # Make small for checking image token expansion
+        self.processor.image_processor.size = {"longest_edge": 30}
+        self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+
+        # Test passing in an image
+        inputs_image = self.processor(text=prompt_string, images=[self.image_0, self.image_1], return_tensors="pt")
+        self.assertIn("input_ids", inputs_image)
+        self.assertTrue(len(inputs_image["input_ids"]) == 1)
+        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["images"], list)
+        self.assertTrue(len(inputs_image["images"]) == 1)
+        self.assertIsInstance(inputs_image["images"][0], list)
+        self.assertTrue(len(inputs_image["images"][0]) == 2)
+        self.assertIsInstance(inputs_image["images"][0][0], torch.Tensor)
+
+        # fmt: off
+        input_ids = inputs_image["input_ids"]
+        self.assertEqual(
+            list(input_ids[0]),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 4328, 1546, 1438, 1023, 4558, 29973, 319, 1799, 9047, 13566, 29901]
+        )
+        # fmt: on
+
+        # Test passing in a url
+        inputs_url = self.processor(text=prompt_string, images=[self.url_0, self.url_1], return_tensors="pt")
+        self.assertIn("input_ids", inputs_url)
+        self.assertTrue(len(inputs_url["input_ids"]) == 1)
+        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_url["images"], list)
+        self.assertTrue(len(inputs_url["images"]) == 1)
+        self.assertIsInstance(inputs_url["images"][0], list)
+        self.assertTrue(len(inputs_url["images"][0]) == 2)
+        self.assertIsInstance(inputs_url["images"][0][0], torch.Tensor)
+
+        # fmt: off
+        input_ids = inputs_url["input_ids"]
+        self.assertEqual(
+            list(input_ids[0]),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 4328, 1546, 1438, 1023, 4558, 29973, 319, 1799, 9047, 13566, 29901]
+        )
+        # fmt: on
+
+    def test_processor_with_multiple_images_multiple_lists(self):
+        prompt_string = [
+            "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:",
+            "USER: [IMG]\nWhat's the content of the image? ASSISTANT:",
+        ]
+        image_inputs = [[self.image_0, self.image_1], [self.image_2]]
+
+        # Make small for checking image token expansion
+        self.processor.image_processor.size = {"longest_edge": 30}
+        self.processor.image_processor.patch_size = {"height": 2, "width": 2}
+
+        # Test passing in an image
+        inputs_image = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+        self.assertIn("input_ids", inputs_image)
+        self.assertTrue(len(inputs_image["input_ids"]) == 2)
+        self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_image["images"], list)
+        self.assertTrue(len(inputs_image["images"]) == 2)
+        self.assertIsInstance(inputs_image["images"][0], list)
+        self.assertTrue(len(inputs_image["images"][0]) == 2)
+        self.assertIsInstance(inputs_image["images"][0][0], torch.Tensor)
+
+        # fmt: off
+        input_ids = inputs_image["input_ids"]
+        self.assertEqual(
+            list(input_ids[0]),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 4328, 1546, 1438, 1023, 4558, 29973, 319, 1799, 9047, 13566, 29901]
+        )
+        # fmt: on
+
+        # Test passing in a url
+        inputs_url = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
+        self.assertIn("input_ids", inputs_url)
+        self.assertTrue(len(inputs_url["input_ids"]) == 2)
+        self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
+        self.assertIsInstance(inputs_url["images"], list)
+        self.assertTrue(len(inputs_url["images"]) == 2)
+        self.assertIsInstance(inputs_url["images"][0], list)
+        self.assertTrue(len(inputs_url["images"][0]) == 2)
+        self.assertIsInstance(inputs_url["images"][0][0], torch.Tensor)
+
+        # fmt: off
+        input_ids = inputs_url["input_ids"]
+        self.assertEqual(
+            list(input_ids[0]),
+            # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
+            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 4328, 1546, 1438, 1023, 4558, 29973, 319, 1799, 9047, 13566, 29901]
+        )
+        # fmt: on

From 908233f8dd1b3d45dc99f1f207af00b5dea8867b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 20:08:32 +0200
Subject: [PATCH 46/58] fixup post merge

---
 src/transformers/models/pixtral/processing_pixtral.py | 3 ++-
 tests/models/pixtral/test_image_processing_pixtral.py | 6 +-----
 tests/models/pixtral/test_processor_pixtral.py        | 5 +++--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index 468e6421309481..b1359fad5d8c0d 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -27,12 +27,13 @@
 
 logger = logging.get_logger(__name__)
 
+
 # Copied from transformers.models.idefics2.processing_idefics2.is_url
 def is_url(val) -> bool:
     return isinstance(val, str) and val.startswith("http")
 
 
-# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_url
+# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
 def is_image_or_image_url(elem):
     return is_url(elem) or is_valid_image(elem)
 
diff --git a/tests/models/pixtral/test_image_processing_pixtral.py b/tests/models/pixtral/test_image_processing_pixtral.py
index 2c571c1b70ca1e..7086960e764038 100644
--- a/tests/models/pixtral/test_image_processing_pixtral.py
+++ b/tests/models/pixtral/test_image_processing_pixtral.py
@@ -18,8 +18,6 @@
 
 import numpy as np
 
-from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
-from transformers.models.llava_next.image_processing_llava_next import select_best_resolution
 from transformers.testing_utils import require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -214,8 +212,6 @@ def test_call_pytorch(self):
                 expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
                 self.assertEqual(tuple(encoded_image.shape), expected_output_image_shape)
 
-    @unittest.skip(
-        reason="PixtralImageProcessor doesn't treat 4 channel PIL and numpy consistently yet"
-    )  # FIXME Amy
+    @unittest.skip(reason="PixtralImageProcessor doesn't treat 4 channel PIL and numpy consistently yet")  # FIXME Amy
     def test_call_numpy_4_channels(self):
         pass
diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
index 04e48cbf54a8cc..3f5cdb6da03d79 100644
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -11,9 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import requests
 import unittest
 
+import requests
 import torch
 
 from transformers.testing_utils import require_vision
@@ -22,7 +22,8 @@
 
 if is_vision_available():
     from PIL import Image
-    from transformers import AutoProcessor, PixtralProcessor, PixtralImageProcessor, AutoTokenizer
+
+    from transformers import AutoTokenizer, PixtralImageProcessor, PixtralProcessor
 
 
 @require_vision

From 8c0f8f61d457db2116370f94b90a87f8a870c98e Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 20:11:37 +0200
Subject: [PATCH 47/58] images -> pixel values

---
 .../models/pixtral/test_processor_pixtral.py  | 62 +++++++++----------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
index 3f5cdb6da03d79..ad83500ae80bc3 100644
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -43,7 +43,7 @@ def setUp(self):
         super().setUp()
 
         # FIXME - just load the processor directly from the checkpoint
-        tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/pixtral-12b")
         image_processor = PixtralImageProcessor()
         self.processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
@@ -97,11 +97,11 @@ def test_processor_with_single_image(self):
         self.assertIn("input_ids", inputs_image)
         self.assertTrue(len(inputs_image["input_ids"]) == 1)
         self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_image["images"], list)
-        self.assertTrue(len(inputs_image["images"]) == 1)
-        self.assertIsInstance(inputs_image["images"][0], list)
-        self.assertTrue(len(inputs_image["images"][0]) == 1)
-        self.assertIsInstance(inputs_image["images"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], list)
+        self.assertTrue(len(inputs_image["pixel_values"]) == 1)
+        self.assertIsInstance(inputs_image["pixel_values"][0], list)
+        self.assertTrue(len(inputs_image["pixel_values"][0]) == 1)
+        self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
 
         # fmt: off
         input_ids = inputs_image["input_ids"]
@@ -117,11 +117,11 @@ def test_processor_with_single_image(self):
         self.assertIn("input_ids", inputs_url)
         self.assertTrue(len(inputs_url["input_ids"]) == 1)
         self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_url["images"], list)
-        self.assertTrue(len(inputs_url["images"]) == 1)
-        self.assertIsInstance(inputs_url["images"][0], list)
-        self.assertTrue(len(inputs_url["images"][0]) == 1)
-        self.assertIsInstance(inputs_url["images"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_url["pixel_values"], list)
+        self.assertTrue(len(inputs_url["pixel_values"]) == 1)
+        self.assertIsInstance(inputs_url["pixel_values"][0], list)
+        self.assertTrue(len(inputs_url["pixel_values"][0]) == 1)
+        self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
 
         # fmt: off
         input_ids = inputs_url["input_ids"]
@@ -144,11 +144,11 @@ def test_processor_with_multiple_images_single_list(self):
         self.assertIn("input_ids", inputs_image)
         self.assertTrue(len(inputs_image["input_ids"]) == 1)
         self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_image["images"], list)
-        self.assertTrue(len(inputs_image["images"]) == 1)
-        self.assertIsInstance(inputs_image["images"][0], list)
-        self.assertTrue(len(inputs_image["images"][0]) == 2)
-        self.assertIsInstance(inputs_image["images"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], list)
+        self.assertTrue(len(inputs_image["pixel_values"]) == 1)
+        self.assertIsInstance(inputs_image["pixel_values"][0], list)
+        self.assertTrue(len(inputs_image["pixel_values"][0]) == 2)
+        self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
 
         # fmt: off
         input_ids = inputs_image["input_ids"]
@@ -164,11 +164,11 @@ def test_processor_with_multiple_images_single_list(self):
         self.assertIn("input_ids", inputs_url)
         self.assertTrue(len(inputs_url["input_ids"]) == 1)
         self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_url["images"], list)
-        self.assertTrue(len(inputs_url["images"]) == 1)
-        self.assertIsInstance(inputs_url["images"][0], list)
-        self.assertTrue(len(inputs_url["images"][0]) == 2)
-        self.assertIsInstance(inputs_url["images"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_url["pixel_values"], list)
+        self.assertTrue(len(inputs_url["pixel_values"]) == 1)
+        self.assertIsInstance(inputs_url["pixel_values"][0], list)
+        self.assertTrue(len(inputs_url["pixel_values"][0]) == 2)
+        self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
 
         # fmt: off
         input_ids = inputs_url["input_ids"]
@@ -195,11 +195,11 @@ def test_processor_with_multiple_images_multiple_lists(self):
         self.assertIn("input_ids", inputs_image)
         self.assertTrue(len(inputs_image["input_ids"]) == 2)
         self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_image["images"], list)
-        self.assertTrue(len(inputs_image["images"]) == 2)
-        self.assertIsInstance(inputs_image["images"][0], list)
-        self.assertTrue(len(inputs_image["images"][0]) == 2)
-        self.assertIsInstance(inputs_image["images"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_image["pixel_values"], list)
+        self.assertTrue(len(inputs_image["pixel_values"]) == 2)
+        self.assertIsInstance(inputs_image["pixel_values"][0], list)
+        self.assertTrue(len(inputs_image["pixel_values"][0]) == 2)
+        self.assertIsInstance(inputs_image["pixel_values"][0][0], torch.Tensor)
 
         # fmt: off
         input_ids = inputs_image["input_ids"]
@@ -215,11 +215,11 @@ def test_processor_with_multiple_images_multiple_lists(self):
         self.assertIn("input_ids", inputs_url)
         self.assertTrue(len(inputs_url["input_ids"]) == 2)
         self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
-        self.assertIsInstance(inputs_url["images"], list)
-        self.assertTrue(len(inputs_url["images"]) == 2)
-        self.assertIsInstance(inputs_url["images"][0], list)
-        self.assertTrue(len(inputs_url["images"][0]) == 2)
-        self.assertIsInstance(inputs_url["images"][0][0], torch.Tensor)
+        self.assertIsInstance(inputs_url["pixel_values"], list)
+        self.assertTrue(len(inputs_url["pixel_values"]) == 2)
+        self.assertIsInstance(inputs_url["pixel_values"][0], list)
+        self.assertTrue(len(inputs_url["pixel_values"][0]) == 2)
+        self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
 
         # fmt: off
         input_ids = inputs_url["input_ids"]

From b7d7760cc489aa936a84b713e4ec1ebcadddbe0b Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 20:12:34 +0200
Subject: [PATCH 48/58] oups sorry Mr docbuilder

---
 docs/source/en/model_doc/pixtral.md | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
index 2596dd2fe51c8f..abe3645f064452 100644
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@@ -82,15 +82,6 @@ Each image captures a different scene, from a close-up of a dog to expansive nat
 
 [[autodoc]] PixtralVisionConfig
 
-## PixtralImageProcessor
-
-[[autodoc]] PixtralImageProcessor
-    - forward
-
-## PixtralProcessor
-[[autodoc]] PixtralProcessor
-    - forward
-
 ## PixtralModel
 
 [[autodoc]] PixtralModel

From be154e18301ccaa225ed936159654fbdbe96567d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 20:15:58 +0200
Subject: [PATCH 49/58] isort

---
 src/transformers/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 03c9ff3988defa..36775d8454ab8c 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -643,13 +643,13 @@
     "models.phi": ["PhiConfig"],
     "models.phi3": ["Phi3Config"],
     "models.phobert": ["PhobertTokenizer"],
-    "models.pixtral": ["PixtralVisionConfig", "PixtralProcessor"],
     "models.pix2struct": [
         "Pix2StructConfig",
         "Pix2StructProcessor",
         "Pix2StructTextConfig",
         "Pix2StructVisionConfig",
     ],
+    "models.pixtral": ["PixtralProcessor", "PixtralVisionConfig"],
     "models.plbart": ["PLBartConfig"],
     "models.poolformer": ["PoolFormerConfig"],
     "models.pop2piano": ["Pop2PianoConfig"],
@@ -1199,8 +1199,8 @@
     _import_structure["models.owlv2"].append("Owlv2ImageProcessor")
     _import_structure["models.owlvit"].extend(["OwlViTFeatureExtractor", "OwlViTImageProcessor"])
     _import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"])
-    _import_structure["models.pixtral"].append("PixtralImageProcessor")
     _import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"])
+    _import_structure["models.pixtral"].append("PixtralImageProcessor")
     _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"])
     _import_structure["models.pvt"].extend(["PvtImageProcessor"])
     _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"])
@@ -1361,7 +1361,6 @@
             "AlignVisionModel",
         ]
     )
-    _import_structure["models.pixtral"].extend(["PixtralModel", "PixtralPreTrainedModel"])
     _import_structure["models.altclip"].extend(
         [
             "AltCLIPModel",
@@ -2979,6 +2978,7 @@
             "Pix2StructVisionModel",
         ]
     )
+    _import_structure["models.pixtral"].extend(["PixtralModel", "PixtralPreTrainedModel"])
     _import_structure["models.plbart"].extend(
         [
             "PLBartForCausalLM",

From bd721e26d04b5f7bb9d060b1039b026c34ed8ec6 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 20:16:57 +0200
Subject: [PATCH 50/58] fix

---
 .../models/pixtral/test_image_processing_pixtral.py  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/models/pixtral/test_image_processing_pixtral.py b/tests/models/pixtral/test_image_processing_pixtral.py
index 7086960e764038..3994201c065c45 100644
--- a/tests/models/pixtral/test_image_processing_pixtral.py
+++ b/tests/models/pixtral/test_image_processing_pixtral.py
@@ -159,12 +159,12 @@ def test_call_pil(self):
                 self.assertIsInstance(image, Image.Image)
 
         # Test not batched input
-        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").images
+        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
         expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
         self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
 
         # Test batched
-        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").images
+        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
         for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
             for encoded_image, image in zip(encoded_images, images):
                 expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
@@ -180,12 +180,12 @@ def test_call_numpy(self):
                 self.assertIsInstance(image, np.ndarray)
 
         # Test not batched input
-        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").images
+        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
         expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
         self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
 
         # Test batched
-        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").images
+        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
         for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
             for encoded_image, image in zip(encoded_images, images):
                 expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)
@@ -201,12 +201,12 @@ def test_call_pytorch(self):
                 self.assertIsInstance(image, torch.Tensor)
 
         # Test not batched input
-        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").images
+        encoded_images = image_processing(image_inputs_list[0][0], return_tensors="pt").pixel_values
         expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs_list[0][0])
         self.assertEqual(tuple(encoded_images[0][0].shape), expected_output_image_shape)
 
         # Test batched
-        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").images
+        batch_encoded_images = image_processing(image_inputs_list, return_tensors="pt").pixel_values
         for encoded_images, images in zip(batch_encoded_images, image_inputs_list):
             for encoded_image, image in zip(encoded_images, images):
                 expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image)

From 2eda3530ea25675fd0a7befffdc06c9de2accf92 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 21:22:22 +0200
Subject: [PATCH 51/58] fix processor tests

---
 .../models/pixtral/test_processor_pixtral.py  | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
index ad83500ae80bc3..aea06df1c350c6 100644
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -47,6 +47,7 @@ def setUp(self):
         image_processor = PixtralImageProcessor()
         self.processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor)
 
+    @unittest.skip("No chat template was set for this model (yet)")
     def test_chat_template(self):
         expected_prompt = "USER: [IMG]\nWhat is shown in this image? ASSISTANT:"
 
@@ -62,6 +63,7 @@ def test_chat_template(self):
         formatted_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
         self.assertEqual(expected_prompt, formatted_prompt)
 
+    @unittest.skip("No chat template was set for this model (yet)")
     def test_image_token_filling(self):
         # Important to check with non square image
         image = torch.randint(0, 2, (3, 500, 316))
@@ -106,9 +108,9 @@ def test_processor_with_single_image(self):
         # fmt: off
         input_ids = inputs_image["input_ids"]
         self.assertEqual(
-            list(input_ids[0]),
+            input_ids[0].tolist(),
             # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
-            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 319, 1799, 9047, 13566, 29901]
+            [21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
         )
         # fmt: on
 
@@ -126,9 +128,9 @@ def test_processor_with_single_image(self):
         # fmt: off
         input_ids = inputs_url["input_ids"]
         self.assertEqual(
-            list(input_ids[0]),
+            input_ids[0].tolist(),
             # Equivalent to "USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the content of the image? ASSISTANT:"
-            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 319, 1799, 9047, 13566, 29901]
+            [21510,  1058,  1032,    10,    10,    12,    10,    10,    13,  1010, 7493,  1681,  1278,  4701,  1307,  1278,  3937,  1063,  1349,  4290, 16002, 41150,  1058]
         )
         # fmt: on
 
@@ -153,7 +155,7 @@ def test_processor_with_multiple_images_single_list(self):
         # fmt: off
         input_ids = inputs_image["input_ids"]
         self.assertEqual(
-            list(input_ids[0]),
+            input_ids[0].tolist(),
             # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
             [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 4328, 1546, 1438, 1023, 4558, 29973, 319, 1799, 9047, 13566, 29901]
         )
@@ -169,11 +171,10 @@ def test_processor_with_multiple_images_single_list(self):
         self.assertIsInstance(inputs_url["pixel_values"][0], list)
         self.assertTrue(len(inputs_url["pixel_values"][0]) == 2)
         self.assertIsInstance(inputs_url["pixel_values"][0][0], torch.Tensor)
-
         # fmt: off
         input_ids = inputs_url["input_ids"]
         self.assertEqual(
-            list(input_ids[0]),
+            input_ids[0].tolist(),
             # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
             [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 4328, 1546, 1438, 1023, 4558, 29973, 319, 1799, 9047, 13566, 29901]
         )
@@ -184,6 +185,7 @@ def test_processor_with_multiple_images_multiple_lists(self):
             "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:",
             "USER: [IMG]\nWhat's the content of the image? ASSISTANT:",
         ]
+        self.processor.tokenizer.pad_token = "</s>"
         image_inputs = [[self.image_0, self.image_1], [self.image_2]]
 
         # Make small for checking image token expansion
@@ -204,9 +206,9 @@ def test_processor_with_multiple_images_multiple_lists(self):
         # fmt: off
         input_ids = inputs_image["input_ids"]
         self.assertEqual(
-            list(input_ids[0]),
+            input_ids[0].tolist(),
             # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
-            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 4328, 1546, 1438, 1023, 4558, 29973, 319, 1799, 9047, 13566, 29901]
+            [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
         )
         # fmt: on
 
@@ -224,8 +226,9 @@ def test_processor_with_multiple_images_multiple_lists(self):
         # fmt: off
         input_ids = inputs_url["input_ids"]
         self.assertEqual(
-            list(input_ids[0]),
+            input_ids[0].tolist(),
             # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
-            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 4328, 1546, 1438, 1023, 4558, 29973, 319, 1799, 9047, 13566, 29901]
+            [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
         )
         # fmt: on
+

From 36e55259d011edc2359c1b000e8871d528b47edf Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 21:24:26 +0200
Subject: [PATCH 52/58] small fixes

---
 .../models/pixtral/convert_pixtral_weights_to_hf.py          | 5 ++++-
 tests/models/pixtral/test_processor_pixtral.py               | 1 -
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 99cdc6dcd3b319..439704bb829d31 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -178,7 +178,10 @@ def convert_mistral_tokenizer():
     vocab = specials_tokens
 
     tokenizer = PreTrainedTokenizerFast(
-        tokenizer_object=MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted()
+        tokenizer_object=MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(),
+        bos_token="<s>",
+        unk_token="<unk>",
+        eos_token="</s>",
     )
     tokenizer.model_input_names = ["input_ids", "attention_mask"]
 
diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
index aea06df1c350c6..4f6897c50e8427 100644
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -231,4 +231,3 @@ def test_processor_with_multiple_images_multiple_lists(self):
             [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
         )
         # fmt: on
-

From 7d4bb1991026e59556521b467dc0f63402f26172 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 21:25:29 +0200
Subject: [PATCH 53/58] nit

---
 .../models/pixtral/convert_pixtral_weights_to_hf.py        | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
index 439704bb829d31..c4190082d99471 100644
--- a/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
+++ b/src/transformers/models/pixtral/convert_pixtral_weights_to_hf.py
@@ -107,13 +107,6 @@ def __init__(
         self.additional_special_tokens = additional_special_tokens
 
     def extract_vocab_merges_from_model(self, vocab: str):
-        try:
-            pass
-        except Exception:
-            raise ValueError(
-                "`tiktoken` is required to read a `tiktoken` file. Install it with " "`pip install tiktoken`."
-            )
-
         bpe_ranks = vocab
         byte_encoder = bytes_to_unicode()
 

From df84fe7f76e29538d230248998dc446fa6d8d990 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 21:52:55 +0200
Subject: [PATCH 54/58] update

---
 tests/models/pixtral/test_processor_pixtral.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py
index 4f6897c50e8427..b70cab1c074480 100644
--- a/tests/models/pixtral/test_processor_pixtral.py
+++ b/tests/models/pixtral/test_processor_pixtral.py
@@ -157,7 +157,7 @@ def test_processor_with_multiple_images_single_list(self):
         self.assertEqual(
             input_ids[0].tolist(),
             # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
-            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 4328, 1546, 1438, 1023, 4558, 29973, 319, 1799, 9047, 13566, 29901]
+            [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
         )
         # fmt: on
 
@@ -176,7 +176,7 @@ def test_processor_with_multiple_images_single_list(self):
         self.assertEqual(
             input_ids[0].tolist(),
             # Equivalent to ["USER: [IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END][IMG][IMG][IMG_BREAK][IMG][IMG][IMG_END]\nWhat's the difference between these two images? ASSISTANT:"]
-            [1, 3148, 1001, 29901, 518, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 29933, 1525, 22311, 3816, 7833, 29954, 3816, 7833, 29954, 3816, 7833, 29954, 29918, 11794, 29962, 13, 5618, 29915, 29879, 278, 4328, 1546, 1438, 1023, 4558, 29973, 319, 1799, 9047, 13566, 29901]
+            [21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
         )
         # fmt: on
 

From 0da8b22033bfb585392d2abc9053eeaae2dae2c8 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Fri, 13 Sep 2024 22:03:31 +0200
Subject: [PATCH 55/58] last nits

---
 docs/source/en/model_doc/pixtral.md                 | 13 +++++++------
 .../models/pixtral/image_processing_pixtral.py      |  2 +-
 .../models/pixtral/processing_pixtral.py            |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
index abe3645f064452..ed321b75050983 100644
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@@ -36,17 +36,18 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts)
 Here is an example of how to run it:
 
 ```python 
-from transformers import AutoModelForConditionalGeneration, AutoProcessor
+from transformers import LlavaForConditionalGeneration, AutoProcessor
+from PIL import Image
 
 model_id = "hf-internal-testing/pixtral-12b"
-model = AutoModelForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
 processor = AutoProcessor.from_pretrained(model_id)
 
 IMG_URLS = [
-    Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw),
-    Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw),
-    Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw),
-    Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw),
+    "https://picsum.photos/id/237/400/300",
+    "https://picsum.photos/id/231/200/300",
+    "https://picsum.photos/id/27/500/500",
+    "https://picsum.photos/id/17/150/600",
 ]
 PROMPT = "<s>[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]"
 
diff --git a/src/transformers/models/pixtral/image_processing_pixtral.py b/src/transformers/models/pixtral/image_processing_pixtral.py
index de3c644653bea7..c6d18420bec575 100644
--- a/src/transformers/models/pixtral/image_processing_pixtral.py
+++ b/src/transformers/models/pixtral/image_processing_pixtral.py
@@ -86,7 +86,7 @@ def to(self, *args, **kwargs) -> "BatchMixFeature":
                 new_data[k] = [
                     element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
                 ]
-            if torch.is_floating_point(v):
+            elif torch.is_floating_point(v):
                 # cast and send to device
                 new_data[k] = v.to(*args, **kwargs)
             elif device is not None:
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index b1359fad5d8c0d..9362703c8aa6da 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -78,7 +78,7 @@ def to(self, *args, **kwargs) -> "BatchMixFeature":
                 new_data[k] = [
                     element.to(*args, **kwargs) for sample in v for element in sample if is_torch_tensor(element)
                 ]
-            if torch.is_floating_point(v):
+            elif torch.is_floating_point(v):
                 # cast and send to device
                 new_data[k] = v.to(*args, **kwargs)
             elif device is not None:

From 8cfff1afda3e2416b99207b89f870abdc102f176 Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sat, 14 Sep 2024 00:27:36 +0200
Subject: [PATCH 56/58] oups this was really breaking!

---
 docs/source/en/model_doc/pixtral.md                 | 2 +-
 src/transformers/models/pixtral/modeling_pixtral.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md
index ed321b75050983..8df2bf5af5f9ca 100644
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@@ -40,7 +40,7 @@ from transformers import LlavaForConditionalGeneration, AutoProcessor
 from PIL import Image
 
 model_id = "hf-internal-testing/pixtral-12b"
-model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+model = LlavaForConditionalGeneration.from_pretrained(model_id).to("cuda")
 processor = AutoProcessor.from_pretrained(model_id)
 
 IMG_URLS = [
diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 081a28e35cedfe..130d04015daee4 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -194,7 +194,7 @@ def forward(
         value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
 
         cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=0)
 
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
 

From 1effab222252f9539666d8c112f2f50e518134fa Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sat, 14 Sep 2024 00:42:52 +0200
Subject: [PATCH 57/58] nits

---
 src/transformers/models/pixtral/modeling_pixtral.py | 12 ------------
 tests/models/llava/test_modeling_llava.py           |  2 +-
 2 files changed, 1 insertion(+), 13 deletions(-)

diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py
index 130d04015daee4..0e10c78b7852af 100644
--- a/src/transformers/models/pixtral/modeling_pixtral.py
+++ b/src/transformers/models/pixtral/modeling_pixtral.py
@@ -483,18 +483,6 @@ def __init__(self, config):
         self.transformer = PixtralTransformer(config)
         self.patch_positional_embedding = PixtralRotaryEmbedding(config, device=self.device)
 
-    @property
-    def max_patches_per_side(self) -> int:
-        return self.args.image_size // self.args.patch_size
-
-    @property
-    def device(self) -> torch.device:
-        return next(self.parameters()).device
-
-    @property
-    def dtype(self) -> torch.device:
-        return next(self.parameters()).dtype
-
     @add_start_docstrings_to_model_forward(PIXTRAL_INPUTS_DOCSTRING)
     def forward(
         self,
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
index 9dd64bcbd6cc0d..5c05480ffa6dbb 100644
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@@ -574,7 +574,7 @@ def test_expansion_in_processing(self):
     @require_bitsandbytes
     def test_pixtral(self):
         model_id = "hf-internal-testing/pixtral-12b"
-        model = LlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        model = LlavaForConditionalGeneration.from_pretrained(model_id)
         processor = AutoProcessor.from_pretrained(model_id)
 
         IMG_URLS = [

From f33fe1988a5bf858cc5f27ca2cf45e900a169e5d Mon Sep 17 00:00:00 2001
From: Arthur Zucker <arthur.zucker@gmail.com>
Date: Sat, 14 Sep 2024 09:44:00 +0200
Subject: [PATCH 58/58] is composition needs to be true

---
 src/transformers/models/llava/configuration_llava.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py
index f2338a7c5a5df7..3a4cb09855f0ec 100644
--- a/src/transformers/models/llava/configuration_llava.py
+++ b/src/transformers/models/llava/configuration_llava.py
@@ -73,7 +73,7 @@ class LlavaConfig(PretrainedConfig):
     ```"""
 
     model_type = "llava"
-    is_composition = False
+    is_composition = True
 
     def __init__(
         self,