huggingface · aijadugar · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025 · Oct 25, 2025
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
@@ -654,26 +654,18 @@ def to_dict(self) -> dict[str, Any]:
         Returns:
             `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance.
         """
-        output = copy.deepcopy(self.__dict__)
+        # shallow copy to avoid deepcopy errors
+        output = self.__dict__.copy()
 
         # Get the kwargs in `__init__`.
         sig = inspect.signature(self.__init__)
-        # Only save the attributes that are presented in the kwargs of `__init__`.
-        # or in the attributes
-        attrs_to_save = list(sig.parameters) + self.__class__.attributes
-        # extra attributes to be kept
-        attrs_to_save += ["auto_map"]
-
-        if "tokenizer" in output:
-            del output["tokenizer"]
-        if "qformer_tokenizer" in output:
-            del output["qformer_tokenizer"]
-        if "protein_tokenizer" in output:
-            del output["protein_tokenizer"]
-        if "char_tokenizer" in output:
-            del output["char_tokenizer"]
-        if "chat_template" in output:
-            del output["chat_template"]
+        # Save only the attributes that are either passed as kwargs to `__init__`,
+        # defined in the class's `attributes` list, or included in "auto_map".
+        attrs_to_save = list(sig.parameters) + self.__class__.attributes + ["auto_map"]
+
+        # Special attributes to handle: tokenizers and chat_template
+        for key in ["tokenizer", "qformer_tokenizer", "protein_tokenizer", "char_tokenizer", "chat_template"]:
+            output.pop(key, None)
 
         def save_public_processor_class(dictionary):
             # make sure private name "_processor_class" is correctly
@@ -748,7 +740,7 @@ def __repr__(self):
         attributes_repr = "\n".join(attributes_repr)
         return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}"
 
-    def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
+    def save_pretrained(self, save_directory, save_jinja_files=False, push_to_hub: bool = False, **kwargs):
         """
         Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
         can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
@@ -792,9 +784,12 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs):
             if hasattr(attribute, "_set_processor_class"):
                 attribute._set_processor_class(self.__class__.__name__)
 
-            # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
-            if attribute_name == "tokenizer":
-                attribute.save_pretrained(save_directory)
+            # if attribute is tokenizer, then save it in its own file for avoid overwriting
+            if hasattr(attribute, "save_pretrained"):
+                # use the attribute_name as prefix to create a unique file
+                attribute_save_dir = os.path.join(save_directory, attribute_name)
+                os.makedirs(attribute_save_dir, exist_ok=True)
+                attribute.save_pretrained(attribute_save_dir, save_jinja_files=save_jinja_files)
             elif attribute._auto_class is not None:
                 custom_object_save(attribute, save_directory, config=attribute)
 
@@ -1425,7 +1420,14 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs)
             else:
                 attribute_class = cls.get_possibly_dynamic_module(class_name)
 
-            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+            # updated loading path for handling multiple tokenizers
+            attribute_path = os.path.join(pretrained_model_name_or_path, attribute_name)
+            if os.path.isdir(attribute_path):
+                # load from its attribute's-specific folder
+                args.append(attribute_class.from_pretrained(attribute_path, **kwargs))
+            else:
+                # now fallback to original path
+                args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
 
         return args
 

diff --git a/tests/test_processor_utils.py b/tests/test_processor_utils.py
@@ -0,0 +1,51 @@
+import tempfile, os
+
+from transformers import (
+    ProcessorMixin,
+    BertTokenizerFast,
+    RobertaTokenizerFast,
+)
+from transformers.testing_utils import TestCasePlus
+
+
+class TestProcessorSavePretrainedMultipleAttributes(TestCasePlus):
+    def test_processor_loads_separate_attributes(self):
+
+        class OtherProcessor(ProcessorMixin):
+            name = "other-processor"
+            attributes = ["tokenizer1", "tokenizer2"]
+
+            # Must be class names as strings
+            tokenizer1_class = "BertTokenizerFast"
+            tokenizer2_class = "RobertaTokenizerFast"
+
+            def __init__(self, tokenizer1, tokenizer2):
+                super().__init__(tokenizer1=tokenizer1, tokenizer2=tokenizer2)
+
+        # Initialize tokenizers
+        tokenizer1 = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        tokenizer2 = RobertaTokenizerFast.from_pretrained("roberta-base")
+
+        processor = OtherProcessor(tokenizer1=tokenizer1, tokenizer2=tokenizer2)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Save tokenizers in separate folders
+            tokenizer1_dir = os.path.join(temp_dir, "tokenizer1")
+            tokenizer1.save_pretrained(tokenizer1_dir)
+
+            tokenizer2_dir = os.path.join(temp_dir, "tokenizer2")
+            tokenizer2.save_pretrained(tokenizer2_dir)
+
+            # Save processor metadata
+            processor.save_pretrained(temp_dir, push_to_hub=False)
+
+            # Reload tokenizers
+            loaded_tokenizer1 = BertTokenizerFast.from_pretrained(tokenizer1_dir)
+            loaded_tokenizer2 = RobertaTokenizerFast.from_pretrained(tokenizer2_dir)
+
+            # Recreate processor with loaded tokenizers
+            new_processor = OtherProcessor(tokenizer1=loaded_tokenizer1,
+                                           tokenizer2=loaded_tokenizer2)
+
+        # Assert the two tokenizers are of different classes
+        assert new_processor.tokenizer1.__class__ != new_processor.tokenizer2.__class__