From 73f91c5211afe372ae26beaee394aeefa15dcc66 Mon Sep 17 00:00:00 2001 From: BARI ANKIT VINOD <139578960+OnlyCR7@users.noreply.github.com> Date: Fri, 24 Oct 2025 08:44:03 +0530 Subject: [PATCH 01/12] multiple tokenizers with different filenames can save now --- src/transformers/processing_utils.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 2a0fc63a0a66..afc5ff77022e 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -794,10 +794,12 @@ def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): if hasattr(attribute, "_set_processor_class"): attribute._set_processor_class(self.__class__.__name__) - # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json` - if attribute_name == "tokenizer": - # Propagate save_jinja_files to tokenizer to ensure we don't get conflicts - attribute.save_pretrained(save_directory, save_jinja_files=save_jinja_files) + # if attribute is tokenizer, then save it in its own file for avoid overwriting + if hasattr(attribute, "save_pretrained"): + # use the attribute_name as prefix to create a unique file + attribute_save_dir = os.path.join(save_directory, attribute_name) + os.makedirs(attribute_save_dir, exist_ok=True) + attribute.save_pretrained(attribute_save_dir, save_jinja_files=save_jinja_files) elif attribute._auto_class is not None: custom_object_save(attribute, save_directory, config=attribute) @@ -1450,7 +1452,14 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs) else: attribute_class = cls.get_possibly_dynamic_module(class_name) - args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) + # updated loading path for handling multiple tokenizers + attribute_path = os.path.join(pretrained_model_name_or_path, attribute_name) + if os.path.isdir(attribute_path): + # load from its attribute's-specific folder + args.append(attribute_class.from_pretrained(attribute_path, **kwargs)) + else: + # now fallback to original path + args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) return args From c2ab93f760425d641e8202028d94ab2c2fa54705 Mon Sep 17 00:00:00 2001 From: BARI ANKIT VINOD <139578960+OnlyCR7@users.noreply.github.com> Date: Fri, 24 Oct 2025 23:49:40 +0530 Subject: [PATCH 02/12] shallow copy to avoid deepcopy errors --- src/transformers/processing_utils.py | 37 ++++++---------------------- tests/test_processor_utils.py | 37 ++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 30 deletions(-) create mode 100644 tests/test_processor_utils.py diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index afc5ff77022e..c113e41e4638 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -654,30 +654,16 @@ def to_dict(self) -> dict[str, Any]: Returns: `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance. """ - output = copy.deepcopy(self.__dict__) + # shallow copy to avoid deepcopy errors + output = self.__dict__.copy() - # Get the kwargs in `__init__`. sig = inspect.signature(self.__init__) - # Only save the attributes that are presented in the kwargs of `__init__`. - # or in the attributes - attrs_to_save = list(sig.parameters) + self.__class__.attributes - # extra attributes to be kept - attrs_to_save += ["auto_map"] - - if "tokenizer" in output: - del output["tokenizer"] - if "qformer_tokenizer" in output: - del output["qformer_tokenizer"] - if "protein_tokenizer" in output: - del output["protein_tokenizer"] - if "char_tokenizer" in output: - del output["char_tokenizer"] - if "chat_template" in output: - del output["chat_template"] + attrs_to_save = list(sig.parameters) + self.__class__.attributes + ["auto_map"] + + for key in ["tokenizer", "qformer_tokenizer", "protein_tokenizer", "char_tokenizer", "chat_template"]: + output.pop(key, None) def save_public_processor_class(dictionary): - # make sure private name "_processor_class" is correctly - # saved as "processor_class" _processor_class = dictionary.pop("_processor_class", None) if _processor_class is not None: dictionary["processor_class"] = _processor_class @@ -687,10 +673,6 @@ def save_public_processor_class(dictionary): return dictionary def cast_array_to_list(dictionary): - """ - Numpy arrays are not serialiazable but can be in pre-processing dicts. - This function casts arrays to list, recusring through the nested configs as well. - """ for key, value in dictionary.items(): if isinstance(value, np.ndarray): dictionary[key] = value.tolist() @@ -698,7 +680,6 @@ def cast_array_to_list(dictionary): dictionary[key] = cast_array_to_list(value) return dictionary - # Special case, add `audio_tokenizer` dict which points to model weights and path if "audio_tokenizer" in output: audio_tokenizer_dict = { "audio_tokenizer_class": self.audio_tokenizer.__class__.__name__, @@ -706,14 +687,10 @@ def cast_array_to_list(dictionary): } output["audio_tokenizer"] = audio_tokenizer_dict - # Serialize attributes as a dict output = { k: v.to_dict() if isinstance(v, PushToHubMixin) else v for k, v in output.items() - if ( - k in attrs_to_save # keep all attributes that have to be serialized - and v.__class__.__name__ != "BeamSearchDecoderCTC" # remove attributes with that are objects - ) + if k in attrs_to_save and v.__class__.__name__ != "BeamSearchDecoderCTC" } output = cast_array_to_list(output) output = save_public_processor_class(output) diff --git a/tests/test_processor_utils.py b/tests/test_processor_utils.py new file mode 100644 index 000000000000..6dd390e1de45 --- /dev/null +++ b/tests/test_processor_utils.py @@ -0,0 +1,37 @@ +import tempfile + +from transformers.testing_utils import TestCasePlus +from transformers import ProcessorMixin, AutoTokenizer, PreTrainedTokenizer + + +class ProcessorSavePretrainedMultipleAttributes(TestCasePlus): + def test_processor_loads_separate_attributes(self): + class OtherProcessor(ProcessorMixin): + name = "other-processor" + + attributes = [ + "tokenizer1", + "tokenizer2", + ] + tokenizer1_class = "AutoTokenizer" + tokenizer2_class = "AutoTokenizer" + + def __init__(self, + tokenizer1: PreTrainedTokenizer, + tokenizer2: PreTrainedTokenizer + ): + super().__init__(tokenizer1=tokenizer1, + tokenizer2=tokenizer2) + + tokenizer1 = AutoTokenizer.from_pretrained("google/gemma-3-270m") + tokenizer2 = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B") + + processor = OtherProcessor(tokenizer1=tokenizer1, + tokenizer2=tokenizer2) + assert processor.tokenizer1.__class__ != processor.tokenizer2.__class__ + + with tempfile.TemporaryDirectory() as temp_dir: + processor.save_pretrained(save_directory=temp_dir, push_to_hub=False) + new_processor = OtherProcessor.from_pretrained(temp_dir) + + assert new_processor.tokenizer1.__class__ != new_processor.tokenizer2.__class__ From 2e90fa5ecc3111548b6f3f5de2945dd7428d0bcd Mon Sep 17 00:00:00 2001 From: "Mr. @" <139578960+aijadugar@users.noreply.github.com> Date: Sat, 25 Oct 2025 00:21:29 +0530 Subject: [PATCH 03/12] Fix formatting issues in processing_utils.py removes whitespaces --- src/transformers/processing_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index c113e41e4638..c0aff9226c64 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -655,7 +655,7 @@ def to_dict(self) -> dict[str, Any]: `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance. """ # shallow copy to avoid deepcopy errors - output = self.__dict__.copy() + output = self.__dict__.copy() sig = inspect.signature(self.__init__) attrs_to_save = list(sig.parameters) + self.__class__.attributes + ["auto_map"] @@ -1429,7 +1429,7 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs) else: attribute_class = cls.get_possibly_dynamic_module(class_name) - # updated loading path for handling multiple tokenizers + # updated loading path for handling multiple tokenizers attribute_path = os.path.join(pretrained_model_name_or_path, attribute_name) if os.path.isdir(attribute_path): # load from its attribute's-specific folder From 93731c5b5b5620b7ed528e057857258c244800a8 Mon Sep 17 00:00:00 2001 From: "Mr. @" <139578960+aijadugar@users.noreply.github.com> Date: Sat, 25 Oct 2025 17:40:18 +0530 Subject: [PATCH 04/12] Fix processor save logic for multiple tokenizers and restore original comments and serialization behavior --- src/transformers/processing_utils.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index c113e41e4638..16996bd2ec2d 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -664,6 +664,11 @@ def to_dict(self) -> dict[str, Any]: output.pop(key, None) def save_public_processor_class(dictionary): + """ + Numpy arrays are not serialiazable but can be in pre-processing dicts. + This function casts arrays to list, recusring through the nested configs as well. + """ + # make sure private name "_processor_class" is correctly saved as "processor_class" _processor_class = dictionary.pop("_processor_class", None) if _processor_class is not None: dictionary["processor_class"] = _processor_class @@ -685,12 +690,15 @@ def cast_array_to_list(dictionary): "audio_tokenizer_class": self.audio_tokenizer.__class__.__name__, "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path, } + # Special case, add `audio_tokenizer` dict which points to model weights and path output["audio_tokenizer"] = audio_tokenizer_dict - + # Serialize attributes as a dict output = { k: v.to_dict() if isinstance(v, PushToHubMixin) else v for k, v in output.items() - if k in attrs_to_save and v.__class__.__name__ != "BeamSearchDecoderCTC" + if ( + k in attrs_to_save # keep all attributes that have to be serialized + ) } output = cast_array_to_list(output) output = save_public_processor_class(output) From 67c3bb79d2969872d17078500097a7a0ed265c30 Mon Sep 17 00:00:00 2001 From: "Mr. @" <139578960+aijadugar@users.noreply.github.com> Date: Sun, 26 Oct 2025 08:45:32 +0530 Subject: [PATCH 05/12] Revert unrelated edits and keep only relevant processor save logic Moved docstring for clarity and added it to the cast_array_to_list function. --- src/transformers/processing_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 544696a1b5d3..b742ce0ff739 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -664,10 +664,6 @@ def to_dict(self) -> dict[str, Any]: output.pop(key, None) def save_public_processor_class(dictionary): - """ - Numpy arrays are not serialiazable but can be in pre-processing dicts. - This function casts arrays to list, recusring through the nested configs as well. - """ # make sure private name "_processor_class" is correctly saved as "processor_class" _processor_class = dictionary.pop("_processor_class", None) if _processor_class is not None: @@ -678,6 +674,10 @@ def save_public_processor_class(dictionary): return dictionary def cast_array_to_list(dictionary): + """ + Numpy arrays are not serialiazable but can be in pre-processing dicts. + This function casts arrays to list, recusring through the nested configs as well. + """ for key, value in dictionary.items(): if isinstance(value, np.ndarray): dictionary[key] = value.tolist() From 560067e28dcfb40bd01df4ece1b383e5807a21f9 Mon Sep 17 00:00:00 2001 From: "Mr. @" <139578960+aijadugar@users.noreply.github.com> Date: Sun, 26 Oct 2025 09:01:08 +0530 Subject: [PATCH 06/12] Restore original comments and docstrings; keep only relevant processor save logic --- src/transformers/processing_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index b742ce0ff739..cb0972213941 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -657,14 +657,19 @@ def to_dict(self) -> dict[str, Any]: # shallow copy to avoid deepcopy errors output = self.__dict__.copy() + # Get the kwargs in `__init__`. sig = inspect.signature(self.__init__) + # Save only the attributes that are either passed as kwargs to `__init__`, + # defined in the class's `attributes` list, or included in "auto_map". attrs_to_save = list(sig.parameters) + self.__class__.attributes + ["auto_map"] + # Special attributes to handle: tokenizers and chat_template for key in ["tokenizer", "qformer_tokenizer", "protein_tokenizer", "char_tokenizer", "chat_template"]: output.pop(key, None) def save_public_processor_class(dictionary): - # make sure private name "_processor_class" is correctly saved as "processor_class" + # make sure private name "_processor_class" is correctly + # saved as "processor_class" _processor_class = dictionary.pop("_processor_class", None) if _processor_class is not None: dictionary["processor_class"] = _processor_class @@ -684,13 +689,12 @@ def cast_array_to_list(dictionary): elif isinstance(value, dict): dictionary[key] = cast_array_to_list(value) return dictionary - + # Special case, add `audio_tokenizer` dict which points to model weights and path if "audio_tokenizer" in output: audio_tokenizer_dict = { "audio_tokenizer_class": self.audio_tokenizer.__class__.__name__, "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path, } - # Special case, add `audio_tokenizer` dict which points to model weights and path output["audio_tokenizer"] = audio_tokenizer_dict # Serialize attributes as a dict output = { From 68caa7dcdd9c39016d3bbe8ba9b0a24c1f1d7baf Mon Sep 17 00:00:00 2001 From: "Mr. @" <139578960+aijadugar@users.noreply.github.com> Date: Sun, 26 Oct 2025 09:12:02 +0530 Subject: [PATCH 07/12] Restore original comments and keep only relevant processor save logic --- src/transformers/processing_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index cb0972213941..b9dde9b00291 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -696,12 +696,14 @@ def cast_array_to_list(dictionary): "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path, } output["audio_tokenizer"] = audio_tokenizer_dict - # Serialize attributes as a dict + + # Serialize attributes as a dict output = { k: v.to_dict() if isinstance(v, PushToHubMixin) else v for k, v in output.items() if ( k in attrs_to_save # keep all attributes that have to be serialized + and v.__class__.__name__ != "BeamSearchDecoderCTC" # remove attributes with that are objects ) } output = cast_array_to_list(output) From 4b2c0495e57cc5cf86ca1b8e206766a211fa2b45 Mon Sep 17 00:00:00 2001 From: "Mr. @" <139578960+aijadugar@users.noreply.github.com> Date: Sun, 26 Oct 2025 09:16:58 +0530 Subject: [PATCH 08/12] Restore original comments --- src/transformers/processing_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index b9dde9b00291..818fe1ef187a 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -689,6 +689,7 @@ def cast_array_to_list(dictionary): elif isinstance(value, dict): dictionary[key] = cast_array_to_list(value) return dictionary + # Special case, add `audio_tokenizer` dict which points to model weights and path if "audio_tokenizer" in output: audio_tokenizer_dict = { @@ -696,7 +697,7 @@ def cast_array_to_list(dictionary): "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path, } output["audio_tokenizer"] = audio_tokenizer_dict - + # Serialize attributes as a dict output = { k: v.to_dict() if isinstance(v, PushToHubMixin) else v From 9e4b1419c68923cf73697ff40507e3ecbb177eef Mon Sep 17 00:00:00 2001 From: "Mr. @" <139578960+aijadugar@users.noreply.github.com> Date: Sun, 26 Oct 2025 09:19:37 +0530 Subject: [PATCH 09/12] Restore original comments --- src/transformers/processing_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 818fe1ef187a..ed2b34670c28 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -689,7 +689,7 @@ def cast_array_to_list(dictionary): elif isinstance(value, dict): dictionary[key] = cast_array_to_list(value) return dictionary - + # Special case, add `audio_tokenizer` dict which points to model weights and path if "audio_tokenizer" in output: audio_tokenizer_dict = { @@ -697,7 +697,7 @@ def cast_array_to_list(dictionary): "audio_tokenizer_name_or_path": self.audio_tokenizer.name_or_path, } output["audio_tokenizer"] = audio_tokenizer_dict - + # Serialize attributes as a dict output = { k: v.to_dict() if isinstance(v, PushToHubMixin) else v From 1ffb4d3ee0f3bbbffec57d24513698c8a3a458f0 Mon Sep 17 00:00:00 2001 From: "Mr. @" <139578960+aijadugar@users.noreply.github.com> Date: Sun, 26 Oct 2025 23:06:35 +0530 Subject: [PATCH 10/12] removes trailling whitespaces --- src/transformers/processing_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index ed2b34670c28..316a64d64d2b 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -655,7 +655,7 @@ def to_dict(self) -> dict[str, Any]: `dict[str, Any]`: Dictionary of all the attributes that make up this processor instance. """ # shallow copy to avoid deepcopy errors - output = self.__dict__.copy() + output = self.__dict__.copy() # Get the kwargs in `__init__`. sig = inspect.signature(self.__init__) @@ -740,7 +740,7 @@ def __repr__(self): attributes_repr = "\n".join(attributes_repr) return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}" - def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): + def save_pretrained(self, save_directory, save_jinja_files=False, push_to_hub: bool = False, **kwargs): """ Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it can be reloaded using the [`~ProcessorMixin.from_pretrained`] method. @@ -1420,7 +1420,7 @@ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs) else: attribute_class = cls.get_possibly_dynamic_module(class_name) - # updated loading path for handling multiple tokenizers + # updated loading path for handling multiple tokenizers attribute_path = os.path.join(pretrained_model_name_or_path, attribute_name) if os.path.isdir(attribute_path): # load from its attribute's-specific folder From 8203b0ed367ba134e32c37651aa31f9ac2367a48 Mon Sep 17 00:00:00 2001 From: "Mr. @" <139578960+aijadugar@users.noreply.github.com> Date: Sun, 26 Oct 2025 23:08:12 +0530 Subject: [PATCH 11/12] import like ruff's isort rule --- tests/test_processor_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_processor_utils.py b/tests/test_processor_utils.py index 6dd390e1de45..16bc36cda5ad 100644 --- a/tests/test_processor_utils.py +++ b/tests/test_processor_utils.py @@ -1,7 +1,7 @@ import tempfile +from transformers import AutoTokenizer, PreTrainedTokenizer, ProcessorMixin from transformers.testing_utils import TestCasePlus -from transformers import ProcessorMixin, AutoTokenizer, PreTrainedTokenizer class ProcessorSavePretrainedMultipleAttributes(TestCasePlus): From 17f81b0622498e67269e400dc55c2a55f11e0513 Mon Sep 17 00:00:00 2001 From: BARI ANKIT VINOD <139578960+OnlyCR7@users.noreply.github.com> Date: Mon, 27 Oct 2025 08:44:54 +0530 Subject: [PATCH 12/12] Fix processor save/load for multiple tokenizer attributes --- tests/test_processor_utils.py | 58 ++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/tests/test_processor_utils.py b/tests/test_processor_utils.py index 16bc36cda5ad..2eee03431fb1 100644 --- a/tests/test_processor_utils.py +++ b/tests/test_processor_utils.py @@ -1,37 +1,51 @@ -import tempfile +import tempfile, os -from transformers import AutoTokenizer, PreTrainedTokenizer, ProcessorMixin +from transformers import ( + ProcessorMixin, + BertTokenizerFast, + RobertaTokenizerFast, +) from transformers.testing_utils import TestCasePlus -class ProcessorSavePretrainedMultipleAttributes(TestCasePlus): +class TestProcessorSavePretrainedMultipleAttributes(TestCasePlus): def test_processor_loads_separate_attributes(self): + class OtherProcessor(ProcessorMixin): name = "other-processor" + attributes = ["tokenizer1", "tokenizer2"] - attributes = [ - "tokenizer1", - "tokenizer2", - ] - tokenizer1_class = "AutoTokenizer" - tokenizer2_class = "AutoTokenizer" + # Must be class names as strings + tokenizer1_class = "BertTokenizerFast" + tokenizer2_class = "RobertaTokenizerFast" - def __init__(self, - tokenizer1: PreTrainedTokenizer, - tokenizer2: PreTrainedTokenizer - ): - super().__init__(tokenizer1=tokenizer1, - tokenizer2=tokenizer2) + def __init__(self, tokenizer1, tokenizer2): + super().__init__(tokenizer1=tokenizer1, tokenizer2=tokenizer2) - tokenizer1 = AutoTokenizer.from_pretrained("google/gemma-3-270m") - tokenizer2 = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B") + # Initialize tokenizers + tokenizer1 = BertTokenizerFast.from_pretrained("bert-base-uncased") + tokenizer2 = RobertaTokenizerFast.from_pretrained("roberta-base") - processor = OtherProcessor(tokenizer1=tokenizer1, - tokenizer2=tokenizer2) - assert processor.tokenizer1.__class__ != processor.tokenizer2.__class__ + processor = OtherProcessor(tokenizer1=tokenizer1, tokenizer2=tokenizer2) with tempfile.TemporaryDirectory() as temp_dir: - processor.save_pretrained(save_directory=temp_dir, push_to_hub=False) - new_processor = OtherProcessor.from_pretrained(temp_dir) + # Save tokenizers in separate folders + tokenizer1_dir = os.path.join(temp_dir, "tokenizer1") + tokenizer1.save_pretrained(tokenizer1_dir) + + tokenizer2_dir = os.path.join(temp_dir, "tokenizer2") + tokenizer2.save_pretrained(tokenizer2_dir) + + # Save processor metadata + processor.save_pretrained(temp_dir, push_to_hub=False) + + # Reload tokenizers + loaded_tokenizer1 = BertTokenizerFast.from_pretrained(tokenizer1_dir) + loaded_tokenizer2 = RobertaTokenizerFast.from_pretrained(tokenizer2_dir) + + # Recreate processor with loaded tokenizers + new_processor = OtherProcessor(tokenizer1=loaded_tokenizer1, + tokenizer2=loaded_tokenizer2) + # Assert the two tokenizers are of different classes assert new_processor.tokenizer1.__class__ != new_processor.tokenizer2.__class__