diff --git a/src/adapters/models/bart/adapter_model.py b/src/adapters/models/bart/adapter_model.py index 4e07fc5f10..34a5615644 100644 --- a/src/adapters/models/bart/adapter_model.py +++ b/src/adapters/models/bart/adapter_model.py @@ -1,5 +1,6 @@ import torch +from transformers.generation import GenerationMixin from transformers.models.bart.modeling_bart import ( BART_INPUTS_DOCSTRING, BART_START_DOCSTRING, @@ -18,7 +19,9 @@ @add_start_docstrings( "BART Model with the option to add multiple flexible prediction heads on top.", BART_START_DOCSTRING ) -class BartAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, BartPreTrainedModel): +class BartAdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, BartPreTrainedModel, GenerationMixin +): _tied_weights_keys = [ "encoder.embed_tokens.weight", "decoder.embed_tokens.weight", diff --git a/src/adapters/models/bert/adapter_model.py b/src/adapters/models/bert/adapter_model.py index a15f3e4327..3be78bd5bd 100644 --- a/src/adapters/models/bert/adapter_model.py +++ b/src/adapters/models/bert/adapter_model.py @@ -1,3 +1,4 @@ +from transformers.generation import GenerationMixin from transformers.models.bert.modeling_bert import ( BERT_INPUTS_DOCSTRING, BERT_START_DOCSTRING, @@ -16,7 +17,9 @@ """Bert Model transformer with the option to add multiple flexible heads on top.""", BERT_START_DOCSTRING, ) -class BertAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, BertPreTrainedModel): +class BertAdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, BertPreTrainedModel, GenerationMixin +): head_types = [ "classification", diff --git a/src/adapters/models/bert_generation/adapter_model.py b/src/adapters/models/bert_generation/adapter_model.py index d3822e24a7..0bbe5ad51f 100644 --- a/src/adapters/models/bert_generation/adapter_model.py +++ b/src/adapters/models/bert_generation/adapter_model.py @@ -1,3 +1,4 @@ +from transformers.generation import GenerationMixin from transformers.models.bert_generation.modeling_bert_generation import ( BERT_GENERATION_INPUTS_DOCSTRING, BERT_GENERATION_START_DOCSTRING, @@ -17,7 +18,7 @@ BERT_GENERATION_START_DOCSTRING, ) class BertGenerationAdapterModel( - EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, BertGenerationPreTrainedModel + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, BertGenerationPreTrainedModel, GenerationMixin ): _keys_to_ignore_on_load_unexpected = [r"lm_head.bias"] diff --git a/src/adapters/models/distilbert/adapter_model.py b/src/adapters/models/distilbert/adapter_model.py index 3f38c893ca..d7b09dfe1e 100644 --- a/src/adapters/models/distilbert/adapter_model.py +++ b/src/adapters/models/distilbert/adapter_model.py @@ -1,5 +1,6 @@ import torch.nn as nn +from transformers.generation import GenerationMixin from transformers.models.distilbert.modeling_distilbert import ( DISTILBERT_INPUTS_DOCSTRING, DISTILBERT_START_DOCSTRING, @@ -18,7 +19,7 @@ DISTILBERT_START_DOCSTRING, ) class DistilBertAdapterModel( - EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, DistilBertPreTrainedModel + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, DistilBertPreTrainedModel, GenerationMixin ): head_types = [ "classification", diff --git a/src/adapters/models/electra/adapter_model.py b/src/adapters/models/electra/adapter_model.py index 57e20fadbe..83bc8f9184 100644 --- a/src/adapters/models/electra/adapter_model.py +++ b/src/adapters/models/electra/adapter_model.py @@ -1,3 +1,4 @@ +from transformers.generation import GenerationMixin from transformers.models.electra.modeling_electra import ( ELECTRA_INPUTS_DOCSTRING, ELECTRA_START_DOCSTRING, @@ -16,7 +17,9 @@ """Electra Model transformer with the option to add multiple flexible heads on top.""", ELECTRA_START_DOCSTRING, ) -class ElectraAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, ElectraPreTrainedModel): +class ElectraAdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, ElectraPreTrainedModel, GenerationMixin +): head_types = [ "classification", diff --git a/src/adapters/models/gpt2/adapter_model.py b/src/adapters/models/gpt2/adapter_model.py index 2cfbdc8821..c6b96d1204 100644 --- a/src/adapters/models/gpt2/adapter_model.py +++ b/src/adapters/models/gpt2/adapter_model.py @@ -2,6 +2,7 @@ import torch +from transformers.generation import GenerationMixin from transformers.models.gpt2.modeling_gpt2 import GPT2_START_DOCSTRING, GPT2Model, GPT2PreTrainedModel from transformers.utils import add_start_docstrings @@ -25,7 +26,9 @@ """, GPT2_START_DOCSTRING, ) -class GPT2AdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, GPT2PreTrainedModel): +class GPT2AdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, GPT2PreTrainedModel, GenerationMixin +): head_types = [ "classification", "multilabel_classification", diff --git a/src/adapters/models/gptj/adapter_model.py b/src/adapters/models/gptj/adapter_model.py index f029f840d6..c075aeac1a 100644 --- a/src/adapters/models/gptj/adapter_model.py +++ b/src/adapters/models/gptj/adapter_model.py @@ -2,6 +2,7 @@ import torch +from transformers.generation import GenerationMixin from transformers.models.gptj.modeling_gptj import GPTJ_START_DOCSTRING, GPTJModel, GPTJPreTrainedModel from transformers.utils import add_start_docstrings @@ -25,7 +26,9 @@ """, GPTJ_START_DOCSTRING, ) -class GPTJAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, GPTJPreTrainedModel): +class GPTJAdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, GPTJPreTrainedModel, GenerationMixin +): head_types = [ "classification", "multilabel_classification", diff --git a/src/adapters/models/llama/adapter_model.py b/src/adapters/models/llama/adapter_model.py index c3116fbe14..39d93ad9b5 100644 --- a/src/adapters/models/llama/adapter_model.py +++ b/src/adapters/models/llama/adapter_model.py @@ -3,6 +3,7 @@ import torch +from transformers.generation import GenerationMixin from transformers.models.llama.modeling_llama import LLAMA_START_DOCSTRING, LlamaModel, LlamaPreTrainedModel from transformers.utils import add_start_docstrings @@ -26,7 +27,9 @@ """, LLAMA_START_DOCSTRING, ) -class LlamaAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, LlamaPreTrainedModel): +class LlamaAdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, LlamaPreTrainedModel, GenerationMixin +): head_types = [ "classification", "multilabel_classification", diff --git a/src/adapters/models/mbart/adapter_model.py b/src/adapters/models/mbart/adapter_model.py index ebbfb45efa..06e31650fa 100644 --- a/src/adapters/models/mbart/adapter_model.py +++ b/src/adapters/models/mbart/adapter_model.py @@ -1,5 +1,6 @@ import torch +from transformers.generation import GenerationMixin from transformers.models.mbart.modeling_mbart import ( MBART_INPUTS_DOCSTRING, MBART_START_DOCSTRING, @@ -19,7 +20,9 @@ @add_start_docstrings( "MBART Model with the option to add multiple flexible prediction heads on top.", MBART_START_DOCSTRING ) -class MBartAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, MBartPreTrainedModel): +class MBartAdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, MBartPreTrainedModel, GenerationMixin +): _tied_weights_keys = [ "encoder.embed_tokens.weight", "decoder.embed_tokens.weight", diff --git a/src/adapters/models/mistral/adapter_model.py b/src/adapters/models/mistral/adapter_model.py index 1909fccdec..595cace188 100644 --- a/src/adapters/models/mistral/adapter_model.py +++ b/src/adapters/models/mistral/adapter_model.py @@ -2,6 +2,7 @@ import torch +from transformers.generation import GenerationMixin from transformers.models.mistral.modeling_mistral import MISTRAL_START_DOCSTRING, MistralModel, MistralPreTrainedModel from transformers.utils import add_start_docstrings @@ -25,7 +26,9 @@ """, MISTRAL_START_DOCSTRING, ) -class MistralAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, MistralPreTrainedModel): +class MistralAdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, MistralPreTrainedModel, GenerationMixin +): head_types = [ "classification", "multilabel_classification", diff --git a/src/adapters/models/mt5/adapter_model.py b/src/adapters/models/mt5/adapter_model.py index 418b47b13f..705d0852ef 100644 --- a/src/adapters/models/mt5/adapter_model.py +++ b/src/adapters/models/mt5/adapter_model.py @@ -2,6 +2,7 @@ import torch +from transformers.generation import GenerationMixin from transformers.models.mt5.modeling_mt5 import ( MT5_INPUTS_DOCSTRING, MT5_START_DOCSTRING, @@ -22,7 +23,9 @@ @add_start_docstrings( "MT5 Model with the option to add multiple flexible prediction heads on top.", MT5_START_DOCSTRING ) -class MT5AdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, MT5PreTrainedModel): +class MT5AdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, MT5PreTrainedModel, GenerationMixin +): _tied_weights_keys = [ "encoder.embed_tokens.weight", "decoder.embed_tokens.weight", diff --git a/src/adapters/models/plbart/adapter_model.py b/src/adapters/models/plbart/adapter_model.py index 2aaaf0b9fa..0475fd077d 100644 --- a/src/adapters/models/plbart/adapter_model.py +++ b/src/adapters/models/plbart/adapter_model.py @@ -1,5 +1,6 @@ import torch +from transformers.generation import GenerationMixin from transformers.models.plbart.modeling_plbart import ( PLBART_INPUTS_DOCSTRING, PLBART_START_DOCSTRING, @@ -18,7 +19,9 @@ @add_start_docstrings( "PLBART Model with the option to add multiple flexible prediction heads on top.", PLBART_START_DOCSTRING ) -class PLBartAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, PLBartPreTrainedModel): +class PLBartAdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, PLBartPreTrainedModel, GenerationMixin +): _tied_weights_keys = [ "encoder.embed_tokens.weight", "decoder.embed_tokens.weight", diff --git a/src/adapters/models/roberta/adapter_model.py b/src/adapters/models/roberta/adapter_model.py index ab9411ef7d..5a9af959d8 100644 --- a/src/adapters/models/roberta/adapter_model.py +++ b/src/adapters/models/roberta/adapter_model.py @@ -1,3 +1,4 @@ +from transformers.generation import GenerationMixin from transformers.models.roberta.modeling_roberta import ( ROBERTA_INPUTS_DOCSTRING, ROBERTA_START_DOCSTRING, @@ -16,7 +17,9 @@ """Roberta Model transformer with the option to add multiple flexible heads on top.""", ROBERTA_START_DOCSTRING, ) -class RobertaAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, RobertaPreTrainedModel): +class RobertaAdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, RobertaPreTrainedModel, GenerationMixin +): head_types = [ "classification", "multilabel_classification", diff --git a/src/adapters/models/t5/adapter_model.py b/src/adapters/models/t5/adapter_model.py index 5aa7aff4fd..5f2b324380 100644 --- a/src/adapters/models/t5/adapter_model.py +++ b/src/adapters/models/t5/adapter_model.py @@ -2,6 +2,7 @@ import torch +from transformers.generation import GenerationMixin from transformers.models.t5.modeling_t5 import T5_INPUTS_DOCSTRING, T5_START_DOCSTRING, T5Model, T5PreTrainedModel from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward @@ -15,7 +16,9 @@ @add_start_docstrings("T5 Model with the option to add multiple flexible prediction heads on top.", T5_START_DOCSTRING) -class T5AdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, T5PreTrainedModel): +class T5AdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, T5PreTrainedModel, GenerationMixin +): _tied_weights_keys = [ "encoder.embed_tokens.weight", "decoder.embed_tokens.weight", diff --git a/src/adapters/models/whisper/adapter_model.py b/src/adapters/models/whisper/adapter_model.py index d76ae610c5..4bcc026927 100644 --- a/src/adapters/models/whisper/adapter_model.py +++ b/src/adapters/models/whisper/adapter_model.py @@ -1,6 +1,7 @@ import torch from transformers import EncoderDecoderCache, StaticCache +from transformers.generation import GenerationMixin from transformers.models.whisper.modeling_whisper import ( WHISPER_INPUTS_DOCSTRING, WHISPER_START_DOCSTRING, @@ -19,7 +20,9 @@ @add_start_docstrings( "WHISPER Model with the option to add multiple flexible prediction heads on top.", WHISPER_START_DOCSTRING ) -class WhisperAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, WhisperPreTrainedModel): +class WhisperAdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, WhisperPreTrainedModel, GenerationMixin +): _tied_weights_keys = [] head_types = ["seq2seq_lm"] diff --git a/src/adapters/models/xlm_roberta/adapter_model.py b/src/adapters/models/xlm_roberta/adapter_model.py index 1cab4aaac5..559202d52d 100644 --- a/src/adapters/models/xlm_roberta/adapter_model.py +++ b/src/adapters/models/xlm_roberta/adapter_model.py @@ -1,3 +1,4 @@ +from transformers.generation import GenerationMixin from transformers.models.xlm_roberta.modeling_xlm_roberta import ( XLM_ROBERTA_INPUTS_DOCSTRING, XLM_ROBERTA_START_DOCSTRING, @@ -17,7 +18,7 @@ XLM_ROBERTA_START_DOCSTRING, ) class XLMRobertaAdapterModel( - EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, XLMRobertaPreTrainedModel + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, XLMRobertaPreTrainedModel, GenerationMixin ): head_types = [ diff --git a/src/adapters/models/xmod/adapter_model.py b/src/adapters/models/xmod/adapter_model.py index a179fc6be8..e81f49dee0 100644 --- a/src/adapters/models/xmod/adapter_model.py +++ b/src/adapters/models/xmod/adapter_model.py @@ -2,6 +2,7 @@ import torch +from transformers.generation import GenerationMixin from transformers.models.xmod.modeling_xmod import ( XMOD_INPUTS_DOCSTRING, XMOD_START_DOCSTRING, @@ -20,7 +21,9 @@ """X-MOD Model transformer with the option to add multiple flexible heads on top.""", XMOD_START_DOCSTRING, ) -class XmodAdapterModel(EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, XmodPreTrainedModel): +class XmodAdapterModel( + EmbeddingAdaptersWrapperMixin, ModelWithFlexibleHeadsAdaptersMixin, XmodPreTrainedModel, GenerationMixin +): head_types = [ "classification",