Remove reference of video_load_backend and video_fps for processor

cyyever · cyyever · commit adff27913b7c · 2025-09-08T18:39:25.000+08:00
Signed-off-by: cyy &lt;cyyever@outlook.com&gt;
diff --git a/docs/source/en/chat_templating_multimodal.md b/docs/source/en/chat_templating_multimodal.md
@@ -195,10 +195,6 @@ messages = [
 
 Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process.
 
-The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
-
-The examples below use Decord as the backend because it is a bit faster than PyAV.
-
 <hfoptions id="sampling">
 <hfoption id="fixed number of frames">
 
@@ -213,7 +209,6 @@ processed_chat = processor.apply_chat_template(
     return_dict=True,
     return_tensors="pt",
     num_frames=32,
-    video_load_backend="decord",
 )
 print(processed_chat.keys())
 ```
@@ -223,16 +218,15 @@ These inputs are now ready to be used in [`~GenerationMixin.generate`].
 </hfoption>
 <hfoption id="fps">
 
-For longer videos, it may be better to sample more frames for better representation with the `video_fps` parameter. This determines how many frames per second to extract. As an example, if a video is 10 seconds long and `video_fps=2`, then the model samples 20 frames. In other words, 2 frames are uniformly sampled every 10 seconds.
+For longer videos, it may be better to sample more frames for better representation with the `fps` parameter. This determines how many frames per second to extract. As an example, if a video is 10 seconds long and `fps=2`, then the model samples 20 frames. In other words, 2 frames are uniformly sampled every 10 seconds.
 
 ```py
 processed_chat = processor.apply_chat_template(
     messages,
     add_generation_prompt=True,
     tokenize=True,
     return_dict=True,
-    video_fps=16,
-    video_load_backend="decord",
+    fps=16,
 )
 print(processed_chat.keys())
 ```
diff --git a/docs/source/en/model_doc/qwen2_5_omni.md b/docs/source/en/model_doc/qwen2_5_omni.md
@@ -83,7 +83,7 @@ inputs = processor.apply_chat_template(
     tokenize=True,
     return_dict=True,
     return_tensors="pt",
-    video_fps=1,
+    fps=1,
 
     # kwargs to be passed to `Qwen2-5-OmniProcessor`
     padding=True,
diff --git a/docs/source/en/model_doc/qwen2_5_vl.md b/docs/source/en/model_doc/qwen2_5_vl.md
@@ -146,7 +146,7 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     
     inputs = processor.apply_chat_template(
         conversation,
-        video_fps=1,
+        fps=1,
         add_generation_prompt=True,
         tokenize=True,
         return_dict=True,
diff --git a/docs/source/en/model_doc/qwen2_vl.md b/docs/source/en/model_doc/qwen2_vl.md
@@ -99,7 +99,7 @@ conversation = [
 
 inputs = processor.apply_chat_template(
     conversation,
-    video_fps=1,
+    fps=1,
     add_generation_prompt=True,
     tokenize=True,
     return_dict=True,
@@ -169,7 +169,7 @@ conversations = [conversation1, conversation2, conversation3, conversation4]
 # Preparation for batch inference
 ipnuts = processor.apply_chat_template(
     conversations,
-    video_fps=1,
+    fps=1,
     add_generation_prompt=True,
     tokenize=True,
     return_dict=True,
diff --git a/docs/source/ko/model_doc/qwen2_vl.md b/docs/source/ko/model_doc/qwen2_vl.md
@@ -97,7 +97,7 @@ conversation = [
 
 inputs = processor.apply_chat_template(
     conversation,
-    video_fps=1,
+    fps=1,
     add_generation_prompt=True,
     tokenize=True,
     return_dict=True,
@@ -167,7 +167,7 @@ conversations = [conversation1, conversation2, conversation3, conversation4]
 # 배치 추론을 위한 준비
 ipnuts = processor.apply_chat_template(
     conversations,
-    video_fps=1,
+    fps=1,
     add_generation_prompt=True,
     tokenize=True,
     return_dict=True,
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
@@ -1448,7 +1448,6 @@ def validate_init_kwargs(processor_config, valid_kwargs):
 
         return unused_kwargs, valid_kwargs
 
-    @deprecate_kwarg("video_fps", version="4.58", new_name="fps")
     @deprecate_kwarg(
         "video_load_backend",
         version="4.59",
@@ -1622,7 +1621,7 @@ def apply_chat_template(
             if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token):
                 kwargs["add_special_tokens"] = False
 
-            # Always sample frames by default unless explicitly set to `False` by users. If users do not pass `num_frames`/`video_fps`
+            # Always sample frames by default unless explicitly set to `False` by users. If users do not pass `num_frames`/`fps`
             # sampling should not done for BC.
             if "do_sample_frames" not in kwargs and (
                 kwargs.get("fps") is not None or kwargs.get("num_frames") is not None
diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py
@@ -436,7 +436,6 @@ def test_small_model_integration_test(self):
             tokenize=True,
             return_dict=True,
             return_tensors="pt",
-            video_load_backend="decord",
             padding=True,
             padding_side="left",
         ).to(torch_device)
@@ -462,7 +461,6 @@ def test_small_model_integration_test_batched(self):
             tokenize=True,
             return_dict=True,
             return_tensors="pt",
-            video_load_backend="decord",
             padding=True,
             padding_side="left",
         ).to(torch_device)