diff --git a/docs/docs/providers/agents/index.mdx b/docs/docs/providers/agents/index.mdx index 06eb104afa..52b92734e6 100644 --- a/docs/docs/providers/agents/index.mdx +++ b/docs/docs/providers/agents/index.mdx @@ -1,7 +1,7 @@ --- description: "Agents - APIs for creating and interacting with agentic systems." +APIs for creating and interacting with agentic systems." sidebar_label: Agents title: Agents --- @@ -12,6 +12,6 @@ title: Agents Agents - APIs for creating and interacting with agentic systems. +APIs for creating and interacting with agentic systems. This section contains documentation for all available providers for the **agents** API. diff --git a/docs/docs/providers/batches/index.mdx b/docs/docs/providers/batches/index.mdx index 2c64b277f8..18e5e314d2 100644 --- a/docs/docs/providers/batches/index.mdx +++ b/docs/docs/providers/batches/index.mdx @@ -1,14 +1,14 @@ --- description: "The Batches API enables efficient processing of multiple requests in a single operation, - particularly useful for processing large datasets, batch evaluation workflows, and - cost-effective inference at scale. +particularly useful for processing large datasets, batch evaluation workflows, and +cost-effective inference at scale. - The API is designed to allow use of openai client libraries for seamless integration. +The API is designed to allow use of openai client libraries for seamless integration. - This API provides the following extensions: - - idempotent batch creation +This API provides the following extensions: + - idempotent batch creation - Note: This API is currently under active development and may undergo changes." +Note: This API is currently under active development and may undergo changes." sidebar_label: Batches title: Batches --- @@ -18,14 +18,14 @@ title: Batches ## Overview The Batches API enables efficient processing of multiple requests in a single operation, - particularly useful for processing large datasets, batch evaluation workflows, and - cost-effective inference at scale. +particularly useful for processing large datasets, batch evaluation workflows, and +cost-effective inference at scale. - The API is designed to allow use of openai client libraries for seamless integration. +The API is designed to allow use of openai client libraries for seamless integration. - This API provides the following extensions: - - idempotent batch creation +This API provides the following extensions: + - idempotent batch creation - Note: This API is currently under active development and may undergo changes. +Note: This API is currently under active development and may undergo changes. This section contains documentation for all available providers for the **batches** API. diff --git a/docs/docs/providers/inference/index.mdx b/docs/docs/providers/inference/index.mdx index ebbaf1be18..1dc479675f 100644 --- a/docs/docs/providers/inference/index.mdx +++ b/docs/docs/providers/inference/index.mdx @@ -1,9 +1,9 @@ --- description: "Llama Stack Inference API for generating completions, chat completions, and embeddings. - This API provides the raw interface to the underlying models. Two kinds of models are supported: - - LLM models: these models generate \"raw\" and \"chat\" (conversational) completions. - - Embedding models: these models generate embeddings to be used for semantic search." +This API provides the raw interface to the underlying models. Two kinds of models are supported: +- LLM models: these models generate \"raw\" and \"chat\" (conversational) completions. +- Embedding models: these models generate embeddings to be used for semantic search." sidebar_label: Inference title: Inference --- @@ -14,8 +14,8 @@ title: Inference Llama Stack Inference API for generating completions, chat completions, and embeddings. - This API provides the raw interface to the underlying models. Two kinds of models are supported: - - LLM models: these models generate "raw" and "chat" (conversational) completions. - - Embedding models: these models generate embeddings to be used for semantic search. +This API provides the raw interface to the underlying models. Two kinds of models are supported: +- LLM models: these models generate "raw" and "chat" (conversational) completions. +- Embedding models: these models generate embeddings to be used for semantic search. This section contains documentation for all available providers for the **inference** API. diff --git a/docs/static/deprecated-llama-stack-spec.html b/docs/static/deprecated-llama-stack-spec.html index ffda7552b3..473090de77 100644 --- a/docs/static/deprecated-llama-stack-spec.html +++ b/docs/static/deprecated-llama-stack-spec.html @@ -9096,6 +9096,10 @@ "type": "string", "description": "(Optional) Truncation strategy applied to the response" }, + "max_output_tokens": { + "type": "integer", + "description": "(Optional) Upper bound for response tokens generation" + }, "input": { "type": "array", "items": { @@ -9914,6 +9918,9 @@ }, "max_infer_iters": { "type": "integer" + }, + "max_output_tokens": { + "type": "integer" } }, "additionalProperties": false, @@ -9983,6 +9990,10 @@ "truncation": { "type": "string", "description": "(Optional) Truncation strategy applied to the response" + }, + "max_output_tokens": { + "type": "integer", + "description": "(Optional) Upper bound for response tokens generation" } }, "additionalProperties": false, diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index 0e672f9145..e097b41d21 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -6740,6 +6740,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation input: type: array items: @@ -7351,6 +7355,8 @@ components: (Optional) Additional fields to include in the response. max_infer_iters: type: integer + max_output_tokens: + type: integer additionalProperties: false required: - input @@ -7414,6 +7420,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation additionalProperties: false required: - created_at diff --git a/docs/static/llama-stack-spec.html b/docs/static/llama-stack-spec.html index c570dcddf7..56eab35f43 100644 --- a/docs/static/llama-stack-spec.html +++ b/docs/static/llama-stack-spec.html @@ -7503,6 +7503,10 @@ "type": "string", "description": "(Optional) Truncation strategy applied to the response" }, + "max_output_tokens": { + "type": "integer", + "description": "(Optional) Upper bound for response tokens generation" + }, "input": { "type": "array", "items": { @@ -8009,6 +8013,9 @@ }, "max_infer_iters": { "type": "integer" + }, + "max_output_tokens": { + "type": "integer" } }, "additionalProperties": false, @@ -8078,6 +8085,10 @@ "truncation": { "type": "string", "description": "(Optional) Truncation strategy applied to the response" + }, + "max_output_tokens": { + "type": "integer", + "description": "(Optional) Upper bound for response tokens generation" } }, "additionalProperties": false, diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 3e1431b229..55299cae1d 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -5660,6 +5660,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation input: type: array items: @@ -6014,6 +6018,8 @@ components: (Optional) Additional fields to include in the response. max_infer_iters: type: integer + max_output_tokens: + type: integer additionalProperties: false required: - input @@ -6077,6 +6083,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation additionalProperties: false required: - created_at diff --git a/docs/static/stainless-llama-stack-spec.html b/docs/static/stainless-llama-stack-spec.html index 167a4aa3ce..5c8e661bef 100644 --- a/docs/static/stainless-llama-stack-spec.html +++ b/docs/static/stainless-llama-stack-spec.html @@ -9512,6 +9512,10 @@ "type": "string", "description": "(Optional) Truncation strategy applied to the response" }, + "max_output_tokens": { + "type": "integer", + "description": "(Optional) Upper bound for response tokens generation" + }, "input": { "type": "array", "items": { @@ -10018,6 +10022,9 @@ }, "max_infer_iters": { "type": "integer" + }, + "max_output_tokens": { + "type": "integer" } }, "additionalProperties": false, @@ -10087,6 +10094,10 @@ "truncation": { "type": "string", "description": "(Optional) Truncation strategy applied to the response" + }, + "max_output_tokens": { + "type": "integer", + "description": "(Optional) Upper bound for response tokens generation" } }, "additionalProperties": false, diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 6dc1041f1e..d5f84e26fc 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -7105,6 +7105,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation input: type: array items: @@ -7459,6 +7463,8 @@ components: (Optional) Additional fields to include in the response. max_infer_iters: type: integer + max_output_tokens: + type: integer additionalProperties: false required: - input @@ -7522,6 +7528,10 @@ components: type: string description: >- (Optional) Truncation strategy applied to the response + max_output_tokens: + type: integer + description: >- + (Optional) Upper bound for response tokens generation additionalProperties: false required: - created_at diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py index cdf47308ec..b6c1630959 100644 --- a/llama_stack/apis/agents/agents.py +++ b/llama_stack/apis/agents/agents.py @@ -825,6 +825,7 @@ async def create_openai_response( "List of shields to apply during response generation. Shields provide safety and content moderation." ), ] = None, + max_output_tokens: int | None = None, ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]: """Create a new OpenAI response. diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py index 0f3511ea35..23d73fc3b0 100644 --- a/llama_stack/apis/agents/openai_responses.py +++ b/llama_stack/apis/agents/openai_responses.py @@ -363,6 +363,7 @@ class OpenAIResponseObject(BaseModel): :param text: Text formatting configuration for the response :param top_p: (Optional) Nucleus sampling parameter used for generation :param truncation: (Optional) Truncation strategy applied to the response + :param max_output_tokens: (Optional) Upper bound for response tokens generation """ created_at: int @@ -380,6 +381,7 @@ class OpenAIResponseObject(BaseModel): text: OpenAIResponseText = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) top_p: float | None = None truncation: str | None = None + max_output_tokens: int | None = None @json_schema_type diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py index 8ccdcb0e11..b5da96b433 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py @@ -204,6 +204,7 @@ async def create_openai_response( store: bool | None = True, stream: bool | None = False, temperature: float | None = None, + max_output_tokens: int | None = None, text: OpenAIResponseText | None = None, tools: list[OpenAIResponseInputTool] | None = None, include: list[str] | None = None, @@ -224,6 +225,7 @@ async def create_openai_response( previous_response_id=previous_response_id, store=store, temperature=temperature, + max_output_tokens=max_output_tokens, text=text, tools=tools, max_infer_iters=max_infer_iters, @@ -252,6 +254,7 @@ async def _create_streaming_response( previous_response_id: str | None = None, store: bool | None = True, temperature: float | None = None, + max_output_tokens: int | None = None, text: OpenAIResponseText | None = None, tools: list[OpenAIResponseInputTool] | None = None, max_infer_iters: int | None = 10, @@ -268,6 +271,7 @@ async def _create_streaming_response( messages=messages, response_tools=tools, temperature=temperature, + max_tokens=max_output_tokens, response_format=response_format, inputs=input, ) diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/types.py b/llama_stack/providers/inline/agents/meta_reference/responses/types.py index d3b5a16bd3..044485f236 100644 --- a/llama_stack/providers/inline/agents/meta_reference/responses/types.py +++ b/llama_stack/providers/inline/agents/meta_reference/responses/types.py @@ -63,6 +63,7 @@ class ChatCompletionContext(BaseModel): response_format: OpenAIResponseFormatParam approval_requests: list[OpenAIResponseMCPApprovalRequest] = [] approval_responses: dict[str, OpenAIResponseMCPApprovalResponse] = {} + max_tokens: int | None = None def __init__( self, @@ -72,6 +73,7 @@ def __init__( temperature: float | None, response_format: OpenAIResponseFormatParam, inputs: list[OpenAIResponseInput] | str, + max_tokens: int | None = None, ): super().__init__( model=model, diff --git a/tests/integration/agents/test_openai_responses.py b/tests/integration/agents/test_openai_responses.py index 6648257e69..21c560ae94 100644 --- a/tests/integration/agents/test_openai_responses.py +++ b/tests/integration/agents/test_openai_responses.py @@ -297,3 +297,38 @@ def test_function_call_output_response_with_none_arguments(openai_client, client assert response.output[0].type == "function_call" assert response.output[0].arguments == "{}" _ = response.output[0].call_id + + +def test_response_with_max_output_tokens(compat_client, text_model_id): + """Test that the `max_output_tokens` parameter is used.""" + if not isinstance(compat_client, OpenAI): + pytest.skip("This test requires the OpenAI client.") + + response = compat_client.responses.create( + model=text_model_id, + input=[ + { + "role": "user", + "content": "what's the current time? You MUST call the `get_current_time` function to find out.", + } + ], + max_output_tokens=15, + stream=False, + ) + + assert response.id is not None + assert response.model == text_model_id + + assert hasattr(response, "max_output_tokens") + assert response.max_output_tokens == 15 + + output_text = "" + for item in response.output: + if item.type == "message" and item.role == "assistant": + if item.content and item.content.type == "text": + output_text = item.content.text + break + + assert output_text, "Assistant response content should not be empty" + + assert len(output_text.split()) < 30