Skip to content

Commit 53b1572

Browse files
authored
chore(apis): unpublish deprecated /v1/inference apis (#3297)
# What does this PR do? unpublish (make unavailable to users) the following apis - - `/v1/inference/completion`, replaced by `/v1/openai/v1/completions` - `/v1/inference/chat-completion`, replaced by `/v1/openai/v1/chat/completions` - `/v1/inference/embeddings`, replaced by `/v1/openai/v1/embeddings` - `/v1/inference/batch-completion`, replaced by `/v1/openai/v1/batches` - `/v1/inference/batch-chat-completion`, replaced by `/v1/openai/v1/batches` note: the implementations are still available for internal use, e.g. agents uses chat-completion.
1 parent 60484c5 commit 53b1572

23 files changed

+3134
-1347
lines changed

docs/static/llama-stack-spec.html

Lines changed: 0 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -210,55 +210,6 @@
210210
}
211211
}
212212
},
213-
"/v1/inference/completion": {
214-
"post": {
215-
"responses": {
216-
"200": {
217-
"description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.",
218-
"content": {
219-
"application/json": {
220-
"schema": {
221-
"$ref": "#/components/schemas/CompletionResponse"
222-
}
223-
},
224-
"text/event-stream": {
225-
"schema": {
226-
"$ref": "#/components/schemas/CompletionResponseStreamChunk"
227-
}
228-
}
229-
}
230-
},
231-
"400": {
232-
"$ref": "#/components/responses/BadRequest400"
233-
},
234-
"429": {
235-
"$ref": "#/components/responses/TooManyRequests429"
236-
},
237-
"500": {
238-
"$ref": "#/components/responses/InternalServerError500"
239-
},
240-
"default": {
241-
"$ref": "#/components/responses/DefaultError"
242-
}
243-
},
244-
"tags": [
245-
"Inference"
246-
],
247-
"summary": "Generate a completion for the given content using the specified model.",
248-
"description": "Generate a completion for the given content using the specified model.",
249-
"parameters": [],
250-
"requestBody": {
251-
"content": {
252-
"application/json": {
253-
"schema": {
254-
"$ref": "#/components/schemas/CompletionRequest"
255-
}
256-
}
257-
},
258-
"required": true
259-
}
260-
}
261-
},
262213
"/v1/agents": {
263214
"get": {
264215
"responses": {
@@ -7299,126 +7250,6 @@
72997250
"title": "ToolCallDelta",
73007251
"description": "A tool call content delta for streaming responses."
73017252
},
7302-
"CompletionRequest": {
7303-
"type": "object",
7304-
"properties": {
7305-
"model_id": {
7306-
"type": "string",
7307-
"description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
7308-
},
7309-
"content": {
7310-
"$ref": "#/components/schemas/InterleavedContent",
7311-
"description": "The content to generate a completion for."
7312-
},
7313-
"sampling_params": {
7314-
"$ref": "#/components/schemas/SamplingParams",
7315-
"description": "(Optional) Parameters to control the sampling strategy."
7316-
},
7317-
"response_format": {
7318-
"$ref": "#/components/schemas/ResponseFormat",
7319-
"description": "(Optional) Grammar specification for guided (structured) decoding."
7320-
},
7321-
"stream": {
7322-
"type": "boolean",
7323-
"description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
7324-
},
7325-
"logprobs": {
7326-
"type": "object",
7327-
"properties": {
7328-
"top_k": {
7329-
"type": "integer",
7330-
"default": 0,
7331-
"description": "How many tokens (for each position) to return log probabilities for."
7332-
}
7333-
},
7334-
"additionalProperties": false,
7335-
"description": "(Optional) If specified, log probabilities for each token position will be returned."
7336-
}
7337-
},
7338-
"additionalProperties": false,
7339-
"required": [
7340-
"model_id",
7341-
"content"
7342-
],
7343-
"title": "CompletionRequest"
7344-
},
7345-
"CompletionResponse": {
7346-
"type": "object",
7347-
"properties": {
7348-
"metrics": {
7349-
"type": "array",
7350-
"items": {
7351-
"$ref": "#/components/schemas/MetricInResponse"
7352-
},
7353-
"description": "(Optional) List of metrics associated with the API response"
7354-
},
7355-
"content": {
7356-
"type": "string",
7357-
"description": "The generated completion text"
7358-
},
7359-
"stop_reason": {
7360-
"type": "string",
7361-
"enum": [
7362-
"end_of_turn",
7363-
"end_of_message",
7364-
"out_of_tokens"
7365-
],
7366-
"description": "Reason why generation stopped"
7367-
},
7368-
"logprobs": {
7369-
"type": "array",
7370-
"items": {
7371-
"$ref": "#/components/schemas/TokenLogProbs"
7372-
},
7373-
"description": "Optional log probabilities for generated tokens"
7374-
}
7375-
},
7376-
"additionalProperties": false,
7377-
"required": [
7378-
"content",
7379-
"stop_reason"
7380-
],
7381-
"title": "CompletionResponse",
7382-
"description": "Response from a completion request."
7383-
},
7384-
"CompletionResponseStreamChunk": {
7385-
"type": "object",
7386-
"properties": {
7387-
"metrics": {
7388-
"type": "array",
7389-
"items": {
7390-
"$ref": "#/components/schemas/MetricInResponse"
7391-
},
7392-
"description": "(Optional) List of metrics associated with the API response"
7393-
},
7394-
"delta": {
7395-
"type": "string",
7396-
"description": "New content generated since last chunk. This can be one or more tokens."
7397-
},
7398-
"stop_reason": {
7399-
"type": "string",
7400-
"enum": [
7401-
"end_of_turn",
7402-
"end_of_message",
7403-
"out_of_tokens"
7404-
],
7405-
"description": "Optional reason why generation stopped, if complete"
7406-
},
7407-
"logprobs": {
7408-
"type": "array",
7409-
"items": {
7410-
"$ref": "#/components/schemas/TokenLogProbs"
7411-
},
7412-
"description": "Optional log probabilities for generated tokens"
7413-
}
7414-
},
7415-
"additionalProperties": false,
7416-
"required": [
7417-
"delta"
7418-
],
7419-
"title": "CompletionResponseStreamChunk",
7420-
"description": "A chunk of a streamed completion response."
7421-
},
74227253
"AgentConfig": {
74237254
"type": "object",
74247255
"properties": {

docs/static/llama-stack-spec.yaml

Lines changed: 0 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -132,43 +132,6 @@ paths:
132132
schema:
133133
$ref: '#/components/schemas/ChatCompletionRequest'
134134
required: true
135-
/v1/inference/completion:
136-
post:
137-
responses:
138-
'200':
139-
description: >-
140-
If stream=False, returns a CompletionResponse with the full completion.
141-
If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.
142-
content:
143-
application/json:
144-
schema:
145-
$ref: '#/components/schemas/CompletionResponse'
146-
text/event-stream:
147-
schema:
148-
$ref: '#/components/schemas/CompletionResponseStreamChunk'
149-
'400':
150-
$ref: '#/components/responses/BadRequest400'
151-
'429':
152-
$ref: >-
153-
#/components/responses/TooManyRequests429
154-
'500':
155-
$ref: >-
156-
#/components/responses/InternalServerError500
157-
default:
158-
$ref: '#/components/responses/DefaultError'
159-
tags:
160-
- Inference
161-
summary: >-
162-
Generate a completion for the given content using the specified model.
163-
description: >-
164-
Generate a completion for the given content using the specified model.
165-
parameters: []
166-
requestBody:
167-
content:
168-
application/json:
169-
schema:
170-
$ref: '#/components/schemas/CompletionRequest'
171-
required: true
172135
/v1/agents:
173136
get:
174137
responses:
@@ -5292,112 +5255,6 @@ components:
52925255
title: ToolCallDelta
52935256
description: >-
52945257
A tool call content delta for streaming responses.
5295-
CompletionRequest:
5296-
type: object
5297-
properties:
5298-
model_id:
5299-
type: string
5300-
description: >-
5301-
The identifier of the model to use. The model must be registered with
5302-
Llama Stack and available via the /models endpoint.
5303-
content:
5304-
$ref: '#/components/schemas/InterleavedContent'
5305-
description: >-
5306-
The content to generate a completion for.
5307-
sampling_params:
5308-
$ref: '#/components/schemas/SamplingParams'
5309-
description: >-
5310-
(Optional) Parameters to control the sampling strategy.
5311-
response_format:
5312-
$ref: '#/components/schemas/ResponseFormat'
5313-
description: >-
5314-
(Optional) Grammar specification for guided (structured) decoding.
5315-
stream:
5316-
type: boolean
5317-
description: >-
5318-
(Optional) If True, generate an SSE event stream of the response. Defaults
5319-
to False.
5320-
logprobs:
5321-
type: object
5322-
properties:
5323-
top_k:
5324-
type: integer
5325-
default: 0
5326-
description: >-
5327-
How many tokens (for each position) to return log probabilities for.
5328-
additionalProperties: false
5329-
description: >-
5330-
(Optional) If specified, log probabilities for each token position will
5331-
be returned.
5332-
additionalProperties: false
5333-
required:
5334-
- model_id
5335-
- content
5336-
title: CompletionRequest
5337-
CompletionResponse:
5338-
type: object
5339-
properties:
5340-
metrics:
5341-
type: array
5342-
items:
5343-
$ref: '#/components/schemas/MetricInResponse'
5344-
description: >-
5345-
(Optional) List of metrics associated with the API response
5346-
content:
5347-
type: string
5348-
description: The generated completion text
5349-
stop_reason:
5350-
type: string
5351-
enum:
5352-
- end_of_turn
5353-
- end_of_message
5354-
- out_of_tokens
5355-
description: Reason why generation stopped
5356-
logprobs:
5357-
type: array
5358-
items:
5359-
$ref: '#/components/schemas/TokenLogProbs'
5360-
description: >-
5361-
Optional log probabilities for generated tokens
5362-
additionalProperties: false
5363-
required:
5364-
- content
5365-
- stop_reason
5366-
title: CompletionResponse
5367-
description: Response from a completion request.
5368-
CompletionResponseStreamChunk:
5369-
type: object
5370-
properties:
5371-
metrics:
5372-
type: array
5373-
items:
5374-
$ref: '#/components/schemas/MetricInResponse'
5375-
description: >-
5376-
(Optional) List of metrics associated with the API response
5377-
delta:
5378-
type: string
5379-
description: >-
5380-
New content generated since last chunk. This can be one or more tokens.
5381-
stop_reason:
5382-
type: string
5383-
enum:
5384-
- end_of_turn
5385-
- end_of_message
5386-
- out_of_tokens
5387-
description: >-
5388-
Optional reason why generation stopped, if complete
5389-
logprobs:
5390-
type: array
5391-
items:
5392-
$ref: '#/components/schemas/TokenLogProbs'
5393-
description: >-
5394-
Optional log probabilities for generated tokens
5395-
additionalProperties: false
5396-
required:
5397-
- delta
5398-
title: CompletionResponseStreamChunk
5399-
description: >-
5400-
A chunk of a streamed completion response.
54015258
AgentConfig:
54025259
type: object
54035260
properties:

llama_stack/apis/inference/inference.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1008,7 +1008,6 @@ class InferenceProvider(Protocol):
10081008

10091009
model_store: ModelStore | None = None
10101010

1011-
@webmethod(route="/inference/completion", method="POST", level=LLAMA_STACK_API_V1)
10121011
async def completion(
10131012
self,
10141013
model_id: str,

llama_stack/providers/utils/inference/prompt_adapter.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,14 @@ async def localize_image_content(uri: str) -> tuple[bytes, str] | None:
192192
format = "png"
193193

194194
return content, format
195+
elif uri.startswith("data"):
196+
# data:image/{format};base64,{data}
197+
match = re.match(r"data:image/(\w+);base64,(.+)", uri)
198+
if not match:
199+
raise ValueError(f"Invalid data URL format, {uri[:40]}...")
200+
fmt, image_data = match.groups()
201+
content = base64.b64decode(image_data)
202+
return content, fmt
195203
else:
196204
return None
197205

0 commit comments

Comments
 (0)