llamastack
diff --git a/‎docs/static/llama-stack-spec.html‎
Lines changed: 0 additions & 169 deletions b/‎docs/static/llama-stack-spec.html‎
Lines changed: 0 additions & 169 deletions
diff --git a/‎docs/static/llama-stack-spec.yaml‎
Lines changed: 0 additions & 143 deletions b/‎docs/static/llama-stack-spec.yaml‎
Lines changed: 0 additions & 143 deletions
diff --git a/‎llama_stack/apis/inference/inference.py‎
Lines changed: 0 additions & 1 deletion b/‎llama_stack/apis/inference/inference.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎llama_stack/providers/utils/inference/prompt_adapter.py‎
Lines changed: 8 additions & 0 deletions b/‎llama_stack/providers/utils/inference/prompt_adapter.py‎
Lines changed: 8 additions & 0 deletions
@@ -210,55 +210,6 @@
                 }
             }
         },
-        "/v1/inference/completion": {
-            "post": {
-                "responses": {
-                    "200": {
-                        "description": "If stream=False, returns a CompletionResponse with the full completion. If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.",
-                        "content": {
-                            "application/json": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/CompletionResponse"
-                                }
-                            },
-                            "text/event-stream": {
-                                "schema": {
-                                    "$ref": "#/components/schemas/CompletionResponseStreamChunk"
-                                }
-                            }
-                        }
-                    },
-                    "400": {
-                        "$ref": "#/components/responses/BadRequest400"
-                    },
-                    "429": {
-                        "$ref": "#/components/responses/TooManyRequests429"
-                    },
-                    "500": {
-                        "$ref": "#/components/responses/InternalServerError500"
-                    },
-                    "default": {
-                        "$ref": "#/components/responses/DefaultError"
-                    }
-                },
-                "tags": [
-                    "Inference"
-                ],
-                "summary": "Generate a completion for the given content using the specified model.",
-                "description": "Generate a completion for the given content using the specified model.",
-                "parameters": [],
-                "requestBody": {
-                    "content": {
-                        "application/json": {
-                            "schema": {
-                                "$ref": "#/components/schemas/CompletionRequest"
-                            }
-                        }
-                    },
-                    "required": true
-                }
-            }
-        },
         "/v1/agents": {
             "get": {
                 "responses": {
@@ -7299,126 +7250,6 @@
                 "title": "ToolCallDelta",
                 "description": "A tool call content delta for streaming responses."
             },
-            "CompletionRequest": {
-                "type": "object",
-                "properties": {
-                    "model_id": {
-                        "type": "string",
-                        "description": "The identifier of the model to use. The model must be registered with Llama Stack and available via the /models endpoint."
-                    },
-                    "content": {
-                        "$ref": "#/components/schemas/InterleavedContent",
-                        "description": "The content to generate a completion for."
-                    },
-                    "sampling_params": {
-                        "$ref": "#/components/schemas/SamplingParams",
-                        "description": "(Optional) Parameters to control the sampling strategy."
-                    },
-                    "response_format": {
-                        "$ref": "#/components/schemas/ResponseFormat",
-                        "description": "(Optional) Grammar specification for guided (structured) decoding."
-                    },
-                    "stream": {
-                        "type": "boolean",
-                        "description": "(Optional) If True, generate an SSE event stream of the response. Defaults to False."
-                    },
-                    "logprobs": {
-                        "type": "object",
-                        "properties": {
-                            "top_k": {
-                                "type": "integer",
-                                "default": 0,
-                                "description": "How many tokens (for each position) to return log probabilities for."
-                            }
-                        },
-                        "additionalProperties": false,
-                        "description": "(Optional) If specified, log probabilities for each token position will be returned."
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "model_id",
-                    "content"
-                ],
-                "title": "CompletionRequest"
-            },
-            "CompletionResponse": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
-                    "content": {
-                        "type": "string",
-                        "description": "The generated completion text"
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Reason why generation stopped"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "content",
-                    "stop_reason"
-                ],
-                "title": "CompletionResponse",
-                "description": "Response from a completion request."
-            },
-            "CompletionResponseStreamChunk": {
-                "type": "object",
-                "properties": {
-                    "metrics": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/MetricInResponse"
-                        },
-                        "description": "(Optional) List of metrics associated with the API response"
-                    },
-                    "delta": {
-                        "type": "string",
-                        "description": "New content generated since last chunk. This can be one or more tokens."
-                    },
-                    "stop_reason": {
-                        "type": "string",
-                        "enum": [
-                            "end_of_turn",
-                            "end_of_message",
-                            "out_of_tokens"
-                        ],
-                        "description": "Optional reason why generation stopped, if complete"
-                    },
-                    "logprobs": {
-                        "type": "array",
-                        "items": {
-                            "$ref": "#/components/schemas/TokenLogProbs"
-                        },
-                        "description": "Optional log probabilities for generated tokens"
-                    }
-                },
-                "additionalProperties": false,
-                "required": [
-                    "delta"
-                ],
-                "title": "CompletionResponseStreamChunk",
-                "description": "A chunk of a streamed completion response."
-            },
             "AgentConfig": {
                 "type": "object",
                 "properties": {
 
@@ -132,43 +132,6 @@ paths:
             schema:
               $ref: '#/components/schemas/ChatCompletionRequest'
         required: true
-  /v1/inference/completion:
-    post:
-      responses:
-        '200':
-          description: >-
-            If stream=False, returns a CompletionResponse with the full completion.
-            If stream=True, returns an SSE event stream of CompletionResponseStreamChunk.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/CompletionResponse'
-            text/event-stream:
-              schema:
-                $ref: '#/components/schemas/CompletionResponseStreamChunk'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          $ref: >-
-            #/components/responses/TooManyRequests429
-        '500':
-          $ref: >-
-            #/components/responses/InternalServerError500
-        default:
-          $ref: '#/components/responses/DefaultError'
-      tags:
-        - Inference
-      summary: >-
-        Generate a completion for the given content using the specified model.
-      description: >-
-        Generate a completion for the given content using the specified model.
-      parameters: []
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/CompletionRequest'
-        required: true
   /v1/agents:
     get:
       responses:
@@ -5292,112 +5255,6 @@ components:
       title: ToolCallDelta
       description: >-
         A tool call content delta for streaming responses.
-    CompletionRequest:
-      type: object
-      properties:
-        model_id:
-          type: string
-          description: >-
-            The identifier of the model to use. The model must be registered with
-            Llama Stack and available via the /models endpoint.
-        content:
-          $ref: '#/components/schemas/InterleavedContent'
-          description: >-
-            The content to generate a completion for.
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: >-
-            (Optional) Parameters to control the sampling strategy.
-        response_format:
-          $ref: '#/components/schemas/ResponseFormat'
-          description: >-
-            (Optional) Grammar specification for guided (structured) decoding.
-        stream:
-          type: boolean
-          description: >-
-            (Optional) If True, generate an SSE event stream of the response. Defaults
-            to False.
-        logprobs:
-          type: object
-          properties:
-            top_k:
-              type: integer
-              default: 0
-              description: >-
-                How many tokens (for each position) to return log probabilities for.
-          additionalProperties: false
-          description: >-
-            (Optional) If specified, log probabilities for each token position will
-            be returned.
-      additionalProperties: false
-      required:
-        - model_id
-        - content
-      title: CompletionRequest
-    CompletionResponse:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        content:
-          type: string
-          description: The generated completion text
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: Reason why generation stopped
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - content
-        - stop_reason
-      title: CompletionResponse
-      description: Response from a completion request.
-    CompletionResponseStreamChunk:
-      type: object
-      properties:
-        metrics:
-          type: array
-          items:
-            $ref: '#/components/schemas/MetricInResponse'
-          description: >-
-            (Optional) List of metrics associated with the API response
-        delta:
-          type: string
-          description: >-
-            New content generated since last chunk. This can be one or more tokens.
-        stop_reason:
-          type: string
-          enum:
-            - end_of_turn
-            - end_of_message
-            - out_of_tokens
-          description: >-
-            Optional reason why generation stopped, if complete
-        logprobs:
-          type: array
-          items:
-            $ref: '#/components/schemas/TokenLogProbs'
-          description: >-
-            Optional log probabilities for generated tokens
-      additionalProperties: false
-      required:
-        - delta
-      title: CompletionResponseStreamChunk
-      description: >-
-        A chunk of a streamed completion response.
     AgentConfig:
       type: object
       properties:
 
@@ -1008,7 +1008,6 @@ class InferenceProvider(Protocol):
 
     model_store: ModelStore | None = None
 
-    @webmethod(route="/inference/completion", method="POST", level=LLAMA_STACK_API_V1)
     async def completion(
         self,
         model_id: str,
 
@@ -192,6 +192,14 @@ async def localize_image_content(uri: str) -> tuple[bytes, str] | None:
                 format = "png"
 
         return content, format
+    elif uri.startswith("data"):
+        # data:image/{format};base64,{data}
+        match = re.match(r"data:image/(\w+);base64,(.+)", uri)
+        if not match:
+            raise ValueError(f"Invalid data URL format, {uri[:40]}...")
+        fmt, image_data = match.groups()
+        content = base64.b64decode(image_data)
+        return content, fmt
     else:
         return None