Skip to content

Commit 5f8d4ae

Browse files
authored
[Feature] support audio tts (#5333)
1 parent 83dbc4e commit 5f8d4ae

File tree

5 files changed

+86
-9
lines changed

5 files changed

+86
-9
lines changed

fastdeploy/engine/request.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@ def __repr__(self) -> str:
416416
f"send_idx={self.send_idx}, "
417417
f"text={self.text!r}, "
418418
f"token_ids={self.token_ids}, "
419+
f"decode_type={self.decode_type}, "
419420
f"draft_token_ids={self.draft_token_ids}, "
420421
f"reasoning_content={self.reasoning_content!r}, "
421422
f"logprobs={self.logprobs}, "

fastdeploy/entrypoints/openai/protocol.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ class ChatMessage(BaseModel):
210210
content: Optional[str] = None
211211
multimodal_content: Optional[List[Any]] = None
212212
reasoning_content: Optional[str] = None
213+
audio_content: Optional[str] = None
213214
tool_calls: Optional[List[DeltaToolCall | ToolCall]] = None
214215
prompt_token_ids: Optional[List[int]] = None
215216
completion_token_ids: Optional[List[int]] = None
@@ -272,6 +273,7 @@ class DeltaMessage(BaseModel):
272273
role: Optional[str] = None
273274
content: Optional[str] = None
274275
multimodal_content: Optional[List[Any]] = None
276+
audio_content: Optional[str] = None
275277
prompt_token_ids: Optional[List[int]] = None
276278
completion_token_ids: Optional[List[int]] = None
277279
reasoning_content: Optional[str] = None

fastdeploy/entrypoints/openai/response_processors.py

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
# limitations under the License.
1515
"""
1616

17-
from typing import Any, List, Optional
17+
import inspect
18+
from typing import Any, Dict, List, Optional
1819

1920
from fastdeploy.entrypoints.openai.usage_calculator import count_tokens
2021
from fastdeploy.input.tokenzier_client import AsyncTokenizerClient, ImageDecodeRequest
@@ -34,19 +35,22 @@ def __init__(
3435
data_processor,
3536
enable_mm_output: Optional[bool] = False,
3637
eoi_token_id: Optional[int] = 101032,
38+
eoa_token_id: Optional[int] = 2048,
3739
eos_token_id: Optional[int] = 2,
3840
decoder_base_url: Optional[str] = None,
3941
):
4042
self.data_processor = data_processor
4143
self.enable_mm_output = enable_mm_output
4244
self.eoi_token_id = eoi_token_id
45+
self.eoa_token_id = eoa_token_id
4346
self.eos_token_id = eos_token_id
4447
if decoder_base_url is not None:
4548
self.decoder_client = AsyncTokenizerClient(base_url=decoder_base_url)
4649
else:
4750
self.decoder_client = None
4851
self._mm_buffer: List[Any] = [] # Buffer for accumulating image token_ids
4952
self._end_image_code_request_output: Optional[Any] = None
53+
self._audio_buffer: Dict[Any] = {}
5054
self._multipart_buffer = []
5155

5256
def enable_multimodal_content(self):
@@ -80,16 +84,54 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
8084
for request_output in request_outputs:
8185
api_server_logger.debug(f"request_output {request_output}")
8286
if not self.enable_mm_output:
83-
yield self.data_processor.process_response_dict(
84-
response_dict=request_output,
85-
stream=stream,
86-
enable_thinking=enable_thinking,
87-
include_stop_str_in_output=include_stop_str_in_output,
88-
)
87+
outputs = request_output.get("outputs", None)
88+
token_ids = outputs.get("token_ids", None) if outputs is not None else None
89+
req_id = request_output.get("request_id", None)
90+
if outputs is not None and token_ids is not None and req_id is not None:
91+
decode_type = request_output["outputs"].get("decode_type", 0) or 0
92+
if decode_type == 0: # text
93+
tts = req_id in self._audio_buffer
94+
if token_ids[-1] == self.eos_token_id:
95+
all_audio_tokens = self._audio_buffer.pop(req_id, [])
96+
else:
97+
all_audio_tokens = None
98+
if inspect.iscoroutinefunction(self.data_processor.process_response_dict):
99+
response = await self.data_processor.process_response_dict(
100+
response_dict=request_output,
101+
stream=stream,
102+
enable_thinking=enable_thinking,
103+
include_stop_str_in_output=include_stop_str_in_output,
104+
audio_tokens=all_audio_tokens,
105+
tts=tts,
106+
)
107+
else:
108+
response = self.data_processor.process_response_dict(
109+
response_dict=request_output,
110+
stream=stream,
111+
enable_thinking=enable_thinking,
112+
include_stop_str_in_output=include_stop_str_in_output,
113+
audio_tokens=all_audio_tokens,
114+
tts=tts,
115+
)
116+
yield response
117+
elif decode_type == 2: # audio
118+
if self.eoa_token_id is not None and self.eoa_token_id in token_ids:
119+
continue
120+
if req_id in self._audio_buffer:
121+
self._audio_buffer[req_id].append(token_ids)
122+
else:
123+
self._audio_buffer[req_id] = [token_ids]
124+
else:
125+
yield self.data_processor.process_response_dict(
126+
response_dict=request_output,
127+
stream=stream,
128+
enable_thinking=enable_thinking,
129+
include_stop_str_in_output=include_stop_str_in_output,
130+
)
89131
elif stream:
90132
decode_type = request_output["outputs"].get("decode_type", 0)
91133
token_ids = request_output["outputs"]["token_ids"]
92-
if decode_type == 0:
134+
if decode_type == 0: # text
93135
if self.eoi_token_id and self.eoi_token_id in token_ids:
94136
if self._mm_buffer:
95137
all_tokens = self._mm_buffer
@@ -118,7 +160,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
118160
request_output["outputs"]["multipart"] = [text]
119161
yield request_output
120162

121-
elif decode_type == 1:
163+
elif decode_type == 1: # image
122164
self._mm_buffer.append(token_ids)
123165
self._end_image_code_request_output = request_output
124166
else:

fastdeploy/entrypoints/openai/serving_chat.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,9 @@ async def chat_completion_stream_generator(
329329
else:
330330
choice.delta.content = ""
331331

332+
if res["outputs"].get("audio_content", None) is not None:
333+
choice.delta.audio_content = res["outputs"]["audio_content"]
334+
332335
if request.return_token_ids:
333336
choice.delta.prompt_token_ids = list(prompt_token_ids)
334337
choice.delta.prompt_tokens = prompt_tokens
@@ -389,6 +392,10 @@ async def chat_completion_stream_generator(
389392
delta_message.multimodal_content = output["multipart"]
390393
else:
391394
delta_message.content = output["text"]
395+
396+
if output.get("audio_content", None) is not None:
397+
delta_message.audio_content = output["audio_content"]
398+
392399
if not res["finished"] and "delta_message" in output:
393400
delta_message_output = output["delta_message"]
394401
if delta_message_output is None:
@@ -689,6 +696,9 @@ async def _create_chat_completion_choice(
689696
else:
690697
message.content = output["text"]
691698

699+
if output.get("audio_content", None) is not None:
700+
message.audio_content = output["audio_content"]
701+
692702
logprobs_full_res = None
693703
draft_logprobs_full_res = None
694704
prompt_logprobs_full_res = None

tests/entrypoints/openai/test_response_processors.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,28 @@ async def test_text_only_mode(self):
5656
self.assertEqual(results[0]["processed"], True)
5757
self.assertEqual(results[0]["raw"]["outputs"]["text"], "hello")
5858

59+
async def test_audio_tts(self):
60+
"""不开启 multimodal,直接走 data_processor"""
61+
processor = ChatResponseProcessor(self.mock_data_processor)
62+
request_outputs = [
63+
{"request_id": "req1", "outputs": {"decode_type": 2, "token_ids": [[11, 22]]}},
64+
{"request_id": "req1", "outputs": {"decode_type": 0, "token_ids": [1]}},
65+
{"request_id": "req1", "outputs": {"decode_type": 2, "token_ids": [[11, 22]]}},
66+
{"request_id": "req1", "outputs": {"decode_type": 0, "token_ids": [2]}},
67+
]
68+
69+
results = [
70+
r
71+
async for r in processor.process_response_chat(
72+
request_outputs, stream=True, enable_thinking=False, include_stop_str_in_output=False
73+
)
74+
]
75+
76+
self.assertEqual(results[0]["processed"], True)
77+
self.assertEqual(results[0]["raw"]["outputs"]["token_ids"], [1])
78+
self.assertEqual(results[1]["processed"], True)
79+
self.assertEqual(results[1]["raw"]["outputs"]["token_ids"], [2])
80+
5981
async def test_streaming_text_and_image(self):
6082
"""流式模式下:text → image → text"""
6183
request_outputs = [

0 commit comments

Comments
 (0)