diff --git a/ai_chatbot_backend/app/services/generation/base_handler.py b/ai_chatbot_backend/app/services/generation/base_handler.py
index deac28be..52e7376c 100644
--- a/ai_chatbot_backend/app/services/generation/base_handler.py
+++ b/ai_chatbot_backend/app/services/generation/base_handler.py
@@ -158,8 +158,14 @@ async def run(self) -> AsyncIterator[str]:
if hasattr(output, 'choices') and output.choices:
delta = output.choices[0].delta
- if hasattr(delta, 'reasoning_content') and delta.reasoning_content:
- self.ctx.accumulated_reasoning += delta.reasoning_content
+ reasoning_content = (
+ getattr(delta, 'reasoning_content', None)
+ or (delta.model_extra or {}).get('reasoning_content')
+ or (delta.model_extra or {}).get('reasoning')
+ or ''
+ )
+ if reasoning_content:
+ self.ctx.accumulated_reasoning += reasoning_content
if hasattr(delta, 'content') and delta.content:
self.ctx.accumulated_content += delta.content
@@ -227,8 +233,13 @@ async def run(self) -> AsyncIterator[str]:
async for audio_event in self._interleave_audio(channels['final']):
yield audio_event
else:
- # for/else: flush remaining chunks after stream ends
- channels = self.ctx.previous_channels
+ # for/else: flush any content held back by PARTIAL_TAIL_GUARD
+ # Recompute channels from the final accumulated text (bypasses the guard)
+ if self.ctx.accumulated_reasoning:
+ final_text = f"{self.ctx.accumulated_reasoning}{self.ctx.accumulated_content}"
+ else:
+ final_text = self.ctx.accumulated_content
+ channels = extract_channels(final_text) or self.ctx.previous_channels
chunks = {
c: channels[c][len(self.ctx.previous_channels.get(c, "")):]
for c in channels
@@ -241,6 +252,7 @@ async def run(self) -> AsyncIterator[str]:
yield sse(ResponseDelta(seq=self.ctx.text_seq, text_channel=channel, text=chunk))
self.ctx.text_seq += 1
print(chunk, end="")
+ self.ctx.previous_channels = channels
# Flush remaining audio
if self.audio_response and 'final' in channels:
diff --git a/ai_chatbot_backend/app/services/generation/model_call.py b/ai_chatbot_backend/app/services/generation/model_call.py
index 201780a5..ec266156 100644
--- a/ai_chatbot_backend/app/services/generation/model_call.py
+++ b/ai_chatbot_backend/app/services/generation/model_call.py
@@ -12,8 +12,12 @@
SAMPLING_PARAMS = {
"temperature": 0.6,
"top_p": 0.95,
- "max_tokens": 6000,
- "extra_body": {"top_k": 20, "min_p": 0}
+ "max_tokens": 2000,
+ "extra_body": {
+ "top_k": 20,
+ "min_p": 0,
+ "chat_template_kwargs": {"thinking_budget": 512},
+ }
}
diff --git a/ai_chatbot_backend/app/services/generation/parser.py b/ai_chatbot_backend/app/services/generation/parser.py
index 9d597bd0..e4023bd3 100644
--- a/ai_chatbot_backend/app/services/generation/parser.py
+++ b/ai_chatbot_backend/app/services/generation/parser.py
@@ -171,6 +171,18 @@ def extract_channels(text: str) -> dict:
if not text:
return {"analysis": "", "final": ""}
+ # Handle case where vLLM strips the opening token but passes through.
+ # In this case the text starts with thinking content and contains with no leading .
+ if "" in text and not re.match(r"^\s*", text):
+ parts = text.split("", 1)
+ incomplete_patterns = ["...` as a wrapper when it is a leading block.
if re.match(r"^\s*", text):
if "" in text:
@@ -180,16 +192,20 @@ def extract_channels(text: str) -> dict:
return {"analysis": m.group("analysis").strip(), "final": m.group("final").strip()}
parts = text.split("", 1)
- return {"analysis": parts[0].strip(), "final": parts[1].strip()}
+ analysis = re.sub(r"^\s*\s*", "", parts[0]).strip()
+ return {"analysis": analysis, "final": parts[1].strip()}
# Streaming: `` started but hasn't closed yet.
- incomplete_patterns = ["` must be listed so they are stripped before emitting.
+ incomplete_patterns = [" tag so analysis is tag-free (consistent with complete case)
+ analysis = re.sub(r"^\s*\s*", "", cleaned_text).strip()
+ return {"analysis": analysis, "final": ""}
# No think wrapper → everything is final (supports pure-JSON outputs).
thinking = _extract_top_level_json_string_field(text, "thinking")
diff --git a/ai_chatbot_backend/app/services/query/reformulation.py b/ai_chatbot_backend/app/services/query/reformulation.py
index 6f50445d..af792133 100644
--- a/ai_chatbot_backend/app/services/query/reformulation.py
+++ b/ai_chatbot_backend/app/services/query/reformulation.py
@@ -1,7 +1,22 @@
from typing import Any
+from openai import AsyncOpenAI
+
from app.config import settings
-from app.dependencies.model import get_vllm_chat_client
+
+# Singleton client for reformulation — avoids leaking a new connection pool on every call
+_reformulation_client: AsyncOpenAI | None = None
+
+
+def _get_reformulation_client() -> AsyncOpenAI:
+ global _reformulation_client
+ if _reformulation_client is None:
+ _reformulation_client = AsyncOpenAI(
+ base_url=settings.vllm_chat_url,
+ api_key=settings.vllm_api_key,
+ )
+ return _reformulation_client
+
# Query reformulator prompt
_QUERY_REFORMULATOR_PROMPT = (
@@ -14,8 +29,6 @@
"to align terminology and target specific topics. Include relevant constraints "
"(dates, versions, scope), and avoid adding facts not in the history. "
"Return only the rewritten query as question in plain text—no quotes, no extra text."
- "# Valid channels: analysis, commentary, final. Channel must be included for every message."
- "Calls to these tools must go to the commentary channel: 'functions'.<|end|>"
)
@@ -70,17 +83,26 @@ async def build_retrieval_query(
print(f"[DEBUG] Reformulation input ({len(request_content)} chars):\n{request_content[:2000]}...")
- client = get_vllm_chat_client()
+ client = _get_reformulation_client()
response = await client.chat.completions.create(
model=settings.vllm_chat_model,
messages=chat,
temperature=0.6,
top_p=0.95,
- max_tokens=500,
- extra_body={"top_k": 20, "min_p": 0}
+ max_tokens=512,
+ timeout=30.0,
+ extra_body={"top_k": 20, "min_p": 0, "chat_template_kwargs": {"enable_thinking": False}},
+ )
+ msg = response.choices[0].message
+ content = msg.content or ""
+ # reasoning_content is a vLLM extension stored in model_extra by the openai SDK
+ reasoning = (
+ getattr(msg, "reasoning_content", None)
+ or (msg.model_extra or {}).get("reasoning_content")
+ or (msg.model_extra or {}).get("reasoning")
+ or ""
)
- # vLLM with --reasoning-parser separates reasoning_content from content
- # Use content directly (final response without thinking)
- text = response.choices[0].message.content or ""
- print(f"[INFO] Generated RAG-Query: {text.strip()}")
- return text.strip()
+ print(f"[DEBUG] Reformulation raw: content={repr(content[:200])}, reasoning_content={repr(reasoning[:200])}")
+ text = content.strip() or reasoning.strip()
+ print(f"[INFO] Generated RAG-Query: {text[:200]}")
+ return text or user_message
diff --git a/ai_chatbot_backend/scripts/start_vllm_servers.sh b/ai_chatbot_backend/scripts/start_vllm_servers.sh
index 9e469138..30388b63 100755
--- a/ai_chatbot_backend/scripts/start_vllm_servers.sh
+++ b/ai_chatbot_backend/scripts/start_vllm_servers.sh
@@ -18,7 +18,7 @@ BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Server configurations
-CHAT_MODEL="cpatonn/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit"
+CHAT_MODEL="/home/tai25/models/qwen3.5-27b-awq-4bit"
CHAT_PORT=8001
CHAT_GPUS="0,1"
@@ -187,7 +187,7 @@ main() {
# Start Chat Model Server (Port 8001) - requires 2 GPUs for tensor parallel
start_server "chat" "$CHAT_MODEL" "$CHAT_PORT" "$CHAT_GPUS" \
"--tensor-parallel-size 2" \
- "--gpu-memory-utilization 0.47" \
+ "--gpu-memory-utilization 0.55" \
"--max-model-len 10000" \
"--max_num_seqs 32" \
"--reasoning-parser deepseek_r1"
@@ -202,7 +202,7 @@ main() {
start_server "embed" "$EMBEDDING_MODEL" "$EMBEDDING_PORT" "$EMBEDDING_GPUS" \
"--max-model-len 10000" \
"--max-num-seqs 32" \
- "--gpu-memory-utilization 0.4"
+ "--gpu-memory-utilization 0.3"
if ! wait_for_server $EMBEDDING_PORT "Embedding"; then
log_error "Embedding server failed to start. Check tmux session for errors."
@@ -212,7 +212,7 @@ main() {
# Start Whisper Server (Port 8003)
start_server "whisper" "$WHISPER_MODEL" "$WHISPER_PORT" "$WHISPER_GPUS" \
- "--gpu-memory-utilization 0.37"
+ "--gpu-memory-utilization 0.2"
if ! wait_for_server $WHISPER_PORT "Whisper"; then
log_error "Whisper server failed to start. Check tmux session for errors."