Skip to content

[Bug] Responses API always falls back to JSON mode which breaks with web search #8958

@casper-hansen

Description

@casper-hansen

What happened?

I need to use the responses API with web search to optimize using training data with GEPA. Yet, when I tried this, it gives this error:

WARNING dspy.adapters.json_adapter: Failed to use structured output format, falling back to JSON mode.

Traceback (most recent call last):
  File "/.venv/lib/python3.11/site-packages/dspy/adapters/chat_adapter.py", line 38, in __call__
    return super().__call__(lm, lm_kwargs, signature, demos, inputs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/.venv/lib/python3.11/site-packages/dspy/adapters/base.py", line 155, in __call__
    outputs = lm(messages=inputs, **lm_kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/.venv/lib/python3.11/site-packages/dspy/utils/callback.py", line 326, in sync_wrapper
    return fn(instance, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/.venv/lib/python3.11/site-packages/dspy/clients/base_lm.py", line 86, in __call__
    outputs = self._process_lm_response(response, prompt, messages, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/.venv/lib/python3.11/site-packages/dspy/clients/base_lm.py", line 55, in _process_lm_response
    outputs = self._process_response(response)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/.venv/lib/python3.11/site-packages/dspy/clients/base_lm.py", line 233, in _process_response
    output_item_type = output_item.type
                       ^^^^^^^^^^^^^^^^
AttributeError: 'dict' object has no attribute 'type'

And later when it falls back to JSON:

litellm.BadRequestError: OpenAIException - {
  "error": {
    "message": "Web Search cannot be used with JSON mode.",
    "type": "invalid_request_error",
    "param": "response_format",
    "code": null
  }
}

Steps to reproduce

# uv pip install git+https://github.com/stanfordnlp/dspy.git[dev]
import os
import dspy
from datasets import load_dataset

gpt5_high_search = dspy.LM(
    "openai/gpt-5",
    model_type="responses",
    api_key=os.getenv("OPENAI_API_KEY"),
    api_base=os.getenv("OPENAI_BASE_URL"),
    temperature=1.0,
    max_tokens=128000,
    tools=[{"type": "web_search"}],
    reasoning={"effort": "high"},
)

gpt5_high = dspy.LM(
    "openai/gpt-5",
    model_type="responses",
    api_key=os.getenv("OPENAI_API_KEY"),
    api_base=os.getenv("OPENAI_BASE_URL"),
    temperature=1.0,
    max_tokens=128000,
    reasoning={"effort": "high"},
)

class Response(dspy.Signature):
    """
    You are a biomedical expert. You must attempt to answer the question below with a correct conclusion.
    """
    question = dspy.InputField(desc="The hard biological question to answer.")
    answer = dspy.OutputField(desc="The answer to the hard biological question.")

class Judge(dspy.Signature):
    """Judge whether the following response to question is correct or not based on the precise and unambiguous correct_answer.

Your judgement must be in the format and criteria specified below:

correct: Answer 'yes' if generated response matches the correct_answer, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the generated answer is incorrect.
    """
    question = dspy.InputField(desc="The hard biological question to answer.")
    response = dspy.InputField(desc="The generated reseponse.")
    correct_answer = dspy.InputField(desc="The correct answer, ground truth.")
    issues = dspy.OutputField(desc="The specific issues with the answer.")
    correct = dspy.OutputField(desc="'yes' if correct, otherwise 'no'")

class Generator(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate = dspy.Predict(Response)
    
    def forward(self, question):
        with dspy.context(lm=gpt5_high_search):
            result = self.generate(question=question)
        
        return result

class Verifier(dspy.Module):
    def __init__(self):
        super().__init__()
        self.verify = dspy.Predict(Judge)
    
    def forward(self, question, response, correct_answer):
        with dspy.context(lm=gpt5_high):
            result = self.verify(question=question, response=response, correct_answer=correct_answer)
        
        return result

def init_dataset():
    ds = load_dataset("casperhansen/pmc-oa-markdown-qa")
    train_split = [
        dspy.Example(
            {
                "question": x["question"],
                "answer": x["answer"],
            }
        ).with_inputs("question")
        for x in ds["train"]
    ]
    test_split = [
        dspy.Example(
            {
                "question": x["question"],
                "answer": x["answer"],
            }
        ).with_inputs("question")
        for x in ds["test"]
    ]

    train_set = train_split[:int(0.8 * len(ds["train"]))]
    val_set = train_split[int(0.8 * len(ds["train"])):]
    test_set = test_split

    return train_set, val_set, test_set

train_set, val_set, test_set = init_dataset()
generator = Generator()
verifier = Verifier()

def metric(
    example: dspy.Example,
    prediction: dspy.Prediction,
    trace=None,
    pred_name=None,
    pred_trace=None,
) -> dspy.Prediction:
    try:
        judgement: Judge = verifier(
            question=example.question,
            response=prediction.answer,
            correct_answer=example.answer,
        )

        if judgement.correct.lower() == "yes":
            score = 1.0
        else:
            score = 0.0

        return dspy.Prediction(
            score=score,
            feedback=judgement.issues,
        )
    except Exception as ex:
        print(ex)
        return dspy.Prediction(
            score=0.0,
            feedback="Error during metric computation",
        )

def single_run():
    pred = generator(**train_set[0].inputs())
    output = verifier(train_set[0].question, pred.answer, train_set[0].answer)

    print(pred)
    print("="*80)
    print(output)

print(">>>>> eval on generator")

evaluator = dspy.Evaluate(
    devset=test_set,
    metric=metric,
    num_threads=100,
    display_progress=True,
    max_errors=1,
    provide_traceback=True,
)

evaluator(generator)

optimizer = dspy.GEPA(
    auto="light",
    metric=metric,
    reflection_lm=gpt5_high,
    # increasing batch size can lead to worse performance due to increased context size
    reflection_minibatch_size=5,
    use_merge=True,
    max_merge_invocations=10,
    num_threads=100,
)

print(">>>>> training")

optimized_generator = optimizer.compile(
    generator,
    trainset=train_set,
    valset=val_set,
)

optimized_generator.save("optimized_generator.json")

print(">>>>> eval on optimized generator")

evaluator(optimized_generator)

DSPy version

latest on main branch (e9c36ab)

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions