LLM Response Mocking
MockServer can mock LLM API responses from any major provider using the httpLlmResponse action. You describe the completion in a provider-neutral format and MockServer encodes it into the correct wire format for the target provider — headers, JSON structure, streaming framing, and all. This lets you test AI-powered applications deterministically, without calling a real LLM.
- Basic completion mock
- Supported providers
- Streaming responses
- Tool calls
- Multi-turn conversations
- Session isolation
- Cost budget
- Chaos / fault injection
- Streaming — client examples (all languages)
Basic Completion Mock
Create an expectation with an httpLlmResponse action. The provider tells MockServer which wire format to produce; the completion describes what to return.
The Java client has a typed fluent API (LlmMockBuilder + HttpLlmResponse). All other clients send the expectation as raw JSON using their generic expectation method (mockAnyResponse in JavaScript; a direct PUT to /mockserver/expectation in other languages).
import static org.mockserver.client.LlmMockBuilder.llmMock;
import static org.mockserver.model.Completion.completion;
import static org.mockserver.model.Provider.OPENAI;
import static org.mockserver.model.Usage.usage;
// OpenAI — POST /v1/chat/completions
llmMock("/v1/chat/completions")
.withProvider(OPENAI)
.withModel("gpt-4o")
.respondingWith(
completion()
.withText("MockServer is an open-source HTTP mock server.")
.withStopReason("stop")
.withUsage(usage().withInputTokens(12).withOutputTokens(8))
)
.applyTo(mockServerClient);
var mockServerClient = require('mockserver-client').mockServerClient;
// OpenAI — POST /v1/chat/completions
mockServerClient("localhost", 1080).mockAnyResponse({
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "MockServer is an open-source HTTP mock server.",
"stopReason": "stop",
"usage": { "inputTokens": 12, "outputTokens": 8 }
}
}
}).then(
function () { console.log("expectation created"); },
function (error) { console.log(error); }
);
import requests
# OpenAI — POST /v1/chat/completions
requests.put(
"http://localhost:1080/mockserver/expectation",
json={
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "MockServer is an open-source HTTP mock server.",
"stopReason": "stop",
"usage": {"inputTokens": 12, "outputTokens": 8}
}
}
}
)
require 'net/http'
require 'json'
# OpenAI — POST /v1/chat/completions
uri = URI('http://localhost:1080/mockserver/expectation')
http = Net::HTTP.new(uri.host, uri.port)
req = Net::HTTP::Put.new(uri.path, 'Content-Type' => 'application/json')
req.body = JSON.generate({
'httpRequest' => { 'method' => 'POST', 'path' => '/v1/chat/completions' },
'httpLlmResponse' => {
'provider' => 'OPENAI',
'model' => 'gpt-4o',
'completion' => {
'text' => 'MockServer is an open-source HTTP mock server.',
'stopReason' => 'stop',
'usage' => { 'inputTokens' => 12, 'outputTokens' => 8 }
}
}
})
http.request(req)
package main
import (
"bytes"
"encoding/json"
"net/http"
)
// OpenAI — POST /v1/chat/completions
func createLlmExpectation() {
body, _ := json.Marshal(map[string]interface{}{
"httpRequest": map[string]interface{}{
"method": "POST",
"path": "/v1/chat/completions",
},
"httpLlmResponse": map[string]interface{}{
"provider": "OPENAI",
"model": "gpt-4o",
"completion": map[string]interface{}{
"text": "MockServer is an open-source HTTP mock server.",
"stopReason": "stop",
"usage": map[string]int{"inputTokens": 12, "outputTokens": 8},
},
},
})
req, _ := http.NewRequest(http.MethodPut,
"http://localhost:1080/mockserver/expectation", bytes.NewReader(body))
req.Header.Set("Content-Type", "application/json")
http.DefaultClient.Do(req)
}
using System.Net.Http;
using System.Text;
using System.Text.Json;
// OpenAI — POST /v1/chat/completions
var expectation = new
{
httpRequest = new { method = "POST", path = "/v1/chat/completions" },
httpLlmResponse = new
{
provider = "OPENAI",
model = "gpt-4o",
completion = new
{
text = "MockServer is an open-source HTTP mock server.",
stopReason = "stop",
usage = new { inputTokens = 12, outputTokens = 8 }
}
}
};
using var client = new HttpClient();
var json = JsonSerializer.Serialize(expectation);
await client.PutAsync(
"http://localhost:1080/mockserver/expectation",
new StringContent(json, Encoding.UTF8, "application/json"));
use serde_json::json;
// OpenAI — POST /v1/chat/completions
let client = reqwest::blocking::Client::new();
client.put("http://localhost:1080/mockserver/expectation")
.json(&json!({
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "MockServer is an open-source HTTP mock server.",
"stopReason": "stop",
"usage": { "inputTokens": 12, "outputTokens": 8 }
}
}
}))
.send()
.unwrap();
<?php
// OpenAI — POST /v1/chat/completions
$expectation = [
'httpRequest' => ['method' => 'POST', 'path' => '/v1/chat/completions'],
'httpLlmResponse' => [
'provider' => 'OPENAI',
'model' => 'gpt-4o',
'completion' => [
'text' => 'MockServer is an open-source HTTP mock server.',
'stopReason' => 'stop',
'usage' => ['inputTokens' => 12, 'outputTokens' => 8],
],
],
];
$ch = curl_init('http://localhost:1080/mockserver/expectation');
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'PUT');
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($expectation));
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
curl_exec($ch);
curl_close($ch);
# OpenAI — POST /v1/chat/completions
curl -v -X PUT "http://localhost:1080/mockserver/expectation" -d '{
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "MockServer is an open-source HTTP mock server.",
"stopReason": "stop",
"usage": { "inputTokens": 12, "outputTokens": 8 }
}
}
}'
# Anthropic — POST /v1/messages
curl -v -X PUT "http://localhost:1080/mockserver/expectation" -d '{
"httpRequest": {
"method": "POST",
"path": "/v1/messages"
},
"httpLlmResponse": {
"provider": "ANTHROPIC",
"model": "claude-sonnet-4-20250514",
"completion": {
"text": "MockServer is an open-source HTTP mock server.",
"stopReason": "end_turn",
"usage": { "inputTokens": 12, "outputTokens": 8 }
}
}
}'
When a request matches POST /v1/chat/completions, MockServer returns an OpenAI-formatted JSON response with the specified text, model, stop reason, and token usage — including the correct id, created, and object fields that the OpenAI SDK expects.
Supported Providers
Each provider value produces the correct API response format for that provider's SDK:
| Provider | Typical API path | Notes |
|---|---|---|
| OPENAI | /v1/chat/completions | Chat Completions API format |
| OPENAI_RESPONSES | /v1/responses | OpenAI Responses API format |
| ANTHROPIC | /v1/messages | Anthropic Messages API format |
| GEMINI | /v1beta/models/{model}:generateContent | Google Gemini format |
| BEDROCK | /model/{model}/converse | AWS Bedrock Converse API; streaming uses application/vnd.amazon.eventstream binary framing |
| AZURE_OPENAI | /openai/deployments/{deployment}/chat/completions | Azure-hosted OpenAI format (delegates to OpenAI codec) |
| OLLAMA | /api/chat | Ollama local model API format |
Streaming Responses
Set streaming to true on the completion to return a stream instead of a single JSON response. MockServer splits the text into token-sized chunks and sends them as streaming chunks in the provider's native streaming format (SSE for most providers; NDJSON for Ollama).
Use streamingPhysics to control timing — useful for testing loading indicators, timeouts, and backpressure handling:
{
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "This response is streamed token by token.",
"streaming": true,
"streamingPhysics": {
"timeToFirstToken": {
"timeUnit": "MILLISECONDS",
"value": 200
},
"tokensPerSecond": 50,
"jitter": 0.1,
"seed": 42
},
"usage": {
"inputTokens": 10,
"outputTokens": 8
}
}
}
}
| Field | Description |
|---|---|
| timeToFirstToken | Delay before the first SSE event is sent |
| tokensPerSecond | Base token emission rate (1 – 10000) |
| jitter | Fractional uniform deviation from the base rate (0.0 – 1.0) |
| seed | PRNG seed for reproducible inter-token timing |
Tool Calls
To mock an LLM response that invokes tools (function calling), add toolCalls to the completion:
{
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"toolCalls": [
{
"id": "call_abc123",
"name": "get_weather",
"arguments": "{\"location\": \"London\"}"
}
],
"stopReason": "tool_use"
}
}
}
Multi-Turn Conversations
For agent testing, you often need to script a sequence of LLM responses that depend on what the agent sent. MockServer supports this through conversation predicates combined with scenario state to create multi-turn conversation flows.
Each turn uses conversationPredicates to match against the conversation history in the request body, and scenario state to track which turn the conversation is on:
[
{
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "I'll look up the weather for you.",
"toolCalls": [
{
"id": "call_1",
"name": "get_weather",
"arguments": "{\"location\": \"London\"}"
}
],
"stopReason": "tool_use"
},
"conversationPredicates": {
"turnIndex": 0,
"latestMessageContains": "weather"
}
},
"times": { "remainingTimes": 1 }
},
{
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "The weather in London is 18C and sunny.",
"stopReason": "stop"
},
"conversationPredicates": {
"containsToolResultFor": "get_weather"
}
},
"times": { "remainingTimes": 1 }
}
]
Conversation predicates
Predicates match against the parsed conversation in the request body (decoded using the provider's message format):
| Predicate | Description |
|---|---|
| turnIndex | Match when the assistant turn count equals this value (0-based) |
| latestMessageContains | Match when the last message contains this substring |
| latestMessageMatches | Match when the last message matches this regex pattern |
| latestMessageRole | Match when the last message has this role (e.g. user, tool) |
| containsToolResultFor | Match when the conversation contains a tool result for this tool name |
| semanticMatchAgainst | Opt-in, exploratory: the expected meaning the latest message should express, judged by a runtime LLM. Off by default — ignored unless mockserver.llmSemanticMatchingEnabled is set and a backend resolves. Non-deterministic; for exploration only, never for CI assertions. |
Prompt normalisation
Agent prompts are dynamically assembled, so exact-byte matching can be brittle. Add a normalization object to the predicates to apply deterministic transforms before matching:
"conversationPredicates": {
"latestMessageContains": "search for weather",
"normalization": {
"collapseWhitespace": true,
"lowercase": true,
"sortJsonKeys": true
}
}
Session Isolation
When multiple agents or test threads share a MockServer instance, each conversation needs its own independent state. Use an isolation source to extract a session key from each request (a header, query parameter, or cookie) so conversation state is tracked per session.
Session isolation is configured via the Java client’s conversation builder (isolateBy(IsolationSource.header("x-session-id"))) or the create_llm_conversation MCP tool’s isolateBy parameter. The isolation source is encoded into the expectation’s scenario name — it is not a separate field in the raw expectation JSON. Common isolation sources:
- Header: extract the session key from a request header (e.g. x-session-id)
- Query parameter: extract from a URL query parameter
- Cookie: extract from a cookie value
Example using the create_llm_conversation MCP tool:
{
"provider": "OPENAI",
"path": "/v1/chat/completions",
"isolateBy": {
"source": "header",
"name": "x-session-id"
},
"turns": [
{
"match": { "turnIndex": 0 },
"response": { "text": "Hello!", "stopReason": "stop" }
}
]
}
Each unique isolation key value creates an independent conversation state machine, so multiple agents can run conversations in parallel without interfering with each other.
Cost Budget
When using MockServer as a proxy in front of a real LLM provider, the llmCostBudgetUsd property sets a cumulative cost ceiling. Once the estimated cost of all forwarded LLM completions exceeds the budget, further LLM forwards are blocked with a 429 response. This is a safety net for CI pipelines or development environments where runaway agents could generate unexpected charges.
| Property | Env var | Default | Description |
|---|---|---|---|
| mockserver.llmCostBudgetUsd | MOCKSERVER_LLM_COST_BUDGET_USD | -1.0 (disabled) | Cumulative cost budget in USD. Set to a positive value to enable; negative or unset means no limit. |
The budget is enforced on all forward paths (matched forward actions, proxy-pass, and reverse-proxy routes) and resets on PUT /mockserver/reset. Cost estimation uses an internal pricing table and is approximate — treat it as a safety guard, not an invoice.
Chaos / Fault Injection
Add a chaos object to the httpLlmResponse to test how your application handles LLM failures:
{
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "Normal response text",
"streaming": true
},
"chaos": {
"errorStatus": 503,
"errorProbability": 0.3,
"truncateMode": "MID_STREAM",
"truncateAtFraction": 0.5,
"malformedSse": true
}
}
}
| Field | Description |
|---|---|
| errorStatus | Return this HTTP status as a provider error (e.g. 503, 429). With no errorProbability, always fires. |
| retryAfter | Value for the Retry-After header on an injected error (e.g. "30"). Useful alongside errorStatus: 429 to test retry-after handling. |
| errorProbability | Probability (0.0 – 1.0) that the error fires on each request |
| truncateMode | NONE (default) or MID_STREAM. Must be set to MID_STREAM for truncateAtFraction to take effect. |
| truncateAtFraction | For streaming responses, cut the stream short at this fraction of SSE events (0.0 – 1.0, default 0.5). Only applies when truncateMode is MID_STREAM. |
| malformedSse | Inject a deliberately broken-JSON SSE chunk |
| quotaName / quotaLimit / quotaWindowMillis | Deterministic fixed-window rate limit |
| quotaErrorStatus | HTTP status returned when the quota is exceeded (default 429). Must be between 100 and 599. |
| seed | PRNG seed for reproducible probabilistic faults |
See Chaos Testing & Fault Injection for more about chaos profiles.
Streaming — Client Examples
These examples show how to create a streaming LLM expectation with realistic timing physics. The Java client uses the typed fluent API; all other clients send the expectation as raw JSON.
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static org.mockserver.client.Llm.jitter;
import static org.mockserver.client.Llm.timeToFirstToken;
import static org.mockserver.client.Llm.tokensPerSecond;
import static org.mockserver.client.LlmMockBuilder.llmMock;
import static org.mockserver.model.Completion.completion;
import static org.mockserver.model.Provider.OPENAI;
llmMock("/v1/chat/completions")
.withProvider(OPENAI)
.withModel("gpt-4o")
.respondingWith(
completion()
.withText("This response is streamed token by token.")
.streaming()
.withStreamingPhysics(
timeToFirstToken(200, MILLISECONDS),
tokensPerSecond(50),
jitter(0.1))
)
.applyTo(mockServerClient);
var mockServerClient = require('mockserver-client').mockServerClient;
mockServerClient("localhost", 1080).mockAnyResponse({
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "This response is streamed token by token.",
"streaming": true,
"streamingPhysics": {
"timeToFirstToken": { "timeUnit": "MILLISECONDS", "value": 200 },
"tokensPerSecond": 50,
"jitter": 0.1,
"seed": 42
}
}
}
}).then(
function () { console.log("expectation created"); },
function (error) { console.log(error); }
);
import requests
requests.put(
"http://localhost:1080/mockserver/expectation",
json={
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "This response is streamed token by token.",
"streaming": True,
"streamingPhysics": {
"timeToFirstToken": {"timeUnit": "MILLISECONDS", "value": 200},
"tokensPerSecond": 50,
"jitter": 0.1,
"seed": 42
}
}
}
}
)
require 'net/http'
require 'json'
uri = URI('http://localhost:1080/mockserver/expectation')
http = Net::HTTP.new(uri.host, uri.port)
req = Net::HTTP::Put.new(uri.path, 'Content-Type' => 'application/json')
req.body = JSON.generate({
'httpRequest' => { 'method' => 'POST', 'path' => '/v1/chat/completions' },
'httpLlmResponse' => {
'provider' => 'OPENAI',
'model' => 'gpt-4o',
'completion' => {
'text' => 'This response is streamed token by token.',
'streaming' => true,
'streamingPhysics' => {
'timeToFirstToken' => { 'timeUnit' => 'MILLISECONDS', 'value' => 200 },
'tokensPerSecond' => 50,
'jitter' => 0.1,
'seed' => 42
}
}
}
})
http.request(req)
package main
import (
"bytes"
"encoding/json"
"net/http"
)
func createStreamingLlmExpectation() {
body, _ := json.Marshal(map[string]interface{}{
"httpRequest": map[string]interface{}{
"method": "POST",
"path": "/v1/chat/completions",
},
"httpLlmResponse": map[string]interface{}{
"provider": "OPENAI",
"model": "gpt-4o",
"completion": map[string]interface{}{
"text": "This response is streamed token by token.",
"streaming": true,
"streamingPhysics": map[string]interface{}{
"timeToFirstToken": map[string]interface{}{
"timeUnit": "MILLISECONDS", "value": 200,
},
"tokensPerSecond": 50,
"jitter": 0.1,
"seed": 42,
},
},
},
})
req, _ := http.NewRequest(http.MethodPut,
"http://localhost:1080/mockserver/expectation", bytes.NewReader(body))
req.Header.Set("Content-Type", "application/json")
http.DefaultClient.Do(req)
}
using System.Net.Http;
using System.Text;
using System.Text.Json;
var expectation = new
{
httpRequest = new { method = "POST", path = "/v1/chat/completions" },
httpLlmResponse = new
{
provider = "OPENAI",
model = "gpt-4o",
completion = new
{
text = "This response is streamed token by token.",
streaming = true,
streamingPhysics = new
{
timeToFirstToken = new { timeUnit = "MILLISECONDS", value = 200 },
tokensPerSecond = 50,
jitter = 0.1,
seed = 42
}
}
}
};
using var client = new HttpClient();
var json = JsonSerializer.Serialize(expectation);
await client.PutAsync(
"http://localhost:1080/mockserver/expectation",
new StringContent(json, Encoding.UTF8, "application/json"));
use serde_json::json;
let client = reqwest::blocking::Client::new();
client.put("http://localhost:1080/mockserver/expectation")
.json(&json!({
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "This response is streamed token by token.",
"streaming": true,
"streamingPhysics": {
"timeToFirstToken": { "timeUnit": "MILLISECONDS", "value": 200 },
"tokensPerSecond": 50,
"jitter": 0.1,
"seed": 42
}
}
}
}))
.send()
.unwrap();
<?php
$expectation = [
'httpRequest' => ['method' => 'POST', 'path' => '/v1/chat/completions'],
'httpLlmResponse' => [
'provider' => 'OPENAI',
'model' => 'gpt-4o',
'completion' => [
'text' => 'This response is streamed token by token.',
'streaming' => true,
'streamingPhysics' => [
'timeToFirstToken' => ['timeUnit' => 'MILLISECONDS', 'value' => 200],
'tokensPerSecond' => 50,
'jitter' => 0.1,
'seed' => 42,
],
],
],
];
$ch = curl_init('http://localhost:1080/mockserver/expectation');
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'PUT');
curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($expectation));
curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
curl_exec($ch);
curl_close($ch);
curl -v -X PUT "http://localhost:1080/mockserver/expectation" -d '{
"httpRequest": {
"method": "POST",
"path": "/v1/chat/completions"
},
"httpLlmResponse": {
"provider": "OPENAI",
"model": "gpt-4o",
"completion": {
"text": "This response is streamed token by token.",
"streaming": true,
"streamingPhysics": {
"timeToFirstToken": { "timeUnit": "MILLISECONDS", "value": 200 },
"tokensPerSecond": 50,
"jitter": 0.1,
"seed": 42
}
}
}
}'
Java Client API (Simple Completion)
The Java client provides a typed fluent API for creating LLM expectations without hand-assembling JSON:
import static org.mockserver.model.HttpRequest.request;
import static org.mockserver.model.HttpLlmResponse.llmResponse;
import static org.mockserver.model.Completion.completion;
mockServerClient
.when(
request()
.withMethod("POST")
.withPath("/v1/chat/completions")
)
.respond(
llmResponse()
.withProvider(Provider.OPENAI)
.withModel("gpt-4o")
.withCompletion(
completion()
.withText("Hello from MockServer!")
.withStopReason("stop")
)
);
Related Pages
- Creating Expectations — full request matcher and action reference
- Chaos Testing & Fault Injection — HTTP and LLM chaos profiles
- MCP Tools Reference — mock_llm_completion and create_llm_conversation tools for AI agents
- Inspect AI Agent Traffic — proxy and inspect real LLM API traffic
- Observability — LLM token/cost Prometheus metrics and OpenTelemetry GenAI spans