"""Ollama HTTP renderer for the OOC→IC translation layer.
``OllamaRenderer`` is a thin, synchronous wrapper around the Ollama
``/api/chat`` endpoint. It is the only place in the translation layer
that makes a network call.
Sync vs async
-------------
The renderer uses the synchronous ``requests`` library (already a pinned
dependency at ``requests==2.32.5``). The ``GameEngine`` is fully
synchronous, and FastAPI runs sync endpoint handlers inside a thread-pool
executor, so a blocking HTTP call here does not stall the event loop.
When the engine is eventually asyncified the upgrade path is:
1. Replace ``requests.post`` with ``await httpx.AsyncClient().post``.
2. Mark ``render`` as ``async def``.
3. Mark ``OOCToICTranslationService.translate`` as ``async def``.
4. Propagate ``await`` up through ``engine.chat/yell/whisper``.
``httpx`` is already in the project dependencies (``>=0.28.1``) so no
new dep is required at that point.
Request structure
-----------------
The ``/api/chat`` payload includes a top-level ``keep_alive`` field
(default ``"5m"``) that tells Ollama how long to keep the model loaded
in memory after the request completes. Without this field Ollama uses
its server default (typically 5 minutes), but after a cold-start the
model may be unloaded before the next request arrives, causing a full
reload on every call. Setting ``keep_alive`` explicitly avoids this.
Deterministic mode
------------------
When ``set_deterministic(seed_int)`` is called (by the service, after
deriving a seed from the IPC hash), temperature is clamped to 0.0 and
the seed is forwarded to Ollama's ``options.seed`` field.
IPC hash sourcing (FUTURE — axis engine integration)
----------------------------------------------------
``set_deterministic`` will be called from ``OOCToICTranslationService``
once the axis engine passes a concrete ``ipc_hash`` through
``service.translate(..., ipc_hash=ipc_hash)``. The service converts the
first 16 hex characters of the hash to an integer::
seed_int = int(ipc_hash[:16], 16)
Until then ``set_deterministic`` is never called and the renderer uses
the configured temperature from ``TranslationLayerConfig``.
"""
from __future__ import annotations
import logging
import requests
logger = logging.getLogger(__name__)
# Temperature used when deterministic mode is not active.
_DEFAULT_TEMPERATURE = 0.7
# Conservative token ceiling for a single line of dialogue.
_DEFAULT_NUM_PREDICT = 128
[docs]
class OllamaRenderer:
"""Synchronous renderer that calls the Ollama ``/api/chat`` endpoint.
One ``OllamaRenderer`` instance is created per ``OOCToICTranslationService``
and reused across all translation calls. The renderer is *stateful* in
one way only: deterministic mode can be armed via ``set_deterministic``,
which persists for the lifetime of the object. This is by design — the
axis engine arms it at the start of a deterministic turn and the service
then calls ``render`` for each character in that turn.
Attributes:
_api_endpoint: Full ``/api/chat`` URL.
_model: Ollama model tag (e.g. ``"gemma2:2b"``).
_timeout: HTTP request timeout in seconds.
_keep_alive: Ollama ``keep_alive`` duration string (e.g.
``"5m"``). Controls how long the model stays
loaded in GPU/CPU memory after each request.
_temperature: Sampling temperature; clamped to 0.0 in deterministic
mode.
_seed: Integer seed forwarded to Ollama when deterministic;
``None`` when non-deterministic.
"""
def __init__(
self,
*,
api_endpoint: str,
model: str,
timeout_seconds: float,
temperature: float = _DEFAULT_TEMPERATURE,
keep_alive: str = "5m",
) -> None:
"""Initialise the renderer.
Args:
api_endpoint: Full Ollama ``/api/chat`` URL.
model: Ollama model tag.
timeout_seconds: HTTP request timeout.
temperature: Default sampling temperature.
keep_alive: Ollama ``keep_alive`` duration string.
Controls how long the model stays loaded
after each request. ``"5m"`` (default)
keeps it warm for 5 minutes; ``"0"``
unloads immediately.
"""
self._api_endpoint = api_endpoint
self._model = model
self._timeout = timeout_seconds
self._keep_alive = keep_alive
self._temperature: float = temperature
self._seed: int | None = None
# ── Deterministic mode ────────────────────────────────────────────────────
[docs]
def set_deterministic(self, seed_int: int) -> None:
"""Arm deterministic mode for subsequent ``render`` calls.
Clamps temperature to 0.0 and stores the seed so that identical
inputs produce identical outputs across runs. This is called by
``OOCToICTranslationService`` when a non-``None`` ``ipc_hash`` is
provided and ``config.deterministic`` is ``True``.
The seed is derived from the IPC hash *by the service*, not here,
to keep hashing logic out of the renderer.
Args:
seed_int: Integer seed forwarded to Ollama's ``options.seed``.
"""
self._temperature = 0.0
self._seed = seed_int
logger.debug("OllamaRenderer: deterministic mode armed (seed=%d)", seed_int)
# ── Primary render method ─────────────────────────────────────────────────
[docs]
def render(self, system_prompt: str, user_message: str) -> str | None:
"""Call Ollama and return the raw response content.
Builds the Ollama request payload, executes a synchronous POST, and
returns the ``message.content`` string from the JSON response.
Returns ``None`` on any network-level failure (timeout, connection
error, non-2xx status). Content-level validation (PASSTHROUGH
sentinel, multi-line output, etc.) is handled by ``OutputValidator``.
Args:
system_prompt: The fully-rendered system prompt (with character
profile injected).
user_message: The original OOC message (used as the ``user``
turn so the model sees both context and input).
Returns:
Raw LLM output string on success, ``None`` on failure.
"""
payload = self._build_payload(system_prompt, user_message)
try:
response = requests.post(
self._api_endpoint,
json=payload,
timeout=self._timeout,
)
response.raise_for_status()
data = response.json()
return data.get("message", {}).get("content", "").strip() or None
except requests.exceptions.Timeout:
logger.warning(
"OllamaRenderer: request timed out after %.1fs (endpoint=%s)",
self._timeout,
self._api_endpoint,
)
return None
except requests.exceptions.ConnectionError:
logger.warning(
"OllamaRenderer: cannot connect to Ollama at %s",
self._api_endpoint,
)
return None
except requests.exceptions.RequestException as exc:
logger.error("OllamaRenderer: request failed: %s", exc)
return None
# ── Internal helpers ──────────────────────────────────────────────────────
def _build_payload(self, system_prompt: str, user_message: str) -> dict:
"""Construct the Ollama ``/api/chat`` request payload.
``stream`` is always ``False`` — we want the full response in a
single JSON object rather than a server-sent-event stream.
``keep_alive`` is included at the top level to control how long
Ollama keeps the model loaded after responding.
Args:
system_prompt: Rendered system prompt text.
user_message: OOC message text.
Returns:
Dict ready to be serialised as the POST body.
"""
options: dict = {
"temperature": self._temperature,
"num_predict": _DEFAULT_NUM_PREDICT,
}
if self._seed is not None:
options["seed"] = self._seed
return {
"model": self._model,
"stream": False,
"keep_alive": self._keep_alive,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_message},
],
"options": options,
}