Source code for mud_server.translation.renderer

"""Ollama HTTP renderer for the OOC→IC translation layer.

``OllamaRenderer`` is a thin, synchronous wrapper around the Ollama
``/api/chat`` endpoint.  It is the only place in the translation layer
that makes a network call.

Sync vs async
-------------
The renderer uses the synchronous ``requests`` library (already a pinned
dependency at ``requests==2.32.5``).  The ``GameEngine`` is fully
synchronous, and FastAPI runs sync endpoint handlers inside a thread-pool
executor, so a blocking HTTP call here does not stall the event loop.

When the engine is eventually asyncified the upgrade path is:
1. Replace ``requests.post`` with ``await httpx.AsyncClient().post``.
2. Mark ``render`` as ``async def``.
3. Mark ``OOCToICTranslationService.translate`` as ``async def``.
4. Propagate ``await`` up through ``engine.chat/yell/whisper``.
``httpx`` is already in the project dependencies (``>=0.28.1``) so no
new dep is required at that point.

Request structure
-----------------
The ``/api/chat`` payload includes a top-level ``keep_alive`` field
(default ``"5m"``) that tells Ollama how long to keep the model loaded
in memory after the request completes.  Without this field Ollama uses
its server default (typically 5 minutes), but after a cold-start the
model may be unloaded before the next request arrives, causing a full
reload on every call.  Setting ``keep_alive`` explicitly avoids this.

Deterministic mode
------------------
When ``set_deterministic(seed_int)`` is called (by the service, after
deriving a seed from the IPC hash), temperature is clamped to 0.0 and
the seed is forwarded to Ollama's ``options.seed`` field.

IPC hash sourcing (FUTURE — axis engine integration)
----------------------------------------------------
``set_deterministic`` will be called from ``OOCToICTranslationService``
once the axis engine passes a concrete ``ipc_hash`` through
``service.translate(..., ipc_hash=ipc_hash)``.  The service converts the
first 16 hex characters of the hash to an integer::

    seed_int = int(ipc_hash[:16], 16)

Until then ``set_deterministic`` is never called and the renderer uses
the configured temperature from ``TranslationLayerConfig``.
"""

from __future__ import annotations

import logging

import requests

logger = logging.getLogger(__name__)

# Temperature used when deterministic mode is not active.
_DEFAULT_TEMPERATURE = 0.7

# Conservative token ceiling for a single line of dialogue.
_DEFAULT_NUM_PREDICT = 128


[docs] class OllamaRenderer: """Synchronous renderer that calls the Ollama ``/api/chat`` endpoint. One ``OllamaRenderer`` instance is created per ``OOCToICTranslationService`` and reused across all translation calls. The renderer is *stateful* in one way only: deterministic mode can be armed via ``set_deterministic``, which persists for the lifetime of the object. This is by design — the axis engine arms it at the start of a deterministic turn and the service then calls ``render`` for each character in that turn. Attributes: _api_endpoint: Full ``/api/chat`` URL. _model: Ollama model tag (e.g. ``"gemma2:2b"``). _timeout: HTTP request timeout in seconds. _keep_alive: Ollama ``keep_alive`` duration string (e.g. ``"5m"``). Controls how long the model stays loaded in GPU/CPU memory after each request. _temperature: Sampling temperature; clamped to 0.0 in deterministic mode. _seed: Integer seed forwarded to Ollama when deterministic; ``None`` when non-deterministic. """ def __init__( self, *, api_endpoint: str, model: str, timeout_seconds: float, temperature: float = _DEFAULT_TEMPERATURE, keep_alive: str = "5m", ) -> None: """Initialise the renderer. Args: api_endpoint: Full Ollama ``/api/chat`` URL. model: Ollama model tag. timeout_seconds: HTTP request timeout. temperature: Default sampling temperature. keep_alive: Ollama ``keep_alive`` duration string. Controls how long the model stays loaded after each request. ``"5m"`` (default) keeps it warm for 5 minutes; ``"0"`` unloads immediately. """ self._api_endpoint = api_endpoint self._model = model self._timeout = timeout_seconds self._keep_alive = keep_alive self._temperature: float = temperature self._seed: int | None = None # ── Deterministic mode ────────────────────────────────────────────────────
[docs] def set_deterministic(self, seed_int: int) -> None: """Arm deterministic mode for subsequent ``render`` calls. Clamps temperature to 0.0 and stores the seed so that identical inputs produce identical outputs across runs. This is called by ``OOCToICTranslationService`` when a non-``None`` ``ipc_hash`` is provided and ``config.deterministic`` is ``True``. The seed is derived from the IPC hash *by the service*, not here, to keep hashing logic out of the renderer. Args: seed_int: Integer seed forwarded to Ollama's ``options.seed``. """ self._temperature = 0.0 self._seed = seed_int logger.debug("OllamaRenderer: deterministic mode armed (seed=%d)", seed_int)
# ── Primary render method ─────────────────────────────────────────────────
[docs] def render(self, system_prompt: str, user_message: str) -> str | None: """Call Ollama and return the raw response content. Builds the Ollama request payload, executes a synchronous POST, and returns the ``message.content`` string from the JSON response. Returns ``None`` on any network-level failure (timeout, connection error, non-2xx status). Content-level validation (PASSTHROUGH sentinel, multi-line output, etc.) is handled by ``OutputValidator``. Args: system_prompt: The fully-rendered system prompt (with character profile injected). user_message: The original OOC message (used as the ``user`` turn so the model sees both context and input). Returns: Raw LLM output string on success, ``None`` on failure. """ payload = self._build_payload(system_prompt, user_message) try: response = requests.post( self._api_endpoint, json=payload, timeout=self._timeout, ) response.raise_for_status() data = response.json() return data.get("message", {}).get("content", "").strip() or None except requests.exceptions.Timeout: logger.warning( "OllamaRenderer: request timed out after %.1fs (endpoint=%s)", self._timeout, self._api_endpoint, ) return None except requests.exceptions.ConnectionError: logger.warning( "OllamaRenderer: cannot connect to Ollama at %s", self._api_endpoint, ) return None except requests.exceptions.RequestException as exc: logger.error("OllamaRenderer: request failed: %s", exc) return None
# ── Internal helpers ────────────────────────────────────────────────────── def _build_payload(self, system_prompt: str, user_message: str) -> dict: """Construct the Ollama ``/api/chat`` request payload. ``stream`` is always ``False`` — we want the full response in a single JSON object rather than a server-sent-event stream. ``keep_alive`` is included at the top level to control how long Ollama keeps the model loaded after responding. Args: system_prompt: Rendered system prompt text. user_message: OOC message text. Returns: Dict ready to be serialised as the POST body. """ options: dict = { "temperature": self._temperature, "num_predict": _DEFAULT_NUM_PREDICT, } if self._seed is not None: options["seed"] = self._seed return { "model": self._model, "stream": False, "keep_alive": self._keep_alive, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}, ], "options": options, }