feat: add streaming to markdown, fix raycast mcps exposing

2026-05-21 13:52:48 +02:00
parent 7fc0c9c0b1
commit 11f061070f
6 changed files with 557 additions and 99 deletions
@@ -29,10 +29,12 @@ import json
 import logging
 import os
 import tempfile
+import time
 from pathlib import Path
 from typing import TYPE_CHECKING, Any

 import aiofile
+from anthropic.types import RawContentBlockStopEvent
 from fastapi import FastAPI, HTTPException, Request, status
 from fastapi.responses import JSONResponse

@@ -46,7 +48,7 @@ from beaver_gateway.core.conversation_store import (
    rewrite_messages,
 )
 from beaver_gateway.core.turn_record import TurnRecord
-from beaver_gateway.frontends._accumulate import accumulate
+from beaver_gateway.frontends._accumulate import StreamAccumulator
 from beaver_gateway.frontends._auth import require_token
 from beaver_gateway.frontends.base import Frontend
 from beaver_gateway.frontends.markdown import parser, renderer
@@ -69,6 +71,14 @@ _log = logging.getLogger("beaver_gateway.frontends.markdown")
 __all__ = ["MarkdownFrontend"]


+# How often we re-render the assistant turn into the .md file while the
+# backend stream is still open. Trades responsiveness (faster updates to
+# Obsidian sync / Raycast tailers) against write amplification. Each
+# ``RawContentBlockStopEvent`` also forces a flush regardless of the
+# timer, so block boundaries always land in the file.
+_STREAM_FLUSH_DEBOUNCE = 0.4
+
+
 class MarkdownFrontend(Frontend):
    """FastAPI app behind ``POST /chat`` driven by Obsidian-vault files."""

@@ -285,21 +295,23 @@ class MarkdownFrontend(Frontend):
            TurnCapture() if isinstance(backend, ClaudeCodeBackendAdapter) else None
        )

+        kwargs: dict[str, Any] = {}
+        if capture is not None:
+            kwargs["capture"] = capture
+        events = backend.complete(
+            agent=agent, messages=outcome.messages, system=None, **kwargs
+        )
        try:
-            kwargs: dict[str, Any] = {}
-            if capture is not None:
-                kwargs["capture"] = capture
-            events = backend.complete(
-                agent=agent, messages=outcome.messages, system=None, **kwargs
+            message = await self._stream_to_file(
+                events=events,
+                file_path=file_path,
+                parsed=parsed,
+                model=agent.model or agent.name,
+                filename=filename,
            )
-            message = await accumulate(events, model=agent.model or agent.name)
+        except HTTPException:
+            raise
        except Exception as exc:
-            _log.exception("backend failed for %s", filename)
-            error_block = _render_error_block(exc)
-            new_body = renderer.append_to_body(parsed.body, error_block)
-            await _write_atomic(
-                file_path, _reattach_frontmatter(parsed.metadata, new_body)
-            )
            raise HTTPException(
                status.HTTP_500_INTERNAL_SERVER_ERROR, f"backend error: {exc}"
            ) from exc
@@ -346,6 +358,67 @@ class MarkdownFrontend(Frontend):

    # ---- helpers -------------------------------------------------------

+    async def _stream_to_file(
+        self,
+        *,
+        events: Any,
+        file_path: Path,
+        parsed: parser.ParsedFile,
+        model: str,
+        filename: str,
+    ) -> Any:
+        """Drain ``events`` into a ``Message``, flushing partials to disk.
+
+        Flushes happen on each ``RawContentBlockStopEvent`` (natural
+        block boundary, content is markdown-consistent) and on the
+        ``_STREAM_FLUSH_DEBOUNCE`` timer between events. The partial
+        write keeps the as-parsed frontmatter; the post-stream final
+        write in ``_write_assistant_reply`` is what stamps the refreshed
+        fingerprint / agent / conversation_id.
+
+        On backend exception we still flush the last partial and append
+        an error callout, so the human sees both what arrived and why it
+        stopped. The exception propagates so ``_handle_chat`` can map it
+        to a 500.
+        """
+        acc = StreamAccumulator()
+
+        async def flush_partial() -> None:
+            partial = acc.finalize(model=model)
+            if not partial.content:
+                return
+            rendered = renderer.render_assistant_message(partial)
+            new_body = renderer.append_to_body(parsed.body, rendered)
+            await _write_atomic(
+                file_path, _reattach_frontmatter(parsed.metadata, new_body)
+            )
+
+        try:
+            last_flush = time.monotonic()
+            async for ev in events:
+                acc.feed(ev)
+                now = time.monotonic()
+                if (
+                    isinstance(ev, RawContentBlockStopEvent)
+                    or (now - last_flush) >= _STREAM_FLUSH_DEBOUNCE
+                ):
+                    await flush_partial()
+                    last_flush = now
+        except Exception as exc:
+            _log.exception("backend failed for %s", filename)
+            partial = acc.finalize(model=model)
+            new_body = parsed.body
+            if partial.content:
+                new_body = renderer.append_to_body(
+                    new_body, renderer.render_assistant_message(partial)
+                )
+            new_body = renderer.append_to_body(new_body, _render_error_block(exc))
+            await _write_atomic(
+                file_path, _reattach_frontmatter(parsed.metadata, new_body)
+            )
+            raise
+        return acc.finalize(model=model)
+
    async def _write_assistant_reply(
        self,
        *,