Files
beaver-gateway/src/beaver_gateway/frontends/markdown/parser.py
T
2026-05-20 21:30:10 +02:00

188 lines
6.9 KiB
Python

"""Parse a markdown chat file into Anthropic ``MessageParam`` history.
The file format is documented in ``frontends/markdown/__init__.py``:
``### User:`` / ``### Assistant:`` H3 headers split turns, optional
``---`` HRs between turns are visual-only, ``> [!thinking]-`` and
``> [!tool]- <name>`` callouts mark structured assistant content.
For backend consumption we strip thinking and tool_use callouts —
assistant turns become text-only. Rationale: history replay through
claude-code's JSONL injection only needs the *narrated* answer (the
thinking signatures expire and the original tool_results aren't
captured in the renderer's output, so a faithful tool_use round-trip
isn't possible today). The renderer keeps callouts in the file because
they're informational for the human reader; the parser drops them when
shaping the backend's input.
"""
from __future__ import annotations
import re
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any
import frontmatter
if TYPE_CHECKING:
from anthropic.types import MessageParam
__all__ = ["ParsedFile", "last_role", "parse", "resolve_agent"]
# Turn marker — must be exactly ``### User:`` or ``### Assistant:`` on
# its own line. Trailing whitespace tolerated; nothing after the colon
# on the same line (any inline content would mean the user typed
# something that just happens to look like a header, and we'd rather
# misparse than silently fold inline content into a turn).
_TURN_RE = re.compile(r"^###\s+(User|Assistant):\s*$", re.MULTILINE)
# Callout-start lines we strip from assistant turns when extracting
# text. We don't try to parse the contents — for backend input we just
# need to drop the whole quoted block.
_CALLOUT_START_RE = re.compile(r"^>\s+\[!(thinking|tool)\]")
@dataclass(frozen=True, slots=True)
class ParsedFile:
"""Result of parsing a single chat ``.md``.
``metadata`` is the YAML frontmatter as a plain dict (empty if the
file has none). ``messages`` is the conversation history shaped for
``Backend.complete`` — assistant turns are text-only. ``body`` is the
raw markdown content *after* the frontmatter is stripped; the
renderer needs it when it appends a new assistant turn so it can
preserve whatever the human typed verbatim (including any callouts
or HRs they added).
"""
metadata: dict[str, Any]
body: str
messages: list[MessageParam]
def parse(text: str) -> ParsedFile:
"""Parse a chat ``.md`` into ``(metadata, body, messages)``.
A file with no turn markers but non-empty body is treated as a
single user turn — the friendly path for "user types into a new
file and hits send" before any turn markers exist.
"""
parsed = frontmatter.loads(text)
metadata = dict(parsed.metadata)
body = parsed.content
messages: list[MessageParam] = []
turns = _split_turns(body)
if not turns:
stripped = body.strip()
if stripped:
messages.append({"role": "user", "content": stripped})
return ParsedFile(metadata=metadata, body=body, messages=messages)
for role, raw in turns:
if role == "user":
text_content = _strip_hrs(raw).strip()
if text_content:
messages.append({"role": "user", "content": text_content})
else:
text_content = _extract_assistant_text(raw)
if text_content:
messages.append({"role": "assistant", "content": text_content})
return ParsedFile(metadata=metadata, body=body, messages=messages)
def last_role(messages: list[MessageParam]) -> str | None:
"""Return ``"user"`` / ``"assistant"`` / ``None`` for an empty list."""
if not messages:
return None
return messages[-1]["role"]
def resolve_agent(
*, metadata: dict[str, Any], request_override: str | None, default: str | None
) -> str | None:
"""Resolve the agent for this chat.
Precedence: request body override > frontmatter > frontend default.
Returns ``None`` if none match — caller responds with 400.
"""
if request_override:
return request_override
fm_agent = metadata.get("agent")
if isinstance(fm_agent, str) and fm_agent:
return fm_agent
return default
# ---- internals ---------------------------------------------------------
def _split_turns(body: str) -> list[tuple[str, str]]:
"""Walk turn markers, return ``[(role_lc, raw_body), ...]``.
Body for each turn is everything between this marker and the next
(or EOF). Leading marker line itself is dropped. We don't trim
whitespace here — that's per-role.
"""
matches = list(_TURN_RE.finditer(body))
if not matches:
return []
out: list[tuple[str, str]] = []
for i, m in enumerate(matches):
role = m.group(1).lower()
start = m.end()
end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
out.append((role, body[start:end]))
return out
def _strip_hrs(raw: str) -> str:
"""Drop decorative ``---`` separator lines (whole-line HRs only).
A ``---`` mid-paragraph (rare, but possible) stays. Only lines that
are *exactly* the HR after optional surrounding whitespace are
removed — those are the ones the renderer emits between turns.
"""
lines = raw.splitlines()
kept = [ln for ln in lines if ln.strip() != "---"]
return "\n".join(kept)
def _extract_assistant_text(raw: str) -> str:
"""Strip thinking/tool callouts from an assistant turn, return spoken text.
Walks line by line. When we see a callout-start line (``> [!thinking]-``
or ``> [!tool]- ...``), we skip the entire contiguous quote block
(lines beginning with ``>`` or blank-then-`>` continuations don't
happen in Obsidian callouts — a blank line ends the callout). HR
lines (``---``) are dropped. Everything else is kept and joined,
then collapsed to a clean trim.
"""
lines = raw.splitlines()
out_lines: list[str] = []
i = 0
while i < len(lines):
line = lines[i]
if _CALLOUT_START_RE.match(line):
# Skip the whole quote block (consecutive lines starting
# with ``>``). Stop at first non-``>`` line, leaving it for
# the next iteration. Blank lines do not end the block — a
# callout body with a blank line uses ``> `` (quote-space)
# too — but in practice Obsidian's quote block ends on the
# first line that doesn't start with ``>``.
while i < len(lines) and lines[i].lstrip().startswith(">"):
i += 1
continue
if line.strip() == "---":
i += 1
continue
out_lines.append(line)
i += 1
# Collapse runs of blank lines that callout-stripping creates
# (two newlines around a stripped block fold into one).
text_joined = "\n".join(out_lines)
text_joined = re.sub(r"\n{3,}", "\n\n", text_joined)
return text_joined.strip()