beaver-gateway/src/beaver_gateway/frontends/markdown/parser.py

"""Parse a markdown chat file into Anthropic ``MessageParam`` history.

The file format is documented in ``frontends/markdown/__init__.py``:
``### User:`` / ``### Assistant:`` H3 headers split turns, optional
``---`` HRs between turns are visual-only, ``> [!thinking]-`` and
``> [!tool]- <name>`` callouts mark structured assistant content.

For backend consumption we strip thinking and tool_use callouts —
assistant turns become text-only. Rationale: history replay through
claude-code's JSONL injection only needs the *narrated* answer (the
thinking signatures expire and the original tool_results aren't
captured in the renderer's output, so a faithful tool_use round-trip
isn't possible today). The renderer keeps callouts in the file because
they're informational for the human reader; the parser drops them when
shaping the backend's input.
"""

from __future__ import annotations

import re
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any

import frontmatter

if TYPE_CHECKING:
    from anthropic.types import MessageParam


__all__ = ["ParsedFile", "last_role", "parse", "resolve_agent"]


# Turn marker — must be exactly ``### User:`` or ``### Assistant:`` on
# its own line. Trailing whitespace tolerated; nothing after the colon
# on the same line (any inline content would mean the user typed
# something that just happens to look like a header, and we'd rather
# misparse than silently fold inline content into a turn).
_TURN_RE = re.compile(r"^###\s+(User|Assistant):\s*$", re.MULTILINE)

# Callout-start lines we strip from assistant turns when extracting
# text. We don't try to parse the contents — for backend input we just
# need to drop the whole quoted block.
_CALLOUT_START_RE = re.compile(r"^>\s+\[!(thinking|tool)\]")


@dataclass(frozen=True, slots=True)
class ParsedFile:
    """Result of parsing a single chat ``.md``.

    ``metadata`` is the YAML frontmatter as a plain dict (empty if the
    file has none). ``messages`` is the conversation history shaped for
    ``Backend.complete`` — assistant turns are text-only. ``body`` is the
    raw markdown content *after* the frontmatter is stripped; the
    renderer needs it when it appends a new assistant turn so it can
    preserve whatever the human typed verbatim (including any callouts
    or HRs they added).
    """

    metadata: dict[str, Any]
    body: str
    messages: list[MessageParam]


def parse(text: str) -> ParsedFile:
    """Parse a chat ``.md`` into ``(metadata, body, messages)``.

    A file with no turn markers but non-empty body is treated as a
    single user turn — the friendly path for "user types into a new
    file and hits send" before any turn markers exist.
    """
    parsed = frontmatter.loads(text)
    metadata = dict(parsed.metadata)
    body = parsed.content

    messages: list[MessageParam] = []
    turns = _split_turns(body)
    if not turns:
        stripped = body.strip()
        if stripped:
            messages.append({"role": "user", "content": stripped})
        return ParsedFile(metadata=metadata, body=body, messages=messages)

    for role, raw in turns:
        if role == "user":
            text_content = _strip_hrs(raw).strip()
            if text_content:
                messages.append({"role": "user", "content": text_content})
        else:
            text_content = _extract_assistant_text(raw)
            if text_content:
                messages.append({"role": "assistant", "content": text_content})

    return ParsedFile(metadata=metadata, body=body, messages=messages)


def last_role(messages: list[MessageParam]) -> str | None:
    """Return ``"user"`` / ``"assistant"`` / ``None`` for an empty list."""
    if not messages:
        return None
    return messages[-1]["role"]


def resolve_agent(
    *, metadata: dict[str, Any], request_override: str | None, default: str | None
) -> str | None:
    """Resolve the agent for this chat.

    Precedence: request body override > frontmatter > frontend default.
    Returns ``None`` if none match — caller responds with 400.
    """
    if request_override:
        return request_override
    fm_agent = metadata.get("agent")
    if isinstance(fm_agent, str) and fm_agent:
        return fm_agent
    return default


# ---- internals ---------------------------------------------------------


def _split_turns(body: str) -> list[tuple[str, str]]:
    """Walk turn markers, return ``[(role_lc, raw_body), ...]``.

    Body for each turn is everything between this marker and the next
    (or EOF). Leading marker line itself is dropped. We don't trim
    whitespace here — that's per-role.
    """
    matches = list(_TURN_RE.finditer(body))
    if not matches:
        return []
    out: list[tuple[str, str]] = []
    for i, m in enumerate(matches):
        role = m.group(1).lower()
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
        out.append((role, body[start:end]))
    return out


def _strip_hrs(raw: str) -> str:
    """Drop decorative ``---`` separator lines (whole-line HRs only).

    A ``---`` mid-paragraph (rare, but possible) stays. Only lines that
    are *exactly* the HR after optional surrounding whitespace are
    removed — those are the ones the renderer emits between turns.
    """
    lines = raw.splitlines()
    kept = [ln for ln in lines if ln.strip() != "---"]
    return "\n".join(kept)


def _extract_assistant_text(raw: str) -> str:
    """Strip thinking/tool callouts from an assistant turn, return spoken text.

    Walks line by line. When we see a callout-start line (``> [!thinking]-``
    or ``> [!tool]- ...``), we skip the entire contiguous quote block
    (lines beginning with ``>`` or blank-then-`>` continuations don't
    happen in Obsidian callouts — a blank line ends the callout). HR
    lines (``---``) are dropped. Everything else is kept and joined,
    then collapsed to a clean trim.
    """
    lines = raw.splitlines()
    out_lines: list[str] = []
    i = 0
    while i < len(lines):
        line = lines[i]
        if _CALLOUT_START_RE.match(line):
            # Skip the whole quote block (consecutive lines starting
            # with ``>``). Stop at first non-``>`` line, leaving it for
            # the next iteration. Blank lines do not end the block — a
            # callout body with a blank line uses ``> `` (quote-space)
            # too — but in practice Obsidian's quote block ends on the
            # first line that doesn't start with ``>``.
            while i < len(lines) and lines[i].lstrip().startswith(">"):
                i += 1
            continue
        if line.strip() == "---":
            i += 1
            continue
        out_lines.append(line)
        i += 1
    # Collapse runs of blank lines that callout-stripping creates
    # (two newlines around a stripped block fold into one).
    text_joined = "\n".join(out_lines)
    text_joined = re.sub(r"\n{3,}", "\n\n", text_joined)
    return text_joined.strip()