refactor: add markdown frontend

2026-05-20 21:30:10 +02:00
parent a7827b2fa6
commit 3dc780c74c
15 changed files with 1721 additions and 141 deletions
@@ -0,0 +1,187 @@
+"""Parse a markdown chat file into Anthropic ``MessageParam`` history.
+
+The file format is documented in ``frontends/markdown/__init__.py``:
+``### User:`` / ``### Assistant:`` H3 headers split turns, optional
+``---`` HRs between turns are visual-only, ``> [!thinking]-`` and
+``> [!tool]- <name>`` callouts mark structured assistant content.
+
+For backend consumption we strip thinking and tool_use callouts —
+assistant turns become text-only. Rationale: history replay through
+claude-code's JSONL injection only needs the *narrated* answer (the
+thinking signatures expire and the original tool_results aren't
+captured in the renderer's output, so a faithful tool_use round-trip
+isn't possible today). The renderer keeps callouts in the file because
+they're informational for the human reader; the parser drops them when
+shaping the backend's input.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+import frontmatter
+
+if TYPE_CHECKING:
+    from anthropic.types import MessageParam
+
+
+__all__ = ["ParsedFile", "last_role", "parse", "resolve_agent"]
+
+
+# Turn marker — must be exactly ``### User:`` or ``### Assistant:`` on
+# its own line. Trailing whitespace tolerated; nothing after the colon
+# on the same line (any inline content would mean the user typed
+# something that just happens to look like a header, and we'd rather
+# misparse than silently fold inline content into a turn).
+_TURN_RE = re.compile(r"^###\s+(User|Assistant):\s*$", re.MULTILINE)
+
+# Callout-start lines we strip from assistant turns when extracting
+# text. We don't try to parse the contents — for backend input we just
+# need to drop the whole quoted block.
+_CALLOUT_START_RE = re.compile(r"^>\s+\[!(thinking|tool)\]")
+
+
+@dataclass(frozen=True, slots=True)
+class ParsedFile:
+    """Result of parsing a single chat ``.md``.
+
+    ``metadata`` is the YAML frontmatter as a plain dict (empty if the
+    file has none). ``messages`` is the conversation history shaped for
+    ``Backend.complete`` — assistant turns are text-only. ``body`` is the
+    raw markdown content *after* the frontmatter is stripped; the
+    renderer needs it when it appends a new assistant turn so it can
+    preserve whatever the human typed verbatim (including any callouts
+    or HRs they added).
+    """
+
+    metadata: dict[str, Any]
+    body: str
+    messages: list[MessageParam]
+
+
+def parse(text: str) -> ParsedFile:
+    """Parse a chat ``.md`` into ``(metadata, body, messages)``.
+
+    A file with no turn markers but non-empty body is treated as a
+    single user turn — the friendly path for "user types into a new
+    file and hits send" before any turn markers exist.
+    """
+    parsed = frontmatter.loads(text)
+    metadata = dict(parsed.metadata)
+    body = parsed.content
+
+    messages: list[MessageParam] = []
+    turns = _split_turns(body)
+    if not turns:
+        stripped = body.strip()
+        if stripped:
+            messages.append({"role": "user", "content": stripped})
+        return ParsedFile(metadata=metadata, body=body, messages=messages)
+
+    for role, raw in turns:
+        if role == "user":
+            text_content = _strip_hrs(raw).strip()
+            if text_content:
+                messages.append({"role": "user", "content": text_content})
+        else:
+            text_content = _extract_assistant_text(raw)
+            if text_content:
+                messages.append({"role": "assistant", "content": text_content})
+
+    return ParsedFile(metadata=metadata, body=body, messages=messages)
+
+
+def last_role(messages: list[MessageParam]) -> str | None:
+    """Return ``"user"`` / ``"assistant"`` / ``None`` for an empty list."""
+    if not messages:
+        return None
+    return messages[-1]["role"]
+
+
+def resolve_agent(
+    *, metadata: dict[str, Any], request_override: str | None, default: str | None
+) -> str | None:
+    """Resolve the agent for this chat.
+
+    Precedence: request body override > frontmatter > frontend default.
+    Returns ``None`` if none match — caller responds with 400.
+    """
+    if request_override:
+        return request_override
+    fm_agent = metadata.get("agent")
+    if isinstance(fm_agent, str) and fm_agent:
+        return fm_agent
+    return default
+
+
+# ---- internals ---------------------------------------------------------
+
+
+def _split_turns(body: str) -> list[tuple[str, str]]:
+    """Walk turn markers, return ``[(role_lc, raw_body), ...]``.
+
+    Body for each turn is everything between this marker and the next
+    (or EOF). Leading marker line itself is dropped. We don't trim
+    whitespace here — that's per-role.
+    """
+    matches = list(_TURN_RE.finditer(body))
+    if not matches:
+        return []
+    out: list[tuple[str, str]] = []
+    for i, m in enumerate(matches):
+        role = m.group(1).lower()
+        start = m.end()
+        end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
+        out.append((role, body[start:end]))
+    return out
+
+
+def _strip_hrs(raw: str) -> str:
+    """Drop decorative ``---`` separator lines (whole-line HRs only).
+
+    A ``---`` mid-paragraph (rare, but possible) stays. Only lines that
+    are *exactly* the HR after optional surrounding whitespace are
+    removed — those are the ones the renderer emits between turns.
+    """
+    lines = raw.splitlines()
+    kept = [ln for ln in lines if ln.strip() != "---"]
+    return "\n".join(kept)
+
+
+def _extract_assistant_text(raw: str) -> str:
+    """Strip thinking/tool callouts from an assistant turn, return spoken text.
+
+    Walks line by line. When we see a callout-start line (``> [!thinking]-``
+    or ``> [!tool]- ...``), we skip the entire contiguous quote block
+    (lines beginning with ``>`` or blank-then-`>` continuations don't
+    happen in Obsidian callouts — a blank line ends the callout). HR
+    lines (``---``) are dropped. Everything else is kept and joined,
+    then collapsed to a clean trim.
+    """
+    lines = raw.splitlines()
+    out_lines: list[str] = []
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        if _CALLOUT_START_RE.match(line):
+            # Skip the whole quote block (consecutive lines starting
+            # with ``>``). Stop at first non-``>`` line, leaving it for
+            # the next iteration. Blank lines do not end the block — a
+            # callout body with a blank line uses ``> `` (quote-space)
+            # too — but in practice Obsidian's quote block ends on the
+            # first line that doesn't start with ``>``.
+            while i < len(lines) and lines[i].lstrip().startswith(">"):
+                i += 1
+            continue
+        if line.strip() == "---":
+            i += 1
+            continue
+        out_lines.append(line)
+        i += 1
+    # Collapse runs of blank lines that callout-stripping creates
+    # (two newlines around a stripped block fold into one).
+    text_joined = "\n".join(out_lines)
+    text_joined = re.sub(r"\n{3,}", "\n\n", text_joined)
+    return text_joined.strip()