refactor: add markdown frontend
This commit is contained in:
@@ -0,0 +1,187 @@
|
||||
"""Parse a markdown chat file into Anthropic ``MessageParam`` history.
|
||||
|
||||
The file format is documented in ``frontends/markdown/__init__.py``:
|
||||
``### User:`` / ``### Assistant:`` H3 headers split turns, optional
|
||||
``---`` HRs between turns are visual-only, ``> [!thinking]-`` and
|
||||
``> [!tool]- <name>`` callouts mark structured assistant content.
|
||||
|
||||
For backend consumption we strip thinking and tool_use callouts —
|
||||
assistant turns become text-only. Rationale: history replay through
|
||||
claude-code's JSONL injection only needs the *narrated* answer (the
|
||||
thinking signatures expire and the original tool_results aren't
|
||||
captured in the renderer's output, so a faithful tool_use round-trip
|
||||
isn't possible today). The renderer keeps callouts in the file because
|
||||
they're informational for the human reader; the parser drops them when
|
||||
shaping the backend's input.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import frontmatter
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from anthropic.types import MessageParam
|
||||
|
||||
|
||||
__all__ = ["ParsedFile", "last_role", "parse", "resolve_agent"]
|
||||
|
||||
|
||||
# Turn marker — must be exactly ``### User:`` or ``### Assistant:`` on
|
||||
# its own line. Trailing whitespace tolerated; nothing after the colon
|
||||
# on the same line (any inline content would mean the user typed
|
||||
# something that just happens to look like a header, and we'd rather
|
||||
# misparse than silently fold inline content into a turn).
|
||||
_TURN_RE = re.compile(r"^###\s+(User|Assistant):\s*$", re.MULTILINE)
|
||||
|
||||
# Callout-start lines we strip from assistant turns when extracting
|
||||
# text. We don't try to parse the contents — for backend input we just
|
||||
# need to drop the whole quoted block.
|
||||
_CALLOUT_START_RE = re.compile(r"^>\s+\[!(thinking|tool)\]")
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class ParsedFile:
|
||||
"""Result of parsing a single chat ``.md``.
|
||||
|
||||
``metadata`` is the YAML frontmatter as a plain dict (empty if the
|
||||
file has none). ``messages`` is the conversation history shaped for
|
||||
``Backend.complete`` — assistant turns are text-only. ``body`` is the
|
||||
raw markdown content *after* the frontmatter is stripped; the
|
||||
renderer needs it when it appends a new assistant turn so it can
|
||||
preserve whatever the human typed verbatim (including any callouts
|
||||
or HRs they added).
|
||||
"""
|
||||
|
||||
metadata: dict[str, Any]
|
||||
body: str
|
||||
messages: list[MessageParam]
|
||||
|
||||
|
||||
def parse(text: str) -> ParsedFile:
|
||||
"""Parse a chat ``.md`` into ``(metadata, body, messages)``.
|
||||
|
||||
A file with no turn markers but non-empty body is treated as a
|
||||
single user turn — the friendly path for "user types into a new
|
||||
file and hits send" before any turn markers exist.
|
||||
"""
|
||||
parsed = frontmatter.loads(text)
|
||||
metadata = dict(parsed.metadata)
|
||||
body = parsed.content
|
||||
|
||||
messages: list[MessageParam] = []
|
||||
turns = _split_turns(body)
|
||||
if not turns:
|
||||
stripped = body.strip()
|
||||
if stripped:
|
||||
messages.append({"role": "user", "content": stripped})
|
||||
return ParsedFile(metadata=metadata, body=body, messages=messages)
|
||||
|
||||
for role, raw in turns:
|
||||
if role == "user":
|
||||
text_content = _strip_hrs(raw).strip()
|
||||
if text_content:
|
||||
messages.append({"role": "user", "content": text_content})
|
||||
else:
|
||||
text_content = _extract_assistant_text(raw)
|
||||
if text_content:
|
||||
messages.append({"role": "assistant", "content": text_content})
|
||||
|
||||
return ParsedFile(metadata=metadata, body=body, messages=messages)
|
||||
|
||||
|
||||
def last_role(messages: list[MessageParam]) -> str | None:
|
||||
"""Return ``"user"`` / ``"assistant"`` / ``None`` for an empty list."""
|
||||
if not messages:
|
||||
return None
|
||||
return messages[-1]["role"]
|
||||
|
||||
|
||||
def resolve_agent(
|
||||
*, metadata: dict[str, Any], request_override: str | None, default: str | None
|
||||
) -> str | None:
|
||||
"""Resolve the agent for this chat.
|
||||
|
||||
Precedence: request body override > frontmatter > frontend default.
|
||||
Returns ``None`` if none match — caller responds with 400.
|
||||
"""
|
||||
if request_override:
|
||||
return request_override
|
||||
fm_agent = metadata.get("agent")
|
||||
if isinstance(fm_agent, str) and fm_agent:
|
||||
return fm_agent
|
||||
return default
|
||||
|
||||
|
||||
# ---- internals ---------------------------------------------------------
|
||||
|
||||
|
||||
def _split_turns(body: str) -> list[tuple[str, str]]:
|
||||
"""Walk turn markers, return ``[(role_lc, raw_body), ...]``.
|
||||
|
||||
Body for each turn is everything between this marker and the next
|
||||
(or EOF). Leading marker line itself is dropped. We don't trim
|
||||
whitespace here — that's per-role.
|
||||
"""
|
||||
matches = list(_TURN_RE.finditer(body))
|
||||
if not matches:
|
||||
return []
|
||||
out: list[tuple[str, str]] = []
|
||||
for i, m in enumerate(matches):
|
||||
role = m.group(1).lower()
|
||||
start = m.end()
|
||||
end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
|
||||
out.append((role, body[start:end]))
|
||||
return out
|
||||
|
||||
|
||||
def _strip_hrs(raw: str) -> str:
|
||||
"""Drop decorative ``---`` separator lines (whole-line HRs only).
|
||||
|
||||
A ``---`` mid-paragraph (rare, but possible) stays. Only lines that
|
||||
are *exactly* the HR after optional surrounding whitespace are
|
||||
removed — those are the ones the renderer emits between turns.
|
||||
"""
|
||||
lines = raw.splitlines()
|
||||
kept = [ln for ln in lines if ln.strip() != "---"]
|
||||
return "\n".join(kept)
|
||||
|
||||
|
||||
def _extract_assistant_text(raw: str) -> str:
|
||||
"""Strip thinking/tool callouts from an assistant turn, return spoken text.
|
||||
|
||||
Walks line by line. When we see a callout-start line (``> [!thinking]-``
|
||||
or ``> [!tool]- ...``), we skip the entire contiguous quote block
|
||||
(lines beginning with ``>`` or blank-then-`>` continuations don't
|
||||
happen in Obsidian callouts — a blank line ends the callout). HR
|
||||
lines (``---``) are dropped. Everything else is kept and joined,
|
||||
then collapsed to a clean trim.
|
||||
"""
|
||||
lines = raw.splitlines()
|
||||
out_lines: list[str] = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
if _CALLOUT_START_RE.match(line):
|
||||
# Skip the whole quote block (consecutive lines starting
|
||||
# with ``>``). Stop at first non-``>`` line, leaving it for
|
||||
# the next iteration. Blank lines do not end the block — a
|
||||
# callout body with a blank line uses ``> `` (quote-space)
|
||||
# too — but in practice Obsidian's quote block ends on the
|
||||
# first line that doesn't start with ``>``.
|
||||
while i < len(lines) and lines[i].lstrip().startswith(">"):
|
||||
i += 1
|
||||
continue
|
||||
if line.strip() == "---":
|
||||
i += 1
|
||||
continue
|
||||
out_lines.append(line)
|
||||
i += 1
|
||||
# Collapse runs of blank lines that callout-stripping creates
|
||||
# (two newlines around a stripped block fold into one).
|
||||
text_joined = "\n".join(out_lines)
|
||||
text_joined = re.sub(r"\n{3,}", "\n\n", text_joined)
|
||||
return text_joined.strip()
|
||||
Reference in New Issue
Block a user