188 lines
6.9 KiB
Python
188 lines
6.9 KiB
Python
"""Parse a markdown chat file into Anthropic ``MessageParam`` history.
|
|
|
|
The file format is documented in ``frontends/markdown/__init__.py``:
|
|
``### User:`` / ``### Assistant:`` H3 headers split turns, optional
|
|
``---`` HRs between turns are visual-only, ``> [!thinking]-`` and
|
|
``> [!tool]- <name>`` callouts mark structured assistant content.
|
|
|
|
For backend consumption we strip thinking and tool_use callouts —
|
|
assistant turns become text-only. Rationale: history replay through
|
|
claude-code's JSONL injection only needs the *narrated* answer (the
|
|
thinking signatures expire and the original tool_results aren't
|
|
captured in the renderer's output, so a faithful tool_use round-trip
|
|
isn't possible today). The renderer keeps callouts in the file because
|
|
they're informational for the human reader; the parser drops them when
|
|
shaping the backend's input.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import TYPE_CHECKING, Any
|
|
|
|
import frontmatter
|
|
|
|
if TYPE_CHECKING:
|
|
from anthropic.types import MessageParam
|
|
|
|
|
|
__all__ = ["ParsedFile", "last_role", "parse", "resolve_agent"]
|
|
|
|
|
|
# Turn marker — must be exactly ``### User:`` or ``### Assistant:`` on
|
|
# its own line. Trailing whitespace tolerated; nothing after the colon
|
|
# on the same line (any inline content would mean the user typed
|
|
# something that just happens to look like a header, and we'd rather
|
|
# misparse than silently fold inline content into a turn).
|
|
_TURN_RE = re.compile(r"^###\s+(User|Assistant):\s*$", re.MULTILINE)
|
|
|
|
# Callout-start lines we strip from assistant turns when extracting
|
|
# text. We don't try to parse the contents — for backend input we just
|
|
# need to drop the whole quoted block.
|
|
_CALLOUT_START_RE = re.compile(r"^>\s+\[!(thinking|tool)\]")
|
|
|
|
|
|
@dataclass(frozen=True, slots=True)
|
|
class ParsedFile:
|
|
"""Result of parsing a single chat ``.md``.
|
|
|
|
``metadata`` is the YAML frontmatter as a plain dict (empty if the
|
|
file has none). ``messages`` is the conversation history shaped for
|
|
``Backend.complete`` — assistant turns are text-only. ``body`` is the
|
|
raw markdown content *after* the frontmatter is stripped; the
|
|
renderer needs it when it appends a new assistant turn so it can
|
|
preserve whatever the human typed verbatim (including any callouts
|
|
or HRs they added).
|
|
"""
|
|
|
|
metadata: dict[str, Any]
|
|
body: str
|
|
messages: list[MessageParam]
|
|
|
|
|
|
def parse(text: str) -> ParsedFile:
|
|
"""Parse a chat ``.md`` into ``(metadata, body, messages)``.
|
|
|
|
A file with no turn markers but non-empty body is treated as a
|
|
single user turn — the friendly path for "user types into a new
|
|
file and hits send" before any turn markers exist.
|
|
"""
|
|
parsed = frontmatter.loads(text)
|
|
metadata = dict(parsed.metadata)
|
|
body = parsed.content
|
|
|
|
messages: list[MessageParam] = []
|
|
turns = _split_turns(body)
|
|
if not turns:
|
|
stripped = body.strip()
|
|
if stripped:
|
|
messages.append({"role": "user", "content": stripped})
|
|
return ParsedFile(metadata=metadata, body=body, messages=messages)
|
|
|
|
for role, raw in turns:
|
|
if role == "user":
|
|
text_content = _strip_hrs(raw).strip()
|
|
if text_content:
|
|
messages.append({"role": "user", "content": text_content})
|
|
else:
|
|
text_content = _extract_assistant_text(raw)
|
|
if text_content:
|
|
messages.append({"role": "assistant", "content": text_content})
|
|
|
|
return ParsedFile(metadata=metadata, body=body, messages=messages)
|
|
|
|
|
|
def last_role(messages: list[MessageParam]) -> str | None:
|
|
"""Return ``"user"`` / ``"assistant"`` / ``None`` for an empty list."""
|
|
if not messages:
|
|
return None
|
|
return messages[-1]["role"]
|
|
|
|
|
|
def resolve_agent(
|
|
*, metadata: dict[str, Any], request_override: str | None, default: str | None
|
|
) -> str | None:
|
|
"""Resolve the agent for this chat.
|
|
|
|
Precedence: request body override > frontmatter > frontend default.
|
|
Returns ``None`` if none match — caller responds with 400.
|
|
"""
|
|
if request_override:
|
|
return request_override
|
|
fm_agent = metadata.get("agent")
|
|
if isinstance(fm_agent, str) and fm_agent:
|
|
return fm_agent
|
|
return default
|
|
|
|
|
|
# ---- internals ---------------------------------------------------------
|
|
|
|
|
|
def _split_turns(body: str) -> list[tuple[str, str]]:
|
|
"""Walk turn markers, return ``[(role_lc, raw_body), ...]``.
|
|
|
|
Body for each turn is everything between this marker and the next
|
|
(or EOF). Leading marker line itself is dropped. We don't trim
|
|
whitespace here — that's per-role.
|
|
"""
|
|
matches = list(_TURN_RE.finditer(body))
|
|
if not matches:
|
|
return []
|
|
out: list[tuple[str, str]] = []
|
|
for i, m in enumerate(matches):
|
|
role = m.group(1).lower()
|
|
start = m.end()
|
|
end = matches[i + 1].start() if i + 1 < len(matches) else len(body)
|
|
out.append((role, body[start:end]))
|
|
return out
|
|
|
|
|
|
def _strip_hrs(raw: str) -> str:
|
|
"""Drop decorative ``---`` separator lines (whole-line HRs only).
|
|
|
|
A ``---`` mid-paragraph (rare, but possible) stays. Only lines that
|
|
are *exactly* the HR after optional surrounding whitespace are
|
|
removed — those are the ones the renderer emits between turns.
|
|
"""
|
|
lines = raw.splitlines()
|
|
kept = [ln for ln in lines if ln.strip() != "---"]
|
|
return "\n".join(kept)
|
|
|
|
|
|
def _extract_assistant_text(raw: str) -> str:
|
|
"""Strip thinking/tool callouts from an assistant turn, return spoken text.
|
|
|
|
Walks line by line. When we see a callout-start line (``> [!thinking]-``
|
|
or ``> [!tool]- ...``), we skip the entire contiguous quote block
|
|
(lines beginning with ``>`` or blank-then-`>` continuations don't
|
|
happen in Obsidian callouts — a blank line ends the callout). HR
|
|
lines (``---``) are dropped. Everything else is kept and joined,
|
|
then collapsed to a clean trim.
|
|
"""
|
|
lines = raw.splitlines()
|
|
out_lines: list[str] = []
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
if _CALLOUT_START_RE.match(line):
|
|
# Skip the whole quote block (consecutive lines starting
|
|
# with ``>``). Stop at first non-``>`` line, leaving it for
|
|
# the next iteration. Blank lines do not end the block — a
|
|
# callout body with a blank line uses ``> `` (quote-space)
|
|
# too — but in practice Obsidian's quote block ends on the
|
|
# first line that doesn't start with ``>``.
|
|
while i < len(lines) and lines[i].lstrip().startswith(">"):
|
|
i += 1
|
|
continue
|
|
if line.strip() == "---":
|
|
i += 1
|
|
continue
|
|
out_lines.append(line)
|
|
i += 1
|
|
# Collapse runs of blank lines that callout-stripping creates
|
|
# (two newlines around a stripped block fold into one).
|
|
text_joined = "\n".join(out_lines)
|
|
text_joined = re.sub(r"\n{3,}", "\n\n", text_joined)
|
|
return text_joined.strip()
|