feat: 1-to-1 message render + web data-lake backend

This commit is contained in:
h
2026-05-31 01:27:40 +02:00
parent f0afb7ec5b
commit 75425d1bee
110 changed files with 10199 additions and 54 deletions
@@ -1,3 +1,4 @@
from userbot.modules.avatars.downloader import capture_avatar
from userbot.modules.avatars.repository import note_avatar
__all__ = ["capture_avatar"]
__all__ = ["capture_avatar", "note_avatar"]
@@ -1,3 +1,5 @@
import json
import asyncpg
_INSERT_AVATAR = """
@@ -13,6 +15,16 @@ SELECT 1 FROM avatars
WHERE account_id = $1 AND owner_id = $2 AND unique_id = $3
"""
_GET_FILE = """
SELECT raw ->> 'file_id' AS file_id, downloaded FROM avatars
WHERE account_id = $1 AND owner_kind = $2 AND owner_id = $3 AND unique_id = $4
"""
_MARK_DOWNLOADED = """
UPDATE avatars SET downloaded = true, storage_key = $5, file_size = $6
WHERE account_id = $1 AND owner_kind = $2 AND owner_id = $3 AND unique_id = $4
"""
async def avatar_exists(
pool: asyncpg.Pool, account_id: int, owner_id: int, unique_id: str
@@ -46,3 +58,54 @@ async def insert_avatar( # noqa: PLR0913
downloaded,
raw,
)
async def note_avatar( # noqa: PLR0913
pool: asyncpg.Pool,
account_id: int,
owner_id: int,
owner_kind: str,
unique_id: str,
file_id: str,
) -> None:
await insert_avatar(
pool,
account_id,
owner_id,
owner_kind,
unique_id,
None,
None,
None,
json.dumps({"file_id": file_id}),
downloaded=False,
)
async def get_avatar_file(
pool: asyncpg.Pool, account_id: int, owner_kind: str, owner_id: int, unique_id: str
) -> tuple[str | None, bool] | None:
row = await pool.fetchrow(_GET_FILE, account_id, owner_kind, owner_id, unique_id)
if row is None:
return None
return row["file_id"], row["downloaded"]
async def mark_avatar_downloaded( # noqa: PLR0913
pool: asyncpg.Pool,
account_id: int,
owner_kind: str,
owner_id: int,
unique_id: str,
storage_key: str,
file_size: int,
) -> None:
await pool.execute(
_MARK_DOWNLOADED,
account_id,
owner_kind,
owner_id,
unique_id,
storage_key,
file_size,
)
@@ -1,6 +1,7 @@
import asyncpg
from pyrogram import Client
from userbot.modules.capture.identity import ChatMetaCache, PeerIdentityCache
from userbot.modules.contacts import ContactCache
from userbot.modules.folders import FolderCache
from userbot.modules.watches import WatchCache
@@ -25,6 +26,8 @@ class CaptureContext:
self.folders = folders
self.contacts = contacts
self.watches = WatchCache(pool, account_id)
self.peer_identity = PeerIdentityCache()
self.chat_meta = ChatMetaCache()
self.policies = PolicySet()
async def reload_policies(self) -> None:
@@ -0,0 +1,110 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from userbot.modules.capture.chat_meta import chat_kind
from userbot.modules.profiles.parse import ProfileFields, snapshot_from_high_level
from userbot.modules.profiles.repository import get_peer, write_profile
from utils.policy.models import ChatKind
if TYPE_CHECKING:
import asyncpg
from pyrogram.types import Message
from userbot.modules.capture.context import CaptureContext
class PeerIdentityCache:
def __init__(self) -> None:
self._cache: dict[int, ProfileFields | None] = {}
async def changed(
self, pool: asyncpg.Pool, account_id: int, peer_id: int, fields: ProfileFields
) -> bool:
if peer_id not in self._cache:
self._cache[peer_id] = await get_peer(pool, account_id, peer_id)
if self._cache[peer_id] == fields:
return False
self._cache[peer_id] = fields
return True
class ChatMetaCache:
def __init__(self) -> None:
self._cache: dict[int, tuple[str | None, str | None]] = {}
async def changed(
self,
pool: asyncpg.Pool,
account_id: int,
chat_id: int,
meta: tuple[str | None, str | None],
) -> bool:
from userbot.modules.groups.repository import ( # noqa: PLC0415
get_latest_chat_meta,
)
if chat_id not in self._cache:
self._cache[chat_id] = await get_latest_chat_meta(pool, account_id, chat_id)
if self._cache[chat_id] == meta:
return False
self._cache[chat_id] = meta
return True
async def _capture_peer(message: Message, ctx: CaptureContext) -> None:
user = message.from_user
if user is None:
return
fields, photo_file_id, photo_unique_id = snapshot_from_high_level(user)
if not await ctx.peer_identity.changed(ctx.pool, ctx.account_id, user.id, fields):
return
await write_profile(ctx.pool, ctx.account_id, user.id, fields, str(user))
if photo_file_id and photo_unique_id:
from userbot.modules.avatars import note_avatar # noqa: PLC0415
await note_avatar(
ctx.pool, ctx.account_id, user.id, "peer", photo_unique_id, photo_file_id
)
async def _capture_chat(message: Message, ctx: CaptureContext) -> None:
chat = message.chat
if (
chat is None
or chat.id is None
or message.date is None
or chat_kind(chat.type) is ChatKind.DM
):
return
photo = chat.photo
photo_unique_id = photo.big_photo_unique_id if photo else None
photo_file_id = photo.big_file_id if photo else None
meta = (chat.title, photo_unique_id)
if not await ctx.chat_meta.changed(ctx.pool, ctx.account_id, chat.id, meta):
return
from userbot.modules.groups.repository import insert_chat_history # noqa: PLC0415
await insert_chat_history(
ctx.pool,
ctx.account_id,
chat.id,
message.id,
"meta",
chat.title,
photo_unique_id,
None,
message.date,
str(message),
)
if photo_file_id and photo_unique_id:
from userbot.modules.avatars import note_avatar # noqa: PLC0415
await note_avatar(
ctx.pool, ctx.account_id, chat.id, "chat", photo_unique_id, photo_file_id
)
async def capture_identity(message: Message, ctx: CaptureContext) -> None:
await _capture_peer(message, ctx)
await _capture_chat(message, ctx)
@@ -51,6 +51,9 @@ async def capture_message(
has_media=message.media is not None,
is_self_destruct=self_destruct_ttl(message) is not None,
)
from userbot.modules.capture.identity import capture_identity # noqa: PLC0415
await capture_identity(message, ctx)
await capture_media(client, message, ctx, chat_id, message.id, toggles)
buttons = callbacks(message)
if buttons:
@@ -23,28 +23,71 @@ INSERT INTO messages
has_media, is_self_destruct, edited_at)
VALUES ($1, $2, $3, $4, $5, $6, $7::jsonb, $8, $9, now())
ON CONFLICT (account_id, chat_id, message_id, date) DO UPDATE SET
text = EXCLUDED.text,
raw = EXCLUDED.raw,
has_media = EXCLUDED.has_media,
is_self_destruct = EXCLUDED.is_self_destruct,
edited_at = now()
"""
_INSERT_VERSION = """
INSERT INTO message_versions
(account_id, chat_id, message_id, observed_at, edit_date, text, raw)
VALUES ($1, $2, $3, now(), $4, $5, $6::jsonb)
VALUES ($1, $2, $3, clock_timestamp(), $4, $5, $6::jsonb)
ON CONFLICT DO NOTHING
"""
_SNAPSHOT_ORIGINAL = """
INSERT INTO message_versions
(account_id, chat_id, message_id, observed_at, edit_date, text, raw)
SELECT account_id, chat_id, message_id, clock_timestamp(), NULL, text, raw
FROM messages m
WHERE m.account_id = $1 AND m.chat_id = $2 AND m.message_id = $3
AND NOT EXISTS (
SELECT 1 FROM message_versions v
WHERE v.account_id = m.account_id AND v.chat_id = m.chat_id
AND v.message_id = m.message_id
)
ON CONFLICT DO NOTHING
"""
_CURRENT_CONTENT = """
SELECT
m.text AS text,
(SELECT d.unique_id FROM media d
WHERE d.account_id = m.account_id AND d.chat_id = m.chat_id
AND d.message_id = m.message_id) AS media_unique_id
FROM messages m
WHERE m.account_id = $1 AND m.chat_id = $2 AND m.message_id = $3
ORDER BY m.date DESC LIMIT 1
"""
_CURRENT_MEDIA = """
SELECT unique_id, storage_key, file_size, downloaded FROM media
WHERE account_id = $1 AND chat_id = $2 AND message_id = $3
"""
_INSERT_MEDIA_VERSION = """
INSERT INTO media_versions
(account_id, chat_id, message_id, observed_at, kind, storage_key,
file_size, mime, ttl_seconds, unique_id)
VALUES ($1, $2, $3, clock_timestamp(), $4, $5, $6, $7, $8, $9)
ON CONFLICT (account_id, chat_id, message_id, storage_key) DO NOTHING
"""
_INSERT_MEDIA = """
INSERT INTO media
(account_id, chat_id, message_id, kind, storage_key, file_size, mime,
ttl_seconds, downloaded)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
ttl_seconds, downloaded, unique_id)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
ON CONFLICT (account_id, chat_id, message_id) DO UPDATE SET
kind = EXCLUDED.kind,
storage_key = EXCLUDED.storage_key,
file_size = EXCLUDED.file_size,
mime = EXCLUDED.mime,
ttl_seconds = EXCLUDED.ttl_seconds,
downloaded = EXCLUDED.downloaded
downloaded = EXCLUDED.downloaded,
unique_id = EXCLUDED.unique_id
"""
@@ -111,11 +154,20 @@ async def add_version( # noqa: PLR0913
text: str | None,
raw: str,
edit_date: datetime | None,
media_unique_id: str | None,
*,
has_media: bool,
is_self_destruct: bool,
) -> None:
) -> bool:
async with pool.acquire() as conn, conn.transaction():
current = await conn.fetchrow(_CURRENT_CONTENT, account_id, chat_id, message_id)
text_changed = current is None or current["text"] != text
media_changed = (
current is not None and current["media_unique_id"] != media_unique_id
)
if not (text_changed or media_changed):
return False
await conn.execute(_SNAPSHOT_ORIGINAL, account_id, chat_id, message_id)
await conn.execute(
_TOUCH_EDITED,
account_id,
@@ -131,6 +183,13 @@ async def add_version( # noqa: PLR0913
await conn.execute(
_INSERT_VERSION, account_id, chat_id, message_id, edit_date, text, raw
)
return True
async def current_media(
pool: asyncpg.Pool, account_id: int, chat_id: int, message_id: int
) -> asyncpg.Record | None:
return await pool.fetchrow(_CURRENT_MEDIA, account_id, chat_id, message_id)
async def insert_media( # noqa: PLR0913
@@ -143,21 +202,37 @@ async def insert_media( # noqa: PLR0913
file_size: int | None,
mime: str | None,
ttl_seconds: int | None,
unique_id: str | None,
*,
downloaded: bool,
) -> None:
await pool.execute(
_INSERT_MEDIA,
account_id,
chat_id,
message_id,
kind,
storage_key,
file_size,
mime,
ttl_seconds,
downloaded,
)
async with pool.acquire() as conn, conn.transaction():
await conn.execute(
_INSERT_MEDIA,
account_id,
chat_id,
message_id,
kind,
storage_key,
file_size,
mime,
ttl_seconds,
downloaded,
unique_id,
)
if storage_key is not None:
await conn.execute(
_INSERT_MEDIA_VERSION,
account_id,
chat_id,
message_id,
kind,
storage_key,
file_size,
mime,
ttl_seconds,
unique_id,
)
async def insert_callbacks(
@@ -17,6 +17,26 @@ VALUES ($1, $2, $3, $4, $5, $6, $7, $8::jsonb)
ON CONFLICT (account_id, chat_id, message_id, user_id) DO NOTHING
"""
_LATEST_TITLE = """
SELECT title FROM chat_history
WHERE account_id = $1 AND chat_id = $2 AND title IS NOT NULL
ORDER BY ts DESC LIMIT 1
"""
_LATEST_PHOTO = """
SELECT photo_unique_id FROM chat_history
WHERE account_id = $1 AND chat_id = $2 AND photo_unique_id IS NOT NULL
ORDER BY ts DESC LIMIT 1
"""
async def get_latest_chat_meta(
pool: asyncpg.Pool, account_id: int, chat_id: int
) -> tuple[str | None, str | None]:
title = await pool.fetchval(_LATEST_TITLE, account_id, chat_id)
photo_unique_id = await pool.fetchval(_LATEST_PHOTO, account_id, chat_id)
return title, photo_unique_id
async def insert_chat_history( # noqa: PLR0913
pool: asyncpg.Pool,
@@ -1,3 +1,9 @@
from userbot.modules.jobs.handlers import backfill, fetch_media, transcribe
from userbot.modules.jobs.handlers import (
backfill,
enrich_chat,
fetch_avatar,
fetch_media,
transcribe,
)
__all__ = ["backfill", "fetch_media", "transcribe"]
__all__ = ["backfill", "enrich_chat", "fetch_avatar", "fetch_media", "transcribe"]
@@ -0,0 +1,91 @@
from datetime import UTC, datetime
from pyrogram import Client
from pyrogram.errors import BadRequest, Forbidden
from pyrogram.types import User
from userbot.modules.avatars import note_avatar
from userbot.modules.capture.context import CaptureContext
from userbot.modules.groups.repository import insert_chat_history
from userbot.modules.jobs.context import JobContext
from userbot.modules.jobs.registry import register
from userbot.modules.profiles.parse import snapshot_from_high_level
from userbot.modules.profiles.repository import write_profile
MEMBER_CAP = 200
_MISSING_SENDERS = """
SELECT DISTINCT sender_id FROM messages
WHERE account_id = $1 AND chat_id = $2 AND sender_id > 0
AND NOT EXISTS (
SELECT 1 FROM peers p WHERE p.account_id = $1 AND p.peer_id = messages.sender_id
)
LIMIT 100
"""
async def _save_user(ctx: CaptureContext, user: User) -> None:
fields, photo_file_id, photo_unique_id = snapshot_from_high_level(user)
await write_profile(ctx.pool, ctx.account_id, user.id, fields, str(user))
if photo_file_id and photo_unique_id:
await note_avatar(
ctx.pool, ctx.account_id, user.id, "peer", photo_unique_id, photo_file_id
)
async def _enrich_chat_meta(client: Client, ctx: CaptureContext, chat_id: int) -> None:
chat = await client.get_chat(chat_id)
photo = chat.photo
photo_unique_id = photo.big_photo_unique_id if photo else None
photo_file_id = photo.big_file_id if photo else None
await insert_chat_history(
ctx.pool,
ctx.account_id,
chat_id,
0,
"meta",
chat.title,
photo_unique_id,
None,
datetime.now(UTC),
str(chat),
)
if photo_file_id and photo_unique_id:
await note_avatar(
ctx.pool, ctx.account_id, chat_id, "chat", photo_unique_id, photo_file_id
)
async def _enrich_members(client: Client, ctx: CaptureContext, chat_id: int) -> None:
try:
async for member in client.get_chat_members(chat_id, limit=MEMBER_CAP):
if isinstance(member.user, User):
await _save_user(ctx, member.user)
except (BadRequest, Forbidden):
return
async def _enrich_senders(client: Client, ctx: CaptureContext, chat_id: int) -> None:
rows = await ctx.pool.fetch(_MISSING_SENDERS, ctx.account_id, chat_id)
ids = [row["sender_id"] for row in rows]
for sender_id in ids:
try:
user = await client.get_users(sender_id)
except BadRequest:
continue
if isinstance(user, User):
await _save_user(ctx, user)
@register("enrich_chat")
async def enrich_chat(ctx: JobContext) -> None:
client = ctx.client
if client is None:
return
capture = getattr(client, "capture", None)
if capture is None:
return
chat_id = ctx.job.params["chat_id"]
await _enrich_chat_meta(client, capture, chat_id)
await _enrich_members(client, capture, chat_id)
await _enrich_senders(client, capture, chat_id)
@@ -0,0 +1,40 @@
from io import BytesIO
from userbot.modules.avatars.repository import get_avatar_file, mark_avatar_downloaded
from userbot.modules.jobs.context import JobContext
from userbot.modules.jobs.registry import register
@register("fetch_avatar")
async def fetch_avatar(ctx: JobContext) -> None:
client = ctx.client
if client is None:
return
capture = getattr(client, "capture", None)
if capture is None:
return
owner_kind = ctx.job.params["owner_kind"]
owner_id = ctx.job.params["owner_id"]
unique_id = ctx.job.params["unique_id"]
found = await get_avatar_file(
ctx.pool, ctx.account_id, owner_kind, owner_id, unique_id
)
if found is None:
return
file_id, downloaded = found
if downloaded or file_id is None:
return
buffer = await client.download_media(file_id, in_memory=True)
if not isinstance(buffer, BytesIO):
return
data = buffer.getvalue()
storage_key = capture.storage.put(data)
await mark_avatar_downloaded(
ctx.pool,
ctx.account_id,
owner_kind,
owner_id,
unique_id,
storage_key,
len(data),
)
@@ -1,3 +1,7 @@
from userbot.modules.media.downloader import capture_media, self_destruct_ttl
from userbot.modules.media.downloader import (
capture_media,
media_unique_id,
self_destruct_ttl,
)
__all__ = ["capture_media", "self_destruct_ttl"]
__all__ = ["capture_media", "media_unique_id", "self_destruct_ttl"]
@@ -33,6 +33,11 @@ def self_destruct_ttl(message: Message) -> int | None:
return getattr(obj, "ttl_seconds", None) if obj is not None else None
def media_unique_id(message: Message) -> str | None:
_, obj = media_object(message)
return getattr(obj, "file_unique_id", None) if obj is not None else None
async def capture_media( # noqa: PLR0913
client: Client,
message: Message,
@@ -44,6 +49,7 @@ async def capture_media( # noqa: PLR0913
kind, obj = media_object(message)
if obj is None:
return
unique_id = getattr(obj, "file_unique_id", None)
ttl = getattr(obj, "ttl_seconds", None)
want = toggles.self_destruct_media if ttl else toggles.media
file_size = getattr(obj, "file_size", None)
@@ -51,12 +57,25 @@ async def capture_media( # noqa: PLR0913
storage_key: str | None = None
downloaded = False
if want:
buffer = await client.download_media(message, in_memory=True)
if isinstance(buffer, BytesIO):
data = buffer.getvalue()
storage_key = ctx.storage.put(data)
file_size = len(data)
existing = await repository.current_media(
ctx.pool, ctx.account_id, chat_id, message_id
)
if (
existing is not None
and existing["downloaded"]
and existing["unique_id"] == unique_id
and existing["storage_key"] is not None
):
storage_key = existing["storage_key"]
file_size = existing["file_size"]
downloaded = True
else:
buffer = await client.download_media(message, in_memory=True)
if isinstance(buffer, BytesIO):
data = buffer.getvalue()
storage_key = ctx.storage.put(data)
file_size = len(data)
downloaded = True
await repository.insert_media(
ctx.pool,
ctx.account_id,
@@ -67,5 +86,6 @@ async def capture_media( # noqa: PLR0913
file_size,
mime,
ttl,
unique_id,
downloaded=downloaded,
)
@@ -1,7 +1,13 @@
from userbot.modules.profiles.parse import (
ProfileFields,
active_username,
snapshot_from_high_level,
snapshot_from_user,
)
__all__ = ["ProfileFields", "active_username", "snapshot_from_user"]
__all__ = [
"ProfileFields",
"active_username",
"snapshot_from_high_level",
"snapshot_from_user",
]
@@ -37,3 +37,20 @@ def snapshot_from_user(
is_deleted_account=bool(getattr(raw_user, "deleted", False)),
)
return fields, photo_file_id, photo_unique_id
def snapshot_from_high_level(
user: User,
) -> tuple[ProfileFields, str | None, str | None]:
photo = user.photo
photo_unique_id = photo.big_photo_unique_id if photo else None
photo_file_id = photo.big_file_id if photo else None
fields = ProfileFields(
first_name=user.first_name,
last_name=user.last_name,
username=user.username,
phone=user.phone_number,
photo_unique_id=photo_unique_id,
is_deleted_account=bool(user.is_deleted),
)
return fields, photo_file_id, photo_unique_id