microblog.pub/app/source.py

219 wiersze
6.4 KiB
Python

2022-06-22 18:11:22 +00:00
import re
2022-08-24 18:12:10 +00:00
import typing
2022-06-22 18:11:22 +00:00
2022-11-12 09:04:37 +00:00
from loguru import logger
2022-10-04 18:26:01 +00:00
from mistletoe import Document # type: ignore
2023-01-06 20:21:53 +00:00
from mistletoe.block_token import CodeFence # type: ignore
2022-10-04 18:26:01 +00:00
from mistletoe.html_renderer import HTMLRenderer # type: ignore
from mistletoe.span_token import SpanToken # type: ignore
from pygments.formatters import HtmlFormatter # type: ignore
from pygments.lexers import get_lexer_by_name as get_lexer # type: ignore
2023-01-06 20:21:53 +00:00
from pygments.util import ClassNotFound # type: ignore
2022-06-29 06:56:39 +00:00
from sqlalchemy import select
2022-06-22 18:11:22 +00:00
from app import webfinger
from app.config import BASE_URL
2022-10-04 18:26:01 +00:00
from app.config import CODE_HIGHLIGHTING_THEME
2022-06-29 18:43:17 +00:00
from app.database import AsyncSession
2022-06-27 18:55:44 +00:00
from app.utils import emoji
2022-06-22 18:11:22 +00:00
2022-08-24 18:12:10 +00:00
if typing.TYPE_CHECKING:
from app.actor import Actor
2022-10-04 18:26:01 +00:00
_FORMATTER = HtmlFormatter(style=CODE_HIGHLIGHTING_THEME)
_HASHTAG_REGEX = re.compile(r"(#[\d\w]+)")
2022-10-05 18:05:16 +00:00
_MENTION_REGEX = re.compile(r"(@[\d\w_.+-]+@[\d\w-]+\.[\d\w\-.]+)")
_URL_REGEX = re.compile(
"(https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*))" # noqa: E501
)
2022-06-22 18:11:22 +00:00
2022-10-04 18:26:01 +00:00
class AutoLink(SpanToken):
parse_inner = False
2022-10-19 19:09:30 +00:00
precedence = 1
2022-10-05 18:05:16 +00:00
pattern = _URL_REGEX
2022-06-22 18:11:22 +00:00
2022-10-04 18:26:01 +00:00
def __init__(self, match_obj: re.Match) -> None:
self.target = match_obj.group()
2022-06-22 18:11:22 +00:00
2022-10-04 18:26:01 +00:00
class Mention(SpanToken):
parse_inner = False
precedence = 10
2022-10-05 18:05:16 +00:00
pattern = _MENTION_REGEX
2022-10-04 18:26:01 +00:00
def __init__(self, match_obj: re.Match) -> None:
self.target = match_obj.group()
class Hashtag(SpanToken):
parse_inner = False
precedence = 10
2022-10-05 18:05:16 +00:00
pattern = _HASHTAG_REGEX
2022-10-04 18:26:01 +00:00
def __init__(self, match_obj: re.Match) -> None:
self.target = match_obj.group()
class CustomRenderer(HTMLRenderer):
def __init__(
self,
mentioned_actors: dict[str, "Actor"] = {},
enable_mentionify: bool = True,
enable_hashtagify: bool = True,
) -> None:
extra_tokens = []
if enable_mentionify:
extra_tokens.append(Mention)
if enable_hashtagify:
extra_tokens.append(Hashtag)
super().__init__(AutoLink, *extra_tokens)
self.tags: list[dict[str, str]] = []
self.mentioned_actors = mentioned_actors
def render_auto_link(self, token: AutoLink) -> str:
template = '<a href="{target}" rel="noopener">{inner}</a>'
target = self.escape_url(token.target)
return template.format(target=target, inner=target)
def render_mention(self, token: Mention) -> str:
mention = token.target
2022-11-12 09:04:37 +00:00
suffix = ""
if mention.endswith("."):
mention = mention[:-1]
suffix = "."
2022-10-04 18:26:01 +00:00
actor = self.mentioned_actors.get(mention)
if not actor:
return mention
self.tags.append(dict(type="Mention", href=actor.ap_id, name=mention))
2022-11-12 09:04:37 +00:00
link = f'<span class="h-card"><a href="{actor.url}" class="u-url mention">{actor.handle}</a></span>{suffix}' # noqa: E501
2022-10-04 18:26:01 +00:00
return link
def render_hashtag(self, token: Hashtag) -> str:
tag = token.target[1:]
2022-10-05 18:05:16 +00:00
link = f'<a href="{BASE_URL}/t/{tag.lower()}" class="mention hashtag" rel="tag">#<span>{tag}</span></a>' # noqa: E501
2022-10-04 18:26:01 +00:00
self.tags.append(
2022-10-05 18:05:16 +00:00
dict(
href=f"{BASE_URL}/t/{tag.lower()}",
name=token.target.lower(),
type="Hashtag",
)
2022-10-04 18:26:01 +00:00
)
return link
2022-06-22 18:11:22 +00:00
2023-01-06 20:21:53 +00:00
def render_block_code(self, token: CodeFence) -> str:
lexer_attr = ""
try:
lexer = get_lexer(token.language)
lexer_attr = f' data-microblogpub-lexer="{lexer.aliases[0]}"'
except ClassNotFound:
pass
2022-10-04 18:26:01 +00:00
code = token.children[0].content
2023-01-06 20:21:53 +00:00
return f"<pre><code{lexer_attr}>\n{code}\n</code></pre>"
2022-06-22 18:11:22 +00:00
2022-10-04 18:26:01 +00:00
async def _prefetch_mentioned_actors(
2022-06-29 18:43:17 +00:00
db_session: AsyncSession,
2022-06-26 17:00:29 +00:00
content: str,
2022-10-04 18:26:01 +00:00
) -> dict[str, "Actor"]:
2022-08-24 18:12:10 +00:00
from app import models
from app.actor import fetch_actor
2022-10-04 18:26:01 +00:00
actors = {}
2022-06-22 18:11:22 +00:00
for mention in re.findall(_MENTION_REGEX, content):
2022-10-04 18:26:01 +00:00
if mention in actors:
continue
2022-11-12 09:04:37 +00:00
# XXX: the regex catches stuff like `@toto@example.com.`
if mention.endswith("."):
mention = mention[:-1]
try:
_, username, domain = mention.split("@")
actor = (
await db_session.execute(
select(models.Actor).where(
models.Actor.handle == mention,
models.Actor.is_deleted.is_(False),
)
2022-08-31 17:44:40 +00:00
)
2022-11-12 09:04:37 +00:00
).scalar_one_or_none()
if not actor:
actor_url = await webfinger.get_actor_url(mention)
if not actor_url:
# FIXME(ts): raise an error?
continue
actor = await fetch_actor(db_session, actor_url)
actors[mention] = actor
except Exception:
logger.exception(f"Failed to prefetch {mention}")
2022-06-22 18:11:22 +00:00
2022-10-04 18:26:01 +00:00
return actors
2022-10-05 18:05:16 +00:00
def hashtagify(
content: str,
) -> tuple[str, list[dict[str, str]]]:
2022-10-04 18:26:01 +00:00
tags = []
2022-10-05 18:05:16 +00:00
with CustomRenderer(
mentioned_actors={},
enable_mentionify=False,
enable_hashtagify=True,
) as renderer:
rendered_content = renderer.render(Document(content))
tags.extend(renderer.tags)
# Handle custom emoji
tags.extend(emoji.tags(content))
return rendered_content, tags
2022-06-22 18:11:22 +00:00
2022-06-29 18:43:17 +00:00
async def markdownify(
db_session: AsyncSession,
2022-06-22 18:11:22 +00:00
content: str,
2022-08-24 18:12:10 +00:00
enable_mentionify: bool = True,
enable_hashtagify: bool = True,
) -> tuple[str, list[dict[str, str]], list["Actor"]]:
2022-06-22 18:11:22 +00:00
"""
>>> content, tags = markdownify("Hello")
"""
tags = []
2022-10-04 18:26:01 +00:00
mentioned_actors: dict[str, "Actor"] = {}
2022-08-24 18:12:10 +00:00
if enable_mentionify:
2022-10-04 18:26:01 +00:00
mentioned_actors = await _prefetch_mentioned_actors(db_session, content)
with CustomRenderer(
mentioned_actors=mentioned_actors,
enable_mentionify=enable_mentionify,
enable_hashtagify=enable_hashtagify,
) as renderer:
rendered_content = renderer.render(Document(content))
tags.extend(renderer.tags)
2022-06-27 18:55:44 +00:00
# Handle custom emoji
tags.extend(emoji.tags(content))
2022-10-20 17:39:55 +00:00
return rendered_content, dedup_tags(tags), list(mentioned_actors.values())
2022-10-05 18:05:16 +00:00
def dedup_tags(tags: list[dict[str, str]]) -> list[dict[str, str]]:
idx = set()
deduped_tags = []
for tag in tags:
tag_idx = (tag["type"], tag["name"])
if tag_idx in idx:
continue
idx.add(tag_idx)
deduped_tags.append(tag)
return deduped_tags