Support for rendering a subset of HTML tags in status content

Code is adapted from GPL3-licensed muv by @seonon https://github.com/seonon/muv
2023-03-23 22:47:56 -04:00 · 2023-03-23 22:47:56 -04:00 · 6b2c3f09bf
commit 6b2c3f09bf
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 *.egg-info/
 *.pyc
 .pypirc
+.vscode
 /.cache/
 /.coverage
 /.env
@ -14,3 +15,4 @@
 debug.log
 /pyrightconfig.json
 /book
+/venv
--- a/toot/tui/constants.py
+++ b/toot/tui/constants.py
@ -37,7 +37,38 @@ PALETTE = [
    ('yellow_bold', 'yellow,bold', ''),
    ('red', 'dark red', ''),
    ('warning', 'light red', ''),
-    ('white_bold', 'white,bold', '')
+    ('white_bold', 'white,bold', ''),
+
+    # HTML tag styling
+
+    # note, anchor styling is often overridden
+    # by class names in Mastodon statuses
+    # so you won't see the italics.
+    ('a', ',italics', ''),
+    ('em', 'white,italics', ''),
+    ('i', 'white,italics', ''),
+
+    ('strong', 'white,bold', ''),
+    ('b', 'white,bold', ''),
+
+    ('u', 'white,underline', ''),
+
+    ('del', 'white, strikethrough', ''),
+
+    ('code', 'white, standout', ''),
+    ('pre', 'white, standout', ''),
+
+    ('blockquote', 'light gray', ''),
+
+    ('h1', 'yellow, bold', ''),
+    ('h2', 'dark red, bold', ''),
+    ('h3', 'yellow, bold', ''),
+    ('h4', 'yellow, bold', ''),
+    ('h5', 'yellow, bold', ''),
+    ('h6', 'yellow, bold', ''),
+
+    ('class_mention_hashtag', 'light cyan,bold', ''),
+
 ]

 VISIBILITY_OPTIONS = [
--- a/toot/tui/richtext.py
+++ b/toot/tui/richtext.py
@ -0,0 +1,267 @@
+"""
+richtext
+"""
+from typing import List
+import urwid
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString, Tag
+
+
+class ContentParser:
+    def __init__(self, config={}):
+        """Parse a limited subset of HTML and create urwid widgets."""
+        self.tag_to_method = {
+            "b": self.inline_tag_to_text,
+            "i": self.inline_tag_to_text,
+            "code": self.inline_tag_to_text,
+            "em": self.inline_tag_to_text,
+            "strong": self.inline_tag_to_text,
+            "del": self.inline_tag_to_text,
+        }
+
+    def html_to_widgets(self, html) -> List[urwid.Widget]:
+        """Convert html to urwid widgets"""
+        widgets: List[urwid.Widget] = []
+        soup = BeautifulSoup(html.replace('&apos;', "'"), "html.parser")
+        for e in soup.body or soup:
+            if isinstance(e, NavigableString):
+                continue
+            name = e.name
+            # get the custom method for the tag, defaulting to tag_to_text if none defined for this tag
+            method = self.tag_to_method.get(
+                name, getattr(self, "_" + name, self.inline_tag_to_text)
+            )
+
+            markup = method(e)  # either returns a Widget, or plain text
+            if not isinstance(markup, urwid.Widget):
+                # plaintext, so create a padded text widget
+                txt = urwid.Text(markup)
+                markup = urwid.Padding(
+                    txt,
+                    align="left",
+                    width=("relative", 100),
+                    min_width=None,
+                )
+            widgets.append(markup)
+        return widgets
+
+    def inline_tag_to_text(self, tag) -> list:
+        """Convert html tag to plain text with tag as attributes recursively"""
+        markups = self.process_inline_tag_children(tag)
+        if not markups:
+            return ""
+        return (tag.name, markups)
+
+    def process_inline_tag_children(self, tag) -> list:
+        markups = []
+        for child in tag.children:
+            if isinstance(child, Tag):
+                method = self.tag_to_method.get(
+                    child.name, getattr(self, "_" + child.name, self.inline_tag_to_text)
+                )
+                markup = method(child)
+                markups.append(markup)
+            else:
+                markups.append(child)
+        return markups
+
+    def process_block_tag_children(self, tag) -> List[urwid.Widget]:
+        pre_widget_markups = []
+        post_widget_markups = []
+        child_widgets = []
+        found_nested_widget = False
+
+        for child in tag.children:
+            if isinstance(child, Tag):
+                # child is a nested tag; process using custom method
+                # or default to inline_tag_to_text
+                method = self.tag_to_method.get(
+                    child.name, getattr(self, "_" + child.name, self.inline_tag_to_text)
+                )
+                result = method(child)
+                if isinstance(result, urwid.Widget):
+                    found_nested_widget = True
+                    child_widgets.append(result)
+                else:
+                    if not found_nested_widget:
+                        pre_widget_markups.append(result)
+                    else:
+                        post_widget_markups.append(result)
+            else:
+                # child is text; append to the appropriate markup list
+                if not found_nested_widget:
+                    pre_widget_markups.append(child)
+                else:
+                    post_widget_markups.append(child)
+
+        widget_list = []
+        if len(pre_widget_markups):
+            widget_list.append(urwid.Text((tag.name, pre_widget_markups)))
+
+        if len(child_widgets):
+            widget_list += child_widgets
+
+        if len(post_widget_markups):
+            widget_list.append(urwid.Text((tag.name, post_widget_markups)))
+
+        return widget_list
+
+    def get_style_name(self, tag) -> str:
+        # TODO: think about whitelisting allowed classes,
+        # or blacklisting classes we do not want.
+        # Classes to whitelist: "mention" "hashtag"
+        # used in anchor tags
+        # Classes to blacklist: "invisible" used in Akkoma
+        # anchor titles
+        style_name = tag.name
+        if "class" in tag.attrs:
+            clss = tag.attrs["class"]
+            if len(clss) > 0:
+                style_name = "class_" + "_".join(clss)
+        return style_name
+
+    # Tag handlers start here.
+    # Tags not explicitly listed are "supported" by
+    # rendering as text.
+    # Inline tags return a list of marked up text for urwid.Text
+    # Block tags return urwid.Widget
+
+
+    def basic_block_tag_handler(self, tag) -> urwid.Widget:
+        """default for block tags that need no special treatment"""
+        return urwid.Pile(self.process_block_tag_children(tag))
+
+    def _a(self, tag) -> list:
+        markups = self.process_inline_tag_children(tag)
+        if not markups:
+            return ""
+
+        # hashtag anchors have a class of "mention hashtag"
+        # we'll return style "class_mention_hashtag"
+        # in that case; set this up in constants.py
+        # to control highlighting of hashtags
+
+        return (self.get_style_name(tag), markups)
+
+    def _blockquote(self, tag) -> urwid.Widget:
+        widget_list = self.process_block_tag_children(tag)
+        blockquote_widget = urwid.LineBox(
+            urwid.Padding(
+                urwid.Pile(widget_list),
+                align="left",
+                width=("relative", 100),
+                min_width=None,
+                left=1,
+                right=1,
+            ),
+            tlcorner="",
+            tline="",
+            lline="│",
+            trcorner="",
+            blcorner="",
+            rline="",
+            bline="",
+            brcorner="",
+        )
+        return urwid.Pile([urwid.AttrMap(blockquote_widget, "blockquote")])
+
+
+    def _br(self, tag) -> list:
+        return (tag.name, ("br", "\n"))
+
+    _div = basic_block_tag_handler
+
+    _li = basic_block_tag_handler
+
+    # Glitch-soc and Pleroma allow <H1>...<H6> in content
+    # Mastodon (PR #23913) does not; header tags are converted to <STRONG>
+
+    _h1 = basic_block_tag_handler
+
+    _h2 = basic_block_tag_handler
+
+    _h3 = basic_block_tag_handler
+
+    _h4 = basic_block_tag_handler
+
+    _h5 = basic_block_tag_handler
+
+    _h6 = basic_block_tag_handler
+
+    def _ol(self, tag) -> urwid.Widget:
+        return self.list_widget(tag, ordered=True)
+
+    _p = basic_block_tag_handler
+
+    def _pre(self, tag) -> urwid.Widget:
+
+        # <PRE> tag spec says that text should not wrap,
+        # but horizontal screen space is at a premium
+        # and we have no horizontal scroll bar, so allow
+        # wrapping.
+
+        widget_list = [urwid.Divider(" ")]
+        widget_list += self.process_block_tag_children(tag)
+
+        pre_widget = urwid.Padding(
+            urwid.Pile(widget_list),
+            align="left",
+            width=("relative", 100),
+            min_width=None,
+            left=1,
+            right=1,
+        )
+        return urwid.Pile([urwid.AttrMap(pre_widget, "pre")])
+
+    def _span(self, tag) -> list:
+        markups = self.process_inline_tag_children(tag)
+
+        if not markups:
+            return ""
+
+        # span inherits its parent's class definition
+        # unless it has a specific class definition
+        # of its own
+
+        if "class" in tag.attrs:
+            style_name = self.get_style_name(tag)
+        elif tag.parent:
+            style_name = self.get_style_name(tag.parent)
+        else:
+            style_name = tag.name
+
+        return (style_name, markups)
+
+    def _ul(self, tag) -> urwid.Widget:
+        return self.list_widget(tag, ordered=False)
+
+    def list_widget(self, tag, ordered=False) -> urwid.Widget:
+        widgets = []
+        i = 1
+        for li in tag.find_all("li", recursive=False):
+            method = self.tag_to_method.get(
+                "li", getattr(self, "_li", self.inline_tag_to_text)
+            )
+            markup = method(li)
+
+            if not isinstance(markup, urwid.Widget):
+                if ordered:
+                    txt = urwid.Text(
+                        ("li", [str(i), ". ", markup])
+                    )  # 1. foo, 2. bar, etc.
+                else:
+                    txt = urwid.Text(("li", ["* ", markup]))  # * foo, * bar, etc.
+                widgets.append(txt)
+            else:
+                if ordered:
+                    txt = urwid.Text(("li", [str(i) + "."]))
+                else:
+                    txt = urwid.Text(("li", "*"))
+
+                columns = urwid.Columns(
+                    [txt, ("weight", 9999, markup)], dividechars=1, min_width=4
+                )
+                widgets.append(columns)
+            i += 1
+
+        return urwid.Pile(widgets)
--- a/toot/tui/timeline.py
+++ b/toot/tui/timeline.py
@ -7,11 +7,11 @@ from typing import List, Optional

 from .entities import Status
 from .scroll import Scrollable, ScrollBar
-from .utils import highlight_hashtags, parse_datetime, highlight_keys
+from .utils import parse_datetime, highlight_keys
 from .widgets import SelectableText, SelectableColumns
+from .richtext import ContentParser
 from toot.tui import app
 from toot.tui.utils import time_ago
-from toot.utils import format_content
 from toot.utils.language import language_name

 logger = logging.getLogger("toot")
@ -341,8 +341,12 @@ class StatusDetails(urwid.Pile):
            yield ("pack", urwid.Text(("content_warning", "Marked as sensitive. Press S to view.")))
        else:
            content = status.original.translation if status.original.show_translation else status.data["content"]
-            for line in format_content(content):
-                yield ("pack", urwid.Text(highlight_hashtags(line, self.followed_tags)))
+
+            parser = ContentParser()
+            widgetlist = parser.html_to_widgets(content)
+
+            for line in widgetlist:
+                yield (line)

            media = status.data["media_attachments"]
            if media: