Extract beautiful soup parsing code

2023-04-20 10:54:29 +02:00 · 2023-04-20 10:54:29 +02:00 · b99a193704
commit b99a193704
--- a/toot/tui/richtext.py
+++ b/toot/tui/richtext.py
@ -4,8 +4,9 @@ richtext
 from typing import List, Tuple
 import urwid
 import unicodedata
+
+from toot.utils import bs4_parse
 from .constants import PALETTE
-from bs4 import BeautifulSoup
 from bs4.element import NavigableString, Tag


@ -21,7 +22,7 @@ class ContentParser:
        """Convert html to urwid widgets"""
        widgets: List[urwid.Widget] = []
        html = unicodedata.normalize("NFKC", html)
-        soup = BeautifulSoup(html.replace("&apos;", "'"), "html.parser")
+        soup = bs4_parse(html)
        for e in soup.body or soup:
            if isinstance(e, NavigableString):
                continue
--- a/toot/utils/init.py
+++ b/toot/utils/init.py
@ -24,15 +24,22 @@ def str_bool_nullable(b):

 def get_text(html):
    """Converts html to text, strips all tags."""
+    text = bs4_parse(html).get_text()
+    return unicodedata.normalize("NFKC", text)
+
+
+def bs4_parse(html: str) -> BeautifulSoup:
+    # Versions of BeautifulSoup before 4.8.0 do not convert &apos; to '
+    # correctly so replace it before decoding. Required in case someone still
+    # uses an older version.
+    html = html.replace("&apos;", "'")

    # Ignore warnings made by BeautifulSoup, if passed something that looks like
    # a file (e.g. a dot which matches current dict), it will warn that the file
    # should be opened instead of passing a filename.
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
-        text = BeautifulSoup(html.replace('&apos;', "'"), "html.parser").get_text()
-
-    return unicodedata.normalize('NFKC', text)
+        return BeautifulSoup(html, "html.parser")


 def parse_html(html):