diff --git a/toot/tui/richtext.py b/toot/tui/richtext.py index 6d83804..d311d1f 100644 --- a/toot/tui/richtext.py +++ b/toot/tui/richtext.py @@ -4,8 +4,9 @@ richtext from typing import List, Tuple import urwid import unicodedata + +from toot.utils import bs4_parse from .constants import PALETTE -from bs4 import BeautifulSoup from bs4.element import NavigableString, Tag @@ -21,7 +22,7 @@ class ContentParser: """Convert html to urwid widgets""" widgets: List[urwid.Widget] = [] html = unicodedata.normalize("NFKC", html) - soup = BeautifulSoup(html.replace("'", "'"), "html.parser") + soup = bs4_parse(html) for e in soup.body or soup: if isinstance(e, NavigableString): continue diff --git a/toot/utils/__init__.py b/toot/utils/__init__.py index e8103ac..7b47dc4 100644 --- a/toot/utils/__init__.py +++ b/toot/utils/__init__.py @@ -24,15 +24,22 @@ def str_bool_nullable(b): def get_text(html): """Converts html to text, strips all tags.""" + text = bs4_parse(html).get_text() + return unicodedata.normalize("NFKC", text) + + +def bs4_parse(html: str) -> BeautifulSoup: + # Versions of BeautifulSoup before 4.8.0 do not convert ' to ' + # correctly so replace it before decoding. Required in case someone still + # uses an older version. + html = html.replace("'", "'") # Ignore warnings made by BeautifulSoup, if passed something that looks like # a file (e.g. a dot which matches current dict), it will warn that the file # should be opened instead of passing a filename. with warnings.catch_warnings(): warnings.simplefilter("ignore") - text = BeautifulSoup(html.replace(''', "'"), "html.parser").get_text() - - return unicodedata.normalize('NFKC', text) + return BeautifulSoup(html, "html.parser") def parse_html(html):