Extract beautiful soup parsing code

richtext
Ivan Habunek 2023-04-20 10:54:29 +02:00
rodzic 2298357480
commit b99a193704
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: CDBD63C43A30BB95
2 zmienionych plików z 13 dodań i 5 usunięć

Wyświetl plik

@ -4,8 +4,9 @@ richtext
from typing import List, Tuple
import urwid
import unicodedata
from toot.utils import bs4_parse
from .constants import PALETTE
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
@ -21,7 +22,7 @@ class ContentParser:
"""Convert html to urwid widgets"""
widgets: List[urwid.Widget] = []
html = unicodedata.normalize("NFKC", html)
soup = BeautifulSoup(html.replace("'", "'"), "html.parser")
soup = bs4_parse(html)
for e in soup.body or soup:
if isinstance(e, NavigableString):
continue

Wyświetl plik

@ -24,15 +24,22 @@ def str_bool_nullable(b):
def get_text(html):
"""Converts html to text, strips all tags."""
text = bs4_parse(html).get_text()
return unicodedata.normalize("NFKC", text)
def bs4_parse(html: str) -> BeautifulSoup:
# Versions of BeautifulSoup before 4.8.0 do not convert ' to '
# correctly so replace it before decoding. Required in case someone still
# uses an older version.
html = html.replace("'", "'")
# Ignore warnings made by BeautifulSoup, if passed something that looks like
# a file (e.g. a dot which matches current dict), it will warn that the file
# should be opened instead of passing a filename.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
text = BeautifulSoup(html.replace(''', "'"), "html.parser").get_text()
return unicodedata.normalize('NFKC', text)
return BeautifulSoup(html, "html.parser")
def parse_html(html):