kopia lustrzana https://gitlab.com/jaywink/federation
96 wiersze
2.8 KiB
Python
96 wiersze
2.8 KiB
Python
import re
|
|
from typing import Set, List
|
|
from urllib.parse import urlparse
|
|
|
|
from bs4 import BeautifulSoup
|
|
from bs4.element import NavigableString
|
|
from commonmark import commonmark
|
|
|
|
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
|
|
TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE)
|
|
# This will match non matching braces. I don't think it's an issue.
|
|
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
|
|
# based on https://stackoverflow.com/a/6041965
|
|
URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|^|(?<=[("<\s]))+(?:[\w\-]+(?:(?:\.[\w\-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))',
|
|
re.UNICODE)
|
|
|
|
def decode_if_bytes(text):
|
|
try:
|
|
return text.decode("utf-8")
|
|
except AttributeError:
|
|
return text
|
|
|
|
|
|
def encode_if_text(text):
|
|
try:
|
|
return bytes(text, encoding="utf-8")
|
|
except TypeError:
|
|
return text
|
|
|
|
|
|
def find_tags(text: str) -> Set[str]:
|
|
"""Find tags in text.
|
|
|
|
Ignore tags inside code blocks.
|
|
|
|
Returns a set of tags.
|
|
|
|
"""
|
|
tags = find_elements(BeautifulSoup(commonmark(text, ignore_html_blocks=True), 'html.parser'),
|
|
TAG_PATTERN)
|
|
return set([tag.text.lstrip('#').lower() for tag in tags])
|
|
|
|
|
|
def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableString]:
|
|
"""
|
|
Split a BeautifulSoup tree strings according to a pattern, replacing each element
|
|
with a NavigableString. The returned list can be used to linkify the found
|
|
elements.
|
|
|
|
:param soup: BeautifulSoup instance of the content being searched
|
|
:param pattern: Compiled regular expression defined using a single group
|
|
:return: A NavigableString list attached to the original soup
|
|
"""
|
|
final = []
|
|
for candidate in soup.find_all(string=True):
|
|
if candidate.parent.name == 'code': continue
|
|
ns = [NavigableString(r) for r in pattern.split(candidate.text) if r]
|
|
found = [s for s in ns if pattern.match(s.text)]
|
|
if found:
|
|
candidate.replace_with(*ns)
|
|
final.extend(found)
|
|
return final
|
|
|
|
|
|
def get_path_from_url(url: str) -> str:
|
|
"""
|
|
Return only the path part of an URL.
|
|
"""
|
|
parsed = urlparse(url)
|
|
return parsed.path
|
|
|
|
|
|
|
|
def test_tag(tag: str) -> bool:
|
|
"""Test a word whether it could be accepted as a tag."""
|
|
if not tag:
|
|
return False
|
|
for char in ILLEGAL_TAG_CHARS:
|
|
if char in tag:
|
|
return False
|
|
return True
|
|
|
|
|
|
def validate_handle(handle):
|
|
"""
|
|
Very basic handle validation as per
|
|
https://diaspora.github.io/diaspora_federation/federation/types.html#diaspora-id
|
|
"""
|
|
return re.match(r"[a-z0-9\-_.]+@[^@/]+\.[^@/]+", handle, flags=re.IGNORECASE) is not None
|
|
|
|
|
|
def with_slash(url):
|
|
if url.endswith('/'):
|
|
return url
|
|
return f"{url}/"
|