From 63a0e38ac9b3e6dac238f04679a8c7943bf6f4c5 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Tue, 6 Jun 2023 16:52:51 -0400 Subject: [PATCH 01/28] Fix platform quirks (hubzilla, guppe, bird.makeup) that prevent their profiles processing to fail. --- federation/entities/activitypub/ldcontext.py | 17 ++++++---- federation/entities/activitypub/models.py | 34 ++++++++++++++++---- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/federation/entities/activitypub/ldcontext.py b/federation/entities/activitypub/ldcontext.py index e46bfb9..8fdafc2 100644 --- a/federation/entities/activitypub/ldcontext.py +++ b/federation/entities/activitypub/ldcontext.py @@ -137,12 +137,17 @@ class LdContextManager: # Merge all defined AP extensions to the inbound context uris = [] defs = {} - # Merge original context dicts in one dict - for item in ctx: - if isinstance(item, str): - uris.append(item) - else: - defs.update(item) + # Merge original context dicts in one dict, taking into account nested @context + def parse_context(ctx): + for item in ctx: + if isinstance(item, str): + uris.append(item) + else: + if '@context' in item: + parse_context([item['@context']]) + item.pop('@context') + defs.update(item) + parse_context(ctx) for item in self._merged: if isinstance(item, str) and item not in uris: diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 9e7fdb5..6bd841e 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -1,6 +1,7 @@ import copy import json import logging +import traceback import uuid from datetime import timedelta from typing import List, Dict, Union @@ -241,8 +242,8 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation): metadata={'ctx':[{ 'alsoKnownAs':{'@id':'as:alsoKnownAs','@type':'@id'}}]}) icon = MixedField(as2.icon, nested='ImageSchema') image = MixedField(as2.image, nested='ImageSchema') - tag_objects = MixedField(as2.tag, nested=['HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True) - attachment = fields.Nested(as2.attachment, nested=['ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'], + tag_objects = MixedField(as2.tag, nested=['NoteSchema', 'HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True) + attachment = fields.Nested(as2.attachment, nested=['NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'], many=True, default=[]) content_map = LanguageMap(as2.content) # language maps are not implemented in calamus context = fields.RawJsonLD(as2.context) @@ -250,7 +251,7 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation): generator = MixedField(as2.generator, nested=['ApplicationSchema','ServiceSchema']) created_at = fields.DateTime(as2.published, add_value_types=True) replies = MixedField(as2.replies, nested=['CollectionSchema','OrderedCollectionSchema']) - signature = MixedField(sec.signature, nested = 'SignatureSchema', + signature = MixedField(sec.signature, nested = 'RsaSignature2017Schema', metadata={'ctx': [CONTEXT_SECURITY, {'RsaSignature2017':'sec:RsaSignature2017'}]}) start_time = fields.DateTime(as2.startTime, add_value_types=True) @@ -333,6 +334,20 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation): data['@context'] = context_manager.merge_context(ctx) return data + # JSONLD specs states it is case sensitive. + # Ensure type names for which we have an implementation have the proper case + # for platforms that ignore the spec. + @pre_load + def patch_types(self, data, **kwargs): + def walk_payload(payload): + for key,val in copy.copy(payload).items(): + if isinstance(val, dict): + payload.update(walk_payload(val)) + if key == 'type': + payload[key] = MODEL_NAMES.get(val.lower(), val) + return payload + return walk_payload(data) + # A node without an id isn't true json-ld, but many payloads have # id-less nodes. Since calamus forces random ids on such nodes, # this removes it. @@ -567,7 +582,7 @@ class Person(Object, base.Profile): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._allowed_children += (PropertyValue, IdentityProof) + self._allowed_children += (Note, PropertyValue, IdentityProof) # Set finger to username@host if not provided by the platform def post_receive(self): @@ -1019,7 +1034,7 @@ class Video(Document, base.Video): return self -class Signature(Object): +class RsaSignature2017(Object): created = fields.DateTime(dc.created, add_value_types=True) creator = IRI(dc.creator) key = fields.String(sec.signatureValue) @@ -1396,6 +1411,7 @@ def model_to_objects(payload): entity = model.schema().load(payload) except (KeyError, jsonld.JsonLdError, exceptions.ValidationError) as exc : # Just give up for now. This must be made robust logger.error("Error parsing jsonld payload (%s)", exc) + traceback.print_exception(exc) return None if isinstance(getattr(entity, 'object_', None), Object): @@ -1416,4 +1432,10 @@ CLASSES_WITH_CONTEXT_EXTENSIONS = ( Person, PropertyValue ) -context_manager = LdContextManager(CLASSES_WITH_CONTEXT_EXTENSIONS) \ No newline at end of file +context_manager = LdContextManager(CLASSES_WITH_CONTEXT_EXTENSIONS) + + +MODEL_NAMES = {} +for key,val in copy.copy(globals()).items(): + if type(val) == JsonLDAnnotation and issubclass(val, Object): + MODEL_NAMES[key.lower()] = key From f72ecf459a598d506a502be7e7cc4da791180b18 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Tue, 6 Jun 2023 17:57:47 -0400 Subject: [PATCH 02/28] Fix logic error. --- federation/entities/activitypub/models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 6bd841e..9c1c6e9 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -342,7 +342,7 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation): def walk_payload(payload): for key,val in copy.copy(payload).items(): if isinstance(val, dict): - payload.update(walk_payload(val)) + walk_payload(val) if key == 'type': payload[key] = MODEL_NAMES.get(val.lower(), val) return payload @@ -1439,3 +1439,6 @@ MODEL_NAMES = {} for key,val in copy.copy(globals()).items(): if type(val) == JsonLDAnnotation and issubclass(val, Object): MODEL_NAMES[key.lower()] = key + +from pprint import pprint +pprint(MODEL_NAMES) \ No newline at end of file From 33131bd9fe9c6728dc2ba9f808373915b7e2d530 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Tue, 6 Jun 2023 18:00:01 -0400 Subject: [PATCH 03/28] Remove debug pprint. --- federation/entities/activitypub/models.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 9c1c6e9..e98321a 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -1439,6 +1439,3 @@ MODEL_NAMES = {} for key,val in copy.copy(globals()).items(): if type(val) == JsonLDAnnotation and issubclass(val, Object): MODEL_NAMES[key.lower()] = key - -from pprint import pprint -pprint(MODEL_NAMES) \ No newline at end of file From e94533b222a58bbc208cd057b7e911f13cb910f5 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Wed, 7 Jun 2023 10:34:08 -0400 Subject: [PATCH 04/28] Allow Link objects as items of the attachment property. Must be rendered by the client app. --- federation/entities/activitypub/models.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index e98321a..740cd11 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -243,7 +243,7 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation): icon = MixedField(as2.icon, nested='ImageSchema') image = MixedField(as2.image, nested='ImageSchema') tag_objects = MixedField(as2.tag, nested=['NoteSchema', 'HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True) - attachment = fields.Nested(as2.attachment, nested=['NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'], + attachment = fields.Nested(as2.attachment, nested=['LinkSchema', 'NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'], many=True, default=[]) content_map = LanguageMap(as2.content) # language maps are not implemented in calamus context = fields.RawJsonLD(as2.context) @@ -736,7 +736,7 @@ class Note(Object, RawContentMixin): def __init__(self, *args, **kwargs): self.tag_objects = [] # mutable objects... super().__init__(*args, **kwargs) - self._allowed_children += (base.Audio, base.Video) + self._allowed_children += (base.Audio, base.Video, Link) def to_as2(self): self.sensitive = 'nsfw' in self.tags @@ -932,12 +932,13 @@ class Note(Object, RawContentMixin): if isinstance(getattr(self, 'attachment', None), list): children = [] for child in self.attachment: - if isinstance(child, Document): - obj = child.to_base() - if isinstance(obj, Image): - if obj.inline or (obj.image and obj.image in self.raw_content): + if isinstance(child, (Document, Link)): + if hasattr(child, 'to_base'): + child = child.to_base() + if isinstance(child, Image): + if child.inline or (child.image and child.image in self.raw_content): continue - children.append(obj) + children.append(child) self._cached_children = children return self._cached_children @@ -1437,5 +1438,5 @@ context_manager = LdContextManager(CLASSES_WITH_CONTEXT_EXTENSIONS) MODEL_NAMES = {} for key,val in copy.copy(globals()).items(): - if type(val) == JsonLDAnnotation and issubclass(val, Object): + if type(val) == JsonLDAnnotation and issubclass(val, (Object, Link)): MODEL_NAMES[key.lower()] = key From e0993a7f7f49bc748820770ff4a32b3a0382a857 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sat, 8 Jul 2023 07:34:44 -0400 Subject: [PATCH 05/28] Switch to BeautifulSoup for content processing. Outbound rendered content is now provided by the client app. Mark inbound AP HTML content hashtags and mentions. Fix missing href attribute crashing process_text_links. --- CHANGELOG.md | 4 +- docs/protocols.rst | 33 +++- federation/entities/activitypub/models.py | 222 +++++++++++----------- federation/entities/mixins.py | 66 ++----- federation/tests/entities/test_base.py | 1 + federation/utils/text.py | 80 +++----- 6 files changed, 177 insertions(+), 229 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 939a554..19d2bca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,7 @@ * For inbound payload, a cached dict of all the defined AP extensions is merged with each incoming LD context. * Better handle conflicting property defaults by having `get_base_attributes` return only attributes that - are not empty (or bool). This helps distinguishing between `marshmallow.missing` and empty values. + are not empty (or bool). This helps distinguish between `marshmallow.missing` and empty values. * JsonLD document caching now set in `activitypub/__init__.py`. @@ -45,6 +45,8 @@ * In fetch_document: if response.encoding is not set, default to utf-8. +* Fix process_text_links that would crash on `a` tags with no `href` attribute. + ## [0.24.1] - 2023-03-18 ### Fixed diff --git a/docs/protocols.rst b/docs/protocols.rst index 1e15467..0dd845b 100644 --- a/docs/protocols.rst +++ b/docs/protocols.rst @@ -4,9 +4,8 @@ Protocols Currently three protocols are being focused on. * Diaspora is considered to be stable with most of the protocol implemented. -* ActivityPub support should be considered as alpha - all the basic - things work but there are likely to be a lot of compatibility issues with other ActivityPub - implementations. +* ActivityPub support should be considered as beta - all the basic + things work and we are fixing incompatibilities as they are identified. * Matrix support cannot be considered usable as of yet. For example implementations in real life projects check :ref:`example-projects`. @@ -69,20 +68,21 @@ Content media type The following keys will be set on the entity based on the ``source`` property existing: * if the object has an ``object.source`` property: - * ``_media_type`` will be the source media type - * ``_rendered_content`` will be the object ``content`` + * ``_media_type`` will be the source media type (only text/markdown is supported). + * ``rendered_content`` will be the object ``content`` * ``raw_content`` will be the source ``content`` * if the object has no ``object.source`` property: * ``_media_type`` will be ``text/html`` - * ``_rendered_content`` will be the object ``content`` - * ``raw_content`` will object ``content`` run through a HTML2Markdown renderer + * ``rendered_content`` will be the object ``content`` + * ``raw_content`` will be empty The ``contentMap`` property is processed but content language selection is not implemented yet. For outbound entities, ``raw_content`` is expected to be in ``text/markdown``, -specifically CommonMark. When sending payloads, ``raw_content`` will be rendered via -the ``commonmark`` library into ``object.content``. The original ``raw_content`` -will be added to the ``object.source`` property. +specifically CommonMark. The client applications are expected to provide the +rendered content for protocols that require it (e.g. ActivityPub). +When sending payloads, ``object.contentMap`` will be set to ``rendered_content`` +and ``raw_content`` will be added to the ``object.source`` property. Medias ...... @@ -98,6 +98,19 @@ support from client applications. For inbound entities we do this automatically by not including received image attachments in the entity ``_children`` attribute. Audio and video are passed through the client application. +Hashtags and mentions +..................... + +For outbound payloads, client applications must add/set the hashtag/mention value to +the ``class`` attribute of rendered content linkified hashtags/mentions. These will be +used to help build the corresponding ``Hashtag`` and ``Mention`` objects. + +For inbound payloads, if a markdown source is provided, hashtags/mentions will be extracted +through the same method used for Diaspora. If only HTML content is provided, the ``a`` tags +will be marked with a ``data-[hashtag|mention]`` attribute (based on the provided Hashtag/Mention +objects) to facilitate the ``href`` attribute modifications lient applications might +wish to make. This should ensure links can be replaced regardless of how the HTML is structured. + .. _matrix: Matrix diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 740cd11..1d58262 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -1,6 +1,7 @@ import copy import json import logging +import re import traceback import uuid from datetime import timedelta @@ -8,6 +9,7 @@ from typing import List, Dict, Union from urllib.parse import urlparse import bleach +from bs4 import BeautifulSoup from calamus import fields from calamus.schema import JsonLDAnnotation, JsonLDSchema, JsonLDSchemaOpts from calamus.utils import normalize_value @@ -731,15 +733,19 @@ class Note(Object, RawContentMixin): _cached_raw_content = '' _cached_children = [] + _soup = None signable = True def __init__(self, *args, **kwargs): self.tag_objects = [] # mutable objects... super().__init__(*args, **kwargs) + self.raw_content # must be "primed" with source property for inbound payloads + self.rendered_content # must be "primed" with content_map property for inbound payloads self._allowed_children += (base.Audio, base.Video, Link) + self._required.remove('raw_content') + self._required += ['rendered_content'] def to_as2(self): - self.sensitive = 'nsfw' in self.tags self.url = self.id edited = False @@ -767,8 +773,8 @@ class Note(Object, RawContentMixin): def to_base(self): kwargs = get_base_attributes(self, keep=( - '_mentions', '_media_type', '_rendered_content', '_source_object', - '_cached_children', '_cached_raw_content')) + '_mentions', '_media_type', '_source_object', + '_cached_children', '_cached_raw_content', '_soup')) entity = Comment(**kwargs) if getattr(self, 'target_id') else Post(**kwargs) # Plume (and maybe other platforms) send the attrbutedTo field as an array if isinstance(entity.actor_id, list): entity.actor_id = entity.actor_id[0] @@ -779,6 +785,7 @@ class Note(Object, RawContentMixin): def pre_send(self) -> None: """ Attach any embedded images from raw_content. + Add Hashtag and Mention objects (the client app must define the class tag/mention property) """ super().pre_send() self._children = [ @@ -789,135 +796,128 @@ class Note(Object, RawContentMixin): ) for image in self.embedded_images ] - # Add other AP objects - self.extract_mentions() - self.content_map = {'orig': self.rendered_content} - self.add_mention_objects() - self.add_tag_objects() + # Add Hashtag objects + for el in self._soup('a', attrs={'class':'hashtag'}): + self.tag_objects.append(Hashtag( + href = el.attrs['href'], + name = el.text.lstrip('#') + )) + if el.text == '#nsfw': self.sensitive = True + + # Add Mention objects + mentions = [] + for el in self._soup('a', attrs={'class':'mention'}): + mentions.append(el.text.lstrip('@')) + + mentions.sort() + for mention in mentions: + if validate_handle(mention): + profile = get_profile(finger=mention) + # only add AP profiles mentions + if getattr(profile, 'id', None): + self.tag_objects.append(Mention(href=profile.id, name='@'+mention)) + # some platforms only render diaspora style markdown if it is available + self.source['content'] = self.source['content'].replace(mention, '{' + mention + '}') + def post_receive(self) -> None: """ - Make linkified tags normal tags. + Mark linkified tags and mentions with a data-{mention, tag} attribute. """ super().post_receive() - if not self.raw_content or self._media_type == "text/markdown": + if self._media_type == "text/markdown": # Skip when markdown return - hrefs = [] - for tag in self.tag_objects: - if isinstance(tag, Hashtag): - if tag.href is not missing: - hrefs.append(tag.href.lower()) - elif tag.id is not missing: - hrefs.append(tag.id.lower()) - # noinspection PyUnusedLocal - def remove_tag_links(attrs, new=False): - # Hashtag object hrefs - href = (None, "href") - url = attrs.get(href, "").lower() - if url in hrefs: - return - # one more time without the query (for pixelfed) - parsed = urlparse(url) - url = f'{parsed.scheme}://{parsed.netloc}{parsed.path}' - if url in hrefs: - return - - # Mastodon - rel = (None, "rel") - if attrs.get(rel) == "tag": - return - - # Friendica - if attrs.get(href, "").endswith(f'tag={attrs.get("_text")}'): - return - - return attrs - - self.raw_content = bleach.linkify( - self.raw_content, - callbacks=[remove_tag_links], - parse_email=False, - skip_tags=["code", "pre"], - ) + self._find_and_mark_hashtags() + self._find_and_mark_mentions() if getattr(self, 'target_id'): self.entity_type = 'Comment' - def add_tag_objects(self) -> None: - """ - Populate tags to the object.tag list. - """ - try: - from federation.utils.django import get_configuration - config = get_configuration() - except ImportError: - tags_path = None - else: - if config["tags_path"]: - tags_path = f"{config['base_url']}{config['tags_path']}" - else: - tags_path = None - for tag in self.tags: - _tag = Hashtag(name=f'#{tag}') - if tags_path: - _tag.href = tags_path.replace(":tag:", tag) - self.tag_objects.append(_tag) + def _find_and_mark_hashtags(self): + hrefs = set() + for tag in self.tag_objects: + if isinstance(tag, Hashtag): + if tag.href is not missing: + hrefs.add(tag.href.lower()) + # Some platforms use id instead of href... + elif tag.id is not missing: + hrefs.add(tag.id.lower()) - def add_mention_objects(self) -> None: - """ - Populate mentions to the object.tag list. - """ - if len(self._mentions): - mentions = list(self._mentions) - mentions.sort() - for mention in mentions: - if validate_handle(mention): - profile = get_profile(finger=mention) - # only add AP profiles mentions - if getattr(profile, 'id', None): - self.tag_objects.append(Mention(href=profile.id, name='@'+mention)) - # some platforms only render diaspora style markdown if it is available - self.source['content'] = self.source['content'].replace(mention, '{'+mention+'}') + for link in self._soup.find_all('a', href=True): + parsed = urlparse(link['href'].lower()) + # remove the query part, if any + url = f'{parsed.scheme}://{parsed.netloc}{parsed.path}' + links = {link['href'].lower(), url} + if links.intersection(hrefs): + link['data-hashtag'] = link.text.lstrip('#').lower() + + def _find_and_mark_mentions(self): + mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)] + hrefs = [mention.href for mention in mentions] + # add Mastodon's form + hrefs.extend([re.sub(r'/(users/)([\w]+)$', r'/@\2', href) for href in hrefs]) + for href in hrefs: + links = self._soup.find_all(href=href) + for link in links: + profile = get_profile_or_entity(fid=link['href']) + if profile: + link['data-mention'] = profile.finger + self._mentions.add(profile.finger) def extract_mentions(self): """ - Extract mentions from the source object. - """ - super().extract_mentions() + Extract mentions from the inbound Mention objects. - if getattr(self, 'tag_objects', None): - #tag_objects = self.tag_objects if isinstance(self.tag_objects, list) else [self.tag_objects] - for tag in self.tag_objects: - if isinstance(tag, Mention): - profile = get_profile_or_entity(fid=tag.href) - handle = getattr(profile, 'finger', None) - if handle: self._mentions.add(handle) + Also attempt to extract from raw_content if available + """ + + if self.raw_content: + super().extract_mentions() + return + + for mention in self.tag_objects: + if isinstance(mention, Mention): + profile = get_profile_or_entity(fid=mention.href) + handle = getattr(profile, 'finger', None) + if handle: self._mentions.add(handle) @property - def raw_content(self): - - if self._cached_raw_content: return self._cached_raw_content + def rendered_content(self): + if self._soup: return str(self._soup) + content = '' if self.content_map: orig = self.content_map.pop('orig') if len(self.content_map.keys()) > 1: logger.warning('Language selection not implemented, falling back to default') - self._rendered_content = orig.strip() + content = orig.strip() else: - self._rendered_content = orig.strip() if len(self.content_map.keys()) == 0 else next(iter(self.content_map.values())).strip() + content = orig.strip() if len(self.content_map.keys()) == 0 else next(iter(self.content_map.values())).strip() self.content_map['orig'] = orig + # to allow for posts/replies with medias only. + if not content: content = "
" + self._soup = BeautifulSoup(content, 'html.parser') + return str(self._soup) + + @rendered_content.setter + def rendered_content(self, value): + if not value: return + self._soup = BeautifulSoup(value, 'html.parser') + self.content_map = {'orig': value} + + @property + def raw_content(self): + if self._cached_raw_content: return self._cached_raw_content + + if isinstance(self.source, dict) and self.source.get('mediaType') == 'text/markdown': + self._media_type = self.source['mediaType'] + self._cached_raw_content = self.source.get('content').strip() + else: + self._media_type = 'text/html' + self._cached_raw_content = "" + return self._cached_raw_content - if isinstance(self.source, dict) and self.source.get('mediaType') == 'text/markdown': - self._media_type = self.source['mediaType'] - self._cached_raw_content = self.source.get('content').strip() - else: - self._media_type = 'text/html' - self._cached_raw_content = self._rendered_content - # to allow for posts/replies with medias only. - if not self._cached_raw_content: self._cached_raw_content = "
" - return self._cached_raw_content - @raw_content.setter def raw_content(self, value): if not value: return @@ -1026,7 +1026,7 @@ class Video(Document, base.Video): self.actor_id = new_act[0] entity = Post(**get_base_attributes(self, - keep=('_mentions', '_media_type', '_rendered_content', + keep=('_mentions', '_media_type', '_soup', '_cached_children', '_cached_raw_content', '_source_object'))) set_public(entity) return entity @@ -1330,14 +1330,16 @@ def extract_and_validate(entity): entity._source_protocol = "activitypub" # Extract receivers entity._receivers = extract_receivers(entity) + + # Extract mentions + if hasattr(entity, "extract_mentions"): + entity.extract_mentions() + if hasattr(entity, "post_receive"): entity.post_receive() if hasattr(entity, 'validate'): entity.validate() - # Extract mentions - if hasattr(entity, "extract_mentions"): - entity.extract_mentions() def extract_replies(replies): diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py index 30ef9d8..8ca6745 100644 --- a/federation/entities/mixins.py +++ b/federation/entities/mixins.py @@ -4,12 +4,13 @@ import re import warnings from typing import List, Set, Union, Dict, Tuple +from bs4 import BeautifulSoup from commonmark import commonmark from marshmallow import missing from federation.entities.activitypub.enums import ActivityType from federation.entities.utils import get_name_for_profile, get_profile -from federation.utils.text import process_text_links, find_tags +from federation.utils.text import process_text_links, find_elements, find_tags, MENTION_PATTERN class BaseEntity: @@ -22,6 +23,7 @@ class BaseEntity: _source_object: Union[str, Dict] = None _sender: str = "" _sender_key: str = "" + _tags: Set = None # ActivityType activity: ActivityType = None activity_id: str = "" @@ -205,7 +207,7 @@ class CreatedAtMixin(BaseEntity): class RawContentMixin(BaseEntity): _media_type: str = "text/markdown" _mentions: Set = None - _rendered_content: str = "" + rendered_content: str = "" raw_content: str = "" def __init__(self, *args, **kwargs): @@ -231,59 +233,22 @@ class RawContentMixin(BaseEntity): images.append((groups[1], groups[0] or "")) return images - @property - def rendered_content(self) -> str: - """Returns the rendered version of raw_content, or just raw_content.""" - try: - from federation.utils.django import get_configuration - config = get_configuration() - if config["tags_path"]: - def linkifier(tag: str) -> str: - return f'' \ - f'#{tag}' - else: - linkifier = None - except ImportError: - linkifier = None - - if self._rendered_content: - return self._rendered_content - elif self._media_type == "text/markdown" and self.raw_content: - # Do tags - _tags, rendered = find_tags(self.raw_content, replacer=linkifier) - # Render markdown to HTML - rendered = commonmark(rendered).strip() - # Do mentions - if self._mentions: - for mention in self._mentions: - # Diaspora mentions are linkified as mailto - profile = get_profile(finger=mention) - href = 'mailto:'+mention if not getattr(profile, 'id', None) else profile.id - rendered = rendered.replace( - "@%s" % mention, - f'@{mention}', - ) - # Finally linkify remaining URL's that are not links - rendered = process_text_links(rendered) - return rendered - return self.raw_content - + # Legacy. Keep this until tests are reworked @property def tags(self) -> List[str]: - """Returns a `list` of unique tags contained in `raw_content`.""" if not self.raw_content: - return [] - tags, _text = find_tags(self.raw_content) - return sorted(tags) + return + return find_tags(self.raw_content) def extract_mentions(self): - if self._media_type != 'text/markdown': return - matches = re.findall(r'@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?', self.raw_content) - if not matches: + if not self.raw_content: return - for mention in matches: + mentions = find_elements( + BeautifulSoup( + commonmark(self.raw_content, ignore_html_blocks=True), 'html.parser'), + MENTION_PATTERN) + for ns in mentions: + mention = ns.text handle = None splits = mention.split(";") if len(splits) == 1: @@ -292,11 +257,12 @@ class RawContentMixin(BaseEntity): handle = splits[1].strip(' }') if handle: self._mentions.add(handle) - self.raw_content = self.raw_content.replace(mention, '@'+handle) + self.raw_content = self.raw_content.replace(mention, '@' + handle) class OptionalRawContentMixin(RawContentMixin): """A version of the RawContentMixin where `raw_content` is not required.""" + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._required.remove("raw_content") diff --git a/federation/tests/entities/test_base.py b/federation/tests/entities/test_base.py index c04b832..43a69ea 100644 --- a/federation/tests/entities/test_base.py +++ b/federation/tests/entities/test_base.py @@ -123,6 +123,7 @@ class TestShareEntity: class TestRawContentMixin: + @pytest.mark.skip def test_rendered_content(self, post): assert post.rendered_content == """

One more test before sleep ๐Ÿ˜… This time with an image.

""" diff --git a/federation/utils/text.py b/federation/utils/text.py index cebed5a..0bb6840 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -1,11 +1,16 @@ import re -from typing import Set, Tuple +from typing import Set, List from urllib.parse import urlparse import bleach from bleach import callbacks +from bs4 import BeautifulSoup +from bs4.element import NavigableString +from commonmark import commonmark ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โ€™โ€โ€”\xa0" +TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE) +MENTION_PATTERN = re.compile(r'(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE) def decode_if_bytes(text): @@ -22,67 +27,26 @@ def encode_if_text(text): return text -def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]: +def find_tags(text: str) -> List[str]: """Find tags in text. - Tries to ignore tags inside code blocks. + Ignore tags inside code blocks. - Optionally, if passed a "replacer", will also replace the tag word with the result - of the replacer function called with the tag word. + Returns a set of tags. - Returns a set of tags and the original or replaced text. """ - found_tags = set() - #
and

tags cause issues in us finding words - add some spacing around them - new_text = text.replace("
", "
").replace("

", "

").replace("

", "

") - lines = new_text.splitlines(keepends=True) - final_lines = [] - code_block = False - final_text = None - # Check each line separately - for line in lines: - final_words = [] - if line[0:3] == "```": - code_block = not code_block - if line.find("#") == -1 or line[0:4] == " " or code_block: - # Just add the whole line - final_lines.append(line) - continue - # Check each word separately - words = line.split(" ") - for word in words: - if word.find('#') > -1: - candidate = word.strip().strip("([]),.!?:*_%/") - if candidate.find('<') > -1 or candidate.find('>') > -1: - # Strip html - candidate = bleach.clean(word, strip=True) - # Now split with slashes - candidates = candidate.split("/") - to_replace = [] - for candidate in candidates: - if candidate.startswith("#"): - candidate = candidate.strip("#") - if test_tag(candidate.lower()): - found_tags.add(candidate.lower()) - to_replace.append(candidate) - if replacer: - tag_word = word - try: - for counter, replacee in enumerate(to_replace, 1): - tag_word = tag_word.replace("#%s" % replacee, replacer(replacee)) - except Exception: - pass - final_words.append(tag_word) - else: - final_words.append(word) - else: - final_words.append(word) - final_lines.append(" ".join(final_words)) - if replacer: - final_text = "".join(final_lines) - if final_text: - final_text = final_text.replace("
", "
").replace("

", "

").replace("

", "

") - return found_tags, final_text or text + tags = find_elements(BeautifulSoup(commonmark(text, ignore_html_blocks=True), 'html.parser'), + TAG_PATTERN) + return sorted([tag.text.lstrip('#').lower() for tag in tags]) + + +def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableString]: + for candidate in soup.find_all(string=True): + if candidate.parent.name == 'code': continue + ns = [NavigableString(r) for r in re.split(pattern, candidate.text)] + candidate.replace_with(*ns) + return list(soup.find_all(string=pattern)) + def get_path_from_url(url: str) -> str: @@ -100,7 +64,7 @@ def process_text_links(text): def link_attributes(attrs, new=False): """Run standard callbacks except for internal links.""" href_key = (None, "href") - if attrs.get(href_key).startswith("/"): + if attrs.get(href_key, "").startswith("/"): return attrs # Run the standard callbacks From 4dca31b17ffe266adba8cd8a48f4951304c22700 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sun, 9 Jul 2023 06:40:23 -0400 Subject: [PATCH 06/28] Make sure the code block a really ignored. --- federation/utils/text.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/federation/utils/text.py b/federation/utils/text.py index 0bb6840..d64ed3f 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -41,12 +41,20 @@ def find_tags(text: str) -> List[str]: def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableString]: + """ + Split a BeautifulSoup tree strings according to a pattern, replacing each element + with a NavigableString. The returned list can be used to linkify the found + elements. + + :param soup: BeautifulSoup instance of the content being searched + :param pattern: Compiled regular expression defined using a single group + :return: A NavigableString list attached to the original soup + """ for candidate in soup.find_all(string=True): if candidate.parent.name == 'code': continue ns = [NavigableString(r) for r in re.split(pattern, candidate.text)] candidate.replace_with(*ns) - return list(soup.find_all(string=pattern)) - + return list(soup.find_all(string=re.compile(r'^'+pattern.pattern))) def get_path_from_url(url: str) -> str: From 6d885a5c40fe9f64e7b64de96fe1eaacae84489d Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sun, 9 Jul 2023 08:51:10 -0400 Subject: [PATCH 07/28] Add lemmy namespace. --- federation/entities/activitypub/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 1d58262..4b8aa1a 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -60,6 +60,7 @@ as2 = fields.Namespace("https://www.w3.org/ns/activitystreams#") dc = fields.Namespace("http://purl.org/dc/terms/") diaspora = fields.Namespace("https://diasporafoundation.org/ns/") ldp = fields.Namespace("http://www.w3.org/ns/ldp#") +lemmy = fields.Namespace("https://join-lemmy.org/ns#") litepub = fields.Namespace("http://litepub.social/ns#") misskey = fields.Namespace("https://misskey-hub.net/ns#") ostatus = fields.Namespace("http://ostatus.org#") From 1f8d4ac93fa93974e62d7e937a0af8b9ee9059df Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Mon, 10 Jul 2023 08:24:33 -0400 Subject: [PATCH 08/28] Fix a regression that broke GET requests signature verification. --- federation/entities/activitypub/django/views.py | 8 +++++--- federation/protocols/activitypub/protocol.py | 8 +++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/federation/entities/activitypub/django/views.py b/federation/entities/activitypub/django/views.py index e6eb688..18f316b 100644 --- a/federation/entities/activitypub/django/views.py +++ b/federation/entities/activitypub/django/views.py @@ -2,7 +2,7 @@ from cryptography.exceptions import InvalidSignature from django.http import JsonResponse, HttpResponse, HttpResponseNotFound from federation.entities.activitypub.mappers import get_outbound_entity -from federation.protocols.activitypub.signing import verify_request_signature +from federation.protocols.activitypub.protocol import Protocol from federation.types import RequestType from federation.utils.django import get_function_from_config @@ -23,9 +23,11 @@ def get_and_verify_signer(request): body=request.body, method=request.method, headers=request.headers) + protocol = Protocol(request=req, get_contact_key=get_public_key) try: - return verify_request_signature(req) - except ValueError: + protocol.verify() + return protocol.sender + except (ValueError, KeyError, InvalidSignature) as exc: return None diff --git a/federation/protocols/activitypub/protocol.py b/federation/protocols/activitypub/protocol.py index 0302eee..acb762a 100644 --- a/federation/protocols/activitypub/protocol.py +++ b/federation/protocols/activitypub/protocol.py @@ -49,6 +49,11 @@ class Protocol: sender = None user = None + def __init__(self, request=None, get_contact_key=None): + # this is required for calls to verify on GET requests + self.request = request + self.get_contact_key = get_contact_key + def build_send(self, entity: BaseEntity, from_user: UserType, to_user_key: RsaKey = None) -> Union[str, Dict]: """ Build POST data for sending out to remotes. @@ -109,10 +114,11 @@ class Protocol: signer = get_profile(key_id=sig.get('keyId')) if not signer: signer = retrieve_and_parse_document(sig.get('keyId')) + print(sig, signer) self.sender = signer.id if signer else self.actor key = getattr(signer, 'public_key', None) if not key: - key = self.get_contact_key(self.actor) if self.get_contact_key else '' + key = self.get_contact_key(self.actor) if self.get_contact_key and self.actor else '' if key: # fallback to the author's key the client app may have provided logger.warning("Failed to retrieve keyId for %s, trying the actor's key", sig.get('keyId')) From 24f5bb21a9eca400a3ddc23fcdf55553d8c30668 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Mon, 10 Jul 2023 14:39:55 -0400 Subject: [PATCH 09/28] Fix some tests and fix code that was failing tests. Catch HTML signatures with invalid padding. --- federation/entities/activitypub/ldsigning.py | 2 +- federation/entities/activitypub/models.py | 5 +- federation/entities/mixins.py | 4 +- .../entities/activitypub/test_entities.py | 31 +++++++----- .../entities/activitypub/test_mappers.py | 42 +++++++++------- federation/tests/fixtures/entities.py | 1 + .../tests/fixtures/payloads/activitypub.py | 14 +++--- federation/tests/utils/test_text.py | 49 ++++--------------- federation/utils/text.py | 6 +-- 9 files changed, 68 insertions(+), 86 deletions(-) diff --git a/federation/entities/activitypub/ldsigning.py b/federation/entities/activitypub/ldsigning.py index 381f419..f77b1fd 100644 --- a/federation/entities/activitypub/ldsigning.py +++ b/federation/entities/activitypub/ldsigning.py @@ -75,8 +75,8 @@ def verify_ld_signature(payload): obj_digest = hash(obj) digest = (sig_digest + obj_digest).encode('utf-8') - sig_value = b64decode(signature.get('signatureValue')) try: + sig_value = b64decode(signature.get('signatureValue')) verifier.verify(SHA256.new(digest), sig_value) logger.debug('ld_signature - %s has a valid signature', payload.get("id")) return profile.id diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 4b8aa1a..61c3787 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -4,7 +4,7 @@ import logging import re import traceback import uuid -from datetime import timedelta +from operator import attrgetter from typing import List, Dict, Union from urllib.parse import urlparse @@ -801,8 +801,9 @@ class Note(Object, RawContentMixin): for el in self._soup('a', attrs={'class':'hashtag'}): self.tag_objects.append(Hashtag( href = el.attrs['href'], - name = el.text.lstrip('#') + name = el.text )) + self.tag_objects = sorted(self.tag_objects, key=attrgetter('name')) if el.text == '#nsfw': self.sensitive = True # Add Mention objects diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py index 8ca6745..506becd 100644 --- a/federation/entities/mixins.py +++ b/federation/entities/mixins.py @@ -237,8 +237,8 @@ class RawContentMixin(BaseEntity): @property def tags(self) -> List[str]: if not self.raw_content: - return - return find_tags(self.raw_content) + return [] + return sorted(find_tags(self.raw_content)) def extract_mentions(self): if not self.raw_content: diff --git a/federation/tests/entities/activitypub/test_entities.py b/federation/tests/entities/activitypub/test_entities.py index 835ad74..8a7ba6b 100644 --- a/federation/tests/entities/activitypub/test_entities.py +++ b/federation/tests/entities/activitypub/test_entities.py @@ -1,3 +1,4 @@ +import commonmark import pytest from unittest.mock import patch from pprint import pprint @@ -9,6 +10,7 @@ from federation.entities.activitypub.models import context_manager from federation.entities.activitypub.models import Accept from federation.tests.fixtures.keys import PUBKEY from federation.types import UserType +from federation.utils.text import process_text_links class TestEntitiesConvertToAS2: @@ -65,6 +67,8 @@ class TestEntitiesConvertToAS2: def test_comment_to_as2__url_in_raw_content(self, activitypubcomment): activitypubcomment.raw_content = 'raw_content http://example.com' + activitypubcomment.rendered_content = process_text_links( + commonmark.commonmark(activitypubcomment.raw_content).strip()) activitypubcomment.pre_send() result = activitypubcomment.to_as2() assert result == { @@ -118,6 +122,7 @@ class TestEntitiesConvertToAS2: } def test_post_to_as2(self, activitypubpost): + activitypubpost.rendered_content = commonmark.commonmark(activitypubpost.raw_content).strip() activitypubpost.pre_send() result = activitypubpost.to_as2() assert result == { @@ -191,6 +196,15 @@ class TestEntitiesConvertToAS2: } def test_post_to_as2__with_tags(self, activitypubpost_tags): + activitypubpost_tags.rendered_content = '

raw_content

\n' \ + '

#foobar\n' \ + '#barfoo

' activitypubpost_tags.pre_send() result = activitypubpost_tags.to_as2() assert result == { @@ -204,11 +218,11 @@ class TestEntitiesConvertToAS2: 'url': 'http://127.0.0.1:8000/post/123456/', 'attributedTo': 'http://127.0.0.1:8000/profile/123456/', 'content': '

raw_content

\n' - '

#foobar\n' - '#barfoo

', @@ -235,6 +249,7 @@ class TestEntitiesConvertToAS2: } def test_post_to_as2__with_images(self, activitypubpost_images): + activitypubpost_images.rendered_content = '

raw_content

' activitypubpost_images.pre_send() result = activitypubpost_images.to_as2() assert result == { @@ -274,6 +289,7 @@ class TestEntitiesConvertToAS2: } def test_post_to_as2__with_diaspora_guid(self, activitypubpost_diaspora_guid): + activitypubpost_diaspora_guid.rendered_content = '

raw_content

' activitypubpost_diaspora_guid.pre_send() result = activitypubpost_diaspora_guid.to_as2() assert result == { @@ -418,17 +434,6 @@ class TestEntitiesPostReceive: "public": False, }] - @patch("federation.entities.activitypub.models.bleach.linkify", autospec=True) - def test_post_post_receive__linkifies_if_not_markdown(self, mock_linkify, activitypubpost): - activitypubpost._media_type = 'text/html' - activitypubpost.post_receive() - mock_linkify.assert_called_once() - - @patch("federation.entities.activitypub.models.bleach.linkify", autospec=True) - def test_post_post_receive__skips_linkify_if_markdown(self, mock_linkify, activitypubpost): - activitypubpost.post_receive() - mock_linkify.assert_not_called() - class TestEntitiesPreSend: def test_post_inline_images_are_attached(self, activitypubpost_embedded_images): diff --git a/federation/tests/entities/activitypub/test_mappers.py b/federation/tests/entities/activitypub/test_mappers.py index 566503f..9a2c042 100644 --- a/federation/tests/entities/activitypub/test_mappers.py +++ b/federation/tests/entities/activitypub/test_mappers.py @@ -4,6 +4,9 @@ from unittest.mock import patch, Mock, DEFAULT import json import pytest +from federation.entities.activitypub.models import Person + + #from federation.entities.activitypub.entities import ( # models.Follow, models.Accept, models.Person, models.Note, models.Note, # models.Delete, models.Announce) @@ -70,9 +73,7 @@ class TestActivitypubEntityMappersReceive: post = entities[0] assert isinstance(post, models.Note) assert isinstance(post, Post) - assert post.raw_content == '

' \ - '@jaywink boom

' + assert post.raw_content == '' assert post.rendered_content == '

' \ '@jaywink boom

' assert post.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237" @@ -87,40 +88,41 @@ class TestActivitypubEntityMappersReceive: post = entities[0] assert isinstance(post, models.Note) assert isinstance(post, Post) - assert post.raw_content == '

boom #test

' + assert post.raw_content == '' + assert post.rendered_content == '

boom

' - # TODO: fix this test - @pytest.mark.skip - def test_message_to_objects_simple_post__with_mentions(self): + @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev3.jasonrobinson.me")) + def test_message_to_objects_simple_post__with_mentions(self, mock_get): entities = message_to_objects(ACTIVITYPUB_POST_WITH_MENTIONS, "https://mastodon.social/users/jaywink") assert len(entities) == 1 post = entities[0] assert isinstance(post, models.Note) assert isinstance(post, Post) assert len(post._mentions) == 1 - assert list(post._mentions)[0] == "https://dev3.jasonrobinson.me/u/jaywink/" + assert list(post._mentions)[0] == "jaywink@dev3.jasonrobinson.me" - def test_message_to_objects_simple_post__with_source__bbcode(self): + + @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me")) + def test_message_to_objects_simple_post__with_source__bbcode(self, mock_get): entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_BBCODE, "https://diaspodon.fr/users/jaywink") assert len(entities) == 1 post = entities[0] assert isinstance(post, models.Note) assert isinstance(post, Post) - assert post.rendered_content == '

' \ + assert post.rendered_content == '

' \ '@jaywink boom

' - assert post.raw_content == '

' \ - '@jaywink boom

' + assert post.raw_content == '' - def test_message_to_objects_simple_post__with_source__markdown(self): + @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me")) + def test_message_to_objects_simple_post__with_source__markdown(self, mock_get): entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_MARKDOWN, "https://diaspodon.fr/users/jaywink") assert len(entities) == 1 post = entities[0] assert isinstance(post, models.Note) assert isinstance(post, Post) - assert post.rendered_content == '

@jaywink boom

' - assert post.raw_content == "@jaywink boom" + assert post.rendered_content == '

@jaywink boom

' + assert post.raw_content == "@jaywink@dev.jasonrobinson.me boom" assert post.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237" assert post.actor_id == "https://diaspodon.fr/users/jaywink" assert post.public is True @@ -145,15 +147,17 @@ class TestActivitypubEntityMappersReceive: assert photo.guid == "" assert photo.handle == "" - def test_message_to_objects_comment(self): + @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me")) + def test_message_to_objects_comment(self, mock_get): entities = message_to_objects(ACTIVITYPUB_COMMENT, "https://diaspodon.fr/users/jaywink") assert len(entities) == 1 comment = entities[0] assert isinstance(comment, models.Note) assert isinstance(comment, Comment) - assert comment.raw_content == '

' \ '@jaywink boom

' + assert comment.raw_content == '' assert comment.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237" assert comment.actor_id == "https://diaspodon.fr/users/jaywink" assert comment.target_id == "https://dev.jasonrobinson.me/content/653bad70-41b3-42c9-89cb-c4ee587e68e4/" diff --git a/federation/tests/fixtures/entities.py b/federation/tests/fixtures/entities.py index 8db61a9..c0d1a07 100644 --- a/federation/tests/fixtures/entities.py +++ b/federation/tests/fixtures/entities.py @@ -30,6 +30,7 @@ def activitypubcomment(): with freeze_time("2019-04-27"): obj = models.Comment( raw_content="raw_content", + rendered_content="

raw_content

", public=True, provider_display_name="Socialhome", id=f"http://127.0.0.1:8000/post/123456/", diff --git a/federation/tests/fixtures/payloads/activitypub.py b/federation/tests/fixtures/payloads/activitypub.py index 7c807c3..e577969 100644 --- a/federation/tests/fixtures/payloads/activitypub.py +++ b/federation/tests/fixtures/payloads/activitypub.py @@ -35,7 +35,7 @@ ACTIVITYPUB_COMMENT = { 'contentMap': {'en': '

@jaywink boom

'}, 'attachment': [], 'tag': [{'type': 'Mention', - 'href': 'https://dev.jasonrobinson.me/p/d4574854-a5d7-42be-bfac-f70c16fcaa97/', + 'href': 'https://dev.jasonrobinson.me/u/jaywink/', 'name': '@jaywink@dev.jasonrobinson.me'}], 'replies': {'id': 'https://diaspodon.fr/users/jaywink/statuses/102356911717767237/replies', 'type': 'Collection', @@ -459,9 +459,9 @@ ACTIVITYPUB_POST_WITH_TAGS = { 'conversation': 'tag:diaspodon.fr,2019-06-28:objectId=2347687:objectType=Conversation', 'content': '

boom

', 'attachment': [], - 'tag': [{'type': 'Mention', - 'href': 'https://dev.jasonrobinson.me/p/d4574854-a5d7-42be-bfac-f70c16fcaa97/', - 'name': '@jaywink@dev.jasonrobinson.me'}], + 'tag': [{'type': 'Hashtag', + 'href': 'https://mastodon.social/tags/test', + 'name': '#test'}], 'replies': {'id': 'https://diaspodon.fr/users/jaywink/statuses/102356911717767237/replies', 'type': 'Collection', 'first': {'type': 'CollectionPage', @@ -552,13 +552,13 @@ ACTIVITYPUB_POST_WITH_SOURCE_MARKDOWN = { 'conversation': 'tag:diaspodon.fr,2019-06-28:objectId=2347687:objectType=Conversation', 'content': '

@jaywink boom

', 'source': { - 'content': "@jaywink boom", + 'content': "@{jaywink@dev.jasonrobinson.me} boom", 'mediaType': "text/markdown", }, 'contentMap': {'en': '

@jaywink boom

'}, 'attachment': [], 'tag': [{'type': 'Mention', - 'href': 'https://dev.jasonrobinson.me/p/d4574854-a5d7-42be-bfac-f70c16fcaa97/', + 'href': 'https://dev.jasonrobinson.me/u/jaywink/', 'name': '@jaywink@dev.jasonrobinson.me'}], 'replies': {'id': 'https://diaspodon.fr/users/jaywink/statuses/102356911717767237/replies', 'type': 'Collection', @@ -612,7 +612,7 @@ ACTIVITYPUB_POST_WITH_SOURCE_BBCODE = { 'contentMap': {'en': '

@jaywink boom

'}, 'attachment': [], 'tag': [{'type': 'Mention', - 'href': 'https://dev.jasonrobinson.me/p/d4574854-a5d7-42be-bfac-f70c16fcaa97/', + 'href': 'https://dev.jasonrobinson.me/u/jaywink/', 'name': '@jaywink@dev.jasonrobinson.me'}], 'replies': {'id': 'https://diaspodon.fr/users/jaywink/statuses/102356911717767237/replies', 'type': 'Collection', diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index 5d0a8df..2c73bc0 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -18,78 +18,49 @@ class TestFindTags: def test_all_tags_are_parsed_from_text(self): source = "#starting and #MixED with some #line\nendings also tags can\n#start on new line" - tags, text = find_tags(source) + tags = find_tags(source) assert tags == {"starting", "mixed", "line", "start"} - assert text == source - tags, text = find_tags(source, replacer=self._replacer) - assert text == "#starting/starting and #MixED/mixed with some #line/line\nendings also tags can\n" \ - "#start/start on new line" def test_code_block_tags_ignored(self): source = "foo\n```\n#code\n```\n#notcode\n\n #alsocode\n" - tags, text = find_tags(source) + tags = find_tags(source) assert tags == {"notcode"} - assert text == source - tags, text = find_tags(source, replacer=self._replacer) - assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n #alsocode\n" def test_endings_are_filtered_out(self): source = "#parenthesis) #exp! #list] *#doh* _#bah_ #gah% #foo/#bar" - tags, text = find_tags(source) + tags = find_tags(source) assert tags == {"parenthesis", "exp", "list", "doh", "bah", "gah", "foo", "bar"} - assert text == source - tags, text = find_tags(source, replacer=self._replacer) - assert text == "#parenthesis/parenthesis) #exp/exp! #list/list] *#doh/doh* _#bah/bah_ #gah/gah% " \ - "#foo/foo/#bar/bar" def test_finds_tags(self): source = "#post **Foobar** #tag #OtherTag #third\n#fourth" - tags, text = find_tags(source) + tags = find_tags(source) assert tags == {"third", "fourth", "post", "othertag", "tag"} - assert text == source - tags, text = find_tags(source, replacer=self._replacer) - assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth" def test_ok_with_html_tags_in_text(self): source = "

#starting and #MixED however not <#>this or <#/>that" - tags, text = find_tags(source) + tags = find_tags(source) assert tags == {"starting", "mixed"} - assert text == source - tags, text = find_tags(source, replacer=self._replacer) - assert text == "

#starting/starting and #MixED/mixed however not <#>this or <#/>that" def test_postfixed_tags(self): source = "#foo) #bar] #hoo, #hee." - tags, text = find_tags(source) + tags = find_tags(source) assert tags == {"foo", "bar", "hoo", "hee"} - assert text == source - tags, text = find_tags(source, replacer=self._replacer) - assert text == "#foo/foo) #bar/bar] #hoo/hoo, #hee/hee." def test_prefixed_tags(self): source = "(#foo [#bar" - tags, text = find_tags(source) + tags = find_tags(source) assert tags == {"foo", "bar"} - assert text == source - tags, text = find_tags(source, replacer=self._replacer) - assert text == "(#foo/foo [#bar/bar" def test_invalid_text_returns_no_tags(self): source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #aยฃa #a(a #a)a #a=a " \ "#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #aโ€™a #aโ€a #\xa0cd" - tags, text = find_tags(source) - assert tags == set() - assert text == source - tags, text = find_tags(source, replacer=self._replacer) - assert text == source + tags = find_tags(source) + assert tags == {'a'} def test_start_of_paragraph_in_html_content(self): source = '

First line

#foobar #barfoo

' - tags, text = find_tags(source) + tags = find_tags(source) assert tags == {"foobar", "barfoo"} - assert text == source - tags, text = find_tags(source, replacer=self._replacer) - assert text == '

First line

#foobar/foobar #barfoo/barfoo

' class TestProcessTextLinks: diff --git a/federation/utils/text.py b/federation/utils/text.py index d64ed3f..e2cd78c 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -27,7 +27,7 @@ def encode_if_text(text): return text -def find_tags(text: str) -> List[str]: +def find_tags(text: str) -> Set[str]: """Find tags in text. Ignore tags inside code blocks. @@ -37,7 +37,7 @@ def find_tags(text: str) -> List[str]: """ tags = find_elements(BeautifulSoup(commonmark(text, ignore_html_blocks=True), 'html.parser'), TAG_PATTERN) - return sorted([tag.text.lstrip('#').lower() for tag in tags]) + return set([tag.text.lstrip('#').lower() for tag in tags]) def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableString]: @@ -54,7 +54,7 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr if candidate.parent.name == 'code': continue ns = [NavigableString(r) for r in re.split(pattern, candidate.text)] candidate.replace_with(*ns) - return list(soup.find_all(string=re.compile(r'^'+pattern.pattern))) + return list(soup.find_all(string=re.compile(r'\A'+pattern.pattern+r'\Z'))) def get_path_from_url(url: str) -> str: From 47af44582c6bec8720114f0896e11b8d2801988d Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Wed, 12 Jul 2023 07:32:04 -0400 Subject: [PATCH 10/28] Do not lowercase the finger attribute. Remove a debug print. --- federation/entities/activitypub/models.py | 2 +- federation/protocols/activitypub/protocol.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 61c3787..5bdcff7 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -594,7 +594,7 @@ class Person(Object, base.Profile): self.finger = profile.finger else: domain = urlparse(self.id).netloc - finger = f'{self.username.lower()}@{domain}' + finger = f'{self.username}@{domain}' if get_profile_id_from_webfinger(finger) == self.id: self.finger = finger # multi-protocol platform diff --git a/federation/protocols/activitypub/protocol.py b/federation/protocols/activitypub/protocol.py index acb762a..516a2f8 100644 --- a/federation/protocols/activitypub/protocol.py +++ b/federation/protocols/activitypub/protocol.py @@ -114,7 +114,6 @@ class Protocol: signer = get_profile(key_id=sig.get('keyId')) if not signer: signer = retrieve_and_parse_document(sig.get('keyId')) - print(sig, signer) self.sender = signer.id if signer else self.actor key = getattr(signer, 'public_key', None) if not key: From d577e39777e39801ce9783ddbb0254e6e45ac263 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Thu, 13 Jul 2023 11:09:00 -0400 Subject: [PATCH 11/28] Do not assume that the last part of a mention.href is the user's name. Adjust patterns to match a leading whitespace or the beginning. --- federation/entities/activitypub/models.py | 10 +++++++--- federation/utils/text.py | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 5bdcff7..f9df860 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -857,9 +857,13 @@ class Note(Object, RawContentMixin): def _find_and_mark_mentions(self): mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)] - hrefs = [mention.href for mention in mentions] - # add Mastodon's form - hrefs.extend([re.sub(r'/(users/)([\w]+)$', r'/@\2', href) for href in hrefs]) + hrefs = [] + for mention in mentions: + hrefs.append(mention.href) + # add Mastodon's form + parsed = urlparse(mention.href) + username = mention.name.lstrip('@').split('@')[0] + hrefs.append(f'{parsed.scheme}://{parsed.netloc}/@{username}') for href in hrefs: links = self._soup.find_all(href=href) for link in links: diff --git a/federation/utils/text.py b/federation/utils/text.py index e2cd78c..3291fe8 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -9,8 +9,8 @@ from bs4.element import NavigableString from commonmark import commonmark ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โ€™โ€โ€”\xa0" -TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE) -MENTION_PATTERN = re.compile(r'(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE) +TAG_PATTERN = re.compile(r'(^|\s)(#[\w]+)', re.UNICODE) +MENTION_PATTERN = re.compile(r'(^|\s)(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE) def decode_if_bytes(text): From 7d750d336563363333da4439a2a090de6be1c0cb Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Fri, 14 Jul 2023 08:55:30 -0400 Subject: [PATCH 12/28] Revert the change in patterns matching beginning or space at the beginning because it prevented the use of markdown on hastags and mentions. --- federation/utils/text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/federation/utils/text.py b/federation/utils/text.py index 3291fe8..e2cd78c 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -9,8 +9,8 @@ from bs4.element import NavigableString from commonmark import commonmark ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โ€™โ€โ€”\xa0" -TAG_PATTERN = re.compile(r'(^|\s)(#[\w]+)', re.UNICODE) -MENTION_PATTERN = re.compile(r'(^|\s)(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE) +TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE) +MENTION_PATTERN = re.compile(r'(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE) def decode_if_bytes(text): From 0783bf43aa54978faaff5128f3b1d0307a3ee8eb Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sat, 15 Jul 2023 09:54:41 -0400 Subject: [PATCH 13/28] Remove unreachable code. Improve (I hope) the mention regex for raw text. --- federation/entities/activitypub/models.py | 10 +--------- federation/utils/text.py | 4 ++-- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index f9df860..e7afbbe 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -874,21 +874,13 @@ class Note(Object, RawContentMixin): def extract_mentions(self): """ - Extract mentions from the inbound Mention objects. - - Also attempt to extract from raw_content if available + Attempt to extract mentions from raw_content if available """ if self.raw_content: super().extract_mentions() return - for mention in self.tag_objects: - if isinstance(mention, Mention): - profile = get_profile_or_entity(fid=mention.href) - handle = getattr(profile, 'finger', None) - if handle: self._mentions.add(handle) - @property def rendered_content(self): if self._soup: return str(self._soup) diff --git a/federation/utils/text.py b/federation/utils/text.py index e2cd78c..cbe6086 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -10,8 +10,8 @@ from commonmark import commonmark ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โ€™โ€โ€”\xa0" TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE) -MENTION_PATTERN = re.compile(r'(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE) - +# This will match non matching braces. I don't think it's an issue. +MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE) def decode_if_bytes(text): try: From 33366802c4741d25a22af6116115d3df58fcdf10 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sun, 16 Jul 2023 07:13:56 -0400 Subject: [PATCH 14/28] Move process_text_links back to the client app. Skip related tests. Convert it to BeautifulSoup. Remove unused imports. --- federation/entities/mixins.py | 2 +- .../entities/activitypub/test_entities.py | 3 ++- federation/tests/utils/test_text.py | 6 ++++- federation/utils/text.py | 25 +------------------ 4 files changed, 9 insertions(+), 27 deletions(-) diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py index 506becd..d37fd93 100644 --- a/federation/entities/mixins.py +++ b/federation/entities/mixins.py @@ -10,7 +10,7 @@ from marshmallow import missing from federation.entities.activitypub.enums import ActivityType from federation.entities.utils import get_name_for_profile, get_profile -from federation.utils.text import process_text_links, find_elements, find_tags, MENTION_PATTERN +from federation.utils.text import find_elements, find_tags, MENTION_PATTERN class BaseEntity: diff --git a/federation/tests/entities/activitypub/test_entities.py b/federation/tests/entities/activitypub/test_entities.py index 8a7ba6b..10335d9 100644 --- a/federation/tests/entities/activitypub/test_entities.py +++ b/federation/tests/entities/activitypub/test_entities.py @@ -10,7 +10,6 @@ from federation.entities.activitypub.models import context_manager from federation.entities.activitypub.models import Accept from federation.tests.fixtures.keys import PUBKEY from federation.types import UserType -from federation.utils.text import process_text_links class TestEntitiesConvertToAS2: @@ -65,6 +64,8 @@ class TestEntitiesConvertToAS2: 'published': '2019-04-27T00:00:00', } + # Now handled by the client app + @pytest.mark.skip def test_comment_to_as2__url_in_raw_content(self, activitypubcomment): activitypubcomment.raw_content = 'raw_content http://example.com' activitypubcomment.rendered_content = process_text_links( diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index 2c73bc0..a442e93 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -1,4 +1,6 @@ -from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links, find_tags +import pytest + +from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, find_tags def test_decode_if_bytes(): @@ -63,6 +65,8 @@ class TestFindTags: assert tags == {"foobar", "barfoo"} +# TODO: move these tests to the client app +@pytest.mark.skip class TestProcessTextLinks: def test_link_at_start_or_end(self): assert process_text_links('https://example.org example.org\nhttp://example.org') == \ diff --git a/federation/utils/text.py b/federation/utils/text.py index cbe6086..8ce6478 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -2,8 +2,6 @@ import re from typing import Set, List from urllib.parse import urlparse -import bleach -from bleach import callbacks from bs4 import BeautifulSoup from bs4.element import NavigableString from commonmark import commonmark @@ -12,6 +10,7 @@ ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โ€™โ€โ€”\xa0" TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE) # This will match non matching braces. I don't think it's an issue. MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE) +URL_PATTERN = re.compile(r'(https?://[\w_\-.#?&/]+)', re.UNICODE) def decode_if_bytes(text): try: @@ -65,28 +64,6 @@ def get_path_from_url(url: str) -> str: return parsed.path -def process_text_links(text): - """Process links in text, adding some attributes and linkifying textual links.""" - link_callbacks = [callbacks.nofollow, callbacks.target_blank] - - def link_attributes(attrs, new=False): - """Run standard callbacks except for internal links.""" - href_key = (None, "href") - if attrs.get(href_key, "").startswith("/"): - return attrs - - # Run the standard callbacks - for callback in link_callbacks: - attrs = callback(attrs, new) - return attrs - - return bleach.linkify( - text, - callbacks=[link_attributes], - parse_email=False, - skip_tags=["code"], - ) - def test_tag(tag: str) -> bool: """Test a word whether it could be accepted as a tag.""" From 4b5a886492f6bef5c7b3f07d27df76530c240280 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Mon, 17 Jul 2023 11:36:24 -0400 Subject: [PATCH 15/28] Match links with no http prefix. Remove trailing garbage from tags. --- federation/entities/activitypub/models.py | 8 +++++--- federation/utils/text.py | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index e7afbbe..8989440 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -849,11 +849,13 @@ class Note(Object, RawContentMixin): for link in self._soup.find_all('a', href=True): parsed = urlparse(link['href'].lower()) - # remove the query part, if any - url = f'{parsed.scheme}://{parsed.netloc}{parsed.path}' + # remove the query part and trailing garbage, if any + path = re.match(r'(/[\w/]+)', parsed.path).group() + url = f'{parsed.scheme}://{parsed.netloc}{path}' links = {link['href'].lower(), url} if links.intersection(hrefs): - link['data-hashtag'] = link.text.lstrip('#').lower() + tag = re.match(r'#?([\w]+)', link.text).group(1).lower() + link['data-hashtag'] = tag def _find_and_mark_mentions(self): mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)] diff --git a/federation/utils/text.py b/federation/utils/text.py index 8ce6478..9d62c04 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -10,7 +10,7 @@ ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โ€™โ€โ€”\xa0" TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE) # This will match non matching braces. I don't think it's an issue. MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE) -URL_PATTERN = re.compile(r'(https?://[\w_\-.#?&/]+)', re.UNICODE) +URL_PATTERN = re.compile(r'((?:https?://)?[\w_\-.#?&/~@!$()*,;%=+]+)', re.UNICODE) def decode_if_bytes(text): try: @@ -52,7 +52,7 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr for candidate in soup.find_all(string=True): if candidate.parent.name == 'code': continue ns = [NavigableString(r) for r in re.split(pattern, candidate.text)] - candidate.replace_with(*ns) + if ns: candidate.replace_with(*ns) return list(soup.find_all(string=re.compile(r'\A'+pattern.pattern+r'\Z'))) From b1bc8e729553d9dedf36252350a0dfcd50f90519 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Tue, 18 Jul 2023 07:16:33 -0400 Subject: [PATCH 16/28] Improve URL pattern. Make find_elements more robust. Move process_text_links tests to the client app. --- federation/tests/utils/test_text.py | 31 ----------------------------- federation/utils/text.py | 12 +++++++---- 2 files changed, 8 insertions(+), 35 deletions(-) diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py index a442e93..71daba3 100644 --- a/federation/tests/utils/test_text.py +++ b/federation/tests/utils/test_text.py @@ -65,37 +65,6 @@ class TestFindTags: assert tags == {"foobar", "barfoo"} -# TODO: move these tests to the client app -@pytest.mark.skip -class TestProcessTextLinks: - def test_link_at_start_or_end(self): - assert process_text_links('https://example.org example.org\nhttp://example.org') == \ - 'https://example.org ' \ - 'example.org\n' \ - 'http://example.org' - - def test_existing_links_get_attrs_added(self): - assert process_text_links('https://example.org') == \ - 'https://example.org' - - def test_code_sections_are_skipped(self): - assert process_text_links('https://example.org\nhttps://example.org\n') == \ - 'https://example.org\nhttps://example.org\n' - - def test_emails_are_skipped(self): - assert process_text_links('foo@example.org') == 'foo@example.org' - - def test_does_not_add_target_blank_if_link_is_internal(self): - assert process_text_links('#foobar') == \ - '#foobar' - - def test_does_not_remove_mention_classes(self): - assert process_text_links('

' - '@jaywink boom

') == \ - '

@jaywink boom

' - - def test_validate_handle(): assert validate_handle("foo@bar.com") assert validate_handle("Foo@baR.com") diff --git a/federation/utils/text.py b/federation/utils/text.py index 9d62c04..f66f437 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -10,7 +10,7 @@ ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โ€™โ€โ€”\xa0" TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE) # This will match non matching braces. I don't think it's an issue. MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE) -URL_PATTERN = re.compile(r'((?:https?://)?[\w_\-.#?&/~@!$()*,;%=+]+)', re.UNICODE) +URL_PATTERN = re.compile(r'(^|[#*_\s])((?:https?://)?[\w\-.]+\.[\w]{1}[\w_\-.#?&/~@!$()*,;%=+]*)', re.UNICODE) def decode_if_bytes(text): try: @@ -49,11 +49,15 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr :param pattern: Compiled regular expression defined using a single group :return: A NavigableString list attached to the original soup """ + found = [] for candidate in soup.find_all(string=True): - if candidate.parent.name == 'code': continue + parent = candidate.find_parent() + if parent.name == 'code': continue ns = [NavigableString(r) for r in re.split(pattern, candidate.text)] - if ns: candidate.replace_with(*ns) - return list(soup.find_all(string=re.compile(r'\A'+pattern.pattern+r'\Z'))) + if ns: + candidate.replace_with(*ns) + found.extend([child for child in parent.find_all(string=pattern) if child in ns]) + return found def get_path_from_url(url: str) -> str: From c87e1c3dd7ee44fae1fbc40ec20e605027868f3d Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Tue, 18 Jul 2023 09:19:53 -0400 Subject: [PATCH 17/28] Unquote and normalize tag links and add to the set being intersected with the hrefs pulled from Hashtag objects. --- federation/entities/activitypub/models.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 8989440..8a6e31f 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -6,7 +6,8 @@ import traceback import uuid from operator import attrgetter from typing import List, Dict, Union -from urllib.parse import urlparse +from unicodedata import normalize +from urllib.parse import unquote, urlparse import bleach from bs4 import BeautifulSoup @@ -848,11 +849,14 @@ class Note(Object, RawContentMixin): hrefs.add(tag.id.lower()) for link in self._soup.find_all('a', href=True): - parsed = urlparse(link['href'].lower()) + parsed = urlparse(unquote(link['href']).lower()) # remove the query part and trailing garbage, if any path = re.match(r'(/[\w/]+)', parsed.path).group() url = f'{parsed.scheme}://{parsed.netloc}{path}' - links = {link['href'].lower(), url} + # convert accented characters to their ascii equivalent + normalized_path = normalize('NFD', path).encode('ascii', 'ignore') + normalized_url = f'{parsed.scheme}://{parsed.netloc}{normalized_path.decode()}' + links = {link['href'].lower(), unquote(link['href']).lower(), url, normalized_url} if links.intersection(hrefs): tag = re.match(r'#?([\w]+)', link.text).group(1).lower() link['data-hashtag'] = tag From d53db6299f7676ca694de33595691f6ffacda3a8 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Tue, 18 Jul 2023 12:42:36 -0400 Subject: [PATCH 18/28] Make tag link processing more robust. --- federation/entities/activitypub/models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 8a6e31f..1cda53e 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -851,7 +851,10 @@ class Note(Object, RawContentMixin): for link in self._soup.find_all('a', href=True): parsed = urlparse(unquote(link['href']).lower()) # remove the query part and trailing garbage, if any - path = re.match(r'(/[\w/]+)', parsed.path).group() + path = parsed.path + trunc = re.match(r'(/[\w/]+)', parsed.path) + if trunc: + path = trunc.group() url = f'{parsed.scheme}://{parsed.netloc}{path}' # convert accented characters to their ascii equivalent normalized_path = normalize('NFD', path).encode('ascii', 'ignore') From d7e6a56eb69db8175c77ab7b143f5e448f67f1dd Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sun, 23 Jul 2023 08:50:40 -0400 Subject: [PATCH 19/28] AP mention finding and marking now also relying on the url property which is expected to be set as the remote_url property of the client app profiles. Add the url property to some tests. The get_profile function now expected to OR the query fields. --- federation/entities/activitypub/models.py | 37 +++++++++++-------- .../entities/activitypub/test_mappers.py | 12 ++++-- federation/tests/fixtures/entities.py | 3 +- federation/tests/utils/test_activitypub.py | 4 +- 4 files changed, 33 insertions(+), 23 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 1cda53e..711b979 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -35,10 +35,10 @@ from federation.utils.text import with_slash, validate_handle logger = logging.getLogger("federation") -def get_profile_or_entity(fid): - obj = get_profile(fid=fid) - if not obj: - obj = retrieve_and_parse_document(fid) +def get_profile_or_entity(**kwargs): + obj = get_profile(**kwargs) + if not obj and kwargs.get('fid'): + obj = retrieve_and_parse_document(kwargs['fid']) return obj @@ -586,6 +586,7 @@ class Person(Object, base.Profile): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self._required += ['url'] self._allowed_children += (Note, PropertyValue, IdentityProof) # Set finger to username@host if not provided by the platform @@ -866,18 +867,22 @@ class Note(Object, RawContentMixin): def _find_and_mark_mentions(self): mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)] - hrefs = [] + # There seems to be consensus on using the profile url for + # the link and the profile id for the Mention object href property, + # but some platforms will set mention.href to the profile url, so + # we check both. for mention in mentions: - hrefs.append(mention.href) - # add Mastodon's form - parsed = urlparse(mention.href) - username = mention.name.lstrip('@').split('@')[0] - hrefs.append(f'{parsed.scheme}://{parsed.netloc}/@{username}') - for href in hrefs: - links = self._soup.find_all(href=href) - for link in links: - profile = get_profile_or_entity(fid=link['href']) - if profile: + hrefs = [] + profile = get_profile_or_entity(fid=mention.href, remote_url=mention.href) + if profile and not profile.url: + # This should be removed when we are confident that the remote_url property + # has been populated for most profiles on the client app side. + profile = retrieve_and_parse_profile(profile.id) + if profile: + hrefs.extend([profile.id, profile.url]) + for href in hrefs: + links = self._soup.find_all(href=href) + for link in links: link['data-mention'] = profile.finger self._mentions.add(profile.finger) @@ -1317,7 +1322,7 @@ def extract_receivers(entity): profile = None # don't care about receivers for payloads without an actor_id if getattr(entity, 'actor_id'): - profile = get_profile_or_entity(entity.actor_id) + profile = get_profile_or_entity(fid=entity.actor_id) if not isinstance(profile, base.Profile): return receivers diff --git a/federation/tests/entities/activitypub/test_mappers.py b/federation/tests/entities/activitypub/test_mappers.py index 9a2c042..ba6bbbb 100644 --- a/federation/tests/entities/activitypub/test_mappers.py +++ b/federation/tests/entities/activitypub/test_mappers.py @@ -91,7 +91,8 @@ class TestActivitypubEntityMappersReceive: assert post.raw_content == '' assert post.rendered_content == '

boom

' - @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev3.jasonrobinson.me")) + @patch("federation.entities.activitypub.models.get_profile_or_entity", + return_value=Person(finger="jaywink@dev3.jasonrobinson.me",url="https://dev3.jasonrobinson.me/u/jaywink/")) def test_message_to_objects_simple_post__with_mentions(self, mock_get): entities = message_to_objects(ACTIVITYPUB_POST_WITH_MENTIONS, "https://mastodon.social/users/jaywink") assert len(entities) == 1 @@ -102,7 +103,8 @@ class TestActivitypubEntityMappersReceive: assert list(post._mentions)[0] == "jaywink@dev3.jasonrobinson.me" - @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me")) + @patch("federation.entities.activitypub.models.get_profile_or_entity", + return_value=Person(finger="jaywink@dev.jasonrobinson.me",url="https://dev.jasonrobinson.me/u/jaywink/")) def test_message_to_objects_simple_post__with_source__bbcode(self, mock_get): entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_BBCODE, "https://diaspodon.fr/users/jaywink") assert len(entities) == 1 @@ -113,7 +115,8 @@ class TestActivitypubEntityMappersReceive: '@jaywink boom

' assert post.raw_content == '' - @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me")) + @patch("federation.entities.activitypub.models.get_profile_or_entity", + return_value=Person(finger="jaywink@dev.jasonrobinson.me",url="https://dev.robinson.me/u/jaywink/")) def test_message_to_objects_simple_post__with_source__markdown(self, mock_get): entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_MARKDOWN, "https://diaspodon.fr/users/jaywink") assert len(entities) == 1 @@ -147,7 +150,8 @@ class TestActivitypubEntityMappersReceive: assert photo.guid == "" assert photo.handle == "" - @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me")) + @patch("federation.entities.activitypub.models.get_profile_or_entity", + return_value=Person(finger="jaywink@dev.jasonrobinson.me", url="https://dev.jasonrobinson.me/u/jaywink/")) def test_message_to_objects_comment(self, mock_get): entities = message_to_objects(ACTIVITYPUB_COMMENT, "https://diaspodon.fr/users/jaywink") assert len(entities) == 1 diff --git a/federation/tests/fixtures/entities.py b/federation/tests/fixtures/entities.py index c0d1a07..e555a97 100644 --- a/federation/tests/fixtures/entities.py +++ b/federation/tests/fixtures/entities.py @@ -256,7 +256,8 @@ def profile(): inboxes={ "private": "https://example.com/bob/private", "public": "https://example.com/public", - }, public_key=PUBKEY, to=["https://www.w3.org/ns/activitystreams#Public"] + }, public_key=PUBKEY, to=["https://www.w3.org/ns/activitystreams#Public"], + url="https://example.com/alice" ) diff --git a/federation/tests/utils/test_activitypub.py b/federation/tests/utils/test_activitypub.py index 2572b42..46e7d46 100644 --- a/federation/tests/utils/test_activitypub.py +++ b/federation/tests/utils/test_activitypub.py @@ -60,7 +60,7 @@ class TestRetrieveAndParseDocument: entity = retrieve_and_parse_document("https://example.com/foobar") assert isinstance(entity, Follow) - @patch("federation.entities.activitypub.models.extract_receivers", return_value=[]) + @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=None) @patch("federation.utils.activitypub.fetch_document", autospec=True, return_value=( json.dumps(ACTIVITYPUB_POST_OBJECT), None, None), ) @@ -80,7 +80,7 @@ class TestRetrieveAndParseDocument: "/foobar.jpg" @patch("federation.entities.activitypub.models.verify_ld_signature", return_value=None) - @patch("federation.entities.activitypub.models.extract_receivers", return_value=[]) + @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=None) @patch("federation.utils.activitypub.fetch_document", autospec=True, return_value=( json.dumps(ACTIVITYPUB_POST), None, None), ) From cb96d83793e7906ddc3e60e5c7a30bdda7948e1f Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sun, 23 Jul 2023 10:05:25 -0400 Subject: [PATCH 20/28] Case insensitive lookup with finger. --- federation/entities/activitypub/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 711b979..cd32fcd 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -816,7 +816,7 @@ class Note(Object, RawContentMixin): mentions.sort() for mention in mentions: if validate_handle(mention): - profile = get_profile(finger=mention) + profile = get_profile(finger__iexact=mention) # only add AP profiles mentions if getattr(profile, 'id', None): self.tag_objects.append(Mention(href=profile.id, name='@'+mention)) From 091b156703622c0e1a29c805aff0b93127b9621a Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sun, 23 Jul 2023 13:03:20 -0400 Subject: [PATCH 21/28] For Person, if the url property is missing, set it to id. --- federation/entities/activitypub/models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index cd32fcd..9989177 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -602,6 +602,9 @@ class Person(Object, base.Profile): # multi-protocol platform if self.finger and self.guid is not missing and self.handle is missing: self.handle = self.finger + # Some platforms don't set this property. + if self.url is missing: + self.url = self.id def to_as2(self): self.followers = f'{with_slash(self.id)}followers/' From 54a8404c3d1616f23083ac68fd961b5b8be3cdfd Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Mon, 24 Jul 2023 08:32:32 -0400 Subject: [PATCH 22/28] Make verify_ld_signature more robust. Make Tombstone objects signable. --- federation/entities/activitypub/ldsigning.py | 2 +- federation/entities/activitypub/models.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/federation/entities/activitypub/ldsigning.py b/federation/entities/activitypub/ldsigning.py index f77b1fd..c118eaa 100644 --- a/federation/entities/activitypub/ldsigning.py +++ b/federation/entities/activitypub/ldsigning.py @@ -99,6 +99,6 @@ class NormalizedDoubles(jsonld.JsonLdProcessor): item['@value'] = math.floor(value) obj = super()._object_to_rdf(item, issuer, triples, rdfDirection) # This is to address https://github.com/digitalbazaar/pyld/issues/175 - if obj.get('datatype') == jsonld.XSD_DOUBLE: + if obj and obj.get('datatype') == jsonld.XSD_DOUBLE: obj['value'] = re.sub(r'(\d)0*E\+?(-)?0*(\d)', r'\1E\2\3', obj['value']) return obj diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 9989177..269c734 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -1205,6 +1205,7 @@ class Retraction(Announce, base.Retraction): class Tombstone(Object, base.Retraction): target_id = fields.Id() + signable = True def to_as2(self): if not isinstance(self.activity, type): return None From 6fd445382dc9a8fceb044323769ca8785b875a0a Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Wed, 26 Jul 2023 12:40:46 -0400 Subject: [PATCH 23/28] Allow '-' in tags. Make AP tag discovery more robust. --- federation/entities/activitypub/models.py | 7 ++++--- federation/utils/text.py | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 269c734..862b6cd 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -856,7 +856,7 @@ class Note(Object, RawContentMixin): parsed = urlparse(unquote(link['href']).lower()) # remove the query part and trailing garbage, if any path = parsed.path - trunc = re.match(r'(/[\w/]+)', parsed.path) + trunc = re.match(r'(/[\w/\-]+)', parsed.path) if trunc: path = trunc.group() url = f'{parsed.scheme}://{parsed.netloc}{path}' @@ -865,8 +865,9 @@ class Note(Object, RawContentMixin): normalized_url = f'{parsed.scheme}://{parsed.netloc}{normalized_path.decode()}' links = {link['href'].lower(), unquote(link['href']).lower(), url, normalized_url} if links.intersection(hrefs): - tag = re.match(r'#?([\w]+)', link.text).group(1).lower() - link['data-hashtag'] = tag + tag = re.match(r'^#?([\w\-]+$)', link.text) + if tag: + link['data-hashtag'] = tag.group(1).lower() def _find_and_mark_mentions(self): mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)] diff --git a/federation/utils/text.py b/federation/utils/text.py index f66f437..7d728dd 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -7,7 +7,7 @@ from bs4.element import NavigableString from commonmark import commonmark ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โ€™โ€โ€”\xa0" -TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE) +TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE) # This will match non matching braces. I don't think it's an issue. MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE) URL_PATTERN = re.compile(r'(^|[#*_\s])((?:https?://)?[\w\-.]+\.[\w]{1}[\w_\-.#?&/~@!$()*,;%=+]*)', re.UNICODE) @@ -56,7 +56,8 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr ns = [NavigableString(r) for r in re.split(pattern, candidate.text)] if ns: candidate.replace_with(*ns) - found.extend([child for child in parent.find_all(string=pattern) if child in ns]) + found.extend([child for child in parent.find_all( + string=re.compile(r'\A'+pattern.pattern+r'\Z')) if child in ns]) return found From 7559f16f4f67cdf24159636c323fe12f53f40601 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Wed, 26 Jul 2023 12:57:47 -0400 Subject: [PATCH 24/28] Remove references to http://schema.org from inbound AP contexts. --- federation/entities/activitypub/ldcontext.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/federation/entities/activitypub/ldcontext.py b/federation/entities/activitypub/ldcontext.py index 8fdafc2..414d60a 100644 --- a/federation/entities/activitypub/ldcontext.py +++ b/federation/entities/activitypub/ldcontext.py @@ -113,12 +113,13 @@ class LdContextManager: if 'python-federation"' in s: ctx = json.loads(s.replace('python-federation', 'python-federation#', 1)) - # some platforms have http://joinmastodon.com/ns in @context. This - # is not a json-ld document. - try: - ctx.pop(ctx.index('http://joinmastodon.org/ns')) - except ValueError: - pass + # Some platforms have reference invalid json-ld document in @context. + # Remove those. + for url in ['http://joinmastodon.org/ns', 'http://schema.org']: + try: + ctx.pop(ctx.index(url)) + except ValueError: + pass # remove @language in context since this directive is not # processed by calamus. Pleroma adds a useless @language: 'und' From db87313535417ef5aa131f8c703a4b83c740aee6 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Thu, 27 Jul 2023 08:00:41 -0400 Subject: [PATCH 25/28] Ignore relayed retractions. --- CHANGELOG.md | 2 ++ federation/entities/activitypub/models.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19d2bca..ce9212a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,6 +47,8 @@ * Fix process_text_links that would crash on `a` tags with no `href` attribute. +* Ignore relayed AP retractions. + ## [0.24.1] - 2023-03-18 ### Fixed diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 862b6cd..13e9bf6 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -1408,6 +1408,9 @@ def element_to_objects(element: Union[Dict, Object], sender: str = "") -> List: logger.error("Failed to validate entity %s: %s", entity, ex) return [] except InvalidSignature as exc: + if isinstance(entity, base.Retraction): + logger.warning('Relayed retraction on %s, ignoring', entity.target_id) + return [] logger.info('%s, fetching from remote', exc) entity = retrieve_and_parse_document(entity.id) if not entity: From 5c168d6630772f36e5b18e0bb3146c06c3ea07dd Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Thu, 27 Jul 2023 22:26:45 -0400 Subject: [PATCH 26/28] Rework find_elements to make it more efficient and resilient. --- federation/utils/text.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/federation/utils/text.py b/federation/utils/text.py index 7d728dd..3bf5497 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -49,16 +49,15 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr :param pattern: Compiled regular expression defined using a single group :return: A NavigableString list attached to the original soup """ - found = [] + final = [] for candidate in soup.find_all(string=True): - parent = candidate.find_parent() - if parent.name == 'code': continue + if candidate.parent.name == 'code': continue ns = [NavigableString(r) for r in re.split(pattern, candidate.text)] - if ns: + found = [s for s in ns if pattern.match(s.text)] + if found: candidate.replace_with(*ns) - found.extend([child for child in parent.find_all( - string=re.compile(r'\A'+pattern.pattern+r'\Z')) if child in ns]) - return found + final.extend(found) + return final def get_path_from_url(url: str) -> str: From 5dac605c4b1c311edadd0e6dd6beb06228191234 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Wed, 2 Aug 2023 07:45:57 -0400 Subject: [PATCH 27/28] Improve URL_PATTERN. --- federation/utils/text.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/federation/utils/text.py b/federation/utils/text.py index 3bf5497..7e6058b 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -10,7 +10,10 @@ ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โ€™โ€โ€”\xa0" TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE) # This will match non matching braces. I don't think it's an issue. MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE) -URL_PATTERN = re.compile(r'(^|[#*_\s])((?:https?://)?[\w\-.]+\.[\w]{1}[\w_\-.#?&/~@!$()*,;%=+]*)', re.UNICODE) +# based on https://stackoverflow.com/a/6041965 +URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|\b(?:\w+\.)+\w+)(?:(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))?\))+(?:\((?:[^\s()<>]+|(?:\(?:[^\s()<>]+\)))?\)|[^\s`!()\[\]{};:\'".,<>?ยซยปโ€œโ€โ€˜โ€™]))?)', + re.UNICODE) + def decode_if_bytes(text): try: @@ -52,7 +55,7 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr final = [] for candidate in soup.find_all(string=True): if candidate.parent.name == 'code': continue - ns = [NavigableString(r) for r in re.split(pattern, candidate.text)] + ns = [NavigableString(r) for r in pattern.split(candidate.text) if r] found = [s for s in ns if pattern.match(s.text)] if found: candidate.replace_with(*ns) From ada8c20d398df45e40f7dac0764e5465aff51366 Mon Sep 17 00:00:00 2001 From: Alain St-Denis Date: Sat, 5 Aug 2023 11:41:30 -0400 Subject: [PATCH 28/28] Impove the url matching regex. --- federation/utils/text.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/federation/utils/text.py b/federation/utils/text.py index 7e6058b..ab4b8ec 100644 --- a/federation/utils/text.py +++ b/federation/utils/text.py @@ -11,9 +11,8 @@ TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE) # This will match non matching braces. I don't think it's an issue. MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE) # based on https://stackoverflow.com/a/6041965 -URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|\b(?:\w+\.)+\w+)(?:(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))?\))+(?:\((?:[^\s()<>]+|(?:\(?:[^\s()<>]+\)))?\)|[^\s`!()\[\]{};:\'".,<>?ยซยปโ€œโ€โ€˜โ€™]))?)', - re.UNICODE) - +URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|^|(?<=[("<\s]))+(?:[\w\-]+(?:(?:\.[\w\-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))', + re.UNICODE) def decode_if_bytes(text): try: